In [195]:
import numpy as np
import pandas as pd

data = pd.read_csv("data/unimelb_training.csv", header=0, dtype="str")
data = data.drop("Unnamed: 251", axis=1)
data["Year"] = data["Start.date"].map(lambda x: float(x.split("/")[2]))
data["Year"].dtype


numerical_data_feature_category = [
    ("RFCD.Percentage.", 5), 
    ("SEO.Percentage.", 5), 
    ("Year.of.Birth.", 15),
    ("Number.of.Successful.Grant.", 15),
    ("Number.of.Unsuccessful.Grant.", 15),
    ("A..", 15),
    ("A.", 15),
    ("B.", 15),
    ("C.", 15)
]

for feature_name, feature_range in numerical_data_feature_category:
    numerical_data_features = [feature_name+str(i) for i in range(1, feature_range+1)]
    for numerical_data_feature in numerical_data_features:
        data[numerical_data_feature] = data[numerical_data_feature].astype(float)

data.dtypes

Grant.Application.ID                        object
Grant.Status                                object
Sponsor.Code                                object
Grant.Category.Code                         object
Contract.Value.Band...see.note.A            object
Start.date                                  object
RFCD.Code.1                                 object
RFCD.Percentage.1                          float64
RFCD.Code.2                                 object
RFCD.Percentage.2                          float64
RFCD.Code.3                                 object
RFCD.Percentage.3                          float64
RFCD.Code.4                                 object
RFCD.Percentage.4                          float64
RFCD.Code.5                                 object
RFCD.Percentage.5                          float64
SEO.Code.1                                  object
SEO.Percentage.1                           float64
SEO.Code.2                                  object
SEO.Percentage.2               

In [196]:
data.head()

Unnamed: 0,Grant.Application.ID,Grant.Status,Sponsor.Code,Grant.Category.Code,Contract.Value.Band...see.note.A,Start.date,RFCD.Code.1,RFCD.Percentage.1,RFCD.Code.2,RFCD.Percentage.2,...,Faculty.No..15,With.PHD.15,No..of.Years.in.Uni.at.Time.of.Grant.15,Number.of.Successful.Grant.15,Number.of.Unsuccessful.Grant.15,A..15,A.15,B.15,C.15,Year
0,1,1,,,A,8/11/05,280199,100.0,0,0.0,...,,,,,,,,,,5.0
1,2,1,2B,10A,B,11/11/05,280103,30.0,280106,30.0,...,,,,,,,,,,5.0
2,3,1,29A,10B,A,14/11/05,321004,60.0,321216,40.0,...,,,,,,,,,,5.0
3,4,1,40D,10B,C,15/11/05,270602,50.0,320602,50.0,...,,,,,,,,,,5.0
4,5,0,59C,10A,A,16/11/05,260500,34.0,280000,33.0,...,,,,,,,,,,5.0


# Contigency for persons

In [197]:
# build the list of all unique values of a given spread column, as a tuple
def process_column(data, prefix, nb_max):
    unique_values = set([])
    for i in range(1, nb_max+1):
        unique_values.update(data[prefix+str(i)].unique())
    return tuple(unique_values)

# Build a dataframe with contingecy frequencies from a given spread column
def add_contigency(prefix, nb_max, unique_values, add_cardinal):
    nb_unique_values = len(unique_values)
    has_nans = np.nan in unique_values
    nan_index = unique_values.index(np.nan)
    new_feature_labels = [prefix+str(value) for value in unique_values]
    if add_cardinal:
        new_feature_labels.append("cardinal_"+prefix)
    def nested_func(x):
        values_frequency = [0]*nb_unique_values
        for i in range(1, nb_max+1):
            values_frequency[unique_values.index(x[prefix+str(i)])] += 1
        if has_nans:
            values_frequency[nan_index] = 0
        summe = sum(values_frequency)
        if summe != 0:
            values_frequency = [value/summe for value in values_frequency]
        if add_cardinal:
            values_frequency.append(summe)
        return pd.Series(values_frequency, index=new_feature_labels)
    return nested_func

# Build a dataframe with contingecy frequencies from a given spread column
def build_contigency(data, prefix, nb_max, add_cardinal=False):
    unique_values = process_column(data, prefix, nb_max)
    return unique_values, data.apply(add_contigency(prefix, nb_max, unique_values, add_cardinal), axis=1, raw=True)

contigency_features_names = []

## Role

In [198]:
unique_roles, role_contigency_df = build_contigency(data, "Role.", 15, add_cardinal=True)

print(unique_roles)
contigency_features_names.extend(list(role_contigency_df.columns))
data = data.join(role_contigency_df)

(nan, 'CHIEF_INVESTIGATOR', 'STUD_CHIEF_INVESTIGATOR', 'EXT_CHIEF_INVESTIGATOR', 'PRINCIPAL_SUPERVISOR', 'HONVISIT', 'EXTERNAL_ADVISOR', 'DELEGATED_RESEARCHER', 'STUDRES')


## Number of years at uni at time of grant

In [199]:
# No..of.Years.in.Uni.at.Time.of.Grant.

unique_yearsinuni, yearsinuni_contigency_df = build_contigency(data, "No..of.Years.in.Uni.at.Time.of.Grant.", 15)

print(unique_yearsinuni)
contigency_features_names.extend(list(yearsinuni_contigency_df.columns))
data = data.join(yearsinuni_contigency_df)

(nan, '>10 to 15', 'more than 15', 'Less than 0', '>=0 to 5', '>5 to 10')


## Country of birth

In [200]:
# Country.of.Birth.

unique_cob, cob_contigency_df = build_contigency(data, "Country.of.Birth.", 15)

print(unique_cob)
contigency_features_names.extend(list(cob_contigency_df.columns))
data = data.join(cob_contigency_df)

(nan, 'Western Europe', 'Eastern Europe', 'Middle East and Africa', 'South Africa', 'The Americas', 'North America', 'Australia', 'Asia Pacific', 'Great Britain', 'New Zealand')


# aggregations for persons

In [202]:
# Build a dataframe with contingecy frequencies from a given spread column
def add_aggregation(prefixes, nb_max):
    feature_names_by_prefix = [[prefix+str(i) for i in range(1, nb_max+1)] for prefix in prefixes]
    methods_to_apply = [np.mean]
    new_feature_labels_by_prefix = [["mean_"+prefix] for prefix in prefixes]
    flat_new_labels = [new_feature_label for new_feature_labels in new_feature_labels_by_prefix for new_feature_label in new_feature_labels]
    def nested_func(x):
        values_by_prefix = [
            [x[feature_name] for feature_name in feature_names if np.isreal(x[feature_name]) and not np.isnan(x[feature_name])]
            for feature_names in feature_names_by_prefix
        ]
        aggregated_values_by_prefix = [[method(values) for method in methods_to_apply] if len(values) > 0 else [np.nan]*len(methods_to_apply) for values in values_by_prefix]
        return pd.Series(
            data=[aggregated_value for aggregated_values in aggregated_values_by_prefix for aggregated_value in aggregated_values], 
            index=flat_new_labels
        )
    return nested_func

# Build a dataframe with contingecy frequencies from a given spread column
def build_aggregation(data, prefix, nb_max):
    return data.apply(add_aggregation(prefix, nb_max), axis=1, raw=True)

## Year of birth / Number of successful grants / Number of unsuccessfull grants / journals publications (A*, A, B, C)

In [203]:
aggregation_df = build_aggregation(data, ["Year.of.Birth.", "Number.of.Successful.Grant.", "Number.of.Unsuccessful.Grant.", "A..", "A.", "B.", "C."], 15)
aggregation_features_names = list(aggregation_df.columns)

print(aggregation_features_names)
data = data.join(aggregation_df)

['mean_Year.of.Birth.', 'mean_Number.of.Successful.Grant.', 'mean_Number.of.Unsuccessful.Grant.', 'mean_A..', 'mean_A.', 'mean_B.', 'mean_C.']


In [204]:
data.head(20)

Unnamed: 0,Grant.Application.ID,Grant.Status,Sponsor.Code,Grant.Category.Code,Contract.Value.Band...see.note.A,Start.date,RFCD.Code.1,RFCD.Percentage.1,RFCD.Code.2,RFCD.Percentage.2,...,Country.of.Birth.Asia Pacific,Country.of.Birth.Great Britain,Country.of.Birth.New Zealand,mean_Year.of.Birth.,mean_Number.of.Successful.Grant.,mean_Number.of.Unsuccessful.Grant.,mean_A..,mean_A.,mean_B.,mean_C.
0,1,1,,,A,8/11/05,280199,100.0,0,0.0,...,1.0,0.0,0.0,1965.0,0.0,0.0,4.0,2.0,0.0,0.0
1,2,1,2B,10A,B,11/11/05,280103,30.0,280106,30.0,...,0.0,0.0,0.0,1960.0,0.0,0.0,6.0,12.0,2.0,2.0
2,3,1,29A,10B,A,14/11/05,321004,60.0,321216,40.0,...,0.25,0.0,0.0,1951.25,0.0,0.0,1.75,5.0,5.0,1.75
3,4,1,40D,10B,C,15/11/05,270602,50.0,320602,50.0,...,0.0,0.0,0.0,1965.0,0.0,0.0,0.0,1.5,6.5,1.5
4,5,0,59C,10A,A,16/11/05,260500,34.0,280000,33.0,...,0.0,0.0,0.0,1965.0,0.0,0.0,3.0,0.0,1.0,0.0
5,6,1,4D,10A,,19/11/05,321204,100.0,0,0.0,...,0.0,0.0,0.0,1950.0,2.0,0.0,7.0,27.0,27.0,6.0
6,7,0,2B,10A,,19/11/05,270708,50.0,270203,30.0,...,0.0,0.333333,0.0,1956.666667,0.0,0.333333,2.666667,4.0,4.666667,1.333333
7,8,0,28D,30B,A,19/11/05,321405,100.0,0,0.0,...,0.0,0.0,0.0,1952.5,0.0,1.5,0.0,0.0,0.0,0.0
8,9,1,2B,10A,H,19/11/05,260108,50.0,260109,50.0,...,0.0,0.0,0.0,1947.5,1.0,0.5,4.0,0.5,5.5,0.0
9,10,1,2B,10A,,19/11/05,270708,40.0,270704,30.0,...,0.0,0.0,0.0,1930.0,0.0,0.0,1.0,0.0,0.0,0.0


In [167]:
data.head(20)

Unnamed: 0,Grant.Application.ID,Grant.Status,Sponsor.Code,Grant.Category.Code,Contract.Value.Band...see.note.A,Start.date,RFCD.Code.1,RFCD.Percentage.1,RFCD.Code.2,RFCD.Percentage.2,...,Country.of.Birth.South Africa,Country.of.Birth.The Americas,Country.of.Birth.North America,Country.of.Birth.Australia,Country.of.Birth.Asia Pacific,Country.of.Birth.Great Britain,Country.of.Birth.New Zealand,mean_Year.of.Birth.,mean_Number.of.Successful.Grant.,mean_Number.of.Unsuccessful.Grant.
0,1,1,,,A,8/11/05,280199,100.0,0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1965.0,0.0,0.0
1,2,1,2B,10A,B,11/11/05,280103,30.0,280106,30.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1960.0,0.0,0.0
2,3,1,29A,10B,A,14/11/05,321004,60.0,321216,40.0,...,0.0,0.0,0.0,0.75,0.25,0.0,0.0,1951.25,0.0,0.0
3,4,1,40D,10B,C,15/11/05,270602,50.0,320602,50.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1965.0,0.0,0.0
4,5,0,59C,10A,A,16/11/05,260500,34.0,280000,33.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1965.0,0.0,0.0
5,6,1,4D,10A,,19/11/05,321204,100.0,0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1950.0,2.0,0.0
6,7,0,2B,10A,,19/11/05,270708,50.0,270203,30.0,...,0.0,0.0,0.0,0.666667,0.0,0.333333,0.0,1956.666667,0.0,0.333333
7,8,0,28D,30B,A,19/11/05,321405,100.0,0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1952.5,0.0,1.5
8,9,1,2B,10A,H,19/11/05,260108,50.0,260109,50.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1947.5,1.0,0.5
9,10,1,2B,10A,,19/11/05,270708,40.0,270704,30.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1930.0,0.0,0.0


# Building dataset

In [207]:
application_feature_names = [
    "Grant.Application.ID", "Grant.Status", "Sponsor.Code", "Grant.Category.Code", "Contract.Value.Band...see.note.A", "Start.date"
]

application_feature_names + contigency_features_names + aggregation_features_names

['Grant.Application.ID',
 'Grant.Status',
 'Sponsor.Code',
 'Grant.Category.Code',
 'Contract.Value.Band...see.note.A',
 'Start.date',
 'Role.nan',
 'Role.CHIEF_INVESTIGATOR',
 'Role.STUD_CHIEF_INVESTIGATOR',
 'Role.EXT_CHIEF_INVESTIGATOR',
 'Role.PRINCIPAL_SUPERVISOR',
 'Role.HONVISIT',
 'Role.EXTERNAL_ADVISOR',
 'Role.DELEGATED_RESEARCHER',
 'Role.STUDRES',
 'cardinal_Role.',
 'No..of.Years.in.Uni.at.Time.of.Grant.nan',
 'No..of.Years.in.Uni.at.Time.of.Grant.>10 to 15',
 'No..of.Years.in.Uni.at.Time.of.Grant.more than 15',
 'No..of.Years.in.Uni.at.Time.of.Grant.Less than 0',
 'No..of.Years.in.Uni.at.Time.of.Grant.>=0 to 5',
 'No..of.Years.in.Uni.at.Time.of.Grant.>5 to 10',
 'Country.of.Birth.nan',
 'Country.of.Birth.Western Europe',
 'Country.of.Birth.Eastern Europe',
 'Country.of.Birth.Middle East and Africa',
 'Country.of.Birth.South Africa',
 'Country.of.Birth.The Americas',
 'Country.of.Birth.North America',
 'Country.of.Birth.Australia',
 'Country.of.Birth.Asia Pacific',
 'Count