In [95]:
import numpy as np
import pandas as pd

data = pd.read_csv("data/unimelb_training.csv", header=0, dtype="str")
data = data.drop("Unnamed: 251", axis=1)
data["Year"] = data["Start.date"].map(lambda x: float(x.split("/")[2]))
data["Year"].dtype


numerical_data_feature_category = [
    ("RFCD.Percentage.", 5), 
    ("SEO.Percentage.", 5), 
    ("Year.of.Birth.", 15),
    ("Number.of.Successful.Grant.", 15),
    ("Number.of.Unsuccessful.Grant.", 15),
    ("A..", 15),
    ("A.", 15),
    ("B.", 15),
    ("C.", 15)
]

for feature_name, feature_range in numerical_data_feature_category:
    numerical_data_features = [feature_name+str(i) for i in range(1, feature_range+1)]
    for numerical_data_feature in numerical_data_features:
        data[numerical_data_feature] = data[numerical_data_feature].astype(float)

data.dtypes

Grant.Application.ID                        object
Grant.Status                                object
Sponsor.Code                                object
Grant.Category.Code                         object
Contract.Value.Band...see.note.A            object
Start.date                                  object
RFCD.Code.1                                 object
RFCD.Percentage.1                          float64
RFCD.Code.2                                 object
RFCD.Percentage.2                          float64
RFCD.Code.3                                 object
RFCD.Percentage.3                          float64
RFCD.Code.4                                 object
RFCD.Percentage.4                          float64
RFCD.Code.5                                 object
RFCD.Percentage.5                          float64
SEO.Code.1                                  object
SEO.Percentage.1                           float64
SEO.Code.2                                  object
SEO.Percentage.2               

In [99]:
data.head()

Unnamed: 0,Grant.Application.ID,Grant.Status,Sponsor.Code,Grant.Category.Code,Contract.Value.Band...see.note.A,Start.date,RFCD.Code.1,RFCD.Percentage.1,RFCD.Code.2,RFCD.Percentage.2,...,Role.HONVISIT,Role.EXTERNAL_ADVISOR,Role.DELEGATED_RESEARCHER,Role.STUDRES,No..of.Years.in.Uni.at.Time.of.Grant.nan,No..of.Years.in.Uni.at.Time.of.Grant.>10 to 15,No..of.Years.in.Uni.at.Time.of.Grant.more than 15,No..of.Years.in.Uni.at.Time.of.Grant.Less than 0,No..of.Years.in.Uni.at.Time.of.Grant.>=0 to 5,No..of.Years.in.Uni.at.Time.of.Grant.>5 to 10
0,1,1,,,A,8/11/05,280199,100.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,2,1,2B,10A,B,11/11/05,280103,30.0,280106,30.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,3,1,29A,10B,A,14/11/05,321004,60.0,321216,40.0,...,0.0,0.0,0.142857,0.0,0.0,0.0,0.5,0.25,0.0,0.25
3,4,1,40D,10B,C,15/11/05,270602,50.0,320602,50.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5
4,5,0,59C,10A,A,16/11/05,260500,34.0,280000,33.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# Contigency for persons

In [96]:
# build the list of all unique values of a given spread column, as a tuple
def process_column(data, prefix, nb_max=15):
    unique_values = set([])
    for i in range(1, nb_max+1):
        unique_values.update(data[prefix+str(i)].unique())
    return tuple(unique_values)

# Build a dataframe with contingecy frequencies from a given spread column
def add_contigency(prefix, nb_max, unique_values, add_cardinal=False):
    nb_unique_values = len(unique_values)
    has_nans = np.nan in unique_values
    nan_index = unique_values.index(np.nan)
    new_feature_labels = [prefix+str(value) for value in unique_values]
    if add_cardinal:
        new_feature_labels += ["cardinal_"+prefix]
    def nested_func(x):
        values_frequency = [0]*nb_unique_values
        for i in range(1, nb_max+1):
            values_frequency[unique_values.index(x[prefix+str(i)])] += 1
        if has_nans:
            values_frequency[nan_index] = 0
        summe = sum(values_frequency)
        if summe != 0:
            values_frequency = [value/summe for value in values_frequency]
        if add_cardinal:
            values_frequency += summe
        return pd.Series(values_frequency, index=new_feature_labels)
    return nested_func

# Build a dataframe with contingecy frequencies from a given spread column
def build_contigency(data, prefix, nb_max):
    unique_values = process_column(data, prefix, nb_max)
    return unique_values, data.apply(add_contigency(prefix, nb_max, unique_values), axis=1, raw=True)

## Role

In [97]:
unique_roles, role_contigency_df = build_contigency(data, "Role.", 15)

print(unique_roles)
data = data.join(role_contigency_df)
data.head()

(nan, 'CHIEF_INVESTIGATOR', 'STUD_CHIEF_INVESTIGATOR', 'EXT_CHIEF_INVESTIGATOR', 'PRINCIPAL_SUPERVISOR', 'HONVISIT', 'EXTERNAL_ADVISOR', 'DELEGATED_RESEARCHER', 'STUDRES')


Unnamed: 0,Grant.Application.ID,Grant.Status,Sponsor.Code,Grant.Category.Code,Contract.Value.Band...see.note.A,Start.date,RFCD.Code.1,RFCD.Percentage.1,RFCD.Code.2,RFCD.Percentage.2,...,Year,Role.nan,Role.CHIEF_INVESTIGATOR,Role.STUD_CHIEF_INVESTIGATOR,Role.EXT_CHIEF_INVESTIGATOR,Role.PRINCIPAL_SUPERVISOR,Role.HONVISIT,Role.EXTERNAL_ADVISOR,Role.DELEGATED_RESEARCHER,Role.STUDRES
0,1,1,,,A,8/11/05,280199,100.0,0,0.0,...,5.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,1,2B,10A,B,11/11/05,280103,30.0,280106,30.0,...,5.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,1,29A,10B,A,14/11/05,321004,60.0,321216,40.0,...,5.0,0.0,0.428571,0.0,0.428571,0.0,0.0,0.0,0.142857,0.0
3,4,1,40D,10B,C,15/11/05,270602,50.0,320602,50.0,...,5.0,0.0,0.166667,0.166667,0.5,0.166667,0.0,0.0,0.0,0.0
4,5,0,59C,10A,A,16/11/05,260500,34.0,280000,33.0,...,5.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Number of years at uni at time of grant

In [None]:
No..of.Years.in.Uni.at.Time.of.Grant.

In [98]:
unique_yearsinuni, yearsinuni_contigency_df = build_contigency(data, "No..of.Years.in.Uni.at.Time.of.Grant.", 15)

print(unique_yearsinuni)
data = data.join(yearsinuni_contigency_df)
data.head()

(nan, '>10 to 15', 'more than 15', 'Less than 0', '>=0 to 5', '>5 to 10')


Unnamed: 0,Grant.Application.ID,Grant.Status,Sponsor.Code,Grant.Category.Code,Contract.Value.Band...see.note.A,Start.date,RFCD.Code.1,RFCD.Percentage.1,RFCD.Code.2,RFCD.Percentage.2,...,Role.HONVISIT,Role.EXTERNAL_ADVISOR,Role.DELEGATED_RESEARCHER,Role.STUDRES,No..of.Years.in.Uni.at.Time.of.Grant.nan,No..of.Years.in.Uni.at.Time.of.Grant.>10 to 15,No..of.Years.in.Uni.at.Time.of.Grant.more than 15,No..of.Years.in.Uni.at.Time.of.Grant.Less than 0,No..of.Years.in.Uni.at.Time.of.Grant.>=0 to 5,No..of.Years.in.Uni.at.Time.of.Grant.>5 to 10
0,1,1,,,A,8/11/05,280199,100.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,2,1,2B,10A,B,11/11/05,280103,30.0,280106,30.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,3,1,29A,10B,A,14/11/05,321004,60.0,321216,40.0,...,0.0,0.0,0.142857,0.0,0.0,0.0,0.5,0.25,0.0,0.25
3,4,1,40D,10B,C,15/11/05,270602,50.0,320602,50.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5
4,5,0,59C,10A,A,16/11/05,260500,34.0,280000,33.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
