In [1]:
import numpy as np
import pandas as pd

In [230]:
data = pd.read_csv("data/unimelb_training.csv", header=0, dtype="str")
data["Grant.Status"] = data["Grant.Status"].astype(int)


numerical_data_feature_category = [
    ("RFCD.Percentage.", 5), 
    ("SEO.Percentage.", 5), 
    ("Year.of.Birth.", 15),
    ("Number.of.Successful.Grant.", 15),
    ("Number.of.Unsuccessful.Grant.", 15),
    ("A..", 15),
    ("A.", 15),
    ("B.", 15),
    ("C.", 15)
]

for feature_name, feature_range in numerical_data_feature_category:
    numerical_data_features = [feature_name+str(i) for i in range(1, feature_range+1)]
    for numerical_data_feature in numerical_data_features:
        data[numerical_data_feature] = data[numerical_data_feature].astype(float)

data.dtypes

Grant.Application.ID                        object
Grant.Status                                 int64
Sponsor.Code                                object
Grant.Category.Code                         object
Contract.Value.Band...see.note.A            object
Start.date                                  object
RFCD.Code.1                                 object
RFCD.Percentage.1                          float64
RFCD.Code.2                                 object
RFCD.Percentage.2                          float64
RFCD.Code.3                                 object
RFCD.Percentage.3                          float64
RFCD.Code.4                                 object
RFCD.Percentage.4                          float64
RFCD.Code.5                                 object
RFCD.Percentage.5                          float64
SEO.Code.1                                  object
SEO.Percentage.1                           float64
SEO.Code.2                                  object
SEO.Percentage.2               

In [231]:
data.head()

Unnamed: 0,Grant.Application.ID,Grant.Status,Sponsor.Code,Grant.Category.Code,Contract.Value.Band...see.note.A,Start.date,RFCD.Code.1,RFCD.Percentage.1,RFCD.Code.2,RFCD.Percentage.2,...,Faculty.No..15,With.PHD.15,No..of.Years.in.Uni.at.Time.of.Grant.15,Number.of.Successful.Grant.15,Number.of.Unsuccessful.Grant.15,A..15,A.15,B.15,C.15,Unnamed: 251
0,1,1,,,A,8/11/05,280199,100.0,0,0.0,...,,,,,,,,,,
1,2,1,2B,10A,B,11/11/05,280103,30.0,280106,30.0,...,,,,,,,,,,
2,3,1,29A,10B,A,14/11/05,321004,60.0,321216,40.0,...,,,,,,,,,,
3,4,1,40D,10B,C,15/11/05,270602,50.0,320602,50.0,...,,,,,,,,,,
4,5,0,59C,10A,A,16/11/05,260500,34.0,280000,33.0,...,,,,,,,,,,


In [233]:
data["Unnamed: 251"].isnull().all()

True

In [209]:
def process_column(data, prefix, nb_max=15):
    unique_values = set([])
    all_values = []
    for i in range(1, nb_max+1):
        unique_values.update(data[prefix+str(i)].unique())
        all_values.extend(list(data[prefix+str(i)]))
    return unique_values, all_values

In [237]:
unique_roles, all_roles = process_column(data, "Role.")
print(list(unique_roles))
print(len(all_roles)) # 8708*15
print(sum([True if np.isreal(x) and np.isnan(x) else False for x in all_roles])/len(all_roles)) # this value is high because we concatenate all the 15 persons

pd.Series(all_roles).value_counts()

[nan, 'CHIEF_INVESTIGATOR', 'EXTERNAL_ADVISOR', 'HONVISIT', 'STUD_CHIEF_INVESTIGATOR', 'STUDRES', 'EXT_CHIEF_INVESTIGATOR', 'DELEGATED_RESEARCHER', 'PRINCIPAL_SUPERVISOR']
130620
0.866054203031695


CHIEF_INVESTIGATOR         12136
EXT_CHIEF_INVESTIGATOR      3732
STUD_CHIEF_INVESTIGATOR      586
PRINCIPAL_SUPERVISOR         536
DELEGATED_RESEARCHER         315
STUDRES                      157
HONVISIT                      29
EXTERNAL_ADVISOR               5
dtype: int64

In [239]:
unique_cob, all_cob = process_column(data, "Country.of.Birth.")
print(list(unique_cob))
print(sum([True if np.isreal(x) and np.isnan(x) else False for x in all_cob])/len(all_cob))

pd.Series(all_cob).value_counts()

[nan, 'Western Europe', 'Middle East and Africa', 'South Africa', 'Great Britain', 'North America', 'New Zealand', 'Eastern Europe', 'The Americas', 'Asia Pacific', 'Australia']
0.911751645996019


Australia                 8500
Great Britain             1011
Asia Pacific               549
Western Europe             500
North America              352
Eastern Europe             193
New Zealand                134
Middle East and Africa     100
South Africa                97
The Americas                91
dtype: int64

In [240]:
print(sum(data["Role.1"].isnull()))

data["Sponsor.Code"].unique()

98


array([nan, '2B', '29A', '40D', '59C', '4D', '28D', '136D', '100D', '1A',
       '21A', '24D', '148D', '12D', '62B', '138B', '60D', '143C', '90B',
       '18B', '89A', '34B', '47C', '36D', '32D', '172D', '184D', '5A',
       '63C', '74B', '6B', '170B', '166B', '101A', '33A', '164D', '75C',
       '161A', '157A', '9A', '126B', '149A', '53A', '94B', '97A', '160D',
       '51C', '167C', '135C', '65A', '42B', '87C', '169A', '95C', '103C',
       '147C', '133A', '150B', '155C', '174B', '83C', '175C', '139C',
       '202B', '13A', '168D', '113A', '134B', '20D', '141A', '80D', '91C',
       '187C', '26B', '162B', '93A', '84D', '158B', '55C', '77A', '429A',
       '140D', '23C', '142B', '112D', '111C', '154B', '185A', '151C', '3C',
       '428D', '137A', '37A', '146B', '176D', '145A', '85A', '14B', '69A',
       '41A', '427C', '49A', '7C', '44D', '163C', '144D', '177A', '39C',
       '153A', '173A', '159C', '178B', '156D', '186B', '152D', '171C',
       '201A', '182B', '183C', '52D', '132D', '

In [242]:
data["Year"] = data["Start.date"].map(lambda x: float(x.split("/")[2]))

# Descriptive analysis

In [72]:
def process_column(data, prefix, nb_max=15):
    unique_values = set([])
    all_values = []
    for i in range(1, nb_max+1):
        unique_values.update(data[prefix+str(i)].unique())
        all_values.extend(list(data[prefix+str(i)]))
    return unique_values, all_values

## Person

In [243]:
person_features = [
    'Person.ID.', 'Role.', 'Year.of.Birth.', 'Country.of.Birth.', 'Home.Language.', 'Dept.No..', 'Faculty.No..', 
    'With.PHD.', 'No..of.Years.in.Uni.at.Time.of.Grant.', 'Number.of.Successful.Grant.', 'Number.of.Unsuccessful.Grant.',
    'A..', 'A.', 'B.', 'C.',
]

In [250]:
# Id and person
person_unique_id, person_all_id = process_column(data, 'Person.ID.')
if np.nan in person_unique_id: 
    person_unique_id.remove(np.nan)

print(len(person_unique_id))
list(person_unique_id)[:20]

2875


['36242',
 '13512',
 '143452',
 '80212',
 '497907',
 '19627',
 '147717',
 '1547',
 '72622',
 '907227',
 '11397',
 '131862',
 '139532',
 '5812',
 '27707',
 '671847',
 '76682',
 '6712',
 '64357',
 '18817']

In [251]:
# from collections import defaultdict

# persons_dict = defaultdict(dict)
# for person_id in person_unique_id:
#     column_number = 1
#     while(column_number <= 15):
#         column_name = 'Person.ID.'+str(column_number)
#         if (data[column_name] == person_id).sum() >= 1:
#             considered_features = [feature_name+str(column_number) for feature_name in person_features]
#             persons_dict[person_id][column_number] = data[data[column_name] == person_id][considered_features + ["Year"]]
#         column_number += 1

# len(persons_dict)

# persons_dict['36242'].keys()
# persons_dict['36242'][1]
# persons_dict['36242'][2]
# persons_dict['36242'][3]

In [252]:
# person_DF = pd.DataFrame(columns=person_features + ["Year"] + ["Rank"])

# new_df = pd.DataFrame(data=persons_dict['36242'][1].values, columns=person_features + ["Year"])
# new_df["Rank"] = 1
# person_DF = person_DF.append(new_df)

# new_df = pd.DataFrame(data=persons_dict['36242'][2].values, columns=person_features + ["Year"])
# new_df["Rank"] = 2
# person_DF = person_DF.append(new_df)
# person_DF

In [248]:
person_DF = pd.DataFrame(columns=person_features + ["Year"] + ["Rank"])

for person_id in person_unique_id:
    column_number = 1
    while(column_number <= 15):
        column_name = 'Person.ID.'+str(column_number)
        if (data[column_name] == person_id).sum() >= 1:
            considered_features = [feature_name+str(column_number) for feature_name in person_features]
            new_df = pd.DataFrame(data=data[data[column_name] == person_id][considered_features + ["Year"]].values, columns=person_features + ["Year"])
            new_df["Rank"] = column_number
            person_DF = person_DF.append(new_df)
            
        column_number += 1

In [172]:
person_DF.head(20)

Unnamed: 0,Person.ID.,Role.,Year.of.Birth.,Country.of.Birth.,Home.Language.,Dept.No..,Faculty.No..,With.PHD.,No..of.Years.in.Uni.at.Time.of.Grant.,Number.of.Successful.Grant.,Number.of.Unsuccessful.Grant.,A..,A.,B.,C.,Year,Rank
0,36242,CHIEF_INVESTIGATOR,1965,Australia,,2713,25,Yes,>=0 to 5,1,0,2,1,0,0,6,1.0
1,36242,CHIEF_INVESTIGATOR,1965,Australia,,2713,25,Yes,>=0 to 5,1,0,2,1,0,0,6,1.0
2,36242,CHIEF_INVESTIGATOR,1965,Australia,,2713,25,Yes,>=0 to 5,1,0,2,1,0,0,6,1.0
3,36242,CHIEF_INVESTIGATOR,1965,Australia,,2713,25,Yes,>5 to 10,1,1,2,2,0,0,7,1.0
4,36242,CHIEF_INVESTIGATOR,1965,Australia,,2713,25,Yes,>5 to 10,1,2,2,2,0,0,8,1.0
5,36242,CHIEF_INVESTIGATOR,1965,Australia,,2713,25,Yes,>5 to 10,1,2,2,2,0,0,8,1.0
6,36242,CHIEF_INVESTIGATOR,1965,Australia,,2713,25,Yes,>5 to 10,1,3,2,2,0,0,8,1.0
0,36242,CHIEF_INVESTIGATOR,1965,Australia,,2713,25,Yes,>5 to 10,1,1,0,0,0,0,7,2.0
0,36242,CHIEF_INVESTIGATOR,1965,Australia,,2713,25,Yes,>5 to 10,1,1,2,2,0,0,7,3.0
0,13512,CHIEF_INVESTIGATOR,1960,Australia,English,3708,10,Yes,>5 to 10,0,1,0,0,6,0,6,1.0


In [163]:
person_DF[person_DF["No..of.Years.in.Uni.at.Time.of.Grant."].isnull()]

Unnamed: 0,Person.ID.,Role.,Year.of.Birth.,Country.of.Birth.,Home.Language.,Dept.No..,Faculty.No..,With.PHD.,No..of.Years.in.Uni.at.Time.of.Grant.,Number.of.Successful.Grant.,Number.of.Unsuccessful.Grant.,A..,A.,B.,C.,Year,Rank
0,11397,CHIEF_INVESTIGATOR,1970,,,,,,,0,0,0,2,2,1,06,4.0
0,131862,CHIEF_INVESTIGATOR,1945,,,,,,,0,0,0,0,0,0,08,2.0
1,131862,CHIEF_INVESTIGATOR,1945,,,,,,,0,0,0,0,0,0,08,2.0
0,5812,CHIEF_INVESTIGATOR,1965,North America,English,2153,19,,,0,0,0,1,0,1,05,1.0
1,5812,CHIEF_INVESTIGATOR,1965,North America,English,2153,19,,,0,0,0,1,0,1,06,1.0
2,5812,CHIEF_INVESTIGATOR,1965,North America,English,2153,19,,,0,0,0,1,0,1,06,1.0
3,5812,CHIEF_INVESTIGATOR,1965,North America,English,2153,19,,,0,0,0,1,1,1,07,1.0
4,5812,CHIEF_INVESTIGATOR,1965,North America,English,2153,19,,,0,0,0,1,1,1,08,1.0
0,5812,CHIEF_INVESTIGATOR,1965,North America,English,2153,19,,,0,0,0,0,0,0,07,2.0
0,671847,CHIEF_INVESTIGATOR,1945,,,,,,,0,0,0,0,0,0,06,1.0


In [None]:
person_DF[person_DF["Person.ID."] == "5812"]

In [154]:
person_DF.shape

(13035, 17)

In [156]:
person_DF.describe()

Unnamed: 0,Rank
count,13035.0
mean,1.620637
std,1.08617
min,1.0
25%,1.0
50%,1.0
75%,2.0
max,15.0


In [159]:
nb_rows = len(person_DF)
[(column_name, person_DF[column_name].isnull().sum() / nb_rows) for column_name in person_DF.columns]

[('Person.ID.', 0.0),
 ('Role.', 7.6716532412734941e-05),
 ('Year.of.Birth.', 0.00030686612965093976),
 ('Country.of.Birth.', 0.11591868047564251),
 ('Home.Language.', 0.90586881472957426),
 ('Dept.No..', 0.082086689681626385),
 ('Faculty.No..', 0.072036823935558117),
 ('With.PHD.', 0.38711162255466053),
 ('No..of.Years.in.Uni.at.Time.of.Grant.', 0.13302646720368239),
 ('Number.of.Successful.Grant.', 0.0),
 ('Number.of.Unsuccessful.Grant.', 0.0),
 ('A..', 0.0),
 ('A.', 0.0),
 ('B.', 0.0),
 ('C.', 0.00015343306482546988),
 ('Year', 0.0),
 ('Rank', 0.0)]

## Building contigency table by people

In [191]:
unique_roles, all_roles = process_column(data, "Role.")
unique_cob, all_cob = process_column(data, "Country.of.Birth.")
unique_homelanguage, all_homelanguage = process_column(data, "Home.Language.")
unique_nbyears, all_nbyears = process_column(data, "No..of.Years.in.Uni.at.Time.of.Grant.")

contigency_features = {
    "numeric": ["Year.of.Birth.", "Number.of.Successful.Grant.", "Number.of.Unsuccessful.Grant.", "A..", "A.", "B.", "C.", "Year", "Rank"],
    "categorical": [
        ("Role.", unique_roles), 
        ("Country.of.Birth.", unique_cob),
        ("Home.Language.", unique_homelanguage),
        ("No..of.Years.in.Uni.at.Time.of.Grant.", unique_nbyears)
    ]
}

contigency_df = pd.DataFrame(columns=contigency_features["numeric"] + [feature_name+str(unique_value) for feature_name, unique_values in contigency_features["categorical"] for unique_value in unique_values])
contigency_df

Unnamed: 0,Year.of.Birth.,Number.of.Successful.Grant.,Number.of.Unsuccessful.Grant.,A..,A.,B.,C.,Year,Rank,Role.nan,...,Country.of.Birth.Australia,Home.Language.nan,Home.Language.Other,Home.Language.English,No..of.Years.in.Uni.at.Time.of.Grant.nan,No..of.Years.in.Uni.at.Time.of.Grant.>5 to 10,No..of.Years.in.Uni.at.Time.of.Grant.more than 15,No..of.Years.in.Uni.at.Time.of.Grant.>10 to 15,No..of.Years.in.Uni.at.Time.of.Grant.>=0 to 5,No..of.Years.in.Uni.at.Time.of.Grant.Less than 0


In [188]:
person_ID = "36242"
person_applications = person_DF[person_DF["Person.ID."] == person_ID]
person_applications

new_values = []
for feature_name in contigency_features["numeric"]:
    new_values.append()

Unnamed: 0,Person.ID.,Role.,Year.of.Birth.,Country.of.Birth.,Home.Language.,Dept.No..,Faculty.No..,With.PHD.,No..of.Years.in.Uni.at.Time.of.Grant.,Number.of.Successful.Grant.,Number.of.Unsuccessful.Grant.,A..,A.,B.,C.,Year,Rank
0,36242,CHIEF_INVESTIGATOR,1965,Australia,,2713,25,Yes,>=0 to 5,1,0,2,1,0,0,6,1.0
1,36242,CHIEF_INVESTIGATOR,1965,Australia,,2713,25,Yes,>=0 to 5,1,0,2,1,0,0,6,1.0
2,36242,CHIEF_INVESTIGATOR,1965,Australia,,2713,25,Yes,>=0 to 5,1,0,2,1,0,0,6,1.0
3,36242,CHIEF_INVESTIGATOR,1965,Australia,,2713,25,Yes,>5 to 10,1,1,2,2,0,0,7,1.0
4,36242,CHIEF_INVESTIGATOR,1965,Australia,,2713,25,Yes,>5 to 10,1,2,2,2,0,0,8,1.0
5,36242,CHIEF_INVESTIGATOR,1965,Australia,,2713,25,Yes,>5 to 10,1,2,2,2,0,0,8,1.0
6,36242,CHIEF_INVESTIGATOR,1965,Australia,,2713,25,Yes,>5 to 10,1,3,2,2,0,0,8,1.0
0,36242,CHIEF_INVESTIGATOR,1965,Australia,,2713,25,Yes,>5 to 10,1,1,0,0,0,0,7,2.0
0,36242,CHIEF_INVESTIGATOR,1965,Australia,,2713,25,Yes,>5 to 10,1,1,2,2,0,0,7,3.0


In [199]:
person_applications["Year.of.Birth."].values

array(['1965', '1965', '1965', '1965', '1965', '1965', '1965', '1965',
       '1965'], dtype=object)