# Modelling

## Prerequisite Models

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import scipy

from sklearn.model_selection import train_test_split

## Required Metrics

In [2]:
from sklearn.metrics import f1_score

## Classifier Models

In [3]:
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import RidgeClassifier, SGDClassifier

In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier

In [5]:
from xgboost import XGBClassifier

## Data Import

In [6]:
train_df = pd.read_csv('../cleaned_data/train.csv')
test_df = pd.read_csv('../cleaned_data/test.csv')

In [11]:
train_df.isnull().any().value_counts()

False    63
dtype: int64

In [12]:
test_df.isnull().any().value_counts()

False    62
dtype: int64

In [13]:
train_df.columns

Index(['Unnamed: 0', 'employee_id', 'gender', 'no_of_trainings', 'age',
       'previous_year_rating', 'length_of_service', 'KPIs_met >80%',
       'awards_won?', 'avg_training_score', 'is_promoted',
       'department_analytics', 'department_finance', 'department_hr',
       'department_legal', 'department_operations', 'department_procurement',
       'department_r_and_d', 'department_sales_and_martketing',
       'department_technology', 'region_1', 'region_10', 'region_11',
       'region_12', 'region_13', 'region_14', 'region_15', 'region_16',
       'region_17', 'region_18', 'region_19', 'region_2', 'region_20',
       'region_21', 'region_22', 'region_23', 'region_24', 'region_25',
       'region_26', 'region_27', 'region_28', 'region_29', 'region_3',
       'region_30', 'region_31', 'region_32', 'region_33', 'region_34',
       'region_4', 'region_5', 'region_6', 'region_7', 'region_8', 'region_9',
       'Bachelor's', 'Below Secondary', 'Master's & above', 'Unknown', 'other',
 

In [16]:
train_df.set_index('Unnamed: 0', inplace = True)
train_df.set_index('employee_id', inplace = True)
train_df.head()

Unnamed: 0_level_0,gender,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted,department_analytics,...,region_9,Bachelor's,Below Secondary,Master's & above,Unknown,other,referred,sourcing,rating_missing,predicted_rating
employee_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
65438,0,1,35,5.0,8,1,0,49,0,0,...,0,0,0,1,0,0,0,1,0,4.0
65141,1,1,30,5.0,4,0,0,60,0,0,...,0,1,0,0,0,1,0,0,0,3.0
7513,1,1,34,3.0,7,0,0,50,0,0,...,0,1,0,0,0,0,0,1,0,3.0
2542,1,2,39,1.0,10,0,0,50,0,0,...,0,1,0,0,0,1,0,0,0,2.0
48945,1,1,45,3.0,2,0,0,73,0,0,...,0,1,0,0,0,1,0,0,0,3.0


In [17]:
test_df.set_index('Unnamed: 0', inplace = True)
test_df.set_index('employee_id', inplace = True)
test_df.head()

Unnamed: 0_level_0,gender,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,department_analytics,department_finance,...,region_9,Bachelor's,Below Secondary,Master's & above,Unknown,other,referred,sourcing,rating_missing,predicted_rating
employee_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8724,1,1,24,3.0,1,1,0,77,0,0,...,0,1,0,0,0,0,0,1,1,3.0
74430,0,1,31,3.0,5,0,0,51,0,0,...,0,1,0,0,0,1,0,0,0,3.0
72255,1,1,31,1.0,4,0,0,47,0,0,...,0,1,0,0,0,1,0,0,0,3.0
38562,0,3,31,2.0,9,0,0,65,0,0,...,0,1,0,0,0,1,0,0,0,3.0
64486,1,1,30,4.0,7,0,0,61,0,1,...,0,1,0,0,0,0,0,1,0,3.0


## Data Splitting

In [19]:
Y = train_df['is_promoted']
X = train_df.drop(['is_promoted'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [46]:
clf = RandomForestClassifier(max_depth=60, random_state=0, min_samples_split=10)
clf.fit(X, Y)
print(f1_score(Y, clf.predict(X)))
pred = clf.predict(test_df)

0.5960620525059666


In [50]:
sam = pd.read_csv('../data/sample_submission_M0L0uXE.csv')
sam.head()

Unnamed: 0,employee_id,is_promoted
0,8724,0
1,74430,0
2,72255,0
3,38562,0
4,64486,0


In [55]:
predictions = pd.DataFrame(pred)
predictions['employee_id'] = test_df.index.values

In [62]:
predictions.columns = ['is_promoted', 'employee_id']
predictions.set_index('employee_id', inplace = True)

In [63]:
predictions.is_promoted.value_counts()

0    22848
1      642
Name: is_promoted, dtype: int64

In [64]:
predictions.head()

Unnamed: 0_level_0,is_promoted
employee_id,Unnamed: 1_level_1
8724,0
74430,0
72255,0
38562,0
64486,0


In [65]:
predictions.to_csv('../output/14_15.50.csv')

In [44]:
from sklearn.ensemble import RandomForestClassifier
for i in [50, 60, 70, 80, 90]:
    for j in [2, 5, 8, 10, 12, 15, 17, 20]:    
        clf = RandomForestClassifier(max_depth=i, random_state=0, min_samples_split=j)
        clf.fit(X_train, y_train)
        predtrain = clf.predict(X_train)
        predtest = clf.predict(X_test)
        print(i, j, 'train: {0:.5f}'.format(f1_score(y_train, predtrain)), ' {0:.5f}'.format(f1_score(y_test, predtest)))

50 2 train: 0.93940  0.40383
50 5 train: 0.75025  0.43851
50 8 train: 0.63393  0.42940
50 10 train: 0.59901  0.43105
50 12 train: 0.54793  0.42496
50 15 train: 0.51875  0.41126
50 17 train: 0.52713  0.42097
50 20 train: 0.52006  0.43960
60 2 train: 0.93939  0.38678
60 5 train: 0.75685  0.43052
60 8 train: 0.63551  0.42669
60 10 train: 0.60158  0.44171
60 12 train: 0.55426  0.42424
60 15 train: 0.51835  0.40474
60 17 train: 0.53130  0.42500
60 20 train: 0.52006  0.43960
70 2 train: 0.93939  0.38678
70 5 train: 0.75685  0.43052
70 8 train: 0.63551  0.42669
70 10 train: 0.60158  0.44171
70 12 train: 0.55426  0.42424
70 15 train: 0.51835  0.40474
70 17 train: 0.53130  0.42500
70 20 train: 0.52006  0.43960
80 2 train: 0.93939  0.38678
80 5 train: 0.75685  0.43052
80 8 train: 0.63551  0.42669
80 10 train: 0.60158  0.44171
80 12 train: 0.55426  0.42424
80 15 train: 0.51835  0.40474
80 17 train: 0.53130  0.42500
80 20 train: 0.52006  0.43960
90 2 train: 0.93939  0.38678
90 5 train: 0.75685  0.

In [23]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

In [24]:
ros = RandomOverSampler(random_state=0)
ros.fit(X_train, y_train)
X_resampled, y_resampled = ros.sample(X_train, y_train)

In [43]:
from sklearn.ensemble import RandomForestClassifier
for i in [50, 60, 70, 80, 90]:
    for j in [2, 5, 8, 10, 12, 15, 17, 20]:    
        clf = RandomForestClassifier(max_depth=i, random_state=0, min_samples_split=j)
        clf.fit(X_resampled, y_resampled)
        predtrain = clf.predict(X_train)
        predtest = clf.predict(X_test)
        print(i, j, 'train: {0:.5f}'.format(f1_score(y_train, predtrain)), ' {0:.5f}'.format(f1_score(y_test, predtest)))

50 2 train: 0.74912  0.40471
50 5 train: 0.67346  0.39460
50 8 train: 0.63754  0.40363
50 10 train: 0.61898  0.41009
50 12 train: 0.60605  0.41368
50 15 train: 0.58225  0.40790
50 17 train: 0.57530  0.39917
50 20 train: 0.56512  0.40127
60 2 train: 0.74766  0.40918
60 5 train: 0.67468  0.39860
60 8 train: 0.64212  0.39687
60 10 train: 0.62529  0.40296
60 12 train: 0.61622  0.41422
60 15 train: 0.58244  0.40421
60 17 train: 0.57643  0.42033
60 20 train: 0.56942  0.40342
70 2 train: 0.74817  0.40974
70 5 train: 0.67333  0.39667
70 8 train: 0.64269  0.39810
70 10 train: 0.62529  0.40296
70 12 train: 0.61622  0.41422
70 15 train: 0.58153  0.40049
70 17 train: 0.57132  0.40555
70 20 train: 0.56882  0.39757
80 2 train: 0.74817  0.40974
80 5 train: 0.67333  0.39667
80 8 train: 0.64269  0.39810
80 10 train: 0.62529  0.40296
80 12 train: 0.61622  0.41422
80 15 train: 0.58153  0.40049
80 17 train: 0.57132  0.40555
80 20 train: 0.56882  0.39757
90 2 train: 0.74817  0.40974
90 5 train: 0.67333  0.

In [34]:
rus = RandomUnderSampler(random_state=0)
rus.fit(X_train, y_train)
X_resampled, y_resampled = rus.sample(X_train, y_train)
from sklearn.ensemble import RandomForestClassifier
for i in [50, 60, 70, 80, 90]:
    for j in [2, 5, 8, 10]:    
        clf = RandomForestClassifier(max_depth=i, random_state=0, min_samples_split=j)
        clf.fit(X_resampled, y_resampled)
        predtrain = clf.predict(X_train)
        predtest = clf.predict(X_test)
        print(i, j, 'train: {0:.5f}'.format(f1_score(y_train, predtrain)), ' {0:.5f}'.format(f1_score(y_test, predtest)))

50 2 train: 0.46536  0.36158
50 5 train: 0.41867  0.35262
50 8 train: 0.41015  0.35560
50 10 train: 0.40225  0.35163
60 2 train: 0.46536  0.36158
60 5 train: 0.42012  0.35433
60 8 train: 0.41015  0.35560
60 10 train: 0.40225  0.35163
70 2 train: 0.46536  0.36158
70 5 train: 0.42012  0.35433
70 8 train: 0.41015  0.35560
70 10 train: 0.40225  0.35163
80 2 train: 0.46536  0.36158
80 5 train: 0.42012  0.35433
80 8 train: 0.41015  0.35560
80 10 train: 0.40225  0.35163
90 2 train: 0.46536  0.36158
90 5 train: 0.42012  0.35433
90 8 train: 0.41015  0.35560
90 10 train: 0.40225  0.35163


In [35]:
rus = RandomUnderSampler(random_state=0, ratio={0: 20, 1: 80})
rus.fit(X_train, y_train)
X_resampled, y_resampled = rus.sample(X_train, y_train)
from sklearn.ensemble import RandomForestClassifier
for i in [50, 60, 70, 80, 90]:
    for j in [2, 5, 8, 10]:    
        clf = RandomForestClassifier(max_depth=i, random_state=0, min_samples_split=j)
        clf.fit(X_resampled, y_resampled)
        predtrain = clf.predict(X_train)
        predtest = clf.predict(X_test)
        print(i, j, 'train: {0:.5f}'.format(f1_score(y_train, predtrain)), ' {0:.5f}'.format(f1_score(y_test, predtest)))

50 2 train: 0.26262  0.25951
50 5 train: 0.20508  0.19975
50 8 train: 0.20195  0.19618
50 10 train: 0.18241  0.17815
60 2 train: 0.26262  0.25951
60 5 train: 0.20508  0.19975
60 8 train: 0.20195  0.19618
60 10 train: 0.18241  0.17815
70 2 train: 0.26262  0.25951
70 5 train: 0.20508  0.19975
70 8 train: 0.20195  0.19618
70 10 train: 0.18241  0.17815
80 2 train: 0.26262  0.25951
80 5 train: 0.20508  0.19975
80 8 train: 0.20195  0.19618
80 10 train: 0.18241  0.17815
90 2 train: 0.26262  0.25951
90 5 train: 0.20508  0.19975
90 8 train: 0.20195  0.19618
90 10 train: 0.18241  0.17815


In [37]:
from imblearn.over_sampling import SMOTE, ADASYN
rus = SMOTE(random_state=0)
rus.fit(X_train, y_train)
X_resampled, y_resampled = rus.sample(X_train, y_train)
from sklearn.ensemble import RandomForestClassifier
for i in [50, 60, 70, 80, 90]:
    for j in [2, 5, 8, 10]:    
        clf = RandomForestClassifier(max_depth=i, random_state=0, min_samples_split=j)
        clf.fit(X_resampled, y_resampled)
        predtrain = clf.predict(X_train)
        predtest = clf.predict(X_test)
        print(i, j, 'train: {0:.5f}'.format(f1_score(y_train, predtrain)), ' {0:.5f}'.format(f1_score(y_test, predtest)))

50 2 train: 0.95467  0.39595
50 5 train: 0.83155  0.39909
50 8 train: 0.75823  0.40627
50 10 train: 0.72314  0.40678
60 2 train: 0.95853  0.39070
60 5 train: 0.83369  0.41353
60 8 train: 0.75297  0.40185
60 10 train: 0.72745  0.41821
70 2 train: 0.95781  0.38665
70 5 train: 0.83229  0.40982
70 8 train: 0.75350  0.40801
70 10 train: 0.72739  0.40520
80 2 train: 0.95781  0.38665
80 5 train: 0.83229  0.40982
80 8 train: 0.75350  0.40801
80 10 train: 0.72739  0.40520
90 2 train: 0.95781  0.38665
90 5 train: 0.83229  0.40982
90 8 train: 0.75350  0.40801
90 10 train: 0.72739  0.40520


In [38]:
from imblearn.over_sampling import ADASYN
rus = ADASYN(random_state=0)
rus.fit(X_train, y_train)
X_resampled, y_resampled = rus.sample(X_train, y_train)
from sklearn.ensemble import RandomForestClassifier
for i in [50, 60, 70, 80, 90]:
    for j in [2, 5, 8, 10]:    
        clf = RandomForestClassifier(max_depth=i, random_state=0, min_samples_split=j)
        clf.fit(X_resampled, y_resampled)
        predtrain = clf.predict(X_train)
        predtest = clf.predict(X_test)
        print(i, j, 'train: {0:.5f}'.format(f1_score(y_train, predtrain)), ' {0:.5f}'.format(f1_score(y_test, predtest)))

50 2 train: 0.95510  0.37812
50 5 train: 0.82537  0.38462
50 8 train: 0.76118  0.40000
50 10 train: 0.73184  0.39877
60 2 train: 0.95523  0.35052
60 5 train: 0.82865  0.40541
60 8 train: 0.75818  0.40625
60 10 train: 0.73691  0.39876
70 2 train: 0.95481  0.36321
70 5 train: 0.82865  0.40541
70 8 train: 0.75784  0.41634
70 10 train: 0.73351  0.40000
80 2 train: 0.95481  0.36321
80 5 train: 0.82865  0.40541
80 8 train: 0.75784  0.41634
80 10 train: 0.73351  0.40000
90 2 train: 0.95481  0.36321
90 5 train: 0.82865  0.40541
90 8 train: 0.75784  0.41634
90 10 train: 0.73351  0.40000


In [39]:
from imblearn.under_sampling import NearMiss
rus = NearMiss(random_state=0)
rus.fit(X_train, y_train)
X_resampled, y_resampled = rus.sample(X_train, y_train)
from sklearn.ensemble import RandomForestClassifier
for i in [50, 60, 70, 80, 90]:
    for j in [2, 5, 8, 10]:    
        clf = RandomForestClassifier(max_depth=i, random_state=0, min_samples_split=j)
        clf.fit(X_resampled, y_resampled)
        predtrain = clf.predict(X_train)
        predtest = clf.predict(X_test)
        print(i, j, 'train: {0:.5f}'.format(f1_score(y_train, predtrain)), ' {0:.5f}'.format(f1_score(y_test, predtest)))

50 2 train: 0.25859  0.18118
50 5 train: 0.21681  0.16967
50 8 train: 0.20324  0.16692
50 10 train: 0.20509  0.17420
60 2 train: 0.25624  0.18010
60 5 train: 0.21681  0.16967
60 8 train: 0.20324  0.16692
60 10 train: 0.20509  0.17420
70 2 train: 0.25624  0.18010
70 5 train: 0.21681  0.16967
70 8 train: 0.20324  0.16692
70 10 train: 0.20509  0.17420
80 2 train: 0.25624  0.18010
80 5 train: 0.21681  0.16967
80 8 train: 0.20324  0.16692
80 10 train: 0.20509  0.17420
90 2 train: 0.25624  0.18010
90 5 train: 0.21681  0.16967
90 8 train: 0.20324  0.16692
90 10 train: 0.20509  0.17420


In [40]:
from imblearn.combine import SMOTEENN 
rus = SMOTEENN(random_state=0)
rus.fit(X_train, y_train)
X_resampled, y_resampled = rus.sample(X_train, y_train)
from sklearn.ensemble import RandomForestClassifier
for i in [50, 60, 70, 80, 90]:
    for j in [2, 5, 8, 10]:    
        clf = RandomForestClassifier(max_depth=i, random_state=0, min_samples_split=j)
        clf.fit(X_resampled, y_resampled)
        predtrain = clf.predict(X_train)
        predtest = clf.predict(X_test)
        print(i, j, 'train: {0:.5f}'.format(f1_score(y_train, predtrain)), ' {0:.5f}'.format(f1_score(y_test, predtest)))

50 2 train: 0.74912  0.40471
50 5 train: 0.67346  0.39460
50 8 train: 0.63754  0.40363
50 10 train: 0.61898  0.41009
60 2 train: 0.74766  0.40918
60 5 train: 0.67468  0.39860
60 8 train: 0.64212  0.39687
60 10 train: 0.62529  0.40296
70 2 train: 0.74817  0.40974
70 5 train: 0.67333  0.39667
70 8 train: 0.64269  0.39810
70 10 train: 0.62529  0.40296
80 2 train: 0.74817  0.40974
80 5 train: 0.67333  0.39667
80 8 train: 0.64269  0.39810
80 10 train: 0.62529  0.40296
90 2 train: 0.74817  0.40974
90 5 train: 0.67333  0.39667
90 8 train: 0.64269  0.39810
90 10 train: 0.62529  0.40296


In [42]:
from imblearn.combine import SMOTETomek 
rus = SMOTEENN(random_state=0)
rus.fit(X_train, y_train)
X_resampled, y_resampled = rus.sample(X_train, y_train)
from sklearn.ensemble import RandomForestClassifier
for i in [50, 60, 70, 80, 90]:
    for j in [2, 5, 8, 10, 15, 20]:    
        clf = RandomForestClassifier(max_depth=i, random_state=0, min_samples_split=j)
        clf.fit(X_resampled, y_resampled)
        predtrain = clf.predict(X_train)
        predtest = clf.predict(X_test)
        print(i, j, 'train: {0:.5f}'.format(f1_score(y_train, predtrain)), ' {0:.5f}'.format(f1_score(y_test, predtest)))

50 2 train: 0.74912  0.40471
50 5 train: 0.67346  0.39460
50 8 train: 0.63754  0.40363
50 10 train: 0.61898  0.41009
50 15 train: 0.58225  0.40790
50 20 train: 0.56512  0.40127
60 2 train: 0.74766  0.40918
60 5 train: 0.67468  0.39860
60 8 train: 0.64212  0.39687
60 10 train: 0.62529  0.40296
60 15 train: 0.58244  0.40421
60 20 train: 0.56942  0.40342
70 2 train: 0.74817  0.40974
70 5 train: 0.67333  0.39667
70 8 train: 0.64269  0.39810
70 10 train: 0.62529  0.40296
70 15 train: 0.58153  0.40049
70 20 train: 0.56882  0.39757
80 2 train: 0.74817  0.40974
80 5 train: 0.67333  0.39667
80 8 train: 0.64269  0.39810
80 10 train: 0.62529  0.40296
80 15 train: 0.58153  0.40049
80 20 train: 0.56882  0.39757
90 2 train: 0.74817  0.40974
90 5 train: 0.67333  0.39667
90 8 train: 0.64269  0.39810
90 10 train: 0.62529  0.40296
90 15 train: 0.58153  0.40049
90 20 train: 0.56882  0.39757
