In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pylab
import statsmodels.api as sm
from pprint import pprint as pp
%matplotlib inline

In [77]:
from scipy import stats
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, Normalizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, VotingClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [51]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [52]:
def avg_training_score_grading(x):
    grade = 0;
    if x < 59 :
        grade = 0
    elif x < 79:
        grade = 1
    else :
        grade = 2
    return grade    

In [53]:
def no_of_trainings_grade(x):
    grade = 0
    if x <= 5:
        grade = 0
    elif x <= 7:
        grade = 1
    else :
        grade = 2
    return grade   

In [54]:
def education_encode(x):
    x = str(x)
    x = ''.join(list(x)[0:4])
    grade = np.NaN
    if x[0:4] == 'Bach':
        grade = 1
    elif x[0:4] == 'Mast':
        grade = 2
    elif x[0:4] == 'Belo':
        grade = 0
    return grade 

In [55]:
def region_encode(x):
    return int(x[len('region_'):])

In [56]:
def age_bining(x):
    x = int(x)
    x = (x//10)*10
    return x
    

In [57]:
df_train['age'] = df_train['age'].apply(age_bining)
df_test['age'] = df_test['age'].apply(age_bining)

In [58]:
df_train['region'] = df_train['region'].apply(region_encode)
df_test['region'] = df_test['region'].apply(region_encode)

In [59]:
df_train['no_of_trainings'] = df_train['no_of_trainings'].apply(no_of_trainings_grade)
df_test['no_of_trainings'] = df_test['no_of_trainings'].apply(no_of_trainings_grade)

In [60]:
df_train['avg_training_score'] = df_train['avg_training_score'].apply(avg_training_score_grading)
df_test['avg_training_score'] = df_test['avg_training_score'].apply(avg_training_score_grading)

In [61]:
df_train['education'] = df_train['education'].apply(education_encode)
df_test['education'] = df_test['education'].apply(education_encode)

In [62]:
imputer = Imputer(missing_values='NaN', strategy="most_frequent", axis=0)
train_previous_year_rating = imputer.fit_transform(df_train.previous_year_rating.values.reshape(-1, 1))
df_train['previous_year_rating'] = train_previous_year_rating
test_previous_year_rating = imputer.transform(df_test.previous_year_rating.values.reshape(-1, 1))
df_test['previous_year_rating'] = test_previous_year_rating

In [63]:
df_train['education'] = df_train['education'].replace(r'', np.NaN)
df_test['education'] = df_test['education'].replace(r'', np.NaN)

In [64]:
df_train['education'] = df_train['education'].astype(object)
df_test['education'] = df_test['education'].astype(object)

In [65]:
imputer = Imputer(missing_values='NaN', strategy="most_frequent", axis=0)
train_education_filled = imputer.fit_transform(df_train.education.values.reshape(-1, 1))
df_train['education'] = train_education_filled
test_education_filled = imputer.transform(df_test.education.values.reshape(-1, 1))
df_test['education'] = test_education_filled

In [66]:
dict_labelencode={}
for col in ['department', 'region', 'gender', 'recruitment_channel', 'no_of_trainings', 'age', 
                    'length_of_service', 'KPIs_met >80%', 'awards_won?', 'avg_training_score', 
                    'previous_year_rating', 'education']:
#for col in ['department', 'gender', 'recruitment_channel']:
    le = LabelEncoder()
    df_train[col] = le.fit_transform(df_train[col])
    df_test[col] = le.transform(df_test[col])
    dict_labelencode[col] = le.classes_
    print("One hot encoding  for -----> ", col)                
    onehotencoder = OneHotEncoder(categorical_features = 'all')
    dummy_var = onehotencoder.fit_transform(df_train[col].values.reshape(-1,1)).toarray()
    valueList = dict_labelencode[col]
    i = 0 
    for value in valueList:
        print("One hot encoding  for value-----> ", value) 
        newcolumn = col + str("_") + str(value)
        print("One hot encoding  for label class -----> ", newcolumn)
        df_train[newcolumn] = dummy_var[:, i]
        i = i+1
    df_train = df_train.drop([newcolumn], axis=1)    
    df_train = df_train.drop([col], axis=1)
    print("dropped column -----> ", newcolumn, col)
    #df_test = df_test.drop([col], axis=1)
    

One hot encoding  for ----->  department
One hot encoding  for value----->  Analytics
One hot encoding  for label class ----->  department_Analytics
One hot encoding  for value----->  Finance
One hot encoding  for label class ----->  department_Finance
One hot encoding  for value----->  HR
One hot encoding  for label class ----->  department_HR
One hot encoding  for value----->  Legal
One hot encoding  for label class ----->  department_Legal
One hot encoding  for value----->  Operations
One hot encoding  for label class ----->  department_Operations
One hot encoding  for value----->  Procurement
One hot encoding  for label class ----->  department_Procurement
One hot encoding  for value----->  R&D
One hot encoding  for label class ----->  department_R&D
One hot encoding  for value----->  Sales & Marketing
One hot encoding  for label class ----->  department_Sales & Marketing
One hot encoding  for value----->  Technology
One hot encoding  for label class ----->  department_Technology
d

In [67]:
df_train.columns

Index(['employee_id', 'is_promoted', 'department_Analytics',
       'department_Finance', 'department_HR', 'department_Legal',
       'department_Operations', 'department_Procurement', 'department_R&D',
       'department_Sales & Marketing', 'region_1', 'region_2', 'region_3',
       'region_4', 'region_5', 'region_6', 'region_7', 'region_8', 'region_9',
       'region_10', 'region_11', 'region_12', 'region_13', 'region_14',
       'region_15', 'region_16', 'region_17', 'region_18', 'region_19',
       'region_20', 'region_21', 'region_22', 'region_23', 'region_24',
       'region_25', 'region_26', 'region_27', 'region_28', 'region_29',
       'region_30', 'region_31', 'region_32', 'region_33', 'gender_f',
       'recruitment_channel_other', 'recruitment_channel_referred',
       'no_of_trainings_0', 'no_of_trainings_1', 'age_20', 'age_30', 'age_40',
       'age_50', 'length_of_service_1', 'length_of_service_2',
       'length_of_service_3', 'length_of_service_4', 'length_of_service_5'

In [68]:
X1 = df_train.loc[:,['department_Analytics',
       'department_Finance', 'department_HR', 'department_Legal',
       'department_Operations', 'department_Procurement', 'department_R&D',
       'department_Sales & Marketing', 'region_1', 'region_2', 'region_3',
       'region_4', 'region_5', 'region_6', 'region_7', 'region_8', 'region_9',
       'region_10', 'region_11', 'region_12', 'region_13', 'region_14',
       'region_15', 'region_16', 'region_17', 'region_18', 'region_19',
       'region_20', 'region_21', 'region_22', 'region_23', 'region_24',
       'region_25', 'region_26', 'region_27', 'region_28', 'region_29',
       'region_30', 'region_31', 'region_32', 'region_33', 'gender_f',
       'recruitment_channel_other', 'recruitment_channel_referred',
       'no_of_trainings_0', 'no_of_trainings_1', 'age_20', 'age_30', 'age_40',
       'age_50', 'length_of_service_1', 'length_of_service_2',
       'length_of_service_3', 'length_of_service_4', 'length_of_service_5',
       'length_of_service_6', 'length_of_service_7', 'length_of_service_8',
       'length_of_service_9', 'length_of_service_10', 'length_of_service_11',
       'length_of_service_12', 'length_of_service_13', 'length_of_service_14',
       'length_of_service_15', 'length_of_service_16', 'length_of_service_17',
       'length_of_service_18', 'length_of_service_19', 'length_of_service_20',
       'length_of_service_21', 'length_of_service_22', 'length_of_service_23',
       'length_of_service_24', 'length_of_service_25', 'length_of_service_26',
       'length_of_service_27', 'length_of_service_28', 'length_of_service_29',
       'length_of_service_30', 'length_of_service_31', 'length_of_service_32',
       'length_of_service_33', 'length_of_service_34', 'KPIs_met >80%_0',
       'awards_won?_0', 'avg_training_score_0', 'avg_training_score_1',
       'previous_year_rating_1.0', 'previous_year_rating_2.0',
       'previous_year_rating_3.0', 'previous_year_rating_4.0', 'education_0.0',
       'education_1.0']].values
y1 = df_train.loc[:,['is_promoted']].values
#y_train = df_train.loc[:,['is_promoted']].values

#X2 = df_test.loc[:,['department', 'region', 'gender', 'recruitment_channel', 'no_of_trainings', 'age', 
                    #'length_of_service', 'KPIs_met >80%', 'awards_won?', 'avg_training_score', 
                    #'previous_year_rating', 'education']].values

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size = 0.25, random_state = 0)

In [75]:
df_train.describe()

Unnamed: 0,employee_id,is_promoted,department_Analytics,department_Finance,department_HR,department_Legal,department_Operations,department_Procurement,department_R&D,department_Sales & Marketing,...,KPIs_met >80%_0,awards_won?_0,avg_training_score_0,avg_training_score_1,previous_year_rating_1.0,previous_year_rating_2.0,previous_year_rating_3.0,previous_year_rating_4.0,education_0.0,education_1.0
count,54808.0,54808.0,54808.0,54808.0,54808.0,54808.0,54808.0,54808.0,54808.0,54808.0,...,54808.0,54808.0,54808.0,54808.0,54808.0,54808.0,54808.0,54808.0,54808.0,54808.0
mean,39195.830627,0.08517,0.09765,0.046271,0.044118,0.018957,0.20705,0.130236,0.018227,0.307254,...,0.648026,0.976828,0.434316,0.352503,0.113542,0.077087,0.414939,0.180211,0.014688,0.712998
std,22586.581449,0.279137,0.296844,0.210072,0.205358,0.136375,0.405195,0.336566,0.133774,0.46136,...,0.47759,0.15045,0.495671,0.477754,0.317257,0.266732,0.492716,0.384367,0.1203,0.452367
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,19669.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,39225.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,58730.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
max,78298.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [78]:
ESTIMATORS = {
"XGB": XGBClassifier(),
"RandomForest": RandomForestClassifier(),
"SVC": SVC(),
"DecisionTree": DecisionTreeClassifier(),
"LogisticRegression":LogisticRegression(),
"KNN":KNeighborsClassifier(),
"AdaBoostClassifier":AdaBoostClassifier(),
"GradientBoost":GradientBoostingClassifier(),
"ExtraTreesClassifier":ExtraTreesClassifier(),
"BaggingClassifier":BaggingClassifier()
}

In [79]:
y_test_predict = dict()
Model_Accuracy = dict()
estimator_dict = dict()
f1_score_dict = {}
for name, estimator in ESTIMATORS.items():
    estimator.fit(X_train, y_train)
    y_test_predict[name] = estimator.predict(X_test)
    Model_Accuracy[name] = accuracy_score(y_test,y_test_predict[name])
    f1_score_dict[name] = f1_score(y_test,y_test_predict[name])

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [84]:
pp(Model_Accuracy)
pp(f1_score_dict)

{'AdaBoostClassifier': 0.91891694643117794,
 'BaggingClassifier': 0.91789519778134576,
 'DecisionTree': 0.90045248868778283,
 'ExtraTreesClassifier': 0.91731134141001314,
 'GradientBoost': 0.93285651729674501,
 'KNN': 0.91337031090351772,
 'LogisticRegression': 0.92504743833017078,
 'RandomForest': 0.91950080280251056,
 'SVC': 0.92475551014450441,
 'XGB': 0.93176178660049624}
{'AdaBoostClassifier': 0.18488628026412326,
 'BaggingClassifier': 0.3787962451684152,
 'DecisionTree': 0.37545787545787546,
 'ExtraTreesClassifier': 0.32438878950506861,
 'GradientBoost': 0.3475177304964539,
 'KNN': 0.11351755041075429,
 'LogisticRegression': 0.25848375451263539,
 'RandomForest': 0.3228974831184776,
 'SVC': 0.19390148553557465,
 'XGB': 0.32393347794649313}


In [80]:
eclf2 = VotingClassifier(estimators=[("XGB",ESTIMATORS['XGB']),
                                     ("RandomForest",ESTIMATORS['RandomForest']),
                                     ("SVC",ESTIMATORS['SVC']),
                                     ("DecisionTree",ESTIMATORS['DecisionTree']),
                                     ("LogisticRegression",ESTIMATORS['LogisticRegression']),
                                     ("KNN",ESTIMATORS['KNN']),
                                     ("AdaBoostClassifier",ESTIMATORS['AdaBoostClassifier']),
                                     ("GradientBoost",ESTIMATORS['GradientBoost']),
                                     ("ExtraTreesClassifier",ESTIMATORS['ExtraTreesClassifier']),
                                     ("BaggingClassifier",ESTIMATORS['BaggingClassifier'])
                                    ],
         voting='hard')

eclf2 = eclf2.fit(X_train, y_train)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [82]:
vc_y_pred = eclf2.predict(X_test)
print(accuracy_score(y_test,vc_y_pred))
print(f1_score(y_test,vc_y_pred))

0.929645307254
0.294289897511


# Bagging Classifier

In [86]:
bag_classifeir = BaggingClassifier()
bag_classifeir.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
         verbose=0, warm_start=False)

In [87]:
bg_y_pred = bag_classifeir.predict(X_test)
print(accuracy_score(y_test, bg_y_pred))
print(f1_score(y_test, bg_y_pred))

0.918041161874
0.383982446517


In [None]:
bg_y_pred2 = bag_classifeir.predict(X2)
df_test['is_promoted'] = bg_y_pred2
header = ['employee_id', 'is_promoted']
df_test[header].to_csv('bag_submission.csv', columns = header, index=False)

# Random Forest Classifier

In [73]:
rf_classifier = RandomForestClassifier(max_depth=200, n_estimators=90, criterion='entropy')
rf_classifier.fit(X_train, y_train)

  from ipykernel import kernelapp as app


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=200, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=90, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [74]:
rf_y_pred = rf_classifier.predict(X_test)
print(accuracy_score(y_test, rf_y_pred))
print(f1_score(y_test, rf_y_pred))

0.923149905123
0.34879406308


# Support vector machine

In [76]:
from sklearn.svm import SVC
svm_classifier = SVC(kernel = 'rbf', random_state = 0)
svm_classifier.fit(X_train, y_train)
# Predicting the Test set results
svm_y_pred = svm_classifier.predict(X_test)
print(accuracy_score(y_test, svm_y_pred))
print(f1_score(y_test, svm_y_pred))

  y = column_or_1d(y, warn=True)


0.924755510145
0.193901485536


NameError: name 'classification_report' is not defined

In [25]:
rf_y_pred2 = rf_classifier.predict(X2)

In [41]:
rf_y_pred2.shape

(23490,)

In [43]:
df_test.shape

(23490, 14)

In [44]:
df_test.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,8724,8,26,1.0,1,2,0,20,3.0,1,1,0,1,0
1,74430,2,4,1.0,0,0,0,30,3.0,5,0,0,0,0
2,72255,7,13,1.0,1,0,0,30,1.0,4,0,0,0,0
3,38562,5,2,1.0,0,0,0,30,2.0,9,0,0,1,0
4,64486,1,29,1.0,1,2,0,30,4.0,7,0,0,1,0


In [49]:
df_train.head(20)

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,7,7,2.0,0,2,0,30,5.0,8,1,0,0,0
1,65141,4,22,1.0,1,0,0,30,5.0,4,0,0,1,0
2,7513,7,19,1.0,1,2,0,30,3.0,7,0,0,0,0
3,2542,7,23,1.0,1,0,0,30,1.0,10,0,0,0,0
4,48945,8,26,1.0,1,0,0,40,3.0,2,0,0,1,0
5,58896,0,2,1.0,1,2,0,30,3.0,7,0,0,2,0
6,20379,4,20,1.0,0,0,0,30,3.0,5,0,0,1,0
7,16290,4,34,2.0,1,2,0,30,3.0,6,0,0,1,0
8,73202,0,20,1.0,1,0,0,20,4.0,5,0,0,2,0
9,28911,7,1,2.0,1,2,0,30,5.0,5,1,0,0,0


In [27]:
df_test['is_promoted'] = rf_y_pred2
header = ['employee_id', 'is_promoted']
df_test[header].to_csv('sample_submission.csv', columns = header, index=False)