# Feature Engineering

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import scipy

from sklearn.model_selection import train_test_split

In [2]:
from sklearn.metrics import f1_score

In [3]:
train_df = pd.read_csv('../cleaned_data/train.csv')
test_df = pd.read_csv('../cleaned_data/test.csv')


train_df.drop('Unnamed: 0', axis = 1, inplace = True)
train_df.set_index('employee_id', inplace = True)
test_df.drop('Unnamed: 0', axis = 1, inplace = True)
test_df.set_index('employee_id', inplace = True)

## New Features

In [4]:
train_df.describe().iloc[:, :10]

Unnamed: 0,gender,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted,department_analytics
count,54808.0,54808.0,54808.0,54808.0,54808.0,54808.0,54808.0,54808.0,54808.0,54808.0
mean,0.702379,1.253011,34.803915,3.304481,5.865512,0.351974,0.023172,63.38675,0.08517,0.09765
std,0.457216,0.609264,7.660169,1.21477,4.265094,0.47759,0.15045,13.371559,0.279137,0.296844
min,0.0,1.0,20.0,1.0,1.0,0.0,0.0,39.0,0.0,0.0
25%,0.0,1.0,29.0,3.0,3.0,0.0,0.0,51.0,0.0,0.0
50%,1.0,1.0,33.0,3.0,5.0,0.0,0.0,60.0,0.0,0.0
75%,1.0,1.0,39.0,4.0,7.0,1.0,0.0,76.0,0.0,0.0
max,1.0,10.0,60.0,5.0,37.0,1.0,1.0,99.0,1.0,1.0


In [5]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

features_to_scale = ['age', 'length_of_service', 'avg_training_score', 'no_of_trainings', 'previous_year_rating', 'predicted_rating']
features_out = []
for i in features_to_scale:
    train_df[i + '_sc'] = train_df[i]
    test_df[i + '_sc'] = test_df[i]
    features_out.append(i + '_sc')

scaled_features = scaler.fit_transform(train_df.loc[:, features_to_scale])
train_df[features_out] = scaled_features
scaled_features = scaler.transform(test_df.loc[:, features_to_scale])
test_df[features_out] = scaled_features

In [7]:
test_df.columns.values

array(['gender', 'no_of_trainings', 'age', 'previous_year_rating',
       'length_of_service', 'KPIs_met >80%', 'awards_won?',
       'avg_training_score', 'department_analytics', 'department_finance',
       'department_hr', 'department_legal', 'department_operations',
       'department_procurement', 'department_r_and_d',
       'department_sales_and_martketing', 'department_technology',
       'region_1', 'region_10', 'region_11', 'region_12', 'region_13',
       'region_14', 'region_15', 'region_16', 'region_17', 'region_18',
       'region_19', 'region_2', 'region_20', 'region_21', 'region_22',
       'region_23', 'region_24', 'region_25', 'region_26', 'region_27',
       'region_28', 'region_29', 'region_3', 'region_30', 'region_31',
       'region_32', 'region_33', 'region_34', 'region_4', 'region_5',
       'region_6', 'region_7', 'region_8', 'region_9', "Bachelor's",
       'Below Secondary', "Master's & above", 'Unknown', 'other',
       'referred', 'sourcing', 'rating_missin

In [None]:
train_df.describe().iloc[:, 60:]

In [None]:
Y = train_df['is_promoted']
X = train_df.drop(['is_promoted'], axis=1)
# X = scaler.fit_transform(X)
# newtest_df = scaler.transform(test_df)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=40, random_state=0, min_samples_split=10)
clf.fit(X_train, y_train)
print(f1_score(y_test, clf.predict(X_test)))

In [None]:
# for i in [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 60]:
#     for j in [2, 5, 10, 15]:
#         clf = RandomForestClassifier(max_depth=i, random_state=0, min_samples_split=j)
#         clf.fit(X_train, y_train)
#         print(i, j, f1_score(y_test, clf.predict(X_test)))

## New Features

In [None]:
train_df.age.plot.hist()

In [8]:
def age_group(i):
    if i <= 30:
        return 1
    elif i <= 40:
        return 2
    elif i <= 50:
        return 3
    else:
        return 4
    
test_df['age_group'] = [age_group(i) for i in test_df.age]
train_df['age_group'] = [age_group(i) for i in train_df.age]

In [9]:
def training_score_groups(i):
    if i <= 45:
        return 1
    elif i <= 55:
        return 2
    elif i <= 65:
        return 3
    elif i <= 75:
        return 4
    elif i <= 85:
        return 5
    else:
        return 6

test_df['training_score_groups'] = [training_score_groups(i) for i in test_df['avg_training_score']]
train_df['training_score_groups'] = [training_score_groups(i) for i in train_df['avg_training_score']]

In [None]:
train_df['avg_training_score'].hist(bins=20)

In [None]:
train_df['length_of_service'].hist(bins = 20)

In [10]:
def length_of_service_groups(i):
    if i <= 4:
        return 1
    elif i <= 8:
        return 2
    elif i <= 12:
        return 3
    elif i <= 16:
        return 4
    elif i <= 20:
        return 5
    else:
        return 6
    
train_df['length_of_service_groups'] = [length_of_service_groups(i) for i in train_df['length_of_service']]
test_df['length_of_service_groups'] = [length_of_service_groups(i) for i in test_df['length_of_service']]

In [None]:
Y = train_df['is_promoted']
X = train_df.drop(['is_promoted'], axis=1)
X = train_df.drop(['age', 'length_of_service', 'avg_training_score', 'no_of_trainings', 'previous_year_rating', 'predicted_rating'], axis = 1)
# X = scaler.fit_transform(X)
# newtest_df = scaler.transform(test_df)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=36)

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=40, random_state=0, min_samples_split=10)
clf.fit(X_train, y_train)
print(f1_score(y_train, clf.predict(X_train)))
print(f1_score(y_test, clf.predict(X_test)))

In [None]:
pd.Series(clf.predict(X_train)).value_counts()
pd.Series(clf.predict(X_test)).value_counts()

In [None]:
y_test.value_counts()

In [None]:
Y = train_df['is_promoted']
X = train_df.drop(['is_promoted', 'age', 'length_of_service', 'avg_training_score', 'no_of_trainings', 'previous_year_rating', 'predicted_rating'], axis = 1)
X_test_new = test_df.drop(['age', 'length_of_service', 'avg_training_score', 'no_of_trainings', 'previous_year_rating', 'predicted_rating'], axis = 1)

In [None]:
X_test_new.columns.values

In [None]:
X.columns.values

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=40, random_state=0, min_samples_split=10)
clf.fit(X, Y)
print(f1_score(Y, clf.predict(X)))
pred = clf.predict(X_test_new)

In [None]:
pred = pd.DataFrame(pred, index = X_test_new.index)

In [None]:
pred.columns = ['is_promoted']

In [None]:
pred.is_promoted.value_counts()

In [None]:
pred.to_csv('../output/sub_14_23-42.csv')

In [13]:
Y = train_df['is_promoted']
X = train_df.drop(['is_promoted'], axis = 1)
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=40, random_state=0, min_samples_split=10)
clf.fit(X, Y)
print(f1_score(Y, clf.predict(X)))
pred = clf.predict(test_df)
pred = pd.DataFrame(pred, index = test_df.index)
pred.columns = ['is_promoted']
pred.to_csv('../output/sub_14_23-53.csv')

0.6190545880286843


In [None]:
X.columns.values

In [None]:
X_test.columns.values

In [18]:
Y = train_df['is_promoted']
X = train_df.drop(['is_promoted'], axis = 1)
from sklearn.ensemble import RandomForestClassifier
for i in [2, 5, 6, 8, 10, 15, 20]:
    clf = RandomForestClassifier(max_depth=50, random_state=0, min_samples_split=2)
    clf.fit(X, Y)
    print(i, f1_score(Y, clf.predict(X)))
#     pred = clf.predict(test_df)
#     pred = pd.DataFrame(pred, index = test_df.index)
#     pred.columns = ['is_promoted']
#     pred.to_csv('../output/sub_14_23-53.csv')

2 0.939122548463893
5 0.939122548463893
6 0.939122548463893
8 0.939122548463893
10 0.939122548463893
15 0.939122548463893
20 0.939122548463893


In [21]:
Y = train_df['is_promoted']
X = train_df.drop(['is_promoted'], axis = 1)
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=50, random_state=0, min_samples_split=10)
clf.fit(X, Y)
print(i, f1_score(Y, clf.predict(X)))
pred = clf.predict(test_df)
pred = pd.DataFrame(pred, index = test_df.index)
pred.columns = ['is_promoted']
pred.to_csv('../output/sub_14_23-581.csv')

20 0.6247630158961645


In [22]:
Y = train_df['is_promoted']
X = train_df.drop(['is_promoted'], axis = 1)
from xgboost import XGBClassifier
clf = XGBClassifier()
clf.fit(X, Y)
print(i, f1_score(Y, clf.predict(X)))
pred = clf.predict(test_df)
pred = pd.DataFrame(pred, index = test_df.index)
pred.columns = ['is_promoted']
pred.to_csv('../output/sub_14_23-59.csv')

20 0.4551429510351626


  if diff:
  if diff:


## XGBOOST 

In [23]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4



In [43]:
def modelfit(alg, dtrain, predictors, target, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], 
                          nfold=cv_folds, metrics='auc', early_stopping_rounds=early_stopping_rounds, 
                          verbose_eval=True)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain[target],eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(dtrain[target].values, dtrain_predictions))
    print("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain[target], dtrain_predprob))
                    
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

In [44]:
#Choose all predictors except target & IDcols
predictors = X.columns.values.tolist()
xgb1 = XGBClassifier(learning_rate =0.1,
                    n_estimators=1000,
                    max_depth=5,
                    min_child_weight=1,
                    gamma=0,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective= 'binary:logistic',
                    nthread=4,
                    scale_pos_weight = 0.0833,
                    seed=27)
# Y = train_df['is_promoted']
# X = train_df.drop(['is_promoted'], axis = 1)
modelfit(xgb1, train_df, predictors, 'is_promoted')

[0]	train-auc:0.766605+0.00147147	test-auc:0.766505+0.00699217
[1]	train-auc:0.767024+0.00163004	test-auc:0.766943+0.00680031
[2]	train-auc:0.767228+0.00177074	test-auc:0.767314+0.00657873
[3]	train-auc:0.767494+0.00207293	test-auc:0.767557+0.0061852
[4]	train-auc:0.767818+0.001939	test-auc:0.767842+0.00632773
[5]	train-auc:0.768134+0.00172498	test-auc:0.768108+0.00642183
[6]	train-auc:0.769388+0.00227879	test-auc:0.769109+0.00663466
[7]	train-auc:0.77774+0.0113318	test-auc:0.77767+0.0161731
[8]	train-auc:0.783072+0.0107364	test-auc:0.783394+0.0117274
[9]	train-auc:0.786418+0.00826824	test-auc:0.785648+0.0113672
[10]	train-auc:0.787467+0.00788814	test-auc:0.786729+0.0105488
[11]	train-auc:0.789038+0.00700605	test-auc:0.788582+0.0108453
[12]	train-auc:0.795214+0.00713531	test-auc:0.795421+0.0110242
[13]	train-auc:0.802882+0.00591797	test-auc:0.803765+0.0100972
[14]	train-auc:0.807066+0.00241545	test-auc:0.807336+0.00463837
[15]	train-auc:0.806754+0.00229179	test-auc:0.80728+0.00467752
[

[129]	train-auc:0.915185+0.00104101	test-auc:0.903372+0.00339803
[130]	train-auc:0.915375+0.00105293	test-auc:0.903451+0.00334625
[131]	train-auc:0.915514+0.00107081	test-auc:0.903574+0.00328984
[132]	train-auc:0.915735+0.00100685	test-auc:0.903763+0.00315769
[133]	train-auc:0.915935+0.000952487	test-auc:0.903998+0.00319319
[134]	train-auc:0.916141+0.000810874	test-auc:0.904133+0.00307314
[135]	train-auc:0.916368+0.000853044	test-auc:0.904303+0.002987
[136]	train-auc:0.916513+0.000831541	test-auc:0.90437+0.00302686
[137]	train-auc:0.916735+0.000841694	test-auc:0.90448+0.00300884
[138]	train-auc:0.916846+0.000881228	test-auc:0.904518+0.00302767
[139]	train-auc:0.916991+0.00089549	test-auc:0.904539+0.00300816
[140]	train-auc:0.917108+0.00088438	test-auc:0.904577+0.00296658
[141]	train-auc:0.917294+0.000803976	test-auc:0.904749+0.00305884
[142]	train-auc:0.917498+0.000876093	test-auc:0.904938+0.00298622
[143]	train-auc:0.917577+0.000876324	test-auc:0.904978+0.00299872
[144]	train-auc:0.91

[256]	train-auc:0.928055+0.000981986	test-auc:0.908577+0.00342439
[257]	train-auc:0.928157+0.000964908	test-auc:0.908602+0.00343147
[258]	train-auc:0.928204+0.000975541	test-auc:0.908642+0.00343447
[259]	train-auc:0.928322+0.00100127	test-auc:0.908609+0.00340311
[260]	train-auc:0.928419+0.000981177	test-auc:0.908653+0.00343512
[261]	train-auc:0.928478+0.000982106	test-auc:0.908672+0.00339156
[262]	train-auc:0.928566+0.00100747	test-auc:0.908646+0.00343024
[263]	train-auc:0.928628+0.00101271	test-auc:0.908662+0.00349001
[264]	train-auc:0.928697+0.00100763	test-auc:0.90875+0.00358481
[265]	train-auc:0.928772+0.00103113	test-auc:0.908804+0.00361659
[266]	train-auc:0.928844+0.00103218	test-auc:0.908834+0.00365149
[267]	train-auc:0.928907+0.00102822	test-auc:0.908853+0.00368307
[268]	train-auc:0.928972+0.00105737	test-auc:0.908837+0.00372047
[269]	train-auc:0.929072+0.00101634	test-auc:0.908911+0.00377906
[270]	train-auc:0.929142+0.00102395	test-auc:0.908921+0.00377696
[271]	train-auc:0.929

  if diff:



Model Report
Accuracy : 0.9407
AUC Score (Train): 0.930820


TypeError: 'str' object is not callable

In [49]:
pred = xgb1.predict(test_df)
pred = pd.DataFrame(pred, index = test_df.index)
pred.columns = ['is_promoted']
pred.to_csv('../output/sub_15_00-29.csv')

  if diff:
