In [6]:
pip install xgboost

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Note: you may need to restart the kernel to use updated packages.


In [21]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from xgboost import XGBClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, make_scorer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE, RFECV
from sklearn.svm import SVR

from cost_function import cost_function, cost_function_cutoff

s3= boto3.resource('s3')
bucket_name= 'morgan-gant-data448-bucket'
bucket= s3.Bucket(bucket_name)

file_key= 'turnover.csv'

bucket_object= bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

#reading the datefile
turnover = pd.read_csv(file_content_stream)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [22]:
#changing to dummy variables
turnover= pd.concat([turnover.drop(columns=['sales', 'salary'], axis=1), pd.get_dummies(turnover[['sales', 'salary']])], axis=1)

#engineering features from the decisoon tree model
turnover['interaction_1']=np.where(((turnover['satisfaction_level'] >= .115) &
                                   (turnover['satisfaction_level'] >= .465) &
                                   (turnover['number_project'] > 2.5)), 1, 0)

turnover['interaction_2']=np.where(((turnover['satisfaction_level'] >= .465) &
                                   (turnover['number_project'] <=2.5) &
                                   (turnover['last_evaluation'] <= .575)), 1, 0)
        
turnover['interaction_3']=np.where(((turnover['satisfaction_level'] >= .465) &
                                    (turnover['time_spend_company'] <=4.5) &
                                    (turnover['average_montly_hours'] <=290.5)), 1,0)


In [23]:
#Defining input and target variables
x= turnover[['satisfaction_level', 'number_project', 'time_spend_company', 'interaction_1', 'interaction_3']]
y= turnover['left']

#splitting the data
x_train, x_test, y_train, y_test= train_test_split(x,y, test_size= .2, stratify=y)

RF GridSearchCV

In [27]:
rf_param_grid = {'n_estimators': [100, 300, 500],
'min_samples_split': [10, 15],
'min_samples_leaf': [5, 7],
'max_depth' : [3, 5, 7]}

#defining customized scoring function
my_score_function= make_scorer(cost_function, greater_is_better= True, need_proba= True)

#runnning gridsearchCV
rf_grid_search= GridSearchCV(estimator= RandomForestClassifier(), param_grid= rf_param_grid, cv= 3, scoring= my_score_function, n_jobs=-1).fit(x_train, y_train)
    
#Extracting best hyperparamter combo
rf_md= rf_grid_search.best_estimator_

#prediciting on test
rf_test_pred= rf_md.predict_proba(x_test)[:,1]

#identify optimal cutoff
opt_cutoff= cost_function_cutoff(y_test, rf_test_pred)

#changing likelyhoodss to lables
rf_label= np.where(rf_test_pred < opt_cutoff , 0, 1)

#computing confusion matrix 
cm= confusion_matrix(y_test, rf_label)

print(cm)
print('Cost of RF model:', -1500*cm[1,0] - 1000*cm[0,1] + 500*cm[1,1])

Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 220, in __call__
    return self._score(
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 268, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
TypeError: cost_function() got an unexpected keyword argument 'need_proba'

Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 220, in __call__
    return self

[[2225   61]
 [ 105  609]]
Cost of RF model: 86000


In [28]:
print(cm)
print('Cost of RF model:', -1500*cm[1,0] - 1000*cm[0,1] + 500*cm[1,1])

[[2225   61]
 [ 105  609]]
Cost of RF model: 86000


XGBoost

In [30]:
xgb_param_grid = {'n_estimators': [500],
'max_depth': [3, 5, 7],
'min_child_weight': [5, 7],
'learning_rate': [0.01],
'gamma': [0.3, 0.1],
'subsample': [0.8, 1],
'colsample_bytree': [1]}

#defining customized scoring function
my_score_function= make_scorer(cost_function, greater_is_better= True, need_proba= True)

#runnning gridsearchCV
xgb_grid_search= GridSearchCV(estimator= XGBClassifier(), param_grid= xgb_param_grid, cv= 3, scoring= my_score_function, n_jobs=-1).fit(x_train, y_train)
    
#Extracting best hyperparamter combo
xgb_md= xgb_grid_search.best_estimator_

#prediciting on test
xgb_test_pred= xgb_md.predict_proba(x_test)[:,1]

#identify optimal cutoff
opt_cutoff= cost_function_cutoff(y_test, xgb_test_pred)

#changing likelyhoodss to lables
xgb_label= np.where(xgb_test_pred < opt_cutoff , 0, 1)

#computing confusion matrix 
cm= confusion_matrix(y_test, xgb_label)

 nan nan nan nan nan nan]


In [31]:
print(cm)
print('Cost of XGB model:', -1500*cm[1,0] - 1000*cm[0,1] + 500*cm[1,1])

[[2233   53]
 [  60  654]]
Cost of XGB model: 184000


In [None]:
#The model I would use to predict on left is the xgboost because it gives us the greatest return when running this model. 