In [1]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-1.5.2-py3-none-manylinux2014_x86_64.whl (173.6 MB)
     |████████████████████████████████| 173.6 MB 80.8 MB/s            
Installing collected packages: xgboost
Successfully installed xgboost-1.5.2
Note: you may need to restart the kernel to use updated packages.


In [10]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from scipy.stats import boxcox
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, make_scorer, confusion_matrix
from sklearn.feature_selection import RFE, RFECV
from sklearn.svm import SVC

from cost_function import cost_function, cost_function_cutoff
from xgboost import XGBClassifier

## Defining the bucket 
s3 = boto3.resource('s3')
bucket_name = 'data-448'
bucket = s3.Bucket(bucket_name)

## Defining the csv file 
file_key = 'In_Class_Assignments/turnover.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading the csv file
turnover = pd.read_csv(file_content_stream)
turnover.head()



Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [11]:
## Changing sales and salary to dummies
turnover = pd.concat([turnover.drop(columns = ['sales', 'salary'], axis = 1), pd.get_dummies(turnover[['sales', 'salary']])], axis = 1)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales_IT,sales_RandD,...,sales_hr,sales_management,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [12]:
## Engineering features from the decision tree model 
turnover['interaction_1'] = np.where(((turnover['satisfaction_level'] >= 0.115) & 
                                      (turnover['satisfaction_level'] <= 0.465) & 
                                      (turnover['number_project'] > 2.5)), 1, 0)

turnover['interaction_2'] = np.where(((turnover['satisfaction_level'] <= 0.465) & 
                                      (turnover['number_project'] <= 2.5) & 
                                      (turnover['last_evaluation'] <= 0.575)), 1, 0)

turnover['interaction_3'] = np.where(((turnover['satisfaction_level'] > 0.465) & 
                                      (turnover['time_spend_company'] <= 4.5) & 
                                      (turnover['average_montly_hours'] <= 290.5)), 1, 0)

turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales_IT,sales_RandD,...,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium,interaction_1,interaction_2,interaction_3
0,0.38,0.53,2,157,3,0,1,0,0,0,...,0,1,0,0,0,1,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,...,0,1,0,0,0,0,1,0,0,0
2,0.11,0.88,7,272,4,0,1,0,0,0,...,0,1,0,0,0,0,1,0,0,0
3,0.72,0.87,5,223,5,0,1,0,0,0,...,0,1,0,0,0,1,0,0,0,0
4,0.37,0.52,2,159,3,0,1,0,0,0,...,0,1,0,0,0,1,0,0,1,0


In [13]:
## Defining input and target
X = turnover[['interaction_3', 'interaction_1', 'satisfaction_level', 'time_spend_company',
              'number_project']]
Y = turnover['left']

## Splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

# Random Forest

In [15]:
RF_param_grid = {'n_estimators': [100, 300, 500],
                 'min_samples_split': [10, 15],
                 'min_samples_leaf': [5, 7],
                 'max_depth' : [3, 5, 7]}

## Defining the customized scoring function
my_score_function = make_scorer(cost_function, greater_is_better = True, needs_proba = True)

## Performing the grid-search cv
RF_grid_search = GridSearchCV(estimator = RandomForestClassifier(), 
                              param_grid = RF_param_grid, cv = 3,
                              scoring = my_score_function, 
                              n_jobs = -1).fit(X_train, Y_train)

## Extracting the best model
RF_md = RF_grid_search.best_estimator_

## Predicting on test
RF_test_pred = RF_md.predict_proba(X_test)[:, 1]

## Identifying the optimal cut-off
opt_cutoff = cost_function_cutoff(Y_test, RF_test_pred)

## Changing likelihoods into labels
RF_label = np.where(RF_test_pred < opt_cutoff, 0, 1)

## Computing the confusion matrix
X = confusion_matrix(Y_test, RF_label)
print(X)
print('The cost of the RF model is ', 1500*X[1, 0] - 1000*X[0, 1] + 500*X[1, 1])

[[2260   26]
 [  55  659]]
The cost of the RF model is  386000


# XGBoost

In [16]:
XGBoost_param_grid = {'n_estimators': [500],
                      'max_depth': [3, 5, 7],
                      'min_child_weight': [5, 7],
                      'learning_rate': [0.01],
                      'gamma': [0.3, 0.1],
                      'subsample': [0.8, 1],
                      'colsample_bytree': [1]}

## Defining the customized scoring function
my_score_function = make_scorer(cost_function, greater_is_better = True, needs_proba = True)

## Performing the grid-search cv
xgb_grid_search = GridSearchCV(estimator = XGBClassifier(), 
                               param_grid = XGBoost_param_grid, 
                               cv = 3,
                               scoring = my_score_function, 
                               n_jobs = -1).fit(X_train, Y_train)

## Extracting the best model
xgb_md = xgb_grid_search.best_estimator_

## Predicting on test
xgb_test_pred = xgb_md.predict_proba(X_test)[:, 1]

## Identifying the optimal cut-off
opt_cutoff = cost_function_cutoff(Y_test, xgb_test_pred)

## Changing likelihoods into labels
xgb_label = np.where(xgb_test_pred < opt_cutoff, 0, 1)

## Computing the confusion matrix
X = confusion_matrix(Y_test, xgb_label)
print(X)
print('The cost of the XGBoost model is ', 1500*X[1, 0] - 1000*X[0, 1] + 500*X[1, 1])



[[2257   29]
 [  56  658]]
The cost of the XGBoost model is  384000
