In [10]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from precision_recall_cutoff import precision_recall_cutoff
from sklearn.feature_selection import RFE, RFECV
from sklearn.svm import SVR

s3= boto3.resource('s3')
bucket_name= 'morgan-gant-data448-bucket'
bucket= s3.Bucket(bucket_name)

file_key= 'turnover.csv'

bucket_object= bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

#reading the datefile
turnover = pd.read_csv(file_content_stream)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [11]:
#changing to dummy variables
turnover= pd.concat([turnover.drop(columns=['sales', 'salary'], axis=1), pd.get_dummies(turnover[['sales', 'salary']])], axis=1)

#engineering features from the decisoon tree model
turnover['interaction_1']=np.where(((turnover['satisfaction_level'] >= .115) &
                                   (turnover['satisfaction_level'] >= .465) &
                                   (turnover['number_project'] > 2.5)), 1, 0)

turnover['interaction_2']=np.where(((turnover['satisfaction_level'] >= .465) &
                                   (turnover['number_project'] <=2.5) &
                                   (turnover['last_evaluation'] <= .575)), 1, 0)
        
turnover['interaction3']=np.where(((turnover['satisfaction_level'] >= .465) &
                                    (turnover['time_spend_company'] <=4.5) &
                                    (turnover['average_montly_hours'] <=290.5)), 1,0)


In [12]:
#Defining input and target variables
x= turnover[['satisfaction_level', 'number_project', 'time_spend_company', 'interaction_1', 'interaction3']]
y= turnover['left']

#splitting the data
x_train, x_test, y_train, y_test= train_test_split(x,y, test_size= .2, stratify=y)

GridsearchCv with Ada

In [22]:
ada_param_grid = {'n_estimators': [100, 300],
'base_estimator__min_samples_split': [10, 15],
'base_estimator__min_samples_leaf': [5, 7],
'base_estimator__max_depth': [3, 5, 7],
'learning_rate': [0.01]}

ada_grid_search=RandomizedSearchCV(estimator= AdaBoostClassifier(base_estimator= DecisionTreeClassifier()), 
                                   param_distributions= ada_param_grid, 
                                   cv= 3, 
                                   scoring= 'f1', 
                                   n_jobs=-1,
                                   n_iter= 10).fit(x_train, y_train)



In [15]:
ada_grid_search.best_params_

{'base_estimator__max_depth': 7,
 'base_estimator__min_samples_leaf': 7,
 'base_estimator__min_samples_split': 10,
 'learning_rate': 0.01,
 'n_estimators': 300}

In [17]:
ada= AdaBoostClassifier(base_estimator= DecisionTreeClassifier(min_samples_split=10, min_samples_leaf= 7, max_depth= 7),
                        n_estimators= 300 , 
                        learning_rate= .01).fit(x_train, y_train)

#predicting on the test
ada_pred= ada.predict_proba(x_test)[:,1]

#changing likelyhoods to labels
ada_label= precision_recall_cutoff(y_test, ada_pred)

print(classification_report(y_test,ada_label))



              precision    recall  f1-score   support

           0       0.98      0.99      0.98      2286
           1       0.95      0.94      0.95       714

    accuracy                           0.98      3000
   macro avg       0.97      0.96      0.97      3000
weighted avg       0.98      0.98      0.98      3000



RandomizedSearchCV with Ada

In [22]:
ada_param_grid = {'n_estimators': [100, 300],
'base_estimator__min_samples_split': [10, 15],
'base_estimator__min_samples_leaf': [5, 7],
'base_estimator__max_depth': [3, 5, 7],
'learning_rate': [0.01]}

ada_grid_search=RandomizedSearchCV(estimator= AdaBoostClassifier(base_estimator= DecisionTreeClassifier()), 
                                   param_distributions= ada_param_grid, 
                                   cv= 3, 
                                   scoring= 'f1', 
                                   n_jobs=-1,
                                   n_iter= 10).fit(x_train, y_train)



In [23]:
ada_grid_search.best_params_

{'n_estimators': 300,
 'learning_rate': 0.01,
 'base_estimator__min_samples_split': 15,
 'base_estimator__min_samples_leaf': 7,
 'base_estimator__max_depth': 7}

In [24]:
ada= AdaBoostClassifier(base_estimator= DecisionTreeClassifier(min_samples_split= 15, min_samples_leaf= 7, max_depth= 7),
                        n_estimators= 300 , 
                        learning_rate= .01).fit(x_train, y_train)

#predicting on the test
ada_pred= ada.predict_proba(x_test)[:,1]

#changing likelyhoods to labels
ada_label= precision_recall_cutoff(y_test, ada_pred)

print(classification_report(y_test,ada_label))



              precision    recall  f1-score   support

           0       0.98      0.99      0.98      2286
           1       0.95      0.94      0.95       714

    accuracy                           0.97      3000
   macro avg       0.97      0.96      0.97      3000
weighted avg       0.97      0.97      0.97      3000



In [None]:
#both accuracy scores were very similar when predicting on left. However, the Gridsearch did a little better so if I had to choose based 
#on 1 iteration, I would use Gridsreach to predict on left.