In [9]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from precision_recall_cutoff import precision_recall_cutoff
from sklearn.feature_selection import RFE, RFECV

s3= boto3.resource('s3')
bucket_name= 'morgan-gant-data448-bucket'
bucket= s3.Bucket(bucket_name)

file_key= 'turnover.csv'

bucket_object= bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

#reading the datefile
turnover = pd.read_csv(file_content_stream)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [10]:
#changing to dummy variables
turnover= pd.concat([turnover.drop(columns=['sales', 'salary'], axis=1), pd.get_dummies(turnover[['sales', 'salary']])], axis=1)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales_IT,sales_RandD,...,sales_hr,sales_management,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [11]:
#engineering features from the decisoon tree model
turnover['interaction_1']=np.where(((turnover['satisfaction_level'] >= .115) &
                                   (turnover['satisfaction_level'] >= .465) &
                                   (turnover['number_project'] > 2.5)), 1, 0)

turnover['interaction_2']=np.where(((turnover['satisfaction_level'] >= .465) &
                                   (turnover['number_project'] <=2.5) &
                                   (turnover['last_evaluation'] <= .575)), 1, 0)
        
turnover['interaction3']=np.where(((turnover['satisfaction_level'] >= .465) &
                                    (turnover['time_spend_company'] <=4.5) &
                                    (turnover['average_montly_hours'] <=290.5)), 1,0)

In [12]:
#Defining input and target variables
x= turnover.drop(columns= 'left', axis=1)
y= turnover['left']

#splitting the data
x_train, x_test, y_train, y_test= train_test_split(x,y, test_size= .2, stratify= y)

RFE with Random Forest

In [13]:
#running RFE with Random forest model
rf_rfe = RFE(estimator = RandomForestClassifier(n_estimators = 500, max_depth = 3), n_features_to_select = 5).fit(x_train, y_train)

#extracting feature names 
print(x_train.columns[rf_rfe.support_])

Index(['satisfaction_level', 'number_project', 'time_spend_company',
       'interaction_1', 'interaction3'],
      dtype='object')


In [16]:
#defining the inputs and target
x_train1= x_train[['satisfaction_level', 'number_project', 'time_spend_company',
       'interaction_1', 'interaction3']]
x_test1= x_test[['satisfaction_level', 'number_project', 'time_spend_company',
       'interaction_1', 'interaction3']]

#building the model
rf_md1= RandomForestClassifier(n_estimators= 500, max_depth=3).fit(x_train1, y_train)

#predicting on test
rf_pred1= rf_md1.predict_proba(x_test1)[:,1]

#changing liklihoods to labels
rf_labels1= precision_recall_cutoff(y_test, rf_pred1)

print(classification_report(y_test, rf_labels1))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      2286
           1       0.91      0.89      0.90       714

    accuracy                           0.95      3000
   macro avg       0.94      0.93      0.94      3000
weighted avg       0.95      0.95      0.95      3000



RFE with Random Forest

In [18]:
#running RFECV with Random forest model
afs = RFECV(estimator = RandomForestClassifier(n_estimators = 500, max_depth = 3), step=1, min_features_to_select = 2, cv=3).fit(x_train, y_train)

#extracting feature names 
print(x_train.columns[afs.support_])

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'salary_low', 'interaction_1', 'interaction3'],
      dtype='object')


In [19]:
#defining the inputs and target
x_train2= x_train[['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'salary_low', 'interaction_1', 'interaction3']]
x_test2= x_test[['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'salary_low', 'interaction_1', 'interaction3']]

#building the model
rf_md2= RandomForestClassifier(n_estimators= 500, max_depth=3).fit(x_train2, y_train)

#predicting on test
rf_pred2= rf_md2.predict_proba(x_test2)[:,1]

#changing liklihoods to labels
rf_labels2= precision_recall_cutoff(y_test, rf_pred2)

print(classification_report(y_test, rf_labels2))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      2286
           1       0.94      0.92      0.93       714

    accuracy                           0.97      3000
   macro avg       0.96      0.95      0.95      3000
weighted avg       0.97      0.97      0.97      3000



In [None]:
#Based on my results and  my split, the best model to predict on the left variable is model two with the RFECV!