In [16]:
pip install imblearn

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Note: you may need to restart the kernel to use updated packages.


In [17]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from scipy.stats import boxcox
from precision_recall_cutoff import precision_recall_cutoff

s3= boto3.resource('s3')
bucket_name= 'morgan-gant-data448-bucket'
bucket= s3.Bucket(bucket_name)

file_key= 'turnover.csv'

bucket_object= bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

#reading the datefile
turnover = pd.read_csv(file_content_stream)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [18]:
#changing to dummy variables
turnover= pd.concat([turnover.drop(columns=['sales', 'salary'], axis=1), pd.get_dummies(turnover[['sales', 'salary']])], axis=1)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales_IT,sales_RandD,...,sales_hr,sales_management,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [19]:
#Defining scaler
scaler= MinMaxScaler()

#changing to a 0-1 scale
turnover[['number_project', 'average_montly_hours']] = scaler.fit_transform(turnover[['number_project', 'average_montly_hours']])

#BoxCox transformation
transformed_time_spend= boxcox(turnover['time_spend_company'])
turnover['time_spend_company'] = transformed_time_spend[0]
turnover.head

<bound method NDFrame.head of        satisfaction_level  last_evaluation  number_project  \
0                    0.38             0.53             0.0   
1                    0.80             0.86             0.6   
2                    0.11             0.88             1.0   
3                    0.72             0.87             0.6   
4                    0.37             0.52             0.0   
...                   ...              ...             ...   
14994                0.40             0.57             0.0   
14995                0.37             0.48             0.0   
14996                0.37             0.53             0.0   
14997                0.11             0.96             0.8   
14998                0.37             0.52             0.0   

       average_montly_hours  time_spend_company  Work_accident  left  \
0                  0.285047            0.804651              0     1   
1                  0.775701            1.098118              0     1   
2        

In [20]:
#Engineering interactions
turnover['interaction_1']= turnover['satisfaction_level'] * turnover['time_spend_company']
turnover['interaction_2']= turnover['last_evaluation'] * turnover['promotion_last_5years']
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales_IT,sales_RandD,...,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium,interaction_1,interaction_2
0,0.38,0.53,0.0,0.285047,0.804651,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0.305767,0.0
1,0.8,0.86,0.6,0.775701,1.098118,0,1,0,0,0,...,0,0,1,0,0,0,0,1,0.878494,0.0
2,0.11,0.88,1.0,0.82243,0.941381,0,1,0,0,0,...,0,0,1,0,0,0,0,1,0.103552,0.0
3,0.72,0.87,0.6,0.593458,1.03233,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0.743278,0.0
4,0.37,0.52,0.0,0.294393,0.804651,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0.297721,0.0


In [21]:
#Defining input and target variables
X= turnover.drop(columns= 'left', axis=1)
Y= turnover['left']

#splitting the data
X_train, X_test, Y_train, Y_test= train_test_split(X,Y, test_size= .2, stratify= Y)

Random Forest Models

In [22]:
#Building model
rf_md= RandomForestClassifier(n_estimators= 500, max_depth=3).fit(X_train, Y_train)

#Prediction on the test
rf_pred= rf_md.predict_proba(X_test)[:,1]

#Changing likelyhoods to labels
rf_label= precision_recall_cutoff(Y_test, rf_pred)

print(classification_report(Y_test, rf_label))

              precision    recall  f1-score   support

           0       0.96      0.89      0.92      2286
           1       0.71      0.89      0.79       714

    accuracy                           0.89      3000
   macro avg       0.84      0.89      0.86      3000
weighted avg       0.90      0.89      0.89      3000



In [24]:
#Without interactions
X_train_new= X_train.drop(columns=['interaction_1', 'interaction_2'], axis=1)
X_test_new= X_test.drop(columns=['interaction_1', 'interaction_2'], axis=1)

#Building model
rf_md= RandomForestClassifier(n_estimators= 500, max_depth=3).fit(X_train_new, Y_train)

#Prediction on the test
rf_pred= rf_md.predict_proba(X_test_new)[:,1]

#Changing likelyhoods to labels
rf_label= precision_recall_cutoff(Y_test, rf_pred)

print(classification_report(Y_test, rf_label))

              precision    recall  f1-score   support

           0       0.96      0.89      0.92      2286
           1       0.71      0.87      0.78       714

    accuracy                           0.89      3000
   macro avg       0.83      0.88      0.85      3000
weighted avg       0.90      0.89      0.89      3000



In [None]:
#Based on my models and data split, the models have very close to the same performance but
#because models without interactions are less complex and these results show the interactions
#aren't significant I would use the second model to predict on the variable left!