In [5]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from precision_recall_cutoff import precision_recall_cutoff
from sklearn.feature_selection import RFE, RFECV

s3= boto3.resource('s3')
bucket_name= 'morgan-gant-data448-bucket'
bucket= s3.Bucket(bucket_name)

file_key= 'turnover.csv'

bucket_object= bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

#reading the datefile
turnover = pd.read_csv(file_content_stream)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [6]:
#changing to dummy variables
turnover= pd.concat([turnover.drop(columns=['sales', 'salary'], axis=1), pd.get_dummies(turnover[['sales', 'salary']])], axis=1)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales_IT,sales_RandD,...,sales_hr,sales_management,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [7]:
#engineering features from the decisoon tree model
turnover['interaction_1']=np.where(((turnover['satisfaction_level'] >= .115) &
                                   (turnover['satisfaction_level'] >= .465) &
                                   (turnover['number_project'] > 2.5)), 1, 0)

turnover['interaction_2']=np.where(((turnover['satisfaction_level'] >= .465) &
                                   (turnover['number_project'] <=2.5) &
                                   (turnover['last_evaluation'] <= .575)), 1, 0)
        
turnover['interaction3']=np.where(((turnover['satisfaction_level'] >= .465) &
                                    (turnover['time_spend_company'] <=4.5) &
                                    (turnover['average_montly_hours'] <=290.5)), 1,0)

In [13]:
#Defining input and target variables
x= turnover.drop(columns= 'left', axis=1)
y= turnover['left']

#splitting the data
x_train, x_test, y_train, y_test= train_test_split(x,y, test_size= .2, stratify= y)

In [14]:
#Scaling inputs
scaler=MinMaxScaler()
x_train= pd.DataFrame(scaler.fit_transform(x_train), columns= x_train.columns)
x_test= pd.DataFrame(scaler.fit_transform(x_test), columns= x_test.columns)

RFECV with Logistic

In [15]:
#running RFE with Random forest model
lr_rfecv = RFECV(estimator =LogisticRegression(), min_features_to_select = 2, cv=3).fit(x_train, y_train)

#extracting feature names 
print(x_train.columns[lr_rfecv.support_])

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'promotion_last_5years', 'salary_high', 'interaction_2',
       'interaction3'],
      dtype='object')


In [21]:
#defining input and target variables
x_train1= x_train[['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'promotion_last_5years', 'salary_high', 'interaction_2',
       'interaction3']]
x_test1= x_test[['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'promotion_last_5years', 'salary_high', 'interaction_2',
       'interaction3']]

#building the model
lr_md= LogisticRegression().fit(x_train1, y_train)

#predicting on the test
lr_pred= lr_md.predict_proba(x_test1)[:,1]

#changing likelyhoods to labels
lr_label= precision_recall_cutoff(y_test, lr_pred)

print(classification_report(y_test,lr_label))

              precision    recall  f1-score   support

           0       0.95      0.94      0.95      2286
           1       0.81      0.86      0.83       714

    accuracy                           0.92      3000
   macro avg       0.88      0.90      0.89      3000
weighted avg       0.92      0.92      0.92      3000



RFECV with Regression

In [20]:
#running RFE with Random forest model
rf_rfecv = RFECV(estimator = RandomForestClassifier(n_estimators = 500, max_depth = 3), step=1, min_features_to_select = 2, cv=3 ).fit(x_train, y_train)
#extracting feature names 
print(x_train.columns[rf_rfecv.support_])

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'salary_low', 'interaction_1', 'interaction3'],
      dtype='object')


In [23]:
#defining input and target variables
x_train2= x_train[['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'salary_low', 'interaction_1', 'interaction3']]
x_test2= x_test[['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'salary_low', 'interaction_1', 'interaction3']]

#building the model
rf_md= LogisticRegression().fit(x_train2, y_train)

#predicting on the test
rf_pred= rf_md.predict_proba(x_test2)[:,1]

#changing likelyhoods to labels
rf_label= precision_recall_cutoff(y_test, rf_pred)

print(classification_report(y_test,rf_label))

              precision    recall  f1-score   support

           0       0.96      0.92      0.94      2286
           1       0.78      0.89      0.83       714

    accuracy                           0.91      3000
   macro avg       0.87      0.90      0.89      3000
weighted avg       0.92      0.91      0.91      3000



In [None]:
#Based on my data split and models, if I were to predict on left I would use the logistic regression model because it had a slightly better performance 