In [1]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 50)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import precision_recall_cutoff

## Defining the bucket 
s3 = boto3.resource('s3')
bucket_name = 'data-448'
bucket = s3.Bucket(bucket_name)

## Defining the csv file 
file_key = 'In_Class_Assignments/turnover.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading the csv file
turnover = pd.read_csv(file_content_stream)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [2]:
## Changing sales to dummy variables
turnover = pd.concat([turnover.drop(columns = ['sales'], axis = 1), pd.get_dummies(turnover['sales'])], axis = 1)

## Changing salary to dummy variables
turnover = pd.concat([turnover, pd.get_dummies(turnover['salary'])], axis = 1)

turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,high,low,medium
0,0.38,0.53,2,157,3,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,medium,0,0,0,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,medium,0,0,0,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0


In [3]:
## Creating interactions/features from the decision tree
turnover['interaction_1'] = np.where((turnover['satisfaction_level'] <= 0.465) & (turnover['number_project'] <= 2.5) & (turnover['last_evaluation'] <= 0.575), 1, 0)
turnover['interaction_2'] = np.where((turnover['satisfaction_level'] <= 0.465) & (turnover['number_project'] >= 2.5) & (turnover['satisfaction_level'] >= 0.115), 1, 0)
turnover['interaction_3'] = np.where((turnover['satisfaction_level'] >= 0.465) & (turnover['time_spend_company'] <= 4.5) & (turnover['average_montly_hours'] <= 290.5), 1, 0)

turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,high,low,medium,interaction_1,interaction_2,interaction_3
0,0.38,0.53,2,157,3,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0
1,0.8,0.86,5,262,6,0,1,0,medium,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
2,0.11,0.88,7,272,4,0,1,0,medium,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
3,0.72,0.87,5,223,5,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
4,0.37,0.52,2,159,3,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0


In [4]:
## Defining the input and target variables
X = turnover.drop(columns = ['left', 'salary'], axis = 1)
Y = turnover['left']

## Spliting the data 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

In [6]:
## Defining the list to store results 
results = list()

for i in range(0, 10):
    
    ## Splitting the data
    X_training, X_testing, Y_training, Y_testing = train_test_split(X_train, Y_train, test_size = 0.2, stratify = Y_train)
    
    ## Builiding the random forest model 
    RF = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_training, Y_training)
    
    ## Extracting feature importances
    results.append(RF.feature_importances_)
    
## Changing to data-frame
results = pd.DataFrame(results)
results.columns = X.columns
results

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,high,low,medium,interaction_1,interaction_2,interaction_3
0,0.177666,0.041856,0.098648,0.060158,0.089154,0.012305,0.000987,8.6e-05,0.000142,3.6e-05,0.000225,0.000524,1.6e-05,4.4e-05,8.6e-05,2.8e-05,0.000105,0.005091,0.006846,0.001163,0.190975,0.041011,0.272849
1,0.179979,0.04262,0.108236,0.068912,0.081178,0.013366,0.000861,9.7e-05,0.000317,3.5e-05,0.00017,0.000305,3e-06,6.8e-05,9.8e-05,6.1e-05,5.5e-05,0.005416,0.005229,0.000483,0.206248,0.040073,0.246191
2,0.195512,0.045019,0.102167,0.076107,0.086961,0.009154,0.001051,7.8e-05,0.000276,2.1e-05,0.000391,0.000406,1.9e-05,1.9e-05,5.2e-05,4.6e-05,0.000126,0.00707,0.005633,0.000758,0.190754,0.039207,0.239171
3,0.184717,0.038522,0.096593,0.069911,0.089596,0.009744,0.000927,3e-05,0.000206,4.5e-05,0.000117,0.000401,7e-06,3.7e-05,4.2e-05,7.6e-05,0.000133,0.004929,0.006361,0.001074,0.206347,0.045627,0.24456
4,0.179668,0.050394,0.115354,0.065023,0.086766,0.011606,0.000739,0.000106,0.000197,3.5e-05,5.8e-05,0.000277,1.4e-05,5.4e-05,7.5e-05,7.1e-05,0.000105,0.006482,0.004462,0.001065,0.206568,0.033346,0.237532
5,0.189727,0.041365,0.088232,0.061183,0.081425,0.010131,0.001135,6.1e-05,0.000153,1.9e-05,0.000148,0.000623,1e-05,3.4e-05,6.5e-05,3.3e-05,5.9e-05,0.007618,0.008038,0.001155,0.199219,0.041263,0.268303
6,0.181064,0.040577,0.121911,0.069147,0.090024,0.008008,0.000971,2e-05,0.000268,3.7e-05,0.000247,0.00015,1.7e-05,5e-06,0.000103,9e-05,5.6e-05,0.004695,0.006084,0.00085,0.197683,0.037839,0.240153
7,0.197604,0.045556,0.105151,0.072498,0.095967,0.011755,0.000688,2.9e-05,0.000288,4.8e-05,0.00016,0.000644,1.6e-05,0.000123,9.5e-05,5e-05,8.6e-05,0.006133,0.005895,0.000946,0.187266,0.038573,0.230429
8,0.175417,0.042693,0.116492,0.063659,0.084603,0.013052,0.000647,4.2e-05,0.000214,0.000105,0.000134,0.000598,3.8e-05,6.9e-05,9.3e-05,3.5e-05,6.6e-05,0.003907,0.00698,0.001226,0.203993,0.038809,0.247126
9,0.187704,0.038353,0.101118,0.079251,0.08717,0.01257,0.000762,3.6e-05,0.000494,1.5e-05,0.000263,0.000472,9e-06,3.9e-05,6.2e-05,5.5e-05,6.5e-05,0.006245,0.007779,0.001119,0.182168,0.039109,0.255144


In [7]:
## Computing averages and sorting by importance
results = pd.DataFrame(results.apply(np.mean, axis = 0))
results = pd.DataFrame({'Feature':results.index, 'Importance': results[0].values})
results = results.sort_values(by = 'Importance', ascending = False)
results

Unnamed: 0,Feature,Importance
22,interaction_3,0.248146
20,interaction_1,0.197122
0,satisfaction_level,0.184906
2,number_project,0.10539
4,time_spend_company,0.087284
3,average_montly_hours,0.068585
1,last_evaluation,0.042696
21,interaction_2,0.039486
5,Work_accident,0.011169
18,low,0.006331


In [8]:
X_train_1 = X_train[['interaction_3', 'interaction_1', 'satisfaction_level', 'number_project', 'time_spend_company']]
X_test_1 = X_test[['interaction_3', 'interaction_1', 'satisfaction_level', 'number_project', 'time_spend_company']]

## Random forest with top 5
RF = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_train_1, Y_train)

## Predicting on test
RF_pred = RF.predict_proba(X_test_1)[:, 1]

## Predicting the labels
RF_labels = precision_recall_cutoff.precision_recall_cutoff(Y_test, RF_pred)

## Computing the classification report
print(classification_report(Y_test, RF_labels))

              precision    recall  f1-score   support

           0       0.97      0.96      0.97      2286
           1       0.88      0.91      0.89       714

    accuracy                           0.95      3000
   macro avg       0.93      0.93      0.93      3000
weighted avg       0.95      0.95      0.95      3000



In [9]:
X_train_2 = X_train[['interaction_3', 'interaction_1', 'satisfaction_level', 'number_project', 'time_spend_company', 'average_montly_hours']]
X_test_2 = X_test[['interaction_3', 'interaction_1', 'satisfaction_level', 'number_project', 'time_spend_company', 'average_montly_hours']]

## Random forest with top 6
RF = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_train_2, Y_train)

## Predicting on test
RF_pred = RF.predict_proba(X_test_2)[:, 1]

## Predicting the labels
RF_labels = precision_recall_cutoff.precision_recall_cutoff(Y_test, RF_pred)

## Computing the classification report
print(classification_report(Y_test, RF_labels))

              precision    recall  f1-score   support

           0       0.98      0.97      0.97      2286
           1       0.90      0.92      0.91       714

    accuracy                           0.96      3000
   macro avg       0.94      0.95      0.94      3000
weighted avg       0.96      0.96      0.96      3000



In [None]:
## Based on my results, I would use the second model (with top 6 features) to predict 
## employee turnover.