In [1]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 50)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import precision_recall_cutoff

## Defining the bucket 
s3 = boto3.resource('s3')
bucket_name = 'data-448'
bucket = s3.Bucket(bucket_name)

## Defining the csv file 
file_key = 'In_Class_Assignments/turnover.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading the csv file
turnover = pd.read_csv(file_content_stream)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [2]:
## Changing sales to dummy variables
turnover = pd.concat([turnover.drop(columns = ['sales'], axis = 1), pd.get_dummies(turnover['sales'])], axis = 1)

## Changing salary to dummy variables
turnover = pd.concat([turnover, pd.get_dummies(turnover['salary'])], axis = 1)

turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,high,low,medium
0,0.38,0.53,2,157,3,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,medium,0,0,0,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,medium,0,0,0,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0


In [3]:
## Creating interactions/features from the decision tree
turnover['interaction_1'] = np.where((turnover['satisfaction_level'] <= 0.465) & (turnover['number_project'] <= 2.5) & (turnover['last_evaluation'] <= 0.575), 1, 0)
turnover['interaction_2'] = np.where((turnover['satisfaction_level'] <= 0.465) & (turnover['number_project'] >= 2.5) & (turnover['satisfaction_level'] >= 0.115), 1, 0)
turnover['interaction_3'] = np.where((turnover['satisfaction_level'] >= 0.465) & (turnover['time_spend_company'] <= 4.5) & (turnover['average_montly_hours'] <= 290.5), 1, 0)

turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,high,low,medium,interaction_1,interaction_2,interaction_3
0,0.38,0.53,2,157,3,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0
1,0.8,0.86,5,262,6,0,1,0,medium,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
2,0.11,0.88,7,272,4,0,1,0,medium,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
3,0.72,0.87,5,223,5,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
4,0.37,0.52,2,159,3,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0


In [6]:
## Defining the input and target variables
X = turnover.drop(columns = ['left', 'salary'], axis = 1)
Y = turnover['left']

## Spliting the data 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

## Defining list to store results
results = list()

for i in range(0, 10):

    ## Spliting the data 
    X_training, X_testing, Y_training, Y_testing = train_test_split(X_train, Y_train, test_size = 0.2, stratify = Y_train)
    
    ## Building the random forest model 
    RF = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_training, Y_training)
    
    ## Extracting feature importances
    results.append(RF.feature_importances_)
    
## Changing to data-frame
results = pd.DataFrame(results)
results.columns = X.columns
results.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,high,low,medium,interaction_1,interaction_2,interaction_3
0,0.164705,0.040827,0.099073,0.060966,0.077724,0.009481,0.001082,4e-05,0.00026,3.7e-05,7.2e-05,0.000376,2.1e-05,1.6e-05,4.6e-05,3.8e-05,0.000195,0.004594,0.00554,0.001436,0.198354,0.041464,0.293652
1,0.175094,0.04072,0.110575,0.069191,0.078686,0.013343,0.001152,0.000104,0.000442,0.000103,7.4e-05,0.000298,1.3e-05,3.7e-05,3.1e-05,4.2e-05,9.8e-05,0.004959,0.006635,0.001124,0.207796,0.035233,0.254251
2,0.194414,0.038677,0.087673,0.07153,0.079798,0.012381,0.000657,4.5e-05,0.000434,2.3e-05,0.00019,0.000372,6e-06,3.9e-05,8.7e-05,4.4e-05,0.000155,0.00338,0.006084,0.000815,0.205264,0.039603,0.25833
3,0.208901,0.041736,0.107293,0.07365,0.077561,0.009978,0.000878,3.6e-05,0.000236,2.2e-05,9.7e-05,0.000277,1.8e-05,1.5e-05,2.7e-05,4.8e-05,8.4e-05,0.00532,0.005977,0.001108,0.184786,0.05135,0.230604
4,0.200071,0.036927,0.124973,0.070988,0.083851,0.009733,0.000667,3.2e-05,0.000191,2e-05,8.2e-05,0.000275,4e-06,3e-05,8.1e-05,6.6e-05,0.000203,0.005042,0.0065,0.000923,0.186182,0.035579,0.237579


In [7]:
results.apply(np.mean, axis = 0)

satisfaction_level       0.184835
last_evaluation          0.041727
number_project           0.107661
average_montly_hours     0.069435
time_spend_company       0.082121
Work_accident            0.010976
promotion_last_5years    0.000865
IT                       0.000049
RandD                    0.000315
accounting               0.000051
hr                       0.000083
management               0.000377
marketing                0.000027
product_mng              0.000027
sales                    0.000056
support                  0.000062
technical                0.000134
high                     0.004903
low                      0.006872
medium                   0.001189
interaction_1            0.193139
interaction_2            0.042121
interaction_3            0.252974
dtype: float64