In [1]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 50)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import precision_recall_cutoff

## Defining the bucket 
s3 = boto3.resource('s3')
bucket_name = 'data-448'
bucket = s3.Bucket(bucket_name)

## Defining the csv file 
file_key = 'In_Class_Assignments/turnover.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading the csv file
turnover = pd.read_csv(file_content_stream)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [2]:
## Changing sales to dummy variables
turnover = pd.concat([turnover.drop(columns = ['sales'], axis = 1), pd.get_dummies(turnover['sales'])], axis = 1)

## Changing salary to dummy variables
turnover = pd.concat([turnover, pd.get_dummies(turnover['salary'])], axis = 1)

turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,high,low,medium
0,0.38,0.53,2,157,3,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,medium,0,0,0,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,medium,0,0,0,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0


In [3]:
## Creating interactions/features from the decision tree
turnover['interaction_1'] = np.where((turnover['satisfaction_level'] <= 0.465) & (turnover['number_project'] <= 2.5) & (turnover['last_evaluation'] <= 0.575), 1, 0)
turnover['interaction_2'] = np.where((turnover['satisfaction_level'] <= 0.465) & (turnover['number_project'] >= 2.5) & (turnover['satisfaction_level'] >= 0.115), 1, 0)
turnover['interaction_3'] = np.where((turnover['satisfaction_level'] >= 0.465) & (turnover['time_spend_company'] <= 4.5) & (turnover['average_montly_hours'] <= 290.5), 1, 0)

turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,high,low,medium,interaction_1,interaction_2,interaction_3
0,0.38,0.53,2,157,3,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0
1,0.8,0.86,5,262,6,0,1,0,medium,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
2,0.11,0.88,7,272,4,0,1,0,medium,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
3,0.72,0.87,5,223,5,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
4,0.37,0.52,2,159,3,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0


In [18]:
## Defining the input and target variables
X = turnover.drop(columns = ['left', 'salary'], axis = 1)
Y = turnover['left']

## Spliting the data 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

## Defining list to store results
results = list()

for i in range(0, 10):

    ## Spliting the data 
    X_training, X_testing, Y_training, Y_testing = train_test_split(X_train, Y_train, test_size = 0.2, stratify = Y_train)
    
    ## Building the random forest model 
    RF = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_training, Y_training)
    
    ## Extracting feature importances
    results.append(RF.feature_importances_)
    
## Changing to data-frame
results = pd.DataFrame(results)
results.columns = X.columns

## Computing the average and sorting by importance
results = pd.DataFrame(results.apply(np.mean, axis = 0))
results = pd.DataFrame({'Feature': results.index, 'Importance': results[0].values})
results = results.sort_values(by = 'Importance' ascending = False)
results.head(10)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,high,low,medium,interaction_1,interaction_2,interaction_3
0,0.18605,0.041698,0.105107,0.072144,0.085928,0.009882,0.001185,8.4e-05,0.000356,7e-05,5.8e-05,0.000394,2.4e-05,3.8e-05,9.3e-05,1.5e-05,0.000104,0.004342,0.004978,0.000803,0.211211,0.03811,0.237326
1,0.192848,0.046615,0.10834,0.05918,0.078915,0.010315,0.000848,6.2e-05,0.000336,4.1e-05,0.000152,0.000334,2.2e-05,1.7e-05,0.000101,3.4e-05,0.000142,0.00412,0.004216,0.000637,0.201019,0.035195,0.25651
2,0.179584,0.043283,0.118467,0.071578,0.092762,0.008606,0.00059,6e-06,0.000366,2.4e-05,0.00019,0.000456,1e-05,3.6e-05,6.6e-05,3e-05,0.000129,0.003751,0.005907,0.001224,0.190621,0.038842,0.243474
3,0.140559,0.042188,0.111007,0.075751,0.084293,0.010117,0.00045,7.3e-05,0.000317,5.5e-05,0.000276,0.000283,3.2e-05,2.6e-05,3.2e-05,6.9e-05,0.000123,0.004527,0.005796,0.001093,0.188204,0.04881,0.28592
4,0.210786,0.033427,0.101107,0.07508,0.074347,0.011973,0.000685,2.9e-05,0.000139,3.9e-05,8e-05,0.00063,1.6e-05,4.6e-05,4.9e-05,6.4e-05,0.000117,0.003397,0.005744,0.000708,0.185218,0.047466,0.248851


In [8]:
results = pd.DataFrame(results.apply(np.mean, axis = 0))
results = pd.DataFrame({'Feature': results.index, 'Importance': results[0].values})
results = results.sort_values(by = 'Importance' ascending = False)
results.head(10)

(23,)

In [19]:
results = pd.DataFrame(results.apply(np.mean, axis = 0))
results.head()

Unnamed: 0,0
satisfaction_level,0.184877
last_evaluation,0.040648
number_project,0.109288
average_montly_hours,0.069424
time_spend_company,0.083429


In [27]:
results[0].values

array([1.84877451e-01, 4.06481403e-02, 1.09288069e-01, 6.94241599e-02,
       8.34288228e-02, 1.14379712e-02, 8.29312050e-04, 4.69945997e-05,
       3.36602163e-04, 4.30611510e-05, 1.42243301e-04, 4.63496281e-04,
       2.08546594e-05, 3.52332110e-05, 6.83144429e-05, 4.01483546e-05,
       1.54490528e-04, 4.52927094e-03, 5.74011638e-03, 9.83466762e-04,
       1.92499420e-01, 4.15500860e-02, 2.53412275e-01])