In [1]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import boxcox
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report

from precision_recall_cutoff import precision_recall_cutoff

## Defining the bucket 
s3 = boto3.resource('s3')
bucket_name = 'data-448'
bucket = s3.Bucket(bucket_name)

## Defining the csv file 
file_key = 'In_Class_Assignments/turnover.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading the csv file
turnover = pd.read_csv(file_content_stream)
turnover.head()



Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [2]:
## Changing sales and salary to dummies
turnover = pd.concat([turnover.drop(columns = ['sales', 'salary'], axis = 1), pd.get_dummies(turnover[['sales', 'salary']])], axis = 1)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales_IT,sales_RandD,...,sales_hr,sales_management,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [3]:
## Engineering features from the decision tree model 
turnover['interaction_1'] = np.where(((turnover['satisfaction_level'] >= 0.115) & 
                                      (turnover['satisfaction_level'] <= 0.465) & 
                                      (turnover['number_project'] > 2.5)), 1, 0)

turnover['interaction_2'] = np.where(((turnover['satisfaction_level'] <= 0.465) & 
                                      (turnover['number_project'] <= 2.5) & 
                                      (turnover['last_evaluation'] <= 0.575)), 1, 0)

turnover['interaction_3'] = np.where(((turnover['satisfaction_level'] > 0.465) & 
                                      (turnover['time_spend_company'] <= 4.5) & 
                                      (turnover['average_montly_hours'] <= 290.5)), 1, 0)

turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales_IT,sales_RandD,...,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium,interaction_1,interaction_2,interaction_3
0,0.38,0.53,2,157,3,0,1,0,0,0,...,0,1,0,0,0,1,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,...,0,1,0,0,0,0,1,0,0,0
2,0.11,0.88,7,272,4,0,1,0,0,0,...,0,1,0,0,0,0,1,0,0,0
3,0.72,0.87,5,223,5,0,1,0,0,0,...,0,1,0,0,0,1,0,0,0,0
4,0.37,0.52,2,159,3,0,1,0,0,0,...,0,1,0,0,0,1,0,0,1,0


In [4]:
## Defining input and target
X = turnover.drop(columns = 'left', axis = 1)
Y = turnover['left']

## Splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

In [5]:
## Defining the list to store feature importances
results = list()

for i in range(0, 10):
    
    ## Splitting the train data
    X_training, X_testing, Y_training, Y_testing = train_test_split(X_train, Y_train, test_size = 0.2, stratify = Y_train)
    
    ## Building the RF model 
    RF = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_training, Y_training)
    
    ## Extracting feature importances
    results.append(RF.feature_importances_)

## Transformig list into data-frame
results = pd.DataFrame(results)
results.columns = X.columns

In [6]:
results

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,sales_IT,sales_RandD,sales_accounting,...,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium,interaction_1,interaction_2,interaction_3
0,0.203181,0.039742,0.120154,0.066002,0.081863,0.009655,0.000758,5.1e-05,0.000221,0.000148,...,2.6e-05,0.000158,7.3e-05,9.5e-05,0.00586,0.007377,0.00102,0.032184,0.206308,0.224689
1,0.188291,0.041264,0.118597,0.063275,0.072266,0.01034,0.001048,6.6e-05,0.000376,6e-05,...,2.5e-05,0.000224,4.1e-05,0.000209,0.003846,0.005297,0.000665,0.050592,0.194927,0.248122
2,0.193393,0.040651,0.11116,0.06728,0.072748,0.009494,0.000619,4e-05,0.000314,5.9e-05,...,4.2e-05,5.1e-05,5.2e-05,9.5e-05,0.004022,0.004348,0.000888,0.046926,0.182926,0.264371
3,0.183048,0.041157,0.129536,0.05865,0.073378,0.010098,0.001069,9.9e-05,0.000482,0.00012,...,3.5e-05,0.000114,2.8e-05,0.000169,0.005721,0.00628,0.000933,0.044236,0.20424,0.240226
4,0.21234,0.042089,0.115142,0.065072,0.079396,0.011835,0.0008,6.3e-05,0.000541,2.9e-05,...,1.3e-05,6.6e-05,3.4e-05,0.000187,0.004959,0.005286,0.000949,0.042093,0.183291,0.235365
5,0.211275,0.042252,0.10706,0.060227,0.092499,0.009037,0.000854,5.7e-05,0.000342,6.7e-05,...,7.2e-05,4.4e-05,4.3e-05,0.000138,0.004284,0.005913,0.000847,0.044223,0.188014,0.232358
6,0.191096,0.050121,0.11028,0.065534,0.079929,0.010574,0.000509,1.7e-05,0.000379,1.8e-05,...,3.9e-05,5.2e-05,4.3e-05,0.000105,0.004406,0.005195,0.000656,0.041674,0.211751,0.22716
7,0.211877,0.040321,0.100558,0.061184,0.083071,0.009382,0.000555,3.9e-05,0.000388,1.7e-05,...,3.5e-05,8.3e-05,4.8e-05,0.000174,0.005948,0.006959,0.000855,0.043889,0.185798,0.248171
8,0.190223,0.039477,0.096594,0.062543,0.07487,0.00825,0.000529,7.3e-05,0.000406,5.3e-05,...,4.2e-05,0.000132,2.2e-05,0.000156,0.004742,0.005444,0.000928,0.044236,0.191215,0.279676
9,0.193466,0.048976,0.126327,0.07408,0.080476,0.010925,0.000823,4.7e-05,0.000477,4.5e-05,...,4.2e-05,9.9e-05,2.5e-05,0.000137,0.006041,0.006297,0.001128,0.03994,0.157304,0.252401


In [7]:
## Average of the importances
results = pd.DataFrame(results.apply(np.mean, axis = 0))
results

Unnamed: 0,0
satisfaction_level,0.197819
last_evaluation,0.042605
number_project,0.113541
average_montly_hours,0.064385
time_spend_company,0.079049
Work_accident,0.009959
promotion_last_5years,0.000756
sales_IT,5.5e-05
sales_RandD,0.000393
sales_accounting,6.2e-05


In [8]:
results = pd.DataFrame({'Feature': results.index, 'Importance': results[0].values})
results

Unnamed: 0,Feature,Importance
0,satisfaction_level,0.197819
1,last_evaluation,0.042605
2,number_project,0.113541
3,average_montly_hours,0.064385
4,time_spend_company,0.079049
5,Work_accident,0.009959
6,promotion_last_5years,0.000756
7,sales_IT,5.5e-05
8,sales_RandD,0.000393
9,sales_accounting,6.2e-05


In [9]:
results = results.sort_values(by = 'Importance', ascending = False)
results

Unnamed: 0,Feature,Importance
22,interaction_3,0.245254
0,satisfaction_level,0.197819
21,interaction_2,0.190577
2,number_project,0.113541
4,time_spend_company,0.079049
3,average_montly_hours,0.064385
20,interaction_1,0.042999
1,last_evaluation,0.042605
5,Work_accident,0.009959
18,salary_low,0.005839
