In [2]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import boxcox
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report

from precision_recall_cutoff import precision_recall_cutoff

## Defining the bucket 
s3 = boto3.resource('s3')
bucket_name = 'omar-vargas-bucket'
bucket = s3.Bucket(bucket_name)

## Defining the csv file 
file_key = 'turnover.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading the csv file
turnover = pd.read_csv(file_content_stream)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [3]:
## Changing sales and salary to dummies
turnover = pd.concat([turnover.drop(columns = ['sales', 'salary'], axis = 1), pd.get_dummies(turnover[['sales', 'salary']])], axis = 1)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales_IT,sales_RandD,...,sales_hr,sales_management,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [4]:
## Engineering features from the decision tree model 
turnover['interaction_1'] = np.where(((turnover['satisfaction_level'] >= 0.115) & 
                                      (turnover['satisfaction_level'] <= 0.465) & 
                                      (turnover['number_project'] > 2.5)), 1, 0)

turnover['interaction_2'] = np.where(((turnover['satisfaction_level'] <= 0.465) & 
                                      (turnover['number_project'] <= 2.5) & 
                                      (turnover['last_evaluation'] <= 0.575)), 1, 0)

turnover['interaction_3'] = np.where(((turnover['satisfaction_level'] > 0.465) & 
                                      (turnover['time_spend_company'] <= 4.5) & 
                                      (turnover['average_montly_hours'] <= 290.5)), 1, 0)

turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales_IT,sales_RandD,...,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium,interaction_1,interaction_2,interaction_3
0,0.38,0.53,2,157,3,0,1,0,0,0,...,0,1,0,0,0,1,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,...,0,1,0,0,0,0,1,0,0,0
2,0.11,0.88,7,272,4,0,1,0,0,0,...,0,1,0,0,0,0,1,0,0,0
3,0.72,0.87,5,223,5,0,1,0,0,0,...,0,1,0,0,0,1,0,0,0,0
4,0.37,0.52,2,159,3,0,1,0,0,0,...,0,1,0,0,0,1,0,0,1,0


In [5]:
## Defining input and target
X = turnover.drop(columns = 'left', axis = 1)
Y = turnover['left']

## Splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

In [6]:
## Defining the list to store feature importances
results = list()

for i in range(0, 10):
    
    ## Splitting the train data
    X_training, X_testing, Y_training, Y_testing = train_test_split(X_train, Y_train, test_size = 0.2, stratify = Y_train)
    
    ## Building the RF model 
    RF = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_training, Y_training)
    
    ## Extracting feature importances
    results.append(RF.feature_importances_)

## Transformig list into data-frame
results = pd.DataFrame(results)
results.columns = X.columns

In [7]:
results

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,sales_IT,sales_RandD,sales_accounting,...,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium,interaction_1,interaction_2,interaction_3
0,0.19909,0.041333,0.118263,0.065589,0.081171,0.010549,0.000485,3e-05,0.000411,7e-05,...,7.2e-05,8.7e-05,3.5e-05,0.000209,0.005901,0.006076,0.00179,0.042989,0.183915,0.241473
1,0.184665,0.044968,0.125082,0.070911,0.090231,0.010665,0.000805,2.7e-05,0.000324,5.9e-05,...,1.8e-05,4.6e-05,2.4e-05,0.000208,0.004619,0.00808,0.001549,0.037043,0.170175,0.249634
2,0.175852,0.04013,0.110863,0.069472,0.082927,0.010671,0.001236,2.5e-05,0.000342,1.4e-05,...,1.5e-05,0.000103,4.2e-05,0.000153,0.003663,0.004834,0.001158,0.049022,0.206954,0.242011
3,0.187787,0.038626,0.126542,0.070124,0.080641,0.010474,0.000948,2.1e-05,0.000415,2.9e-05,...,7e-05,8e-05,2.5e-05,0.000204,0.003551,0.006669,0.00162,0.04238,0.174366,0.254834
4,0.19824,0.046174,0.127615,0.073562,0.076367,0.008408,0.001039,5e-05,0.000322,6.1e-05,...,4.8e-05,5.8e-05,8.4e-05,9.1e-05,0.004219,0.007058,0.001374,0.043878,0.176632,0.234383
5,0.185643,0.042595,0.111128,0.067004,0.077173,0.008379,0.001112,1.6e-05,0.000354,1.6e-05,...,4.4e-05,0.000102,6.9e-05,0.000101,0.004663,0.007913,0.000983,0.034556,0.218306,0.239109
6,0.208881,0.041637,0.096564,0.06696,0.089925,0.010354,0.001215,4.1e-05,0.000868,4e-05,...,1.1e-05,7.8e-05,5.7e-05,0.000169,0.004191,0.007481,0.001349,0.043318,0.175924,0.250415
7,0.181628,0.043723,0.112773,0.066689,0.092408,0.011173,0.000627,4.8e-05,0.000252,3.2e-05,...,2.6e-05,0.000112,7.6e-05,0.000125,0.004758,0.007146,0.001413,0.043657,0.179718,0.253152
8,0.212813,0.041371,0.108905,0.072675,0.087944,0.012455,0.000823,1.6e-05,0.000498,5.3e-05,...,3.6e-05,0.000107,5.8e-05,0.000108,0.006586,0.005029,0.001357,0.037136,0.202576,0.209011
9,0.182575,0.042244,0.1174,0.074886,0.081488,0.011822,0.000702,1.7e-05,0.000121,2.3e-05,...,2.9e-05,7.4e-05,9.2e-05,0.000185,0.005032,0.006127,0.001364,0.044347,0.198232,0.232532


In [8]:
## Average of the importances
results = pd.DataFrame(results.apply(np.mean, axis = 0))
results

Unnamed: 0,0
satisfaction_level,0.191717
last_evaluation,0.04228
number_project,0.115513
average_montly_hours,0.069787
time_spend_company,0.084028
Work_accident,0.010495
promotion_last_5years,0.000899
sales_IT,2.9e-05
sales_RandD,0.000391
sales_accounting,4e-05


In [9]:
results = pd.DataFrame({'Feature': results.index, 'Importance': results[0].values})
results

Unnamed: 0,Feature,Importance
0,satisfaction_level,0.191717
1,last_evaluation,0.04228
2,number_project,0.115513
3,average_montly_hours,0.069787
4,time_spend_company,0.084028
5,Work_accident,0.010495
6,promotion_last_5years,0.000899
7,sales_IT,2.9e-05
8,sales_RandD,0.000391
9,sales_accounting,4e-05


In [11]:
results = results.sort_values(by = 'Importance', ascending = False)
results

Unnamed: 0,Feature,Importance
22,interaction_3,0.240655
0,satisfaction_level,0.191717
21,interaction_2,0.18868
2,number_project,0.115513
4,time_spend_company,0.084028
3,average_montly_hours,0.069787
1,last_evaluation,0.04228
20,interaction_1,0.041833
5,Work_accident,0.010495
18,salary_low,0.006641
