In [7]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from precision_recall_cutoff import precision_recall_cutoff

s3= boto3.resource('s3')
bucket_name= 'morgan-gant-data448-bucket'
bucket= s3.Bucket(bucket_name)

file_key= 'turnover.csv'

bucket_object= bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

#reading the datefile
turnover = pd.read_csv(file_content_stream)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [8]:
#changing to dummy variables
turnover= pd.concat([turnover.drop(columns=['sales', 'salary'], axis=1), pd.get_dummies(turnover[['sales', 'salary']])], axis=1)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales_IT,sales_RandD,...,sales_hr,sales_management,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [9]:
#engineering features from the decisoon tree model
turnover['interaction_1']=np.where(((turnover['satisfaction_level'] >= .115) &
                                   (turnover['satisfaction_level'] >= .465) &
                                   (turnover['number_project'] > 2.5)), 1, 0)

turnover['interaction_2']=np.where(((turnover['satisfaction_level'] >= .465) &
                                   (turnover['number_project'] <=2.5) &
                                   (turnover['last_evaluation'] <= .575)), 1, 0)
        
turnover['interaction3']=np.where(((turnover['satisfaction_level'] >= .465) &
                                    (turnover['time_spend_company'] <=4.5) &
                                    (turnover['average_montly_hours'] <=290.5)), 1,0)

In [13]:
 #Defining input and target variables
x= turnover.drop(columns= 'left', axis=1)
y= turnover['left']

#splitting the data
x_train, x_test, y_train, y_test= train_test_split(x,y, test_size= .2, stratify= y)

In [18]:
#defining list
importance= list()

#running iterations
for i in range(0,10):
    
    #slitting the train data
    x_training, x_testing, y_training, y_testing= train_test_split(x_train, y_train, test_size=.2, stratify= y_train)
    
    #building random forest model 
    rf_md= RandomForestClassifier(n_estimators=500, max_depth= 3).fit(x_training, y_training)
    
    #extracting importances
    importance.append(rf_md.feature_importances_)
    
#transforming list to data frame
importance= pd.DataFrame(importance)
importance.columns= x.columns

In [19]:
#average the importances
importance= pd.DataFrame(importance.apply(np.mean, axis=0))
importance

Unnamed: 0,0
satisfaction_level,0.223513
last_evaluation,0.049576
number_project,0.123091
average_montly_hours,0.079975
time_spend_company,0.099491
Work_accident,0.012846
promotion_last_5years,0.000977
sales_IT,5.9e-05
sales_RandD,0.000409
sales_accounting,4.1e-05


In [21]:
importance= pd.DataFrame({'Feature': importance.index, 'Importance': importance [0].values})

In [23]:
importance= importance.sort_values(by='Importance', ascending=False)
importance

Unnamed: 0,Feature,Importance
22,interaction3,0.266992
0,satisfaction_level,0.223513
2,number_project,0.123091
20,interaction_1,0.121258
4,time_spend_company,0.099491
3,average_montly_hours,0.079975
1,last_evaluation,0.049576
5,Work_accident,0.012846
18,salary_low,0.008479
17,salary_high,0.007288
