In [1]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import boxcox
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report
import precision_recall_cutoff

## Defining the bucket 
s3 = boto3.resource('s3')
bucket_name = 'data-448'
bucket = s3.Bucket(bucket_name)

## Defining the csv file 
file_key = 'In_Class_Assignments/turnover.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading the csv file
turnover = pd.read_csv(file_content_stream)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [2]:
## frequency table of left
turnover['left'].value_counts() / turnover.shape[0]

0    0.761917
1    0.238083
Name: left, dtype: float64

In [3]:
## Changing sales to dummy variables
turnover = pd.concat([turnover.drop(columns = ['sales'], axis = 1), pd.get_dummies(turnover['sales'])], axis = 1)

## Changing salary to dummy variables
turnover = pd.concat([turnover, pd.get_dummies(turnover['salary'])], axis = 1)

turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary,IT,...,hr,management,marketing,product_mng,sales,support,technical,high,low,medium
0,0.38,0.53,2,157,3,0,1,0,low,0,...,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,medium,0,...,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,medium,0,...,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,low,0,...,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,low,0,...,0,0,0,0,1,0,0,0,1,0


In [4]:
## Defining the scaler 
scaler = MinMaxScaler()

## Changing number_project and average_montly_company to 0-1 scale
turnover[['number_project_0_1', 'average_montly_company_0_1']] = scaler.fit_transform(turnover[['number_project', 'average_montly_hours']])

## Box-Cox transformation
transformed_time_spend, best_lambda = boxcox(turnover['time_spend_company'])
turnover['time_spend_company_z'] = transformed_time_spend

turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary,IT,...,product_mng,sales,support,technical,high,low,medium,number_project_0_1,average_montly_company_0_1,time_spend_company_z
0,0.38,0.53,2,157,3,0,1,0,low,0,...,0,1,0,0,0,1,0,0.0,0.285047,0.804651
1,0.8,0.86,5,262,6,0,1,0,medium,0,...,0,1,0,0,0,0,1,0.6,0.775701,1.098118
2,0.11,0.88,7,272,4,0,1,0,medium,0,...,0,1,0,0,0,0,1,1.0,0.82243,0.941381
3,0.72,0.87,5,223,5,0,1,0,low,0,...,0,1,0,0,0,1,0,0.6,0.593458,1.03233
4,0.37,0.52,2,159,3,0,1,0,low,0,...,0,1,0,0,0,1,0,0.0,0.294393,0.804651


In [5]:
## Defining the input and target variables
X = turnover.drop(columns = ['number_project', 'average_montly_hours', 'time_spend_company', 'left', 'salary'], axis = 1)
Y = turnover['left']

## Splitting the data 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

In [6]:
## Random forest 
RF = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_train, Y_train)

## Predicting on test
RF_pred = RF.predict_proba(X_test)[:, 1]

## Predicting the labels
RF_labels = precision_recall_cutoff.precision_recall_cutoff(Y_test, RF_pred)

## Computing the classification report
print(classification_report(Y_test, RF_labels))

              precision    recall  f1-score   support

           0       0.91      0.99      0.95      2286
           1       0.97      0.68      0.80       714

    accuracy                           0.92      3000
   macro avg       0.94      0.84      0.88      3000
weighted avg       0.92      0.92      0.91      3000



In [7]:
## Adaboost
Ada = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 3), n_estimators = 500, learning_rate = 0.01).fit(X_train, Y_train)

## Predicting on test
Ada_pred = Ada.predict_proba(X_test)[:, 1]

## Predicting the labels
Ada_labels = precision_recall_cutoff.precision_recall_cutoff(Y_test, Ada_pred)

## Computing the classification report
print(classification_report(Y_test, Ada_labels))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      2286
           1       0.98      0.94      0.96       714

    accuracy                           0.98      3000
   macro avg       0.98      0.97      0.97      3000
weighted avg       0.98      0.98      0.98      3000



In [8]:
## Based on my results, I would use the adaboost model to predict left.