In [1]:
pip install optuna

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Note: you may need to restart the kernel to use updated packages.


In [2]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE, RFECV
from sklearn.svm import SVR

from cost_function import cost_function

#reading in the data
train= pd.read_csv('turnover_train.csv')
val= pd.read_csv('turnover_val.csv')
test= pd.read_csv('turnover_test.csv')

In [3]:
#changing to dummy variables
train= pd.concat([train.drop(columns=['sales', 'salary'], axis=1), pd.get_dummies(train[['sales', 'salary']])], axis=1)
val= pd.concat([val.drop(columns=['sales', 'salary'], axis=1), pd.get_dummies(val[['sales', 'salary']])], axis=1)
test= pd.concat([test.drop(columns=['sales', 'salary'], axis=1), pd.get_dummies(test[['sales', 'salary']])], axis=1)


#engineering features from the decisoon tree model
train['interaction_1']=np.where(((train['satisfaction_level'] >= .115) &
                                   (train['satisfaction_level'] >= .465) &
                                   (train['number_project'] > 2.5)), 1, 0)

train['interaction_2']=np.where(((train['satisfaction_level'] >= .465) &
                                   (train['number_project'] <=2.5) &
                                   (train['last_evaluation'] <= .575)), 1, 0)
        
train['interaction3']=np.where(((train['satisfaction_level'] >= .465) &
                                    (train['time_spend_company'] <=4.5) &
                                    (train['average_montly_hours'] <=290.5)), 1,0)


#engineering features from the decisoon tree model
val['interaction_1']=np.where(((val['satisfaction_level'] >= .115) &
                                   (val['satisfaction_level'] >= .465) &
                                   (val['number_project'] > 2.5)), 1, 0)

val['interaction_2']=np.where(((val['satisfaction_level'] >= .465) &
                                   (val['number_project'] <=2.5) &
                                   (val['last_evaluation'] <= .575)), 1, 0)
        
val['interaction3']=np.where(((val['satisfaction_level'] >= .465) &
                                    (val['time_spend_company'] <=4.5) &
                                    (val['average_montly_hours'] <=290.5)), 1,0)



#engineering features from the decisoon tree model
test['interaction_1']=np.where(((test['satisfaction_level'] >= .115) &
                                   (test['satisfaction_level'] >= .465) &
                                   (test['number_project'] > 2.5)), 1, 0)

test['interaction_2']=np.where(((test['satisfaction_level'] >= .465) &
                                   (test['number_project'] <=2.5) &
                                   (test['last_evaluation'] <= .575)), 1, 0)
        
test['interaction3']=np.where(((test['satisfaction_level'] >= .465) &
                                    (test['time_spend_company'] <=4.5) &
                                    (test['average_montly_hours'] <=290.5)), 1,0)

Random Forest with Optuna

In [4]:
x= train[['interaction3', 'interaction_1', 'satisfaction_level', 'time_spend_company', 'number_project']]
y= train['left']

         
         
class Objective:
    def __init__(self, seed):
        self.seed = seed
        
    def __call__(self, trial):
    ## Parameters to be evaluated
        params = dict(n_estimators = trial.suggest_int('n_estimators', 100, 2000),
            min_samples_split = trial.suggest_int('min_samples_split', 5, 30),
            min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 30),
            max_depth = trial.suggest_int('max_depth', 2, 10)
            )
        score= list()
         
        #Running cross validation
        skf = StratifiedKFold(n_splits = 3, shuffle = True, random_state = self.seed)
       
        for train_idx, valid_idx in skf.split(x, y):
            x_train, x_valid = x.iloc[train_idx], x.iloc[valid_idx]
            y_train , y_valid = y.iloc[train_idx] , y.iloc[valid_idx]
         
            rf_md = RandomForestClassifier(**params).fit(x_train, y_train)
         
            preds_valid = rf_md.predict_proba(x_valid)[:,1]
            score = cost_function(y_valid, preds_valid)
            score.append(score[0])
         
        return np.mean(score)

In [5]:
SEED= 42
N_TRIALS= 20

study= optuna.create_study(direction= 'maximize')
study.optimize(Objective(SEED), n_trials= N_TRIALS)

[32m[I 2023-03-25 00:54:43,352][0m A new study created in memory with name: no-name-95b65af6-db65-4ba7-9bcc-0fc7667d441e[0m
[32m[I 2023-03-25 00:54:55,351][0m Trial 0 finished with value: 196333.55000000002 and parameters: {'n_estimators': 1160, 'min_samples_split': 25, 'min_samples_leaf': 21, 'max_depth': 6}. Best is trial 0 with value: 196333.55000000002.[0m
[32m[I 2023-03-25 00:54:57,370][0m Trial 1 finished with value: -30666.466666666664 and parameters: {'n_estimators': 247, 'min_samples_split': 24, 'min_samples_leaf': 18, 'max_depth': 2}. Best is trial 0 with value: 196333.55000000002.[0m
[32m[I 2023-03-25 00:55:06,226][0m Trial 2 finished with value: 202333.50333333333 and parameters: {'n_estimators': 831, 'min_samples_split': 23, 'min_samples_leaf': 5, 'max_depth': 7}. Best is trial 2 with value: 202333.50333333333.[0m
[32m[I 2023-03-25 00:55:22,513][0m Trial 3 finished with value: 199333.5266666667 and parameters: {'n_estimators': 1393, 'min_samples_split': 21, '

In [6]:
study.best_trial.params

{'n_estimators': 1992,
 'min_samples_split': 10,
 'min_samples_leaf': 5,
 'max_depth': 8}

In [11]:
rf_md= RandomForestClassifier(**study.best_trial.params).fit(x,y)

#predicting on val and test 
x_val= val[['interaction3', 'interaction_1', 'satisfaction_level', 'time_spend_company', 'number_project']]
x_test= test[['interaction3', 'interaction_1', 'satisfaction_level', 'time_spend_company', 'number_project']]

y_val= val['left']
y_test= test['left']

rf_val_pred= rf_md.predict_proba(x_val)[:,1]
rf_test_pred= rf_md.predict_proba(x_test)[:,1]

#identify optimal cutoff value
opt_cutoff= cost_function(y_val, rf_val_pred)[1]

#chaning likelyhoods to labels
rf_label= np.where(rf_test_pred < opt_cutoff, 0,1)

cm= confusion_matrix(y_test, rf_label)

print(cm)
print('Cost of RF model:', -1500*cm[1,0] - 1000*cm[0,1] + 500*cm[1,1])

[[1125   18]
 [  28  329]]
Cost of RF model: 104500
