In [1]:
pip install optuna

Collecting optuna
  Downloading optuna-3.0.6-py3-none-any.whl (348 kB)
     |████████████████████████████████| 348 kB 33.1 MB/s            
[?25hCollecting cliff
  Downloading cliff-3.10.1-py3-none-any.whl (81 kB)
     |████████████████████████████████| 81 kB 13.7 MB/s            
Collecting cmaes>=0.8.2
  Downloading cmaes-0.9.0-py3-none-any.whl (23 kB)
Collecting alembic>=1.5.0
  Downloading alembic-1.7.7-py3-none-any.whl (210 kB)
     |████████████████████████████████| 210 kB 77.0 MB/s            
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako
  Downloading Mako-1.1.6-py2.py3-none-any.whl (75 kB)
     |████████████████████████████████| 75 kB 1.0 MB/s             
[?25hCollecting importlib-resources
  Downloading importlib_resources-5.4.0-py3-none-any.whl (28 kB)
Collecting stevedore>=2.0.1
  Downloading stevedore-3.5.2-py3-none-any.whl (50 kB)
     |████████████████████████████████| 50 kB 3.6 MB/s            
[?25hCollecting pbr!=2.1

In [5]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from scipy.stats import boxcox
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.feature_selection import RFE, RFECV
from sklearn.svm import SVC

import optuna

from cost_function import cost_function

# Reading the data
train = pd.read_csv('turnover_train.csv')
validation = pd.read_csv('turnover_val.csv')
test = pd.read_csv('turnover_test.csv')

In [6]:
## Changing sales and salary to dummies
train = pd.concat([train.drop(columns = ['sales', 'salary'], axis = 1), pd.get_dummies(train[['sales', 'salary']])], axis = 1)
validation = pd.concat([validation.drop(columns = ['sales', 'salary'], axis = 1), pd.get_dummies(validation[['sales', 'salary']])], axis = 1)
test = pd.concat([test.drop(columns = ['sales', 'salary'], axis = 1), pd.get_dummies(test[['sales', 'salary']])], axis = 1)

In [7]:
train['interaction_1'] = np.where(((train['satisfaction_level'] >= 0.115) & 
                                      (train['satisfaction_level'] <= 0.465) & 
                                      (train['number_project'] > 2.5)), 1, 0)

train['interaction_2'] = np.where(((train['satisfaction_level'] <= 0.465) & 
                                      (train['number_project'] <= 2.5) & 
                                      (train['last_evaluation'] <= 0.575)), 1, 0)

train['interaction_3'] = np.where(((train['satisfaction_level'] > 0.465) & 
                                      (train['time_spend_company'] <= 4.5) & 
                                      (train['average_montly_hours'] <= 290.5)), 1, 0)

validation['interaction_1'] = np.where(((validation['satisfaction_level'] >= 0.115) & 
                                      (validation['satisfaction_level'] <= 0.465) & 
                                      (validation['number_project'] > 2.5)), 1, 0)

validation['interaction_2'] = np.where(((validation['satisfaction_level'] <= 0.465) & 
                                      (validation['number_project'] <= 2.5) & 
                                      (validation['last_evaluation'] <= 0.575)), 1, 0)

validation['interaction_3'] = np.where(((validation['satisfaction_level'] > 0.465) & 
                                      (validation['time_spend_company'] <= 4.5) & 
                                      (validation['average_montly_hours'] <= 290.5)), 1, 0)

test['interaction_1'] = np.where(((test['satisfaction_level'] >= 0.115) & 
                                      (test['satisfaction_level'] <= 0.465) & 
                                      (test['number_project'] > 2.5)), 1, 0)

test['interaction_2'] = np.where(((test['satisfaction_level'] <= 0.465) & 
                                      (test['number_project'] <= 2.5) & 
                                      (test['last_evaluation'] <= 0.575)), 1, 0)

test['interaction_3'] = np.where(((test['satisfaction_level'] > 0.465) & 
                                      (test['time_spend_company'] <= 4.5) & 
                                      (test['average_montly_hours'] <= 290.5)), 1, 0)

# Random Forest with Optuna

In [8]:
X = train[['interaction_3', 'interaction_1', 'satisfaction_level', 'time_spend_company',
           'number_project']]
Y = train['left']

class Objective:
    
    def __init__(self, seed):
        self.seed = seed
        
    def __call__(self, trial):
        
        params = dict(n_estimators = trial.suggest_int('n_estimators', 100, 2000),
                      min_samples_split = trial.suggest_int('min_samples_split', 5, 30),
                      min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 30),
                      max_depth = trial.suggest_int('max_depth', 2, 10)
                      )
        
        scores = list()
        
        skf = StratifiedKFold(n_splits = 3, shuffle = True, random_state = self.seed)
        
        for train_idx, valid_idx in skf.split(X, Y):
            
            X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
            Y_train, Y_valid = Y.iloc[train_idx], Y.iloc[valid_idx]
            
            RF_md = RandomForestClassifier(**params).fit(X_train, Y_train)
            
            pred_valid = RF_md.predict_proba(X_valid)[:, 1]
            score = cost_function(Y_valid, pred_valid)
            scores.append(score[0])
            
        return np.mean(scores)

In [10]:
SEED = 42
N_TRIALS = 20

study = optuna.create_study(direction = 'maximize')
study.optimize(Objective(SEED), n_trials = N_TRIALS)

[32m[I 2023-03-24 17:43:35,953][0m A new study created in memory with name: no-name-328d44d8-32d6-4a08-8ecc-aafef7bfb764[0m
[32m[I 2023-03-24 17:43:48,881][0m Trial 0 finished with value: 222166.66666666666 and parameters: {'n_estimators': 1319, 'min_samples_split': 30, 'min_samples_leaf': 11, 'max_depth': 4}. Best is trial 0 with value: 222166.66666666666.[0m
[32m[I 2023-03-24 17:44:05,280][0m Trial 1 finished with value: 221166.66666666666 and parameters: {'n_estimators': 1815, 'min_samples_split': 13, 'min_samples_leaf': 14, 'max_depth': 4}. Best is trial 0 with value: 222166.66666666666.[0m
[32m[I 2023-03-24 17:44:20,019][0m Trial 2 finished with value: 293166.6666666667 and parameters: {'n_estimators': 1221, 'min_samples_split': 8, 'min_samples_leaf': 9, 'max_depth': 10}. Best is trial 2 with value: 293166.6666666667.[0m
[32m[I 2023-03-24 17:44:39,213][0m Trial 3 finished with value: 288166.6666666667 and parameters: {'n_estimators': 1774, 'min_samples_split': 23, 'm

In [11]:
study.best_trial.params

{'n_estimators': 1400,
 'min_samples_split': 6,
 'min_samples_leaf': 5,
 'max_depth': 10}

In [14]:
from sklearn.metrics import confusion_matrix

## Building the random forest model
RF_md = RandomForestClassifier(**study.best_trial.params).fit(X, Y)

## Predicting on validation and test
X_val = validation[['interaction_3', 'interaction_1', 'satisfaction_level', 'time_spend_company',
           'number_project']]
X_test = test[['interaction_3', 'interaction_1', 'satisfaction_level', 'time_spend_company',
           'number_project']]

Y_val = validation['left']
Y_test = test['left']

RF_val_pred = RF_md.predict_proba(X_val)[:, 1]
RF_test_pred = RF_md.predict_proba(X_test)[:, 1]

## Identify the optimal cutoff
opt_cutoff = cost_function(Y_val, RF_val_pred)[1]

## Changing the likelihoods into labels
RF_label = np.where(RF_test_pred < opt_cutoff, 0, 1)

conf_mat = confusion_matrix(Y_test, RF_label)
print(conf_mat)
print('The cost of the RF is:', -1500 * conf_mat[1, 0] - 1000 * conf_mat[0, 1] + 500 * conf_mat[1, 1])

[[1125   18]
 [  28  329]]
The cost of the RF is: 104500
