In [8]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 50)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, make_scorer

## Defining the bucket 
s3 = boto3.resource('s3')
bucket_name = 'data-448'
bucket = s3.Bucket(bucket_name)

## Defining the csv file 
file_key_train = 'In_Class_Assignments/turnover_train.csv'
file_key_val = 'In_Class_Assignments/turnover_val.csv'
file_key_test = 'In_Class_Assignments/turnover_test.csv'

bucket_object_train = bucket.Object(file_key_train)
file_object_train = bucket_object_train.get()
file_content_stream_train = file_object_train.get('Body')

bucket_object_val = bucket.Object(file_key_val)
file_object_val = bucket_object_val.get()
file_content_stream_val = file_object_val.get('Body')

bucket_object_test = bucket.Object(file_key_test)
file_object_test = bucket_object_test.get()
file_content_stream_test = file_object_test.get('Body')

## Reading the csv file
train = pd.read_csv(file_content_stream_train)
validation = pd.read_csv(file_content_stream_val)
test = pd.read_csv(file_content_stream_test)

In [9]:
## Changing sales to dummy variables
train = pd.concat([train.drop(columns = ['sales'], axis = 1), pd.get_dummies(train['sales'])], axis = 1)
validation = pd.concat([validation.drop(columns = ['sales'], axis = 1), pd.get_dummies(validation['sales'])], axis = 1)
test = pd.concat([test.drop(columns = ['sales'], axis = 1), pd.get_dummies(test['sales'])], axis = 1)

## Changing salary to dummy variables
train = pd.concat([train, pd.get_dummies(train['salary'])], axis = 1)
validation = pd.concat([validation, pd.get_dummies(validation['salary'])], axis = 1)
test = pd.concat([test, pd.get_dummies(test['salary'])], axis = 1)

In [10]:
## Creating interactions/features from the decision tree
train['interaction_1'] = np.where((train['satisfaction_level'] <= 0.465) & (train['number_project'] <= 2.5) & (train['last_evaluation'] <= 0.575), 1, 0)
train['interaction_2'] = np.where((train['satisfaction_level'] <= 0.465) & (train['number_project'] >= 2.5) & (train['satisfaction_level'] >= 0.115), 1, 0)
train['interaction_3'] = np.where((train['satisfaction_level'] >= 0.465) & (train['time_spend_company'] <= 4.5) & (train['average_montly_hours'] <= 290.5), 1, 0)

validation['interaction_1'] = np.where((validation['satisfaction_level'] <= 0.465) & (validation['number_project'] <= 2.5) & (validation['last_evaluation'] <= 0.575), 1, 0)
validation['interaction_2'] = np.where((validation['satisfaction_level'] <= 0.465) & (validation['number_project'] >= 2.5) & (validation['satisfaction_level'] >= 0.115), 1, 0)
validation['interaction_3'] = np.where((validation['satisfaction_level'] >= 0.465) & (validation['time_spend_company'] <= 4.5) & (validation['average_montly_hours'] <= 290.5), 1, 0)

test['interaction_1'] = np.where((test['satisfaction_level'] <= 0.465) & (test['number_project'] <= 2.5) & (test['last_evaluation'] <= 0.575), 1, 0)
test['interaction_2'] = np.where((test['satisfaction_level'] <= 0.465) & (test['number_project'] >= 2.5) & (test['satisfaction_level'] >= 0.115), 1, 0)
test['interaction_3'] = np.where((test['satisfaction_level'] >= 0.465) & (test['time_spend_company'] <= 4.5) & (test['average_montly_hours'] <= 290.5), 1, 0)

In [11]:
## Defining the input and target variables
X_train = train.drop(columns = ['left', 'salary'], axis = 1)
Y_train = train['left']

X_val = validation
Y_val = validation['left']

X_test = test.drop(columns = ['left', 'salary'], axis = 1)
Y_test = test['left']

## Changing the scale
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

## Spliting the data into train, validation, and test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

In [18]:
def cost_function(Y_true, Y_pred):
    
    '''
    This a customize scoring function that takes two arguments:
    Y_true: true labels
    Y_pred: likelihoods from the model   
    '''
    
    ## Defining cutoff values in a data-frame
    results = pd.DataFrame({'cutoffs': np.round(np.linspace(0.05, 0.95, num = 40, endpoint = True), 2)})
    results['cost'] = np.nan
    
    for i in range(0, results.shape[0]):
        
        ## Changing likelihoods to labels
        Y_pred_lab = np.where(Y_pred < results['cutoffs'][i], 0, 1)
        
        ## Computing confusion matrix and scoring based on description
        X = confusion_matrix(Y_pred_lab, Y_true)
        results['cost'][i] = -1500 * X[1, 0] - 1000 * X[0, 1] + 500 * X[1, 1]
        
    ## Sorting results 
    results = results.sort_values(by = 'cost', ascending = False).reset_index(drop = True)
    
    return results['cost'][0]

my_score_function = make_scorer(cost_function, greater_is_better = True, needs_proba = True)

In [25]:
## Defining hyper-paramerters for RF
RF_param_grid = {'n_estimators': [100, 300, 500],
                 'min_samples_split': [10, 15],
                 'min_samples_leaf': [5, 7],
                 'max_depth' : [3, 5, 7]}

## Performing grid search with 3 folds
RF_grid_search = GridSearchCV(RandomForestClassifier(), RF_param_grid, cv = 3, scoring = my_score_function, n_jobs = -1).fit(X_train, Y_train)

## Extracting the best model 
RF_md = RF_grid_search.best_estimator_

## Predicting on test
RF_pred = RF_md.predict_proba(X_test)[:, 1]

In [26]:
RF_pred

array([0.01615787, 0.00631489, 0.61448619, ..., 0.00841033, 0.62595868,
       0.02921706])

In [27]:
## Defining the hyper-parameters for svm
SVM_param_grid = {'kernel': ['rbf', 'poly', 'sigmoid'],
                  'C': [0.01, 0.1, 1, 10],
                  'gamma': [0.001, 0.01, 0.1, 1]}

## Performing grid serach with 3 folds
SVM_grid_search = GridSearchCV(SVC(), SVM_param_grid, cv = 3, scoring = my_score_function, n_jobs = -1).fit(X_train, Y_train)

## Extracting the best model 
svm_md = SVM_grid_search.best_estimator_

## ## Predicting on test
svm_pred = svm_md.predict_proba(X_test)[:, 1]

KeyboardInterrupt: 