In [116]:
# Importing all the required libraries

from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
from sklearn import tree
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
np.set_printoptions(threshold=np.inf)
import os
from sklearn.pipeline import make_pipeline
os.getcwd()
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import seaborn as sns
import statistics as s
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

In [117]:
# Loading the datasets
train = pd.read_csv('aug_train.csv')

In [118]:
#removing 'enrollee_id' from data since it is not required in model
train.drop(['enrollee_id','city'],axis=1,inplace=True)

### Separating independent and dependent variables

In [119]:
X = train.drop(columns=['target'])

y = train['target']

In [120]:
# Encoding categorical variables
X_cats = (OneHotEncoder(sparse=False,handle_unknown='ignore')
                   .fit_transform(X[['gender','relevent_experience',
                           'enrolled_university','education_level',
                           'major_discipline','company_type',
                           'last_new_job','experience','company_size']]))
X_cats = pd.DataFrame(X_cats)

In [121]:
# Merging encoded categorical variables with numeric variables
X_numerical = X.drop(columns=['gender','relevent_experience',
                                  'enrolled_university','education_level',
                                 'major_discipline','company_type',
                                  'last_new_job','experience','company_size'])
col_names = X_numerical.columns
X_numerical = pd.DataFrame(X_numerical, columns=col_names)
X = X_numerical.join(X_cats)

In [122]:
# Since the Target has "0" far more than "1" we will just fill NA values with 0 for this problem
X.fillna(0, inplace=True)

In [123]:
# Using SMOTE for handling class imbalance
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state = 0)
X.columns = X.columns.astype('str')

X_smote, y_smote = smote.fit_resample(X,y)


In [124]:
# Splitting data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_smote,
                                                    y_smote,
                                                    test_size=0.2,
                                                    random_state=42)

In [125]:
# Creating a function to test and compare various algorithms
def model_fit(x_train, y_train, test_data):
        
    #RandomForest
    alg = RandomForestClassifier()
    alg.fit(x_train, y_train)
    y_pred = alg.predict(test_data)  
    print('Random Forest Model 3')
    print('AUC On Test Set - {}'.format(roc_auc_score(y_pred, y_test)))
    conf_mat = confusion_matrix(y_pred, y_test)
    print('Confusion Matrix :\n',conf_mat)
    sensitivity1 = conf_mat[0,0]/(conf_mat[0,0]+conf_mat[0,1])
    print('Sensitivity : ', sensitivity1 )
    specificity1 = conf_mat[1,1]/(conf_mat[1,0]+conf_mat[1,1])
    print('Specificity : ', specificity1)
    f1_score2 = f1_score(y_test,y_pred)
    print('f1_score : ', f1_score2)
    

In [126]:
model_fit(X_train, y_train,X_test)

Random Forest Model 3
AUC On Test Set - 0.8553908205571831
Confusion Matrix :
 [[2530  490]
 [ 347 2386]]
Sensitivity :  0.8377483443708609
Specificity :  0.8730332967435053
f1_score :  0.8507755393118204


#### From the above output, we can conclude that RandomForest is the best algorithm to go forward

In [127]:
# Tuning Hyperparameters
from sklearn.model_selection import RandomizedSearchCV

forest  = RandomForestClassifier(random_state = 42)

params = {
        'n_estimators' : [100, 300, 500, 800, 1200],
        'max_depth' : [5, 8, 15, 25, 30],
        'min_samples_split' : [2, 5, 10, 15, 100],
        'min_samples_leaf' : [1, 2, 5, 10] 
        }

gridF = RandomizedSearchCV(forest, params, cv = 5, verbose = 1)

In [128]:
clf_grid = gridF.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [129]:
clf_grid.best_params_

{'n_estimators': 100,
 'min_samples_split': 15,
 'min_samples_leaf': 2,
 'max_depth': 25}

In [130]:
# Running model on entire data 
model = RandomForestClassifier(n_estimators=800,
                               min_samples_split=10,
                               min_samples_leaf=2, 
                               max_depth=30)
model.fit(X_smote, y_smote)



RandomForestClassifier(max_depth=30, min_samples_leaf=2, min_samples_split=10,
                       n_estimators=800)

In [131]:
import pickle

# Save the model to a file
with open("model_hr.pkl", "wb") as f:
    pickle.dump(model, f)