In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


import warnings
warnings.filterwarnings('ignore')

## EDA

In [2]:
df=pd.read_csv('../HR_Analytics/files/aug_train.csv')
df.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0


In [3]:
df.shape

(19158, 14)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19158 entries, 0 to 19157
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             19158 non-null  int64  
 1   city                    19158 non-null  object 
 2   city_development_index  19158 non-null  float64
 3   gender                  14650 non-null  object 
 4   relevent_experience     19158 non-null  object 
 5   enrolled_university     18772 non-null  object 
 6   education_level         18698 non-null  object 
 7   major_discipline        16345 non-null  object 
 8   experience              19093 non-null  object 
 9   company_size            13220 non-null  object 
 10  company_type            13018 non-null  object 
 11  last_new_job            18735 non-null  object 
 12  training_hours          19158 non-null  int64  
 13  target                  19158 non-null  float64
dtypes: float64(2), int64(2), object(10)
me

In [5]:
# This data set consists mainly of categorical columns. Some of these need to be transformed to numerical data
# The target variable is a categorical column, hence this will be a binary clustering problem. 

In [6]:
df.describe()

Unnamed: 0,enrollee_id,city_development_index,training_hours,target
count,19158.0,19158.0,19158.0,19158.0
mean,16875.358179,0.828848,65.366896,0.249348
std,9616.292592,0.123362,60.058462,0.432647
min,1.0,0.448,1.0,0.0
25%,8554.25,0.74,23.0,0.0
50%,16982.5,0.903,47.0,0.0
75%,25169.75,0.92,88.0,0.0
max,33380.0,0.949,336.0,1.0


In [7]:
# There are a lot of missing values, which need to be filled
nulls = pd.DataFrame(df.isna().sum()*100/len(df), columns=['percentage'])
nulls.sort_values('percentage', ascending = False).head(80)

Unnamed: 0,percentage
company_type,32.049274
company_size,30.994885
gender,23.53064
major_discipline,14.683161
education_level,2.401086
last_new_job,2.207955
enrolled_university,2.014824
experience,0.339284
enrollee_id,0.0
city,0.0


In [8]:
# There is a hudge inbalance in the target variable inside each column. 
# Most of candidates dont want to change their job after finishing the training
df.groupby('target').count()

Unnamed: 0_level_0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0.0,14381,14381,14381,11262,14381,14118,14025,12117,14339,10853,10625,14112,14381
1.0,4777,4777,4777,3388,4777,4654,4673,4228,4754,2367,2393,4623,4777


## Data Cleaning

In [9]:
df.set_index('enrollee_id')

Unnamed: 0_level_0,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
enrollee_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
8949,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7386,city_173,0.878,Male,No relevent experience,no_enrollment,Graduate,Humanities,14,,,1,42,1.0
31398,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,,,4,52,1.0
24576,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,50-99,Pvt Ltd,4,44,0.0
5756,city_65,0.802,Male,Has relevent experience,no_enrollment,High School,,<1,500-999,Pvt Ltd,2,97,0.0


In [10]:
# Replacing NaNs with the mode of the columns
df['education_level'] = df['education_level'].fillna(df['education_level'].mode()[0])
df['company_type'] = df['company_type'].fillna(df['company_type'].mode()[0])
df['company_size'] = df['company_size'].fillna(df['company_size'].mode()[0])
df['gender'] = df['gender'].fillna(df['gender'].mode()[0])
df['major_discipline'] = df['major_discipline'].fillna(df['major_discipline'].mode()[0])
df['enrolled_university'] = df['enrolled_university'].fillna(df['enrolled_university'].mode()[0])
df['last_new_job'] = df['last_new_job'].fillna(df['last_new_job'].mode()[0])
df['experience'] = df['experience'].fillna(df['experience'].mode()[0])

In [11]:
# Removing the NaNs by filing them with a new category.
#df['relevent_experience'].unique() 

# Function to impute null value with new category
#def impute_nan_create_category(df,ColName):
     #df[ColName] = np.where(df[ColName].isnull(),"unknown",df[ColName])

# Call function to create new category for variables
#for Columns in ['education_level','company_type','company_size','gender','major_discipline','enrolled_university']:
    #impute_nan_create_category(df,Columns)

# Display result
#df[['education_level','company_type','company_size','gender','major_discipline','enrolled_university']].head(10)


In [12]:
# Removing the highest and lowest values of 'company_size' and 'last_new_job' 
df['last_new_job'].replace(['>4','never'],['4','0'],inplace=True)
df['last_new_job'].fillna(df['last_new_job'].value_counts().index[0],inplace=True)
df['last_new_job'] = [float(i) for i in df['last_new_job']]

df['experience'].replace(['>20','<1'],['20','1'],inplace=True)
df['experience'].fillna(df['experience'].value_counts().index[0],inplace=True)
df['experience'] = [float(i) for i in df['experience']]

In [13]:
df.isnull().values.any()

False

In [14]:
# Convert values of company_size to small, medium, large, very large
df['company_size'].replace(['<10','10/49', '50-99', '100-500', '500-999', '1000-4999', '5000-9999', '10000+',],
                             ['Small','Small','Small','Medium','Medium','Large','Large','Large'],inplace=True)

In [15]:
df['education_level'].unique()

array(['Graduate', 'Masters', 'High School', 'Phd', 'Primary School'],
      dtype=object)

In [16]:
education_level_ordinal=[]
for i in df['education_level']:
    if i =='Primary School':
        education_level_ordinal.append(0)
    elif i == 'High School':
        education_level_ordinal.append(1)
    elif i == 'Graduate':
        education_level_ordinal.append(2)
    elif i == 'Masters':
        education_level_ordinal.append(3)
    elif i == 'Phd':
        education_level_ordinal.append(4)
    else:
        education_level_ordinal.append(5)

df['education_level']=education_level_ordinal

df['education_level']

0        2
1        2
2        2
3        2
4        3
        ..
19153    2
19154    2
19155    2
19156    1
19157    0
Name: education_level, Length: 19158, dtype: int64

In [17]:
df['company_size'].unique()

array(['Small', 'Large', 'Medium'], dtype=object)

In [18]:
company_size_ordinal=[]
for i in df['company_size']:
    if i =='Small':
        company_size_ordinal.append(0)
    elif i == 'Medium':
        company_size_ordinal.append(1)
    elif i == 'Large':
        company_size_ordinal.append(3)
    else:
        company_size_ordinal.append(4)

df['company_size']=company_size_ordinal

df['company_size']

# Encoding

#Ordinal to education_level
#Ordinal to company_size
#Ordinal to 
#Ordinal

0        0
1        0
2        0
3        0
4        0
        ..
19153    0
19154    0
19155    0
19156    1
19157    0
Name: company_size, Length: 19158, dtype: int64

In [19]:
model_df = df.copy()

In [20]:
X = df.drop('target', axis=1)
y = df['target']

In [21]:
# Dealing with numerical and categorical data
X_num = X.select_dtypes(include = np.number)
X_cat = X.select_dtypes(include = np.object)
print (X.shape, X_num.shape, X_cat.shape)

(19158, 13) (19158, 7) (19158, 6)


In [22]:
# One Hot/Label Encoding (categorical)
encoder = OneHotEncoder(handle_unknown='error', drop='first')
encoder.fit(X_cat)

OneHotEncoder(drop='first')

In [23]:
encoded = encoder.transform(X_cat).toarray()
encoded.shape

(19158, 137)

In [24]:
# Concat DataFrames
X = np.concatenate([X_num, encoded], axis=1)
X.shape

(19158, 144)


## Training the Benchmark Model/Logistic regression

In [25]:
#Splitting the data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [69]:
# using standard scaler
from sklearn.preprocessing import StandardScaler

transformer = StandardScaler() # StandardScaler and MinMax scaler works row wise,
# so make sure you do the train-test split first (optional)
# (train-test split comes first, 
#or else your trainning set will carry information from your test set)
transformer.fit(X_train)

X_train_standardized = transformer.transform(X_train)
X_test_standardized = transformer.transform(X_test)
#pd.DataFrame(x_standardized)



In [70]:
logreg = LogisticRegression(random_state=42, max_iter=10000) # max_iter 500??????
logreg.fit(X_train_standardized, y_train)

LogisticRegression(max_iter=10000, random_state=42)

In [71]:
y_pred_logreg = logreg.predict(X_test_standardized)
confusion_matrix(y_test, y_pred_logreg)

array([[3022, 1278],
       [ 595,  853]])

In [72]:
print(classification_report(y_test, y_pred_logreg))

              precision    recall  f1-score   support

         0.0       0.84      0.70      0.76      4300
         1.0       0.40      0.59      0.48      1448

    accuracy                           0.67      5748
   macro avg       0.62      0.65      0.62      5748
weighted avg       0.73      0.67      0.69      5748



## Evaluate the Model

In [57]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()
X_sm, y_sm = smote.fit_resample(np.array(X_train), y_train)
y_sm.value_counts()

0.0    10081
1.0    10081
Name: target, dtype: int64

In [58]:
logreg = LogisticRegression(random_state=42, max_iter=10000) # max_iter 500??????
logreg.fit(X_sm, y_sm)

LogisticRegression(max_iter=10000, random_state=42)

In [59]:
y_pred_logreg = logreg.predict(X_test)
confusion_matrix(y_test, y_pred_logreg)

array([[3313,  987],
       [ 546,  902]])

In [60]:
print(classification_report(y_test, y_pred_logreg))

              precision    recall  f1-score   support

         0.0       0.86      0.77      0.81      4300
         1.0       0.48      0.62      0.54      1448

    accuracy                           0.73      5748
   macro avg       0.67      0.70      0.68      5748
weighted avg       0.76      0.73      0.74      5748



In [61]:
# Downsampling - TomekLinks
from imblearn.under_sampling import TomekLinks

# watch out not to inflate your metrics, ideally:
# you do the train-test split first and fit_resample only on the training set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

tl = TomekLinks('majority')

X_tl, y_tl = tl.fit_resample(np.array(X_train), y_train)

y_tl.value_counts()

0.0    8677
1.0    3329
Name: target, dtype: int64

In [62]:
classification = LogisticRegression(random_state=42, max_iter=10000) # max_iter 500??????
classification.fit(X_tl, y_tl)

LogisticRegression(max_iter=10000, random_state=42)

In [63]:
predictions = classification.predict(X_test)
confusion_matrix(y_test, predictions)

array([[3971,  329],
       [ 999,  449]])

In [64]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         0.0       0.80      0.92      0.86      4300
         1.0       0.58      0.31      0.40      1448

    accuracy                           0.77      5748
   macro avg       0.69      0.62      0.63      5748
weighted avg       0.74      0.77      0.74      5748



In [39]:
tl = TomekLinks('majority')
X_tl, y_tl = tl.fit_resample(np.array(X_train), y_train)
print('1st TkLinks:')
print(y_tl.value_counts())

smote = SMOTE()
X_sm, y_sm = smote.fit_resample(X_tl, y_tl)
print('SMOTE:')
print(y_sm.value_counts())

tl = TomekLinks('all')
X_tl, y_tl = tl.fit_resample(X_sm, y_sm)
print('2nd TkLinks:')
print(y_tl.value_counts())

1st TkLinks:
0.0    8677
1.0    3329
Name: target, dtype: int64
SMOTE:
0.0    8677
1.0    8677
Name: target, dtype: int64
2nd TkLinks:
0.0    8137
1.0    8137
Name: target, dtype: int64


In [40]:
classification = LogisticRegression(random_state=42, max_iter=500) # max_iter 500??????
classification.fit(X_train, y_train)

LogisticRegression(max_iter=500, random_state=42)

In [41]:
predictions = classification.predict(X_test)
confusion_matrix(y_test, predictions)

array([[4054,  246],
       [1104,  344]])

In [42]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         0.0       0.79      0.94      0.86      4300
         1.0       0.58      0.24      0.34      1448

    accuracy                           0.77      5748
   macro avg       0.68      0.59      0.60      5748
weighted avg       0.73      0.77      0.73      5748



## Classification Models

In [43]:
# Logistic Regression

In [44]:
# KNN

from sklearn.neighbors import KNeighborsClassifier
knnc = KNeighborsClassifier(n_neighbors=4)
knnc.fit(X_train,y_train)
y_pred_knnc = knnc.predict(X_test)

print(classification_report(y_test, y_pred_knnc))

              precision    recall  f1-score   support

         0.0       0.75      0.94      0.83      4300
         1.0       0.29      0.07      0.11      1448

    accuracy                           0.72      5748
   macro avg       0.52      0.51      0.47      5748
weighted avg       0.63      0.72      0.65      5748



In [89]:
# Random Forest

rfc = RandomForestClassifier()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y) 
X_train_standardized = transformer.transform(X_train)
X_test_standardized = transformer.transform(X_test)
X_sm, y_sm = smote.fit_resample(X_train_standardized, y_train)
rfc.fit(X_sm, y_sm)

# gritsearch unit 7
y_pred_rfc = rfc.predict(X_test_standardized)

print(classification_report(y_test, y_pred_rfc))

              precision    recall  f1-score   support

         0.0       0.84      0.85      0.85      2877
         1.0       0.54      0.51      0.52       955

    accuracy                           0.77      3832
   macro avg       0.69      0.68      0.68      3832
weighted avg       0.76      0.77      0.77      3832



In [46]:
# Decision Tree

from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(max_depth = 3, random_state=0)

dtc.fit(X_train,y_train)
y_pred_dtc = dtc.predict(X_test)
#metrics.accuracy_score(y_test, y_pred_dtc)

print(classification_report(y_test, y_pred_dtc))

              precision    recall  f1-score   support

         0.0       0.82      0.89      0.85      4300
         1.0       0.56      0.41      0.48      1448

    accuracy                           0.77      5748
   macro avg       0.69      0.65      0.66      5748
weighted avg       0.75      0.77      0.76      5748



In [47]:
from xgboost import XGBRegressor as XGBR

xgbr=XGBR()
xgbr.fit(X_train, y_train)

y_pred_xgbr=xgbr.predict(X_test)

XGBoostError: 
XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed
    - vcomp140.dll or libgomp-1.dll for Windows
    - libomp.dylib for Mac OSX
    - libgomp.so for Linux and other UNIX-like OSes
    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.

  * You are running 32-bit Python on a 64-bit OS

Error message(s): ["dlopen(/Users/lydia/Library/Python/3.9/lib/python/site-packages/xgboost/lib/libxgboost.dylib, 0x0006): Library not loaded: /usr/local/opt/libomp/lib/libomp.dylib\n  Referenced from: /Users/lydia/Library/Python/3.9/lib/python/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: tried: '/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/usr/lib/libomp.dylib' (no such file)"]


In [None]:
pip install xgboost

In [86]:
#Import svm model
from sklearn import svm

#Create a svm Classifier
clf = svm.SVC() # Linear Kernel
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y) 
X_train_standardized = transformer.transform(X_train)
X_test_standardized = transformer.transform(X_test)
X_sm, y_sm = smote.fit_resample(X_train_standardized, y_train)

#Train the model using the training sets
clf.fit(X_sm, y_sm)

#Predict the response for test dataset
y_pred_clf = clf.predict(X_test_standardized)

In [83]:
X_test_standardized.shape

(5748, 144)

In [87]:
from sklearn import metrics

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_clf))

Accuracy: 0.7126826722338204


In [88]:
print(classification_report(y_test, y_pred_clf))

              precision    recall  f1-score   support

         0.0       0.89      0.70      0.79      2877
         1.0       0.45      0.74      0.56       955

    accuracy                           0.71      3832
   macro avg       0.67      0.72      0.67      3832
weighted avg       0.78      0.71      0.73      3832



In [None]:
models=[logreg, knnc, rfc, dtc, clf]
model_names=['logreg', 'knnc', 'rfc', 'dtc', 'clf']
preds=[y_pred_logreg, y_pred_knnc, y_pred_rfc, y_pred_dtc, y_pred_clf]