In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn, sklearn.model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from typing import Tuple
from sklearn.metrics import balanced_accuracy_score
from imblearn.over_sampling import SMOTE
from joblib import dump, load

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("candidate_data.csv")
print(df.info())
print(df.isna().any())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   motivation             1000 non-null   int64 
 1   working_independently  1000 non-null   int64 
 2   teamplayer             1000 non-null   int64 
 3   productivity           1000 non-null   object
 4   solution_oriented      1000 non-null   int64 
 5   age                    1000 non-null   int64 
 6   willingness_to_learn   1000 non-null   object
 7   temporal_availability  1000 non-null   object
 8   hired                  1000 non-null   int64 
dtypes: int64(6), object(3)
memory usage: 70.4+ KB
None
motivation               False
working_independently    False
teamplayer               False
productivity             False
solution_oriented        False
age                      False
willingness_to_learn     False
temporal_availability    False
hired                    False

In [3]:
df["productivity"] = df["productivity"].astype("category")
df["willingness_to_learn"] = df["willingness_to_learn"].astype("category")
df["temporal_availability"] = df["temporal_availability"].astype("category")
df = pd.get_dummies(df,dtype=int,drop_first=True)

In [4]:
df

Unnamed: 0,motivation,working_independently,teamplayer,solution_oriented,age,hired,productivity_productive,productivity_unproductive,willingness_to_learn_ready,willingness_to_learn_very_ready,temporal_availability_low,temporal_availability_medium
0,7,0,1,0,59,1,0,1,0,1,0,1
1,4,0,1,0,55,0,0,1,1,0,0,0
2,8,1,0,1,34,1,0,0,0,0,0,0
3,5,1,1,0,47,0,1,0,0,0,0,1
4,7,0,1,0,37,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
995,10,1,1,1,21,1,1,0,1,0,1,0
996,10,1,1,1,50,1,1,0,1,0,1,0
997,8,0,0,1,33,1,0,0,1,0,1,0
998,2,1,0,0,21,1,1,0,0,1,0,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column                           Non-Null Count  Dtype
---  ------                           --------------  -----
 0   motivation                       1000 non-null   int64
 1   working_independently            1000 non-null   int64
 2   teamplayer                       1000 non-null   int64
 3   solution_oriented                1000 non-null   int64
 4   age                              1000 non-null   int64
 5   hired                            1000 non-null   int64
 6   productivity_productive          1000 non-null   int64
 7   productivity_unproductive        1000 non-null   int64
 8   willingness_to_learn_ready       1000 non-null   int64
 9   willingness_to_learn_very_ready  1000 non-null   int64
 10  temporal_availability_low        1000 non-null   int64
 11  temporal_availability_medium     1000 non-null   int64
dtypes: int64(12)
memory usage: 93.9 KB


In [6]:
train_df, test_df = sklearn.model_selection.train_test_split(df, test_size=0.2, random_state=0)

In [7]:
# find best model with cross validation

models = []
best_model = None
best_model_score = 0

y_train = train_df["hired"]
y_test = test_df["hired"]
X_train = train_df.drop(columns=["hired"])
X_test = test_df.drop(columns=["hired"])
smote = SMOTE(random_state=0)
X_upsampled, y_upsampled = smote.fit_resample(X_train, y_train)

for n_estimators in range(1, 51):
    for i in range (1,6):
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=10,random_state=i)
        # train best model
        model.fit(X_upsampled, y_upsampled)
        y_pred = model.predict(X_test)
        bac = balanced_accuracy_score(y_test,y_pred)
        if bac>best_model_score:
            print("best n_estimator so far: ",n_estimators)
            best_model_score = bac
            best_model = model
        
print(best_model_score)

best n_estimator so far:  1
best n_estimator so far:  1
best n_estimator so far:  2
best n_estimator so far:  2
best n_estimator so far:  3
best n_estimator so far:  4
best n_estimator so far:  5
best n_estimator so far:  6
best n_estimator so far:  7
best n_estimator so far:  8
0.964781746031746


In [8]:
# save the model
dump(best_model, 'randomForest.joblib')

# ---------------------

# # load the model
# loaded_model = load('adaboost_model.joblib')

# # test the loaded model
# print(loaded_model.predict(X_test))

['randomForest.joblib']