In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("Employee.csv")
df

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2017,Bangalore,3,34,Male,No,0,0
1,Bachelors,2013,Pune,1,28,Female,No,3,1
2,Bachelors,2014,New Delhi,3,38,Female,No,2,0
3,Masters,2016,Bangalore,3,27,Male,No,5,1
4,Masters,2017,Pune,3,24,Male,Yes,2,1
...,...,...,...,...,...,...,...,...,...
4648,Bachelors,2013,Bangalore,3,26,Female,No,4,0
4649,Masters,2013,Pune,2,37,Male,No,2,1
4650,Masters,2018,New Delhi,3,27,Male,No,5,1
4651,Bachelors,2012,Bangalore,3,30,Male,Yes,2,0


In [3]:
df.describe()

Unnamed: 0,JoiningYear,PaymentTier,Age,ExperienceInCurrentDomain,LeaveOrNot
count,4653.0,4653.0,4653.0,4653.0,4653.0
mean,2015.06297,2.698259,29.393295,2.905652,0.343864
std,1.863377,0.561435,4.826087,1.55824,0.475047
min,2012.0,1.0,22.0,0.0,0.0
25%,2013.0,3.0,26.0,2.0,0.0
50%,2015.0,3.0,28.0,3.0,0.0
75%,2017.0,3.0,32.0,4.0,1.0
max,2018.0,3.0,41.0,7.0,1.0


In [4]:
df.columns = df.columns.str.lower()
df.columns

Index(['education', 'joiningyear', 'city', 'paymenttier', 'age', 'gender',
       'everbenched', 'experienceincurrentdomain', 'leaveornot'],
      dtype='object')

Now let's change how we measure the years in the company by changing the joining year by the years in the company.

In [5]:
df["yearsinthecompany"] = 2024 - df["joiningyear"]
df["yearsinthecompany"] 

0        7
1       11
2       10
3        8
4        7
        ..
4648    11
4649    11
4650     6
4651    12
4652     9
Name: yearsinthecompany, Length: 4653, dtype: int64

In [6]:
del df["joiningyear"]

In [7]:
df.columns

Index(['education', 'city', 'paymenttier', 'age', 'gender', 'everbenched',
       'experienceincurrentdomain', 'leaveornot', 'yearsinthecompany'],
      dtype='object')

In [8]:
df.everbenched

0        No
1        No
2        No
3        No
4       Yes
       ... 
4648     No
4649     No
4650     No
4651    Yes
4652    Yes
Name: everbenched, Length: 4653, dtype: object

In [9]:
df.paymenttier.value_counts()

paymenttier
3    3492
2     918
1     243
Name: count, dtype: int64

Now let's map the payment tier as it is categorical value.

In [10]:
payemnet_tier_map = {
    1 :"first_tier",
    2 : "second_tier",
    3 : "third_tier"
}
df["paymenttier"] = df["paymenttier"].map(payemnet_tier_map)
df

Unnamed: 0,education,city,paymenttier,age,gender,everbenched,experienceincurrentdomain,leaveornot,yearsinthecompany
0,Bachelors,Bangalore,third_tier,34,Male,No,0,0,7
1,Bachelors,Pune,first_tier,28,Female,No,3,1,11
2,Bachelors,New Delhi,third_tier,38,Female,No,2,0,10
3,Masters,Bangalore,third_tier,27,Male,No,5,1,8
4,Masters,Pune,third_tier,24,Male,Yes,2,1,7
...,...,...,...,...,...,...,...,...,...
4648,Bachelors,Bangalore,third_tier,26,Female,No,4,0,11
4649,Masters,Pune,second_tier,37,Male,No,2,1,11
4650,Masters,New Delhi,third_tier,27,Male,No,5,1,6
4651,Bachelors,Bangalore,third_tier,30,Male,Yes,2,0,12


In [11]:
df.dtypes

education                    object
city                         object
paymenttier                  object
age                           int64
gender                       object
everbenched                  object
experienceincurrentdomain     int64
leaveornot                    int64
yearsinthecompany             int64
dtype: object

In [12]:
categorical_columns = df.dtypes[df.dtypes==object].index
for column in categorical_columns:
    df[column] = df[column].str.lower().str.replace(" ","_")

df    

Unnamed: 0,education,city,paymenttier,age,gender,everbenched,experienceincurrentdomain,leaveornot,yearsinthecompany
0,bachelors,bangalore,third_tier,34,male,no,0,0,7
1,bachelors,pune,first_tier,28,female,no,3,1,11
2,bachelors,new_delhi,third_tier,38,female,no,2,0,10
3,masters,bangalore,third_tier,27,male,no,5,1,8
4,masters,pune,third_tier,24,male,yes,2,1,7
...,...,...,...,...,...,...,...,...,...
4648,bachelors,bangalore,third_tier,26,female,no,4,0,11
4649,masters,pune,second_tier,37,male,no,2,1,11
4650,masters,new_delhi,third_tier,27,male,no,5,1,6
4651,bachelors,bangalore,third_tier,30,male,yes,2,0,12


In [13]:
from sklearn.model_selection import train_test_split
df_full_train, df_test = train_test_split(df,test_size=0.2,random_state=1)
df_train, df_val = train_test_split(df_full_train,test_size=0.25,random_state=1)
print(len(df_train),len(df_test),len(df_val))

2791 931 931


In [14]:
df_full_train =df_full_train.reset_index(drop=True)
df_train =df_train.reset_index(drop=True)
df_val =df_val.reset_index(drop=True)
df_test =df_test.reset_index(drop=True)

In [15]:
y_train = df_train["leaveornot"]
y_val = df_val["leaveornot"]
y_test = df_test["leaveornot"]
y_full_train = df_full_train["leaveornot"]

del df_train["leaveornot"]
del df_val["leaveornot"]
del df_test["leaveornot"]
del df_full_train["leaveornot"]

In [16]:
df.isnull().sum()

education                    0
city                         0
paymenttier                  0
age                          0
gender                       0
everbenched                  0
experienceincurrentdomain    0
leaveornot                   0
yearsinthecompany            0
dtype: int64

In [17]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)
encoded_df_train = dv.fit_transform(df_train.to_dict(orient="records"))
encoded_df_val = dv.transform(df_val.to_dict(orient="records"))

In [18]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=7,min_samples_leaf=3)
dt.fit(encoded_df_train,y_train)
y_predict = dt.predict(encoded_df_val)

In [19]:
from sklearn.metrics import roc_auc_score
roc_auc = roc_auc_score(y_val,y_predict)
roc_auc

0.8209991381303736

In [65]:
min_sample_leafes = range(1,20)
max_depths = range(1,20)
scores =[]
for mleafs in min_sample_leafes:
    for depth in max_depths:
        dt = DecisionTreeClassifier(max_depth=depth,min_samples_leaf=mleafs)
        dt.fit(encoded_df_train,y_train)
        y_predict = dt.predict(encoded_df_val)
        roc_auc = roc_auc_score(y_val,y_predict)
        scores.append((mleafs,depth,roc_auc))

scores_df = pd.DataFrame(scores,columns=["min_sample_leafes","max_depth","roc_auc_score"])
scores_df        

Unnamed: 0,min_sample_leafes,max_depth,roc_auc_score
0,1,1,0.619666
1,1,2,0.718497
2,1,3,0.716589
3,1,4,0.744295
4,1,5,0.799894
...,...,...,...
356,19,15,0.803933
357,19,16,0.803933
358,19,17,0.803933
359,19,18,0.803933


In [66]:
scores_df.sort_values(by="roc_auc_score",ascending=False)

Unnamed: 0,min_sample_leafes,max_depth,roc_auc_score
6,1,7,0.827715
25,2,7,0.823503
178,10,8,0.822200
180,10,10,0.821346
189,10,19,0.821346
...,...,...,...
57,4,1,0.619666
190,11,1,0.619666
209,12,1,0.619666
76,5,1,0.619666


 The best parameters are 7 for max_depth and min_leaf_samples 1

In [20]:
dt = DecisionTreeClassifier(max_depth=7,min_samples_leaf=1)
dt.fit(encoded_df_train,y_train)
y_predict = dt.predict(encoded_df_val)
roc_auc = roc_auc_score(y_val,y_predict)
roc_auc

0.8277154148535872

In [21]:
from sklearn.ensemble import RandomForestClassifier
rf_scores =[]
for n in range(10,201,10):
    for depth in range(1,16):
        rf = RandomForestClassifier(n_estimators=n,max_depth=depth,random_state=1)
        rf.fit(encoded_df_train,y_train)
        y_predict = rf.predict(encoded_df_val)
        roc_auc = roc_auc_score(y_val,y_predict)
        rf_scores.append((n,depth,roc_auc))
rf_scores_df = pd.DataFrame(rf_scores,columns=["n_estimators","max_depth","roc_auc_score"])   
rf_scores_df.sort_values(by="roc_auc_score",ascending=False)     

Unnamed: 0,n_estimators,max_depth,roc_auc_score
262,180,8,0.829134
53,40,9,0.828396
292,200,8,0.828338
99,70,10,0.828338
142,100,8,0.828338
...,...,...,...
285,200,1,0.542108
225,160,1,0.542108
240,170,1,0.542108
150,110,1,0.542108


So for the best model in random forest we use n_estimators 180 , max_depth = 8

In [22]:
rf = RandomForestClassifier(n_estimators=180,max_depth=8,random_state=1)
rf.fit(encoded_df_train,y_train)
y_predict = rf.predict(encoded_df_val)
rf_roc_auc = roc_auc_score(y_val,y_predict)
rf_roc_auc

0.8291343465556748

Now let's try xgboost

In [23]:
import xgboost as xgb

feature_names=list(dv.get_feature_names_out())
d_train_xgb = xgb.DMatrix(encoded_df_train,label=y_train,feature_names=feature_names)
d_val_xgb = xgb.DMatrix(encoded_df_val,label=y_val,feature_names=feature_names)

  if is_sparse(data):


In [24]:
xgb_scores=[]
for eta in [0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5]:
    for depth in range(1,16):
        x_params = {
            "eta" : eta,
            "max_depth" : depth,
            "min_child_weight":1,
            "objective":"binary:logistic",
            "eval_metric":"auc",
            "seed":1,
            "verbosity":1
        }
        xgb_model = xgb.train(x_params,d_train_xgb,num_boost_round=20)
        y_pred_xgb = xgb_model.predict(d_val_xgb)
        roc_xgb = roc_auc_score(y_val,y_pred_xgb)
        xgb_scores.append((eta,depth,roc_xgb))
xgb_scores = pd.DataFrame(xgb_scores,columns=["eta","max_depth","xgb_scores"])   
xgb_scores.sort_values(by="xgb_scores",ascending=False)  

Unnamed: 0,eta,max_depth,xgb_scores
66,0.30,7,0.893780
95,0.40,6,0.893470
110,0.45,6,0.891031
50,0.25,6,0.890679
81,0.35,7,0.889959
...,...,...,...
45,0.25,1,0.810883
60,0.30,1,0.810150
30,0.20,1,0.801003
15,0.15,1,0.791630


so the best xgb model is max_depth 7 and eta 0.3 

In [25]:
x_params = {
            "eta" : 0.3,
            "max_depth" : 7,
            "min_child_weight":1,
            "objective":"binary:logistic",
            "eval_metric":"auc",
            "seed":1,
            "verbosity":1
        }
chosen_xgb_model = xgb.train(x_params,d_train_xgb,num_boost_round=20)
y_pred_xgb = chosen_xgb_model.predict(d_val_xgb)
roc_xgb = roc_auc_score(y_val,y_pred_xgb)
roc_xgb

0.893779823842257

Comparing this model to the previous models of Decision Tree roc => 0.828  \
  and Random forest of roc => 0.829   
  it has a better roc score.

Now let's train it on the full train data set

In [26]:
full_dv = DictVectorizer(sparse=False)
encoded_df_full_train = full_dv.fit_transform(df_full_train.to_dict(orient="records"))
encoded_df_test = full_dv.transform(df_test.to_dict(orient="records"))

In [27]:
full_feature_names=list(full_dv.get_feature_names_out())
d_full_train_xgb = xgb.DMatrix(encoded_df_full_train,label=y_full_train,feature_names=full_feature_names)
d_test_xgb = xgb.DMatrix(encoded_df_test,label=y_test,feature_names=full_feature_names)

  if is_sparse(data):


In [28]:
x_params = {
            "eta" : 0.3,
            "max_depth" : 7,
            "min_child_weight":1,
            "objective":"binary:logistic",
            "eval_metric":"auc",
            "seed":1,
            "verbosity":1
        }
chosen_xgb_model = xgb.train(x_params,d_full_train_xgb,num_boost_round=20)
y_pred_xgb = chosen_xgb_model.predict(d_test_xgb)
roc_xgb = roc_auc_score(y_test,y_pred_xgb)
roc_xgb

0.866513330365534

The Total ROC score after training the model over the full data set has decreased a little but still has an accepted ROC

Now let's export the model using pickle

In [29]:
import pickle

pickle.dump(chosen_xgb_model,open("emloyeeleaveclassifier.pkl","wb"))

In [30]:
pickle.dump(full_dv,open("dictvictorizer.pkl","wb"))