### Import libraries

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import warnings
warnings.filterwarnings("ignore")

### Load dataset

In [5]:
df = pd.read_csv(r"C:\Users\ritik\Downloads\employee_churn_dataset.csv")
df.head()

Unnamed: 0,Employee ID,Age,Gender,Education Level,Marital Status,Tenure,Job Role,Department,Salary,Work Location,...,Training Hours,Promotions,Overtime Hours,Satisfaction Level,Work-Life Balance,Average Monthly Hours Worked,Absenteeism,Distance from Home,Manager Feedback Score,Churn
0,E00001,50,Male,Bachelor's,Married,5,Analyst,Marketing,93422,Remote,...,35,0,16,0.03,Poor,155,14,15,7.9,0
1,E00002,36,Male,Bachelor's,Married,4,Manager,Sales,44589,On-site,...,1,0,12,0.39,Average,162,2,8,5.2,0
2,E00003,29,Male,Bachelor's,Married,3,Sales,IT,56768,Hybrid,...,41,0,2,0.62,Poor,243,6,21,6.3,0
3,E00004,42,Male,Bachelor's,Single,12,Analyst,Sales,79009,On-site,...,33,0,8,0.73,Good,291,11,25,7.4,0
4,E00005,40,Other,Bachelor's,Married,1,Sales,HR,81982,On-site,...,30,0,13,0.52,Average,272,3,1,4.1,0


### Drop unnecessary columns

In [6]:
df = df.drop(columns=[
    "Employee ID", "Job Role", "Education Level",
    "Gender", "Marital Status", "Work Location" , "Distance from Home" , 
] , axis = 1)
df


Unnamed: 0,Age,Tenure,Department,Salary,Performance Rating,Projects Completed,Training Hours,Promotions,Overtime Hours,Satisfaction Level,Work-Life Balance,Average Monthly Hours Worked,Absenteeism,Manager Feedback Score,Churn
0,50,5,Marketing,93422,5,37,35,0,16,0.03,Poor,155,14,7.9,0
1,36,4,Sales,44589,2,43,1,0,12,0.39,Average,162,2,5.2,0
2,29,3,IT,56768,1,38,41,0,2,0.62,Poor,243,6,6.3,0
3,42,12,Sales,79009,1,39,33,0,8,0.73,Good,291,11,7.4,0
4,40,1,HR,81982,3,49,30,0,13,0.52,Average,272,3,4.1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,26,9,IT,35754,4,26,69,0,22,0.34,Poor,217,18,1.5,0
9996,41,5,IT,82506,1,2,76,1,20,0.96,Average,157,3,7.6,0
9997,35,4,HR,86380,1,48,72,0,32,0.60,Poor,241,18,5.3,0
9998,53,10,IT,90993,4,43,47,0,19,0.19,Excellent,265,4,1.4,0


### Check null values

In [7]:
df.isnull().sum()

Age                             0
Tenure                          0
Department                      0
Salary                          0
Performance Rating              0
Projects Completed              0
Training Hours                  0
Promotions                      0
Overtime Hours                  0
Satisfaction Level              0
Work-Life Balance               0
Average Monthly Hours Worked    0
Absenteeism                     0
Manager Feedback Score          0
Churn                           0
dtype: int64

### Encoding

In [8]:
from sklearn.preprocessing import LabelEncoder

encoders = {}
le = LabelEncoder()
df["Department"] = le.fit_transform(df["Department"])
encoders["Department"] = le

import pickle
pickle.dump(encoders, open("encoders.pkl", "wb"))




In [9]:
FINAL_FEATURES = [
    "Age", "Tenure", "Salary", "Overtime Hours",
     "Satisfaction Level", "Promotions",
    "Manager Feedback Score", "Department"
]

print(len(FINAL_FEATURES))


8


In [10]:
df.head()

Unnamed: 0,Age,Tenure,Department,Salary,Performance Rating,Projects Completed,Training Hours,Promotions,Overtime Hours,Satisfaction Level,Work-Life Balance,Average Monthly Hours Worked,Absenteeism,Manager Feedback Score,Churn
0,50,5,2,93422,5,37,35,0,16,0.03,Poor,155,14,7.9,0
1,36,4,3,44589,2,43,1,0,12,0.39,Average,162,2,5.2,0
2,29,3,1,56768,1,38,41,0,2,0.62,Poor,243,6,6.3,0
3,42,12,3,79009,1,39,33,0,8,0.73,Good,291,11,7.4,0
4,40,1,0,81982,3,49,30,0,13,0.52,Average,272,3,4.1,0


In [11]:
X = df[[
    "Age", "Tenure", "Salary", "Overtime Hours",
    "Satisfaction Level", "Promotions",
    "Manager Feedback Score", "Department"
]]
y = df["Churn"]


### Train - Test split

In [12]:
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(X,y, test_size= 0.2 ,random_state= 42)


In [13]:
print(type(x_train))
print(x_train.head())



<class 'pandas.core.frame.DataFrame'>
      Age  Tenure  Salary  Overtime Hours  Satisfaction Level  Promotions  \
9254   31      11   83033              38                0.35           0   
1561   34       5   86685              11                0.32           0   
1670   50      11  115667              37                0.40           0   
6087   22       3  124943               3                0.48           0   
6669   49      14  119995              16                0.23           0   

      Manager Feedback Score  Department  
9254                     5.7           1  
1561                     2.0           3  
1670                     8.3           3  
6087                     2.6           3  
6669                     7.0           3  


In [14]:
print(x_train.columns)

Index(['Age', 'Tenure', 'Salary', 'Overtime Hours', 'Satisfaction Level',
       'Promotions', 'Manager Feedback Score', 'Department'],
      dtype='object')


### Scaling Features

In [15]:
from sklearn.preprocessing import StandardScaler
import pickle

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(x_train)
X_test_scaled  = scaler.transform(x_test)

print("Scaler trained on:", scaler.n_features_in_)

pickle.dump(scaler, open("scalar.pkl", "wb"))




Scaler trained on: 8


### Train model


In [16]:
from xgboost import XGBClassifier

model = XGBClassifier(scale_pos_weight=3, eval_metric="logloss")
model.fit(x_train, y_train)

import joblib
joblib.dump(model, "xgboost_churn_model.pkl")


['xgboost_churn_model.pkl']

In [26]:
y_pred = model.predict(x_test)

from sklearn.metrics import classification_report
print(classification_report(y_test , y_pred))

              precision    recall  f1-score   support

           0       0.80      0.82      0.81      1604
           1       0.19      0.17      0.18       396

    accuracy                           0.69      2000
   macro avg       0.50      0.50      0.50      2000
weighted avg       0.68      0.69      0.69      2000



In [17]:
xgboost_params = {"learning_rate" : [0.1 , 0.01] ,
                  "max_depth" : [5,8,12,20,30] ,
                  "n_estimators" : [100 , 200,300] ,
                  "colsample_bytree" : [0.5 , 0.8 , 1 , 0.3 , 0.4]}




In [18]:
randomcv_model = [
    ("XGboost" , XGBClassifier() ,xgboost_params)
]


In [19]:
from sklearn.model_selection import RandomizedSearchCV
model_param ={}
for name , models , params in randomcv_model:
    random = RandomizedSearchCV(estimator=  models , param_distributions= params , n_iter= 100 , cv = 3 , verbose= 2 , n_jobs= 1)
    random.fit(x_train , y_train)
    model_param[name] = random.best_params_
    
for model_name in model_param:
    print(f"-------------- Best params for {model_name} ----------------")
    print(model_param[model_name])

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=5, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=5, n_estimators=300; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=5, n_estimators=300; total time=   0.0s
[CV] END colsample_bytree=0.5, learning_rate=0.1, max_depth=5, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.5, learning_rate=0.1, max_depth=5, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.5, learning_rate=0.1, max_depth=5, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=1, learning_rate=0.01, max_depth=12, n_estimators=200; total time=   0.2s
[CV] END colsample_bytree=1, learning_rate=0.01, max_depth=12, n_estimators=200; total time=   0.2s
[CV] END colsample_bytree=1, learning_rate=0.01, max_depth=12, n_estimators=200; total time=   0.2s
[CV] END colsample_bytree=0.5, learni

In [20]:

new_model = XGBClassifier(n_estimators =  200 , max_depth =  30 , learning_rate =  0.01 , colsample_bytree = 0.8 ,scale_pos_weight=3, eval_metric="logloss")
new_model.fit(x_train, y_train)

import joblib
joblib.dump(new_model, "xgboost_churn_model.pkl")
 


['xgboost_churn_model.pkl']

In [21]:
y_pred = new_model.predict(x_test)

In [24]:
from sklearn.metrics import classification_report 
print(classification_report(y_test , y_pred))

              precision    recall  f1-score   support

           0       0.80      0.97      0.88      1604
           1       0.20      0.04      0.06       396

    accuracy                           0.78      2000
   macro avg       0.50      0.50      0.47      2000
weighted avg       0.68      0.78      0.71      2000

