In [2]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier


import warnings
warnings.filterwarnings('ignore')

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
df = pd.read_csv('employee.csv')

In [5]:
df.head()

Unnamed: 0,Employee_ID,Department,Gender,Age,Job_Title,Hire_Date,Years_At_Company,Education_Level,Performance_Score,Monthly_Salary,Work_Hours_Per_Week,Projects_Handled,Overtime_Hours,Sick_Days,Remote_Work_Frequency,Team_Size,Training_Hours,Promotions,Employee_Satisfaction_Score,Resigned
0,1,IT,Male,55,Specialist,2022-01-19 08:03:05.556036,2,High School,5,6750.0,33,32,22,2,0,14,66,0,2.63,False
1,2,Finance,Male,29,Developer,2024-04-18 08:03:05.556036,0,High School,5,7500.0,34,34,13,14,100,12,61,2,1.72,False
2,3,Finance,Male,55,Specialist,2015-10-26 08:03:05.556036,8,High School,3,5850.0,37,27,6,3,50,10,1,0,3.17,False
3,4,Customer Support,Female,48,Analyst,2016-10-22 08:03:05.556036,7,Bachelor,2,4800.0,52,10,28,12,100,10,0,1,1.86,False
4,5,Engineering,Female,36,Analyst,2021-07-23 08:03:05.556036,3,Bachelor,2,4800.0,38,11,29,13,100,15,9,1,1.25,False


In [6]:
df.drop(columns=['Employee_ID','Hire_Date'],inplace=True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 18 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Department                   100000 non-null  object 
 1   Gender                       100000 non-null  object 
 2   Age                          100000 non-null  int64  
 3   Job_Title                    100000 non-null  object 
 4   Years_At_Company             100000 non-null  int64  
 5   Education_Level              100000 non-null  object 
 6   Performance_Score            100000 non-null  int64  
 7   Monthly_Salary               100000 non-null  float64
 8   Work_Hours_Per_Week          100000 non-null  int64  
 9   Projects_Handled             100000 non-null  int64  
 10  Overtime_Hours               100000 non-null  int64  
 11  Sick_Days                    100000 non-null  int64  
 12  Remote_Work_Frequency        100000 non-null  int64  
 13  

In [8]:
df.describe()

Unnamed: 0,Age,Years_At_Company,Performance_Score,Monthly_Salary,Work_Hours_Per_Week,Projects_Handled,Overtime_Hours,Sick_Days,Remote_Work_Frequency,Team_Size,Training_Hours,Promotions,Employee_Satisfaction_Score
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,41.02941,4.47607,2.99543,6403.211,44.95695,24.43117,14.51493,7.00855,50.0905,10.01356,49.50606,0.99972,2.999088
std,11.244121,2.869336,1.414726,1372.508717,8.942003,14.469584,8.664026,4.331591,35.351157,5.495405,28.890383,0.815872,1.150719
min,22.0,0.0,1.0,3850.0,30.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
25%,31.0,2.0,2.0,5250.0,37.0,12.0,7.0,3.0,25.0,5.0,25.0,0.0,2.01
50%,41.0,4.0,3.0,6500.0,45.0,24.0,15.0,7.0,50.0,10.0,49.0,1.0,3.0
75%,51.0,7.0,4.0,7500.0,53.0,37.0,22.0,11.0,75.0,15.0,75.0,2.0,3.99
max,60.0,10.0,5.0,9000.0,60.0,49.0,29.0,14.0,100.0,19.0,99.0,2.0,5.0


In [9]:
encoder = LabelEncoder()

columns_to_encode = ['Department','Gender','Job_Title','Resigned']

for col in columns_to_encode:
    df[col] = encoder.fit_transform(df[col])


In [10]:
df['Education_Level'].value_counts()

Education_Level
Bachelor       50041
High School    30004
Master         14904
PhD             5051
Name: count, dtype: int64

In [11]:
education_mapping = {
    'High School':0,
    'Bachelor':1,
    'Master':2,
    'PhD':3
}

df['Education_Level'] = df['Education_Level'].map(education_mapping)

In [12]:
df.head()

Unnamed: 0,Department,Gender,Age,Job_Title,Years_At_Company,Education_Level,Performance_Score,Monthly_Salary,Work_Hours_Per_Week,Projects_Handled,Overtime_Hours,Sick_Days,Remote_Work_Frequency,Team_Size,Training_Hours,Promotions,Employee_Satisfaction_Score,Resigned
0,4,1,55,5,2,0,5,6750.0,33,32,22,2,0,14,66,0,2.63,0
1,2,1,29,2,0,0,5,7500.0,34,34,13,14,100,12,61,2,1.72,0
2,2,1,55,5,8,0,3,5850.0,37,27,6,3,50,10,1,0,3.17,0
3,0,0,48,0,7,1,2,4800.0,52,10,28,12,100,10,0,1,1.86,0
4,1,0,36,0,3,1,2,4800.0,38,11,29,13,100,15,9,1,1.25,0


In [13]:
y = df['Resigned']
x = df.drop(columns=['Resigned'])

In [14]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.3,random_state=42)

In [15]:
rf = RandomForestClassifier()
model = rf.fit(X_train,y_train)

y_pred = model.predict(X_test)

In [16]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.90      1.00      0.95     26969
           1       0.00      0.00      0.00      3031

    accuracy                           0.90     30000
   macro avg       0.45      0.50      0.47     30000
weighted avg       0.81      0.90      0.85     30000



In [17]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': XGBClassifier(),
    'AdaBoost': AdaBoostClassifier(), 
    'Naive Bayes': GaussianNB(),       
    'MLP Neural Network': MLPClassifier() 
}

In [18]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.2f}")

Logistic Regression Accuracy: 0.90
Decision Tree Accuracy: 0.80
Random Forest Accuracy: 0.90
SVM Accuracy: 0.90
KNN Accuracy: 0.89
Gradient Boosting Accuracy: 0.90
XGBoost Accuracy: 0.90
AdaBoost Accuracy: 0.90
Naive Bayes Accuracy: 0.90
MLP Neural Network Accuracy: 0.90
