In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv(r"resampled_data.csv")

df

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,JobInvolvement,...,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,TotalWorkingYears,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
0,41,2,1102,2,1,2,1,2,0,3,...,11,3,1,8.000000,1,6,4.000000,0.000000,5.000000,1
1,49,1,279,1,8,1,1,3,1,2,...,23,4,4,10.000000,3,10,7.000000,1.000000,7.000000,0
2,37,2,1373,1,2,2,4,4,1,2,...,15,3,2,7.000000,3,0,0.000000,0.000000,0.000000,1
3,33,1,1392,1,3,4,1,4,0,3,...,11,3,3,8.000000,3,8,7.000000,3.000000,0.000000,0
4,27,2,591,1,2,1,3,1,1,3,...,12,3,4,6.000000,3,2,2.000000,2.000000,2.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2029,33,1,699,1,5,1,2,2,0,2,...,11,3,3,7.932959,3,3,1.973184,0.986592,2.959776,1
2030,24,2,1409,1,13,3,3,1,0,3,...,14,3,3,3.984309,3,2,1.492154,0.000000,1.492154,1
2031,21,2,514,1,2,3,2,3,0,2,...,13,3,3,3.052710,1,2,2.161955,0.080978,1.838045,1
2032,22,1,295,0,18,1,0,3,0,1,...,11,3,3,0.818280,3,0,0.000000,0.000000,0.000000,1


In [4]:
X = df.drop('Attrition',axis=1)
y = df["Attrition"]
X_train ,X_test , y_train ,y_test = train_test_split(X , y ,random_state=30, shuffle=True ,test_size=0.20,stratify=y)

In [15]:
from sklearn.preprocessing import StandardScaler
SC=StandardScaler()
X_train = SC.fit_transform(X_train)
X_test = SC.transform(X_test)

### Logistic Regression

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [16]:
param_grid = {
    "penalty":['l1','l2','elasticnet'],
    "C":[0.1,1,10],
    "multi_class":['ovr','multinomial']
}
model = LogisticRegression()
grid_search = GridSearchCV(model ,param_grid ,cv=5,scoring="f1") 

In [17]:
# fit the model on training data
grid_search.fit(X_train, y_train)

In [18]:
# get best Hyperparameter and best model
best_params_grid = grid_search.best_params_
best_model_grid = grid_search.best_estimator_

#evaluate the best model on the test data
y_pred_grid = best_model_grid.predict(X_test)
accuracy_grid = accuracy_score(y_test , y_pred_grid)
print("The best Parameter (GridSearchCV) : ",best_params_grid)
print("BEST f1 score = ",grid_search.best_score_)
print("Test Accuracy : ",accuracy_grid)

The best Parameter (GridSearchCV) :  {'C': 0.1, 'multi_class': 'ovr', 'penalty': 'l2'}
BEST f1 score =  0.7346425214744192
Test Accuracy :  0.828009828009828


## Decision Tree

In [19]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor,plot_tree
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)

In [20]:
y_pred = classifier.predict(X_test)

In [21]:
# Hyperparameter Tuning
parameters={'criterion':['gini','entropy'],
            'splitter':['best','random'],
            'max_depth':[2,3,4,5,6,7,8,9,10],
            'max_features':['sqrt','log2','auto']}

from sklearn.model_selection import GridSearchCV
GS_DT=GridSearchCV(classifier,param_grid=parameters,cv=5,scoring="f1")
GS_DT.fit(X_train, y_train)

In [22]:
best_params_grid =GS_DT.best_params_
best_model_grid = GS_DT.best_estimator_

y_pred_grid_DT = GS_DT.predict(X_test)
accuracy_grid_DT = accuracy_score(y_test , y_pred_grid_DT)
print("The best Parameter (GridSearchCV) : ",best_params_grid)
print("BEST f1 score = ",GS_DT.best_score_)
print("Test Accuracy : ",accuracy_grid_DT)


The best Parameter (GridSearchCV) :  {'criterion': 'gini', 'max_depth': 9, 'max_features': 'sqrt', 'splitter': 'best'}
BEST f1 score =  0.7211395320849043
Test Accuracy :  0.7371007371007371


## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
RF = RandomForestClassifier(random_state=42)
parameter = {
    'criterion': ['gini', 'entropy'] ,
    'max_depth':[2,3,4,5,6,7,8,9,10],
    'max_features':[4,6,8,10,12,14,16,20,18]}
GS=GridSearchCV(RF,param_grid=parameter,cv=5,scoring="f1")
GS.fit(X_train,y_train)

In [None]:
best_params_grid =GS.best_params_
best_model_grid = GS.best_estimator_

y_pred_grid = GS.predict(X_test)
accuracy_grid = accuracy_score(y_test , y_pred_grid)
print("The best Parameter (GridSearchCV) : ",best_params_grid)
print("BEST f1 score = ",GS.best_score_)
print("Test Accuracy : ",accuracy_grid)


## SVM (Support Vector Machines)

In [24]:
from sklearn.svm import SVC
svm = SVC()
param_grid = {
    'C': [0.1, 1, 10, 100],       # Regularization parameter
    'gamma': [1, 0.1, 0.01, 0.001], # Kernel coefficient
    'kernel': ['linear', 'rbf'] # Kernel type
}
# Applying Grid Search to find the best model and the best parameters
from sklearn.model_selection import GridSearchCV
grid_search_SVM = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5, scoring='f1')

In [25]:
# Fit the model
grid_search_SVM.fit(X_train, y_train)

In [28]:
print("Best Parameters:", grid_search_SVM.best_params_)
print("Best Cross-Validation Score:", grid_search_SVM.best_score_)

y_pred_SVM = grid_search_SVM.predict(X_test)
accuracy_grid_SVM = accuracy_score(y_test , y_pred_SVM)
print("Test Accuracy : ",accuracy_grid_SVM)


Best Parameters: {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
Best Cross-Validation Score: 0.8572289994583102
Test Accuracy :  0.8845208845208845
