In [1]:
#Loading libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import random
random.seed(42)
np.random.seed(42)
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from tabulate import tabulate
from scipy.stats import uniform, randint


from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,classification_report,roc_auc_score,confusion_matrix
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# Loading the processed dataset

In [2]:
df = pd.read_csv('processed_data.csv')
df

Unnamed: 0,Age,Gender,EducationBackground,MaritalStatus,EmpDepartment,EmpJobRole,BusinessTravelFrequency,DistanceFromHome,EmpEducationLevel,EmpEnvironmentSatisfaction,...,EmpRelationshipSatisfaction,TotalWorkExperienceInYears,TrainingTimesLastYear,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition,PerformanceRating
0,32,1,3,2,5,13,2,10,3,4,...,4,10,2,2,10,7,0.000000,8,0,3
1,47,1,3,2,5,13,2,14,4,4,...,4,20,2,3,7,7,0.693147,7,0,3
2,40,1,5,1,5,13,1,5,4,4,...,3,20,2,3,18,11,0.693147,10,0,4
3,41,1,0,0,2,8,2,10,4,2,...,2,23,2,2,20,6,2.397895,6,0,3
4,60,1,3,2,5,13,2,16,4,1,...,4,10,1,3,2,2,1.098612,2,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,27,0,4,0,5,13,1,3,1,4,...,2,6,3,3,6,5,0.000000,4,0,4
1196,37,1,5,2,4,15,2,10,2,4,...,1,4,2,3,1,0,0.000000,0,0,3
1197,50,1,4,1,4,15,2,28,1,4,...,3,20,3,3,20,8,1.386294,8,0,3
1198,34,0,4,2,0,1,2,9,3,4,...,2,9,3,4,8,7,2.079442,7,0,3


In [3]:
df.columns

Index(['Age', 'Gender', 'EducationBackground', 'MaritalStatus',
       'EmpDepartment', 'EmpJobRole', 'BusinessTravelFrequency',
       'DistanceFromHome', 'EmpEducationLevel', 'EmpEnvironmentSatisfaction',
       'EmpHourlyRate', 'EmpJobInvolvement', 'EmpJobLevel',
       'EmpJobSatisfaction', 'NumCompaniesWorked', 'OverTime',
       'EmpLastSalaryHikePercent', 'EmpRelationshipSatisfaction',
       'TotalWorkExperienceInYears', 'TrainingTimesLastYear',
       'EmpWorkLifeBalance', 'ExperienceYearsAtThisCompany',
       'ExperienceYearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager', 'Attrition', 'PerformanceRating'],
      dtype='object')

# splitting dataset into train and test

In [4]:
x = df.drop(columns = ['PerformanceRating'])

In [5]:
y = df['PerformanceRating']

In [6]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state = 42, stratify = y)

In [7]:
x_train.shape

(960, 26)

In [8]:
y_train.shape

(960,)

In [9]:
x_test.shape

(240, 26)

In [10]:
y_test.shape

(240,)

In [11]:
df.loc[x_test.index,'EmpDepartment']

811     4
1149    4
662     5
542     1
858     4
       ..
623     3
580     5
147     4
798     5
208     3
Name: EmpDepartment, Length: 240, dtype: int64

# Scaling

In [12]:
scaler = StandardScaler()

In [13]:
#scaling only continous columns
continous_columns = ['Age','DistanceFromHome','EmpHourlyRate','EmpLastSalaryHikePercent','TotalWorkExperienceInYears','TrainingTimesLastYear',
                        'ExperienceYearsAtThisCompany','ExperienceYearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager']

In [14]:
#scaling on splitted data to avoid data leakage
x_train[continous_columns] = scaler.fit_transform(x_train[continous_columns])


In [15]:
x_test[continous_columns] = scaler.transform(x_test[continous_columns])


# Balancing

In [16]:
sm = SMOTE()

In [17]:
x_sm,y_sm = sm.fit_resample(x_train,y_train)

In [18]:
from collections import Counter
print(Counter(y_train))
print(Counter(y_sm))

Counter({3: 699, 2: 155, 4: 106})
Counter({3: 699, 2: 699, 4: 699})


In [19]:
x_sm.shape #balanced data

(2097, 26)

In [20]:
x_test.shape  #unbalanced to reflect real-world problem

(240, 26)

In [21]:
y_sm.shape

(2097,)

In [22]:
y_test.shape

(240,)

# Models

# 1. Logistic Regression

In [23]:
from sklearn.linear_model import LogisticRegression

In [24]:
LR = LogisticRegression(multi_class = 'multinomial', solver ='lbfgs', max_iter = 1000, random_state = 42)

In [25]:
LR.fit(x_sm,y_sm)

###### Prediction

In [26]:
LR_y_train = LR.predict(x_sm)
LR_y_train

array([3, 3, 3, ..., 4, 4, 4], dtype=int64)

In [27]:
LR_y_pred = LR.predict(x_test)
LR_y_pred

array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 2, 2,
       2, 3, 2, 4, 4, 3, 2, 3, 3, 4, 4, 3, 3, 4, 3, 3, 4, 3, 3, 3, 3, 2,
       3, 4, 4, 3, 4, 3, 3, 2, 3, 4, 2, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 3,
       2, 3, 3, 4, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 2, 2, 3, 3, 3, 2, 4,
       3, 3, 3, 4, 4, 3, 3, 3, 4, 3, 3, 2, 4, 3, 2, 3, 3, 3, 2, 2, 2, 3,
       3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 2, 3, 3, 2, 3, 3, 4, 3, 3,
       3, 4, 3, 4, 2, 4, 2, 4, 3, 2, 3, 2, 3, 4, 3, 2, 3, 4, 2, 3, 3, 4,
       3, 4, 2, 2, 3, 4, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 4, 2, 2, 2, 3, 3,
       4, 3, 2, 3, 3, 3, 3, 3, 3, 3, 2, 3, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 2, 2, 2, 4,
       4, 3, 2, 4, 3, 3, 2, 2, 2, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 2],
      dtype=int64)

###### Evaluation

In [28]:
LR_acc_train = accuracy_score(y_sm,LR_y_train)
LR_acc_train

0.8822126847877921

In [29]:
print(classification_report(y_sm,LR_y_train))

              precision    recall  f1-score   support

           2       0.89      0.91      0.90       699
           3       0.87      0.82      0.84       699
           4       0.89      0.91      0.90       699

    accuracy                           0.88      2097
   macro avg       0.88      0.88      0.88      2097
weighted avg       0.88      0.88      0.88      2097



In [30]:
LR_acc_test = accuracy_score(y_test,LR_y_pred)
print('ACCURACY SCORE:',LR_acc_test)
LR_pre = precision_score(y_test,LR_y_pred,average = 'weighted')
print('PRECISION SCORE:',LR_pre)
LR_recall = recall_score(y_test,LR_y_pred, average = 'weighted')
print('RECALL SCORE:',LR_recall)
LR_f1 = f1_score(y_test,LR_y_pred, average ='weighted')
print('F1 SCORE:',LR_f1)
print('-----')
print('LR CLASSIFICATION REPORT:')
print(classification_report(y_test,LR_y_pred))

ACCURACY SCORE: 0.7791666666666667
PRECISION SCORE: 0.8073805756479807
RECALL SCORE: 0.7791666666666667
F1 SCORE: 0.7885665968999302
-----
LR CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           2       0.62      0.67      0.64        39
           3       0.90      0.81      0.85       175
           4       0.47      0.73      0.58        26

    accuracy                           0.78       240
   macro avg       0.66      0.74      0.69       240
weighted avg       0.81      0.78      0.79       240



In [31]:
pd.crosstab(y_test,LR_y_pred)

col_0,2,3,4
PerformanceRating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,26,11,2
3,14,142,19
4,2,5,19


In [32]:
LR_Metrics =pd.DataFrame({'METRIC': ['Accuracy','Precision','Recall','F1 Score'],'VALUE' : [LR_acc_test,LR_pre,LR_recall,LR_f1]})
LR_Metrics

Unnamed: 0,METRIC,VALUE
0,Accuracy,0.779167
1,Precision,0.807381
2,Recall,0.779167
3,F1 Score,0.788567


In [33]:
print("LOGISTIC REGRESSION METRICS:")
print(tabulate(LR_Metrics,headers ='keys',tablefmt= 'grid',showindex = False))

LOGISTIC REGRESSION METRICS:
+-----------+----------+
| METRIC    |    VALUE |
| Accuracy  | 0.779167 |
+-----------+----------+
| Precision | 0.807381 |
+-----------+----------+
| Recall    | 0.779167 |
+-----------+----------+
| F1 Score  | 0.788567 |
+-----------+----------+


### HyperParameter Tuning in LR

In [34]:
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization strength
    'solver': ['lbfgs', 'saga'],  # Optimization algorithms
    'max_iter': [100, 200],  # Maximum iterations
    'multi_class': ['ovr', 'multinomial'],  # Multi-class strategy
    'penalty': ['l2'],  # Regularization penalty
}


In [35]:
grid_search = GridSearchCV(estimator=LR, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)


In [36]:
grid_search.fit(x_sm,y_sm)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [37]:
grid_search.best_params_

{'C': 0.1,
 'max_iter': 100,
 'multi_class': 'multinomial',
 'penalty': 'l2',
 'solver': 'lbfgs'}

In [38]:
LR1 = LogisticRegression(C= 0.1,
 max_iter = 100,
 multi_class = 'multinomial',
 penalty = 'l2',
 solver = 'saga')

In [39]:
LR1.fit(x_sm,y_sm)



In [40]:
LR1_y_pred = LR1.predict(x_test)
LR1_y_pred

array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 2,
       2, 3, 2, 4, 4, 3, 2, 3, 3, 4, 4, 3, 3, 4, 3, 3, 4, 3, 3, 3, 3, 2,
       3, 4, 4, 3, 4, 3, 3, 2, 3, 4, 2, 3, 3, 3, 3, 2, 3, 4, 4, 3, 3, 3,
       2, 3, 3, 4, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 2, 2, 3, 3, 3, 2, 4,
       3, 3, 3, 2, 4, 3, 3, 3, 4, 3, 3, 2, 4, 3, 2, 3, 3, 3, 2, 2, 2, 3,
       3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 2, 3, 3, 3, 3, 3, 4, 3, 3,
       3, 4, 3, 4, 2, 4, 2, 4, 3, 2, 3, 2, 3, 4, 3, 2, 3, 4, 2, 3, 3, 4,
       3, 4, 2, 2, 3, 4, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 4, 2, 2, 2, 3, 3,
       4, 3, 2, 3, 3, 3, 3, 3, 3, 3, 2, 3, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 2, 3, 3, 2, 2, 2, 2, 3,
       4, 3, 3, 4, 3, 3, 2, 2, 2, 3, 3, 4, 3, 3, 3, 2, 3, 3, 3, 2],
      dtype=int64)

In [41]:
LR1_acc_test = accuracy_score(y_test,LR1_y_pred)
print('ACCURACY SCORE:',LR1_acc_test)
LR1_pre = precision_score(y_test,LR1_y_pred,average = 'weighted')
print('PRECISION SCORE:',LR1_pre)
LR1_recall = recall_score(y_test,LR1_y_pred, average = 'weighted')
print('RECALL SCORE:',LR1_recall)
LR1_f1 = f1_score(y_test,LR1_y_pred, average ='weighted')
print('F1 SCORE:',LR1_f1)
print('-----')
print('LR CLASSIFICATION REPORT:')
print(classification_report(y_test,LR1_y_pred))

ACCURACY SCORE: 0.7708333333333334
PRECISION SCORE: 0.7979665472802799
RECALL SCORE: 0.7708333333333334
F1 SCORE: 0.7799111916794844
-----
LR CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           2       0.58      0.64      0.61        39
           3       0.89      0.81      0.85       175
           4       0.49      0.73      0.58        26

    accuracy                           0.77       240
   macro avg       0.65      0.73      0.68       240
weighted avg       0.80      0.77      0.78       240



In [42]:
pd.crosstab(y_test,LR1_y_pred)

col_0,2,3,4
PerformanceRating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,25,12,2
3,16,141,18
4,2,5,19


In [43]:
LR1_Metrics =pd.DataFrame({'METRIC': ['Accuracy','Precision','Recall','F1 Score'],'VALUE' : [LR1_acc_test,LR1_pre,LR1_recall,LR1_f1]})
LR1_Metrics

Unnamed: 0,METRIC,VALUE
0,Accuracy,0.770833
1,Precision,0.797967
2,Recall,0.770833
3,F1 Score,0.779911


In [44]:
print("LOGISTIC REGRESSION METRICS AFTER HYPERPARAMATERTUNING:")
print(tabulate(LR1_Metrics,headers ='keys',tablefmt= 'grid',showindex = False))

LOGISTIC REGRESSION METRICS AFTER HYPERPARAMATERTUNING:
+-----------+----------+
| METRIC    |    VALUE |
| Accuracy  | 0.770833 |
+-----------+----------+
| Precision | 0.797967 |
+-----------+----------+
| Recall    | 0.770833 |
+-----------+----------+
| F1 Score  | 0.779911 |
+-----------+----------+


- Logistic Regression showed decent performance before and after tuning, but it still falls short when compared to more complex models. Although hyperparameter tuning slightly improved precision and recall, it's still less effective in capturing the complexity of employee performance.

# 2. SVM

In [45]:
from sklearn.svm import SVC
sv = SVC(kernel = 'linear', decision_function_shape='ovr',random_state=42)

In [46]:
sv.fit(x_sm,y_sm)

###### Prediction

In [47]:
SV_y_train = sv.predict(x_sm)
SV_y_train

array([3, 3, 3, ..., 4, 4, 4], dtype=int64)

In [48]:
SV_y_pred = sv.predict(x_test)
SV_y_pred

array([3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 2, 2,
       2, 3, 2, 4, 4, 3, 2, 3, 3, 4, 4, 3, 3, 4, 3, 3, 4, 3, 3, 3, 3, 2,
       3, 4, 4, 3, 4, 3, 3, 2, 3, 4, 2, 3, 3, 4, 3, 3, 3, 3, 4, 3, 3, 2,
       2, 3, 3, 4, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 2, 2, 2, 3, 4, 2, 4,
       3, 3, 3, 4, 4, 3, 3, 3, 4, 3, 3, 2, 3, 3, 4, 3, 3, 3, 2, 2, 2, 3,
       3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 2, 3, 3, 2, 3, 3, 4, 3, 3,
       3, 4, 3, 4, 3, 4, 3, 4, 3, 2, 3, 2, 3, 4, 3, 2, 3, 4, 2, 3, 3, 4,
       3, 4, 3, 2, 3, 4, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 4, 2, 2, 2, 3, 3,
       4, 3, 2, 3, 3, 3, 3, 3, 3, 3, 2, 3, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 2, 2, 2, 4,
       4, 3, 2, 4, 3, 3, 2, 2, 2, 3, 3, 4, 3, 3, 3, 2, 3, 3, 3, 2],
      dtype=int64)

###### Evaluation

In [49]:
SV_acc_train = accuracy_score(y_sm,SV_y_train)
SV_acc_train

0.8845970433953266

In [50]:
SV_acc_test = accuracy_score(y_test,SV_y_pred)
print('ACCURACY SCORE:',SV_acc_test)
SV_pre = precision_score(y_test,SV_y_pred, average='weighted')
print('PRECISION SCORE:',SV_pre)
SV_recall = recall_score(y_test,SV_y_pred, average = 'weighted')
print('RECALL SCORE:',SV_recall)
SV_f1 = f1_score(y_test,SV_y_pred, average = 'weighted')
print('F1 SCORE:',SV_f1)
print('-----')
print('SVM CLASSIFICATION REPORT:')
print(classification_report(y_test,SV_y_pred))

ACCURACY SCORE: 0.7708333333333334
PRECISION SCORE: 0.8052973394224102
RECALL SCORE: 0.7708333333333334
F1 SCORE: 0.7821045598172717
-----
SVM CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           2       0.63      0.67      0.65        39
           3       0.90      0.80      0.85       175
           4       0.44      0.73      0.55        26

    accuracy                           0.77       240
   macro avg       0.66      0.73      0.68       240
weighted avg       0.81      0.77      0.78       240



In [51]:
pd.crosstab(y_test,SV_y_pred)

col_0,2,3,4
PerformanceRating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,26,10,3
3,14,140,21
4,1,6,19


In [52]:
SV_Metrics =pd.DataFrame({'METRIC': ['Accuracy','Precision','Recall','F1 Score'],'VALUE' : [SV_acc_test,SV_pre,SV_recall,SV_f1]})
SV_Metrics

Unnamed: 0,METRIC,VALUE
0,Accuracy,0.770833
1,Precision,0.805297
2,Recall,0.770833
3,F1 Score,0.782105


In [53]:
print("SVM METRICS:")
print(tabulate(SV_Metrics,headers ='keys',tablefmt= 'grid',showindex = False))

SVM METRICS:
+-----------+----------+
| METRIC    |    VALUE |
| Accuracy  | 0.770833 |
+-----------+----------+
| Precision | 0.805297 |
+-----------+----------+
| Recall    | 0.770833 |
+-----------+----------+
| F1 Score  | 0.782105 |
+-----------+----------+


### HyperParamater Tuning In SVM:

In [54]:
param_dist = {
    'C': uniform(0.1, 10),  # Regularization parameter
    'kernel': ['linear', 'rbf', 'poly'],  # Kernel types
    'gamma': ['scale', 'auto'],  # Kernel coefficient
    'degree': [3, 4, 5],  # Degree of the polynomial kernel function (only for poly)
    'class_weight': ['balanced', None],  # Class weights for handling imbalance
    'probability': [True],  # Enable probability estimates
}


In [55]:
random_search = RandomizedSearchCV(estimator=sv, param_distributions=param_dist, n_iter=100, cv=5, n_jobs=-1, verbose=1)


In [56]:
random_search.fit(x_sm,y_sm)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [57]:
random_search.best_params_

{'C': 7.044504561307088,
 'class_weight': None,
 'degree': 4,
 'gamma': 'auto',
 'kernel': 'rbf',
 'probability': True}

In [58]:
sv1 = SVC(C = 6.474299014982066,
class_weight = None,
 degree = 3,
 gamma = 'auto',
 kernel = 'rbf',
 probability = True)

In [59]:
sv1.fit(x_sm,y_sm)

In [60]:
SV1_y_pred = sv1.predict(x_test)
SV1_y_pred

array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2,
       3, 3, 2, 4, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3,
       3, 4, 4, 3, 4, 3, 3, 2, 3, 4, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 3,
       2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 2, 3, 3, 3, 3, 3, 4,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 2, 3, 3,
       3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 4, 2, 2, 3, 3, 3, 3, 3, 4, 3, 3,
       3, 3, 3, 2, 3, 3, 3, 4, 3, 2, 3, 2, 3, 3, 3, 2, 3, 2, 3, 3, 3, 4,
       3, 3, 3, 2, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 4, 2, 2, 2, 3, 3,
       3, 3, 2, 3, 3, 3, 3, 3, 3, 2, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 2, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 2, 2, 3, 3,
       4, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3],
      dtype=int64)

In [61]:
SV1_acc_test = accuracy_score(y_test,SV1_y_pred)
print('ACCURACY SCORE:',SV1_acc_test)
SV1_pre = precision_score(y_test,SV1_y_pred, average='weighted')
print('PRECISION SCORE:',SV1_pre)
SV1_recall = recall_score(y_test,SV1_y_pred, average = 'weighted')
print('RECALL SCORE:',SV1_recall)
SV1_f1 = f1_score(y_test,SV1_y_pred, average = 'weighted')
print('F1 SCORE:',SV1_f1)
print('-----')
print('SVM CLASSIFICATION REPORT:')
print(classification_report(y_test,SV1_y_pred))

ACCURACY SCORE: 0.8
PRECISION SCORE: 0.7865755208333333
RECALL SCORE: 0.8
F1 SCORE: 0.7883429724325353
-----
SVM CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           2       0.63      0.49      0.55        39
           3       0.84      0.92      0.88       175
           4       0.67      0.46      0.55        26

    accuracy                           0.80       240
   macro avg       0.71      0.62      0.66       240
weighted avg       0.79      0.80      0.79       240



In [62]:
pd.crosstab(y_test,SV1_y_pred)

col_0,2,3,4
PerformanceRating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,19,19,1
3,9,161,5
4,2,12,12


In [63]:
SV1_Metrics =pd.DataFrame({'METRIC': ['Accuracy','Precision','Recall','F1 Score'],'VALUE' : [SV1_acc_test,SV1_pre,SV1_recall,SV1_f1]})
SV1_Metrics

Unnamed: 0,METRIC,VALUE
0,Accuracy,0.8
1,Precision,0.786576
2,Recall,0.8
3,F1 Score,0.788343


In [64]:
print("SVM METRICS AFTER HYPERPARAMETER TUNING:")
print(tabulate(SV1_Metrics,headers ='keys',tablefmt= 'grid',showindex = False))

SVM METRICS AFTER HYPERPARAMETER TUNING:
+-----------+----------+
| METRIC    |    VALUE |
| Accuracy  | 0.8      |
+-----------+----------+
| Precision | 0.786576 |
+-----------+----------+
| Recall    | 0.8      |
+-----------+----------+
| F1 Score  | 0.788343 |
+-----------+----------+


- Hyperparameter tuning improved the SVM model's accuracy, recall, and precision slightly. However, SVM still struggles to outperform ensemble methods, making it moderately useful for this task.

# 3. Decision Tree

In [65]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=42)

In [66]:
dt.fit(x_sm,y_sm)

###### Prediction

In [67]:
dt_y_train= dt.predict(x_sm)
dt_y_train

array([3, 2, 3, ..., 4, 4, 4], dtype=int64)

In [68]:
dt_y_pred = dt.predict(x_test)
dt_y_pred

array([3, 3, 3, 2, 3, 3, 2, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 4, 3, 3, 2,
       3, 3, 3, 4, 4, 3, 3, 3, 3, 4, 2, 3, 3, 4, 3, 3, 4, 3, 3, 3, 3, 3,
       3, 4, 4, 3, 3, 3, 3, 2, 3, 4, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 3,
       2, 3, 3, 4, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 4, 3, 2, 3, 3,
       3, 3, 3, 2, 4, 3, 3, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 3,
       3, 3, 4, 3, 3, 4, 3, 3, 3, 3, 3, 4, 2, 2, 3, 2, 2, 3, 3, 4, 3, 3,
       3, 4, 3, 2, 3, 3, 3, 3, 3, 2, 2, 2, 3, 4, 3, 2, 3, 4, 3, 3, 3, 3,
       4, 4, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 2, 3, 2, 3, 3,
       3, 3, 2, 3, 4, 3, 3, 3, 3, 3, 4, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 2, 3, 3, 3, 2, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3,
       4, 3, 2, 4, 2, 3, 4, 2, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3],
      dtype=int64)

###### Evaluation

In [69]:
dt_acc_train = accuracy_score(y_sm,dt_y_train)
dt_acc_train

1.0

In [70]:
print(classification_report(y_sm,dt_y_train))

              precision    recall  f1-score   support

           2       1.00      1.00      1.00       699
           3       1.00      1.00      1.00       699
           4       1.00      1.00      1.00       699

    accuracy                           1.00      2097
   macro avg       1.00      1.00      1.00      2097
weighted avg       1.00      1.00      1.00      2097



In [71]:
dt_acc_test = accuracy_score(y_test,dt_y_pred)
print('ACCURACY SCORE:',dt_acc_test)
dt_pre = precision_score(y_test,dt_y_pred, average = 'weighted')
print('PRECISION SCORE:',dt_pre)
dt_recall = recall_score(y_test,dt_y_pred,average = 'weighted')
print('RECALL SCORE:',dt_recall)
dt_f1 = f1_score(y_test,dt_y_pred, average = 'weighted')
print('F1 SCORE:',dt_f1)
print('----')
print('DECISION TREE CLASSIFICATION REPORT:')
print(classification_report(y_test,dt_y_pred))

ACCURACY SCORE: 0.8625
PRECISION SCORE: 0.8640158045977011
RECALL SCORE: 0.8625
F1 SCORE: 0.8626482808022922
----
DECISION TREE CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           2       0.75      0.69      0.72        39
           3       0.91      0.91      0.91       175
           4       0.70      0.81      0.75        26

    accuracy                           0.86       240
   macro avg       0.79      0.80      0.79       240
weighted avg       0.86      0.86      0.86       240



In [72]:
pd.crosstab(y_test,dt_y_pred)

col_0,2,3,4
PerformanceRating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,27,11,1
3,8,159,8
4,1,4,21


In [73]:
dt_Metrics =pd.DataFrame({'METRIC': ['Accuracy','Precision','Recall','F1 Score'],'VALUE' : [dt_acc_test,dt_pre,dt_recall,dt_f1]})
dt_Metrics

Unnamed: 0,METRIC,VALUE
0,Accuracy,0.8625
1,Precision,0.864016
2,Recall,0.8625
3,F1 Score,0.862648


In [74]:
print("DECISION TREE METRICS:")
print(tabulate(dt_Metrics,headers ='keys',tablefmt= 'grid',showindex = False))

DECISION TREE METRICS:
+-----------+----------+
| METRIC    |    VALUE |
| Accuracy  | 0.8625   |
+-----------+----------+
| Precision | 0.864016 |
+-----------+----------+
| Recall    | 0.8625   |
+-----------+----------+
| F1 Score  | 0.862648 |
+-----------+----------+


### Hyperparameter tuning in dt

In [75]:
from sklearn.model_selection import GridSearchCV

In [76]:
params = {
    "criterion":("gini", "entropy"),
    "splitter":("best", "random"), 
    "max_depth":(list(range(1, 20))), 
    "min_samples_split":[2, 3, 4],    
    "min_samples_leaf":list(range(1, 20))
}

In [77]:
grid = GridSearchCV(dt, params, scoring='accuracy', cv = 5, verbose = 3, n_jobs=-1)

In [78]:
grid.fit(x_sm, y_sm)

Fitting 5 folds for each of 4332 candidates, totalling 21660 fits


In [79]:
grid.best_params_

{'criterion': 'entropy',
 'max_depth': 7,
 'min_samples_leaf': 5,
 'min_samples_split': 2,
 'splitter': 'best'}

In [80]:
dt1 = DecisionTreeClassifier(criterion= 'entropy',
 max_depth= 12,
 min_samples_leaf= 1,
 min_samples_split= 4,
 splitter= 'best')

In [81]:
dt1.fit(x_sm,y_sm)

In [82]:
dt1_y_pred = dt1.predict(x_test)
dt1_y_pred

array([3, 2, 3, 2, 3, 3, 2, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 4, 3, 3, 2,
       3, 3, 4, 2, 4, 3, 3, 3, 3, 4, 4, 3, 3, 4, 3, 3, 4, 3, 3, 3, 3, 3,
       3, 4, 4, 3, 3, 3, 3, 2, 3, 4, 3, 3, 3, 4, 3, 3, 3, 4, 4, 3, 3, 3,
       2, 3, 3, 4, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 2, 3, 3, 3, 2, 2, 3,
       3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 3,
       3, 3, 4, 3, 3, 4, 3, 3, 3, 3, 3, 4, 2, 2, 3, 2, 2, 3, 3, 4, 3, 3,
       3, 4, 3, 2, 3, 4, 3, 3, 3, 2, 2, 2, 3, 4, 3, 2, 3, 4, 3, 3, 3, 3,
       4, 4, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 2, 3, 2, 3, 3,
       3, 3, 2, 3, 4, 3, 3, 3, 3, 3, 4, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 2, 3, 3, 2, 3,
       4, 3, 2, 4, 3, 3, 4, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3],
      dtype=int64)

In [83]:
dt1_acc_test = accuracy_score(y_test,dt1_y_pred)
print('ACCURACY SCORE:',dt1_acc_test)
dt1_pre = precision_score(y_test,dt1_y_pred, average = 'weighted')
print('PRECISION SCORE:',dt1_pre)
dt1_recall = recall_score(y_test,dt1_y_pred,average = 'weighted')
print('RECALL SCORE:',dt1_recall)
dt1_f1 = f1_score(y_test,dt1_y_pred, average = 'weighted')
print('F1 SCORE:',dt1_f1)
print('----')
print('DECISION TREE CLASSIFICATION REPORT:')
print(classification_report(y_test,dt1_y_pred))

ACCURACY SCORE: 0.8625
PRECISION SCORE: 0.8703057907005276
RECALL SCORE: 0.8625
F1 SCORE: 0.865079229993499
----
DECISION TREE CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           2       0.79      0.77      0.78        39
           3       0.92      0.89      0.91       175
           4       0.64      0.81      0.71        26

    accuracy                           0.86       240
   macro avg       0.78      0.82      0.80       240
weighted avg       0.87      0.86      0.87       240



In [84]:
pd.crosstab(y_test,dt1_y_pred)

col_0,2,3,4
PerformanceRating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,30,9,0
3,7,156,12
4,1,4,21


In [85]:
dt1_Metrics =pd.DataFrame({'METRIC': ['Accuracy','Precision','Recall','F1 Score'],'VALUE' : [dt1_acc_test,dt1_pre,dt1_recall,dt1_f1]})
dt1_Metrics

Unnamed: 0,METRIC,VALUE
0,Accuracy,0.8625
1,Precision,0.870306
2,Recall,0.8625
3,F1 Score,0.865079


In [86]:
print("DECISION TREE METRICS AFTER HYPERPARAMETER TUNING:")
print(tabulate(dt_Metrics,headers ='keys',tablefmt= 'grid',showindex = False))

DECISION TREE METRICS AFTER HYPERPARAMETER TUNING:
+-----------+----------+
| METRIC    |    VALUE |
| Accuracy  | 0.8625   |
+-----------+----------+
| Precision | 0.864016 |
+-----------+----------+
| Recall    | 0.8625   |
+-----------+----------+
| F1 Score  | 0.862648 |
+-----------+----------+


- The Decision Tree performed consistently before and after hyperparameter tuning. While the results were stable, the model didn’t show significant improvement, but its interpretability and stable performance make it useful for predicting employee performance.

# 4. Random Forest

In [87]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 100)

In [88]:
rf.fit(x_sm,y_sm)

###### Prediction

In [89]:
rf_y_train= rf.predict(x_sm)
rf_y_train

array([3, 2, 3, ..., 4, 4, 4], dtype=int64)

In [90]:
rf_y_pred = rf.predict(x_test)
rf_y_pred

array([3, 3, 3, 2, 3, 3, 2, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2,
       3, 3, 3, 2, 4, 3, 2, 3, 3, 4, 3, 3, 3, 4, 3, 3, 4, 3, 3, 3, 3, 3,
       3, 4, 4, 3, 3, 3, 3, 2, 3, 4, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 3,
       2, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 2, 2, 3,
       3, 3, 3, 2, 4, 3, 3, 3, 3, 3, 2, 2, 3, 3, 3, 3, 3, 3, 2, 2, 2, 3,
       3, 3, 4, 3, 3, 4, 3, 3, 3, 3, 3, 4, 3, 2, 3, 3, 2, 3, 3, 4, 3, 3,
       3, 3, 3, 2, 3, 3, 3, 3, 3, 2, 2, 2, 3, 3, 3, 2, 3, 4, 3, 3, 3, 3,
       3, 4, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 2, 3, 2, 3, 3,
       3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 3,
       4, 3, 2, 4, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 2],
      dtype=int64)

###### Evaluation

In [91]:
rf_acc_train = accuracy_score(y_sm,rf_y_train)
rf_acc_train

1.0

In [92]:
print(classification_report(y_sm,rf_y_train))

              precision    recall  f1-score   support

           2       1.00      1.00      1.00       699
           3       1.00      1.00      1.00       699
           4       1.00      1.00      1.00       699

    accuracy                           1.00      2097
   macro avg       1.00      1.00      1.00      2097
weighted avg       1.00      1.00      1.00      2097



In [93]:
rf_acc_test = accuracy_score(y_test,rf_y_pred)
print('ACCURACY SCORE:',rf_acc_test)
rf_pre = precision_score(y_test,rf_y_pred, average = 'weighted')
print('PRECISION SCORE:',rf_pre)
rf_recall = recall_score(y_test,rf_y_pred, average = 'weighted')
print('RECALL SCORE:',rf_recall)
rf_f1 = f1_score(y_test,rf_y_pred, average = 'weighted')
print('F1 SCORE:',rf_f1)
print('------')
print('RANDOM FOREST CLASSIFICATION REPORT:')
print(classification_report(y_test ,rf_y_pred))

ACCURACY SCORE: 0.9333333333333333
PRECISION SCORE: 0.9327388790168278
RECALL SCORE: 0.9333333333333333
F1 SCORE: 0.9325736175312447
------
RANDOM FOREST CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           2       0.89      0.87      0.88        39
           3       0.94      0.97      0.95       175
           4       0.91      0.81      0.86        26

    accuracy                           0.93       240
   macro avg       0.92      0.88      0.90       240
weighted avg       0.93      0.93      0.93       240



In [94]:
pd.crosstab(y_test,rf_y_pred)

col_0,2,3,4
PerformanceRating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,34,5,0
3,4,169,2
4,0,5,21


In [95]:
rf_Metrics =pd.DataFrame({'METRIC': ['Accuracy','Precision','Recall','F1 Score'],'VALUE' : [rf_acc_test,rf_pre,rf_recall,rf_f1]})
rf_Metrics

Unnamed: 0,METRIC,VALUE
0,Accuracy,0.933333
1,Precision,0.932739
2,Recall,0.933333
3,F1 Score,0.932574


In [96]:
print("RANDOM FOREST METRICS")
print(tabulate(rf_Metrics,headers ='keys',tablefmt= 'grid',showindex = False))

RANDOM FOREST METRICS
+-----------+----------+
| METRIC    |    VALUE |
| Accuracy  | 0.933333 |
+-----------+----------+
| Precision | 0.932739 |
+-----------+----------+
| Recall    | 0.933333 |
+-----------+----------+
| F1 Score  | 0.932574 |
+-----------+----------+


### Hyperparameter Tuning in Random Forest

In [97]:
n_estimators = [50,100,200]
max_features = ['auto', 'sqrt']
max_depth = [5,10,None]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False] 

In [98]:
paramgrid = {'n_estimators': n_estimators, 'max_features': max_features,
               'max_depth': max_depth, 'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap}

In [99]:
gridsearch = GridSearchCV(rf, paramgrid, scoring='accuracy', cv = 3, verbose = 3, n_jobs=-1)

In [100]:
gridsearch.fit(x_sm, y_sm)

Fitting 3 folds for each of 324 candidates, totalling 972 fits


  warn(


In [101]:
gridsearch.best_params_

{'bootstrap': False,
 'max_depth': None,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 100}

In [102]:
rf1 = RandomForestClassifier(bootstrap = False,
 max_depth = None,
 max_features = 'auto',
 min_samples_leaf = 1,
 min_samples_split = 2,
 n_estimators = 50)

In [103]:
rf1.fit(x_sm,y_sm)

  warn(


In [104]:
rf1_y_pred = rf1.predict(x_test)
rf1_y_pred

array([3, 3, 3, 2, 3, 3, 2, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2,
       3, 3, 3, 3, 4, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3,
       3, 4, 4, 3, 3, 3, 3, 2, 3, 4, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 3,
       2, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 2, 2, 3,
       3, 3, 3, 2, 4, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 3, 2, 2, 2, 3,
       3, 3, 4, 3, 3, 4, 3, 3, 3, 3, 3, 4, 3, 2, 3, 3, 2, 3, 3, 4, 3, 3,
       3, 3, 3, 2, 3, 3, 3, 3, 3, 2, 2, 2, 3, 3, 3, 2, 3, 4, 3, 3, 3, 3,
       3, 4, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 2, 3, 2, 3, 3,
       3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 2, 2, 2, 3,
       4, 3, 2, 4, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 2],
      dtype=int64)

In [105]:
rf1_acc_test = accuracy_score(y_test,rf1_y_pred)
print('ACCURACY SCORE:',rf1_acc_test)
rf1_pre = precision_score(y_test,rf1_y_pred, average = 'weighted')
print('PRECISION SCORE:',rf1_pre)
rf1_recall = recall_score(y_test,rf1_y_pred,average = 'weighted')
print('RECALL SCORE:',rf1_recall)
rf1_f1 = f1_score(y_test,rf1_y_pred, average = 'weighted')
print('F1 SCORE:',rf1_f1)
print('----')
print('DECISION TREE CLASSIFICATION REPORT:')
print(classification_report(y_test,rf1_y_pred))

ACCURACY SCORE: 0.9166666666666666
PRECISION SCORE: 0.9154987373737373
RECALL SCORE: 0.9166666666666666
F1 SCORE: 0.9149705882352941
----
DECISION TREE CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           2       0.86      0.79      0.83        39
           3       0.93      0.97      0.95       175
           4       0.91      0.77      0.83        26

    accuracy                           0.92       240
   macro avg       0.90      0.84      0.87       240
weighted avg       0.92      0.92      0.91       240



In [106]:
pd.crosstab(y_test,rf1_y_pred)

col_0,2,3,4
PerformanceRating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,31,8,0
3,4,169,2
4,1,5,20


In [107]:
rf1_Metrics =pd.DataFrame({'METRIC': ['Accuracy','Precision','Recall','F1 Score'],'VALUE' : [rf1_acc_test,rf1_pre,rf1_recall,rf1_f1]})
rf1_Metrics

Unnamed: 0,METRIC,VALUE
0,Accuracy,0.916667
1,Precision,0.915499
2,Recall,0.916667
3,F1 Score,0.914971


In [108]:
print("RANDOM FOREST METRICS AFTER HYPERPARAMETER TUNING")
print(tabulate(rf1_Metrics,headers ='keys',tablefmt= 'grid',showindex = False))

RANDOM FOREST METRICS AFTER HYPERPARAMETER TUNING
+-----------+----------+
| METRIC    |    VALUE |
| Accuracy  | 0.916667 |
+-----------+----------+
| Precision | 0.915499 |
+-----------+----------+
| Recall    | 0.916667 |
+-----------+----------+
| F1 Score  | 0.914971 |
+-----------+----------+


- Random Forest showed excellent performance both before and after hyperparameter tuning, though the accuracy slightly decreased after tuning. Despite this, it remains one of the top performers for this task due to its strong generalization and ability to handle complex patterns. Highly recommended for employee performance prediction.

# 5 . KNN

In [109]:
from sklearn.neighbors import KNeighborsClassifier

In [110]:
knn = KNeighborsClassifier(n_neighbors = 5)

In [111]:
knn.fit(x_sm,y_sm)

###### Prediction

In [112]:
knn_y_train = knn.predict(x_sm)
knn_y_train

array([3, 2, 4, ..., 4, 4, 4], dtype=int64)

In [113]:
knn_y_pred = knn.predict(x_test)
knn_y_pred

array([3, 3, 3, 3, 3, 4, 2, 3, 4, 3, 4, 2, 4, 3, 3, 2, 4, 3, 4, 3, 2, 2,
       3, 3, 2, 4, 2, 3, 2, 3, 4, 4, 3, 3, 3, 2, 3, 3, 4, 3, 3, 3, 3, 4,
       3, 4, 4, 3, 4, 3, 3, 2, 3, 4, 3, 3, 3, 3, 2, 3, 2, 4, 4, 3, 2, 2,
       3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 2, 3, 4, 3, 3, 2, 4,
       3, 3, 2, 2, 4, 4, 3, 4, 4, 3, 4, 2, 3, 4, 2, 2, 2, 3, 2, 2, 3, 3,
       4, 3, 4, 4, 3, 4, 4, 3, 3, 3, 3, 4, 2, 2, 3, 3, 2, 4, 3, 4, 3, 3,
       3, 4, 3, 2, 3, 3, 2, 4, 3, 2, 2, 2, 3, 3, 4, 2, 3, 2, 3, 4, 3, 4,
       2, 3, 2, 2, 3, 4, 3, 3, 3, 2, 4, 4, 3, 4, 4, 2, 4, 4, 2, 2, 3, 3,
       3, 2, 2, 2, 3, 3, 4, 3, 3, 2, 4, 3, 4, 2, 3, 4, 3, 3, 3, 2, 3, 3,
       4, 2, 2, 3, 3, 3, 3, 2, 3, 3, 3, 4, 3, 3, 2, 3, 3, 2, 2, 2, 2, 3,
       4, 2, 3, 4, 2, 3, 4, 3, 4, 3, 3, 4, 2, 3, 2, 3, 3, 4, 3, 2],
      dtype=int64)

###### Evaluation

In [114]:
knn_acc_train  = accuracy_score(y_sm,knn_y_train)
knn_acc_train

0.9012875536480687

In [115]:
print(classification_report(y_sm,knn_y_train))

              precision    recall  f1-score   support

           2       0.87      1.00      0.93       699
           3       1.00      0.71      0.83       699
           4       0.87      1.00      0.93       699

    accuracy                           0.90      2097
   macro avg       0.91      0.90      0.90      2097
weighted avg       0.91      0.90      0.90      2097



In [116]:
knn_acc_test = accuracy_score(y_test,knn_y_pred)
print('ACCURACY SCORE:',knn_acc_test)
knn_pre = precision_score(y_test,knn_y_pred, average ='weighted')
print('PRECISION SCORE:',knn_pre)
knn_recall = recall_score(y_test,knn_y_pred, average ='weighted')
print('RECALL SCORE:',knn_recall)
knn_f1 = f1_score(y_test,knn_y_pred, average = 'weighted')
print('F1 SCORE:',knn_f1)
print('------')
print('KNN CLASSIFICATION REPORT:')
print(classification_report(y_test,knn_y_pred))

ACCURACY SCORE: 0.6291666666666667
PRECISION SCORE: 0.7511863104066248
RECALL SCORE: 0.6291666666666667
F1 SCORE: 0.658408283322279
------
KNN CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           2       0.40      0.64      0.50        39
           3       0.89      0.61      0.73       175
           4       0.33      0.73      0.45        26

    accuracy                           0.63       240
   macro avg       0.54      0.66      0.56       240
weighted avg       0.75      0.63      0.66       240



In [117]:
pd.crosstab(y_test,knn_y_pred)

col_0,2,3,4
PerformanceRating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,25,10,4
3,33,107,35
4,4,3,19


In [118]:
knn_Metrics =pd.DataFrame({'METRIC': ['Accuracy','Precision','Recall','F1 Score'],'VALUE' : [knn_acc_test,knn_pre,knn_recall,knn_f1]})
knn_Metrics

Unnamed: 0,METRIC,VALUE
0,Accuracy,0.629167
1,Precision,0.751186
2,Recall,0.629167
3,F1 Score,0.658408


In [119]:
print("KNN EVALUATION METRIC:")
print(tabulate(knn_Metrics,headers ='keys',tablefmt= 'grid',showindex = False))

KNN EVALUATION METRIC:
+-----------+----------+
| METRIC    |    VALUE |
| Accuracy  | 0.629167 |
+-----------+----------+
| Precision | 0.751186 |
+-----------+----------+
| Recall    | 0.629167 |
+-----------+----------+
| F1 Score  | 0.658408 |
+-----------+----------+


### HyperParameter Tuning in KNN

In [120]:
param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
grid_search.fit(x_sm, y_sm)

print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}


In [121]:
knn1 = KNeighborsClassifier(n_neighbors = 3, metric ='manhattan',weights ='distance')

In [122]:
knn1.fit(x_sm,y_sm)

In [123]:
knn1_y_pred = knn1.predict(x_test)
knn1_y_pred

array([3, 3, 3, 3, 3, 4, 2, 3, 4, 3, 4, 3, 4, 3, 3, 3, 3, 3, 4, 3, 3, 2,
       3, 3, 2, 4, 3, 3, 3, 3, 3, 4, 3, 3, 2, 2, 3, 3, 4, 3, 3, 3, 3, 4,
       3, 4, 2, 3, 4, 3, 3, 2, 3, 4, 3, 3, 3, 3, 3, 3, 2, 4, 4, 3, 3, 3,
       2, 3, 3, 2, 3, 3, 3, 2, 3, 3, 4, 3, 2, 2, 3, 2, 3, 4, 3, 3, 3, 4,
       3, 3, 3, 2, 4, 3, 3, 3, 4, 3, 4, 2, 3, 4, 2, 3, 3, 3, 2, 2, 3, 4,
       2, 3, 4, 4, 2, 4, 4, 3, 3, 3, 4, 4, 2, 2, 3, 3, 2, 3, 3, 3, 3, 3,
       3, 3, 3, 2, 3, 3, 3, 4, 3, 2, 2, 2, 4, 3, 3, 2, 3, 2, 3, 3, 3, 4,
       3, 3, 3, 2, 3, 3, 3, 3, 2, 2, 4, 4, 3, 4, 3, 2, 4, 2, 2, 2, 3, 4,
       3, 3, 2, 3, 3, 3, 4, 3, 3, 3, 3, 3, 4, 2, 3, 4, 3, 3, 3, 3, 3, 3,
       2, 2, 2, 3, 3, 3, 3, 2, 3, 3, 3, 4, 3, 3, 2, 3, 3, 4, 2, 2, 2, 3,
       4, 2, 2, 3, 3, 3, 4, 3, 4, 3, 3, 4, 2, 3, 2, 3, 3, 4, 3, 2],
      dtype=int64)

In [124]:
knn1_acc_test = accuracy_score(y_test,knn1_y_pred)
print('ACCURACY SCORE:',knn1_acc_test)
knn1_pre = precision_score(y_test,knn1_y_pred, average = 'weighted')
print('PRECISION SCORE:',knn1_pre)
knn1_recall = recall_score(y_test,knn1_y_pred,average = 'weighted')
print('RECALL SCORE:',knn1_recall)
knn1_f1 = f1_score(y_test,knn1_y_pred, average = 'weighted')
print('F1 SCORE:',knn1_f1)
print('----')
print('KNN CLASSIFICATION REPORT:')
print(classification_report(y_test,knn1_y_pred))

ACCURACY SCORE: 0.675
PRECISION SCORE: 0.7454780312308634
RECALL SCORE: 0.675
F1 SCORE: 0.6988582402817418
----
KNN CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           2       0.48      0.64      0.55        39
           3       0.87      0.71      0.78       175
           4       0.28      0.50      0.36        26

    accuracy                           0.68       240
   macro avg       0.55      0.62      0.56       240
weighted avg       0.75      0.68      0.70       240



In [125]:
pd.crosstab(y_test,knn1_y_pred)

col_0,2,3,4
PerformanceRating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,25,10,4
3,22,124,29
4,5,8,13


In [126]:
knn1_Metrics =pd.DataFrame({'METRIC': ['Accuracy','Precision','Recall','F1 Score'],'VALUE' : [knn1_acc_test,knn1_pre,knn1_recall,knn1_f1]})
knn1_Metrics

Unnamed: 0,METRIC,VALUE
0,Accuracy,0.675
1,Precision,0.745478
2,Recall,0.675
3,F1 Score,0.698858


In [127]:
print("KNN EVALUATION METRIC AFTER HYPERPARAMETER:")
print(tabulate(knn1_Metrics,headers ='keys',tablefmt= 'grid',showindex = False))

KNN EVALUATION METRIC AFTER HYPERPARAMETER:
+-----------+----------+
| METRIC    |    VALUE |
| Accuracy  | 0.675    |
+-----------+----------+
| Precision | 0.745478 |
+-----------+----------+
| Recall    | 0.675    |
+-----------+----------+
| F1 Score  | 0.698858 |
+-----------+----------+


- Even after hyperparameter tuning, KNN failed to provide a significant improvement. Its low accuracy and recall make it not useful for predicting employee performance, and it is not recommended for this task.

# 6. Gradient Boosting Classifier

In [128]:
from sklearn.ensemble import GradientBoostingClassifier

In [129]:
gbc = GradientBoostingClassifier()

In [130]:
gbc.fit(x_sm,y_sm)

###### Prediction

In [131]:
gbc_y_train = gbc.predict(x_sm)
gbc_y_train

array([3, 2, 3, ..., 4, 4, 4], dtype=int64)

In [132]:
gbc_y_pred = gbc.predict(x_test)
gbc_y_pred

array([3, 3, 3, 2, 3, 3, 2, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2,
       3, 3, 3, 2, 4, 3, 3, 3, 3, 4, 3, 3, 3, 4, 3, 3, 4, 3, 3, 3, 3, 2,
       3, 4, 4, 3, 3, 3, 3, 2, 3, 4, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 3,
       2, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 2, 2, 3,
       3, 3, 3, 2, 4, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 3,
       3, 3, 4, 3, 3, 4, 3, 3, 3, 3, 3, 4, 3, 2, 3, 3, 2, 3, 3, 4, 3, 3,
       3, 3, 3, 2, 3, 3, 3, 4, 3, 2, 2, 2, 3, 3, 3, 2, 3, 4, 3, 3, 3, 3,
       3, 4, 3, 2, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 4, 2, 3, 2, 3, 3,
       3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 3,
       4, 3, 2, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3],
      dtype=int64)

###### Evaluation

In [133]:
gbc_acc_train  = accuracy_score(y_sm,gbc_y_train)
gbc_acc_train

0.9918931807343825

In [134]:
print(classification_report(y_sm,gbc_y_train))

              precision    recall  f1-score   support

           2       0.99      1.00      0.99       699
           3       0.99      0.98      0.99       699
           4       1.00      0.99      0.99       699

    accuracy                           0.99      2097
   macro avg       0.99      0.99      0.99      2097
weighted avg       0.99      0.99      0.99      2097



In [135]:
gbc_acc_test = accuracy_score(y_test,gbc_y_pred)
print('ACCURACY SCORE:', gbc_acc_test)
gbc_pre = precision_score(y_test,gbc_y_pred, average = 'weighted')
print('PRECISION SCORE:',gbc_pre)
gbc_recall = recall_score(y_test,gbc_y_pred, average ='weighted')
print('RECALL SCORE:',gbc_recall)
gbc_f1 = f1_score(y_test,gbc_y_pred, average = 'weighted')
print('F1 SCORE:',gbc_f1)
print('------')
print('KNN CLASSIFICATION REPORT:')
print(classification_report(y_test,gbc_y_pred))

ACCURACY SCORE: 0.925
PRECISION SCORE: 0.9242414061474834
RECALL SCORE: 0.925
F1 SCORE: 0.9236948879689861
------
KNN CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           2       0.89      0.85      0.87        39
           3       0.93      0.97      0.95       175
           4       0.91      0.77      0.83        26

    accuracy                           0.93       240
   macro avg       0.91      0.86      0.88       240
weighted avg       0.92      0.93      0.92       240



In [136]:
pd.crosstab(y_test,gbc_y_pred)

col_0,2,3,4
PerformanceRating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,33,6,0
3,4,169,2
4,0,6,20


In [137]:
gbc_Metrics =pd.DataFrame({'METRIC': ['Accuracy','Precision','Recall','F1 Score'],'VALUE' : [gbc_acc_test,gbc_pre,gbc_recall,gbc_f1]})
gbc_Metrics

Unnamed: 0,METRIC,VALUE
0,Accuracy,0.925
1,Precision,0.924241
2,Recall,0.925
3,F1 Score,0.923695


In [138]:
print("GBC EVALUATION METRIC:")
print(tabulate(gbc_Metrics,headers ='keys',tablefmt= 'grid',showindex = False))

GBC EVALUATION METRIC:
+-----------+----------+
| METRIC    |    VALUE |
| Accuracy  | 0.925    |
+-----------+----------+
| Precision | 0.924241 |
+-----------+----------+
| Recall    | 0.925    |
+-----------+----------+
| F1 Score  | 0.923695 |
+-----------+----------+


### HyperParameter in GradientBoosting

In [139]:
param_Grid = {
    'n_estimators' : [50,100,150],
    'learning_rate' : [0.01,0.1,0.2],
    'max_depth' : [3,5,7],
    'subsample' : [0.8,1.0]
}

In [140]:
grid_Search = GridSearchCV(GradientBoostingClassifier(random_state=42), param_Grid, cv = 3)

In [141]:
grid_Search.fit(x_sm,y_sm)

In [142]:
grid_Search.best_params_

{'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8}

In [143]:
gbc1 = GradientBoostingClassifier(learning_rate = 0.1, max_depth = 7,n_estimators = 150, subsample = 0.8)

In [144]:
gbc1.fit(x_sm,y_sm)

In [145]:
gbc1_y_pred = gbc1.predict(x_test)
gbc1_y_pred

array([3, 3, 3, 2, 3, 3, 2, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2,
       3, 3, 3, 2, 4, 3, 3, 3, 3, 4, 3, 3, 3, 4, 3, 3, 4, 3, 3, 3, 3, 3,
       3, 4, 4, 3, 3, 3, 3, 2, 3, 4, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 3,
       2, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 2, 2, 3,
       3, 3, 3, 2, 4, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 3,
       3, 3, 4, 3, 3, 4, 3, 3, 3, 3, 3, 4, 3, 2, 3, 3, 2, 3, 3, 4, 3, 3,
       3, 3, 3, 2, 3, 3, 3, 4, 3, 2, 2, 2, 3, 3, 3, 2, 3, 4, 3, 3, 3, 3,
       3, 4, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 2, 3, 2, 3, 3,
       3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 3,
       4, 3, 2, 4, 3, 3, 4, 2, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3],
      dtype=int64)

In [146]:
gbc1_acc_test = accuracy_score(y_test,gbc1_y_pred)
print('ACCURACY SCORE:', gbc1_acc_test)
gbc1_pre = precision_score(y_test,gbc1_y_pred, average = 'weighted')
print('PRECISION SCORE:',gbc1_pre)
gbc1_recall = recall_score(y_test,gbc1_y_pred, average ='weighted')
print('RECALL SCORE:',gbc1_recall)
gbc1_f1 = f1_score(y_test,gbc1_y_pred, average = 'weighted')
print('F1 SCORE:',gbc1_f1)
print('------')
print('GBC CLASSIFICATION REPORT:')
print(classification_report(y_test,gbc1_y_pred))

ACCURACY SCORE: 0.925
PRECISION SCORE: 0.9241816131529077
RECALL SCORE: 0.925
F1 SCORE: 0.9238224986623862
------
GBC CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           2       0.89      0.82      0.85        39
           3       0.93      0.97      0.95       175
           4       0.91      0.81      0.86        26

    accuracy                           0.93       240
   macro avg       0.91      0.86      0.89       240
weighted avg       0.92      0.93      0.92       240



In [147]:
pd.crosstab(y_test,gbc1_y_pred)

col_0,2,3,4
PerformanceRating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,32,7,0
3,4,169,2
4,0,5,21


In [148]:
gbc1_Metrics =pd.DataFrame({'METRIC': ['Accuracy','Precision','Recall','F1 Score'],'VALUE' : [gbc1_acc_test,gbc1_pre,gbc1_recall,gbc1_f1]})
gbc1_Metrics

Unnamed: 0,METRIC,VALUE
0,Accuracy,0.925
1,Precision,0.924182
2,Recall,0.925
3,F1 Score,0.923822


In [149]:
print("GBC EVALUATION METRIC AFTER HYPERPARAMETER TUNING:")
print(tabulate(gbc1_Metrics,headers ='keys',tablefmt= 'grid',showindex = False))

GBC EVALUATION METRIC AFTER HYPERPARAMETER TUNING:
+-----------+----------+
| METRIC    |    VALUE |
| Accuracy  | 0.925    |
+-----------+----------+
| Precision | 0.924182 |
+-----------+----------+
| Recall    | 0.925    |
+-----------+----------+
| F1 Score  | 0.923822 |
+-----------+----------+


- Gradient Boosting remained stable after tuning, providing strong performance. Its ability to capture complex patterns makes it highly useful for employee performance prediction, and it’s a top model to consider for your task.

# 7. AdaBoost Classifier

In [150]:
from sklearn.ensemble import AdaBoostClassifier

In [151]:
ada = AdaBoostClassifier(n_estimators = 50)

In [152]:
ada.fit(x_sm,y_sm)

###### Prediction

In [153]:
ada_y_train = ada.predict(x_sm)
ada_y_train

array([3, 2, 3, ..., 4, 3, 4], dtype=int64)

In [154]:
ada_y_pred = ada.predict(x_test)
ada_y_pred

array([3, 3, 3, 2, 4, 3, 2, 3, 3, 3, 3, 2, 4, 4, 4, 3, 3, 3, 4, 3, 4, 2,
       2, 3, 3, 2, 4, 3, 3, 3, 3, 4, 4, 3, 3, 4, 3, 3, 4, 4, 3, 3, 4, 2,
       3, 4, 4, 3, 3, 4, 3, 2, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 4, 3,
       2, 3, 3, 4, 3, 3, 3, 3, 3, 4, 3, 4, 3, 3, 4, 2, 2, 3, 3, 2, 2, 3,
       3, 3, 3, 2, 4, 3, 3, 4, 4, 4, 2, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 3,
       4, 3, 4, 3, 3, 3, 2, 3, 3, 3, 3, 4, 3, 2, 3, 3, 2, 3, 4, 4, 3, 3,
       3, 3, 3, 2, 2, 4, 3, 4, 3, 2, 2, 2, 3, 3, 3, 2, 4, 4, 3, 4, 3, 3,
       3, 4, 3, 2, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 4, 2, 3, 2, 4, 3,
       3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 2, 3, 3, 3, 3, 3, 3, 4, 3, 4, 3, 3, 3, 3, 3, 3, 2, 2, 2, 4,
       4, 3, 2, 4, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 4, 3, 3],
      dtype=int64)

###### Evaluation`

In [155]:
ada_acc_train  = accuracy_score(y_sm,ada_y_train)
ada_acc_train

0.8707677634716261

In [156]:
print(classification_report(y_sm,ada_y_train))

              precision    recall  f1-score   support

           2       0.96      0.96      0.96       699
           3       0.81      0.79      0.80       699
           4       0.84      0.86      0.85       699

    accuracy                           0.87      2097
   macro avg       0.87      0.87      0.87      2097
weighted avg       0.87      0.87      0.87      2097



In [157]:
ada_acc_test = accuracy_score(y_test,ada_y_pred)
print('ACCURACY SCORE:', ada_acc_test)
ada_pre = precision_score(y_test,ada_y_pred, average = 'weighted')
print('PRECISION SCORE:',ada_pre)
ada_recall = recall_score(y_test,ada_y_pred, average ='weighted')
print('RECALL SCORE:',ada_recall)
ada_f1 = f1_score(y_test,ada_y_pred, average = 'weighted')
print('F1 SCORE:',ada_f1)
print('------')
print('KNN CLASSIFICATION REPORT:')
print(classification_report(y_test,ada_y_pred))

ACCURACY SCORE: 0.8083333333333333
PRECISION SCORE: 0.8546160130718954
RECALL SCORE: 0.8083333333333333
F1 SCORE: 0.8229640463634367
------
KNN CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           2       0.85      0.85      0.85        39
           3       0.92      0.81      0.86       175
           4       0.42      0.77      0.54        26

    accuracy                           0.81       240
   macro avg       0.73      0.81      0.75       240
weighted avg       0.85      0.81      0.82       240



In [158]:
pd.crosstab(y_test,ada_y_pred)

col_0,2,3,4
PerformanceRating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,33,6,0
3,6,141,28
4,0,6,20


In [159]:
ada_Metrics =pd.DataFrame({'METRIC': ['Accuracy','Precision','Recall','F1 Score'],'VALUE' : [ada_acc_test,ada_pre,ada_recall,ada_f1]})
ada_Metrics

Unnamed: 0,METRIC,VALUE
0,Accuracy,0.808333
1,Precision,0.854616
2,Recall,0.808333
3,F1 Score,0.822964


In [160]:
print("ADA EVALUATION METRIC:")
print(tabulate(ada_Metrics,headers ='keys',tablefmt= 'grid',showindex = False))

ADA EVALUATION METRIC:
+-----------+----------+
| METRIC    |    VALUE |
| Accuracy  | 0.808333 |
+-----------+----------+
| Precision | 0.854616 |
+-----------+----------+
| Recall    | 0.808333 |
+-----------+----------+
| F1 Score  | 0.822964 |
+-----------+----------+


### HyperParameter Tuning in Ada boost

In [161]:
from sklearn.model_selection import RandomizedSearchCV

In [162]:
param_dist = {
    "n_estimators": np.arange(50, 501, 50),  # Number of estimators from 50 to 500
    "learning_rate": np.linspace(0.01, 1.0, 10),  # Learning rate from 0.01 to 1.0
    "base_estimator__max_depth": [1, 2, 3, 4, 5],  # Max depth of base tree
    "base_estimator__min_samples_split": [2, 5, 10],  # Minimum samples to split
    "base_estimator__min_samples_leaf": [1, 2, 5],  # Minimum samples at a leaf
}

In [163]:
base = DecisionTreeClassifier()
ada = AdaBoostClassifier(base_estimator=base, random_state=42)

In [164]:
from sklearn.model_selection import RandomizedSearchCV

random_search = RandomizedSearchCV(
    estimator=ada,
    param_distributions=param_dist,
    n_iter=50,  # Number of random combinations to try
    scoring="accuracy",  # Use appropriate scoring metric
    cv=5,  # 5-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1  # Use all available cores
)

In [165]:
random_search.fit(x_sm, y_sm)

Fitting 5 folds for each of 50 candidates, totalling 250 fits




In [166]:
random_search.best_params_

{'n_estimators': 350,
 'learning_rate': 0.89,
 'base_estimator__min_samples_split': 2,
 'base_estimator__min_samples_leaf': 1,
 'base_estimator__max_depth': 5}

In [167]:
base_estimator = DecisionTreeClassifier(
    min_samples_split=5,
    min_samples_leaf=2,
    max_depth=5,
    random_state=42)


In [168]:
ada1 =AdaBoostClassifier(
    base_estimator=base_estimator,
    n_estimators=400,
    learning_rate=0.89,
    random_state=42
)



In [169]:
ada1.fit(x_sm,y_sm)



In [170]:
ada1_y_pred = ada1.predict(x_test)
ada1_y_pred

array([3, 3, 3, 2, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2,
       3, 3, 3, 3, 4, 3, 2, 3, 3, 4, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3,
       3, 4, 4, 3, 3, 3, 3, 2, 3, 4, 2, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 3,
       2, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 2, 3,
       3, 3, 3, 2, 4, 3, 3, 3, 3, 3, 3, 2, 3, 3, 2, 3, 3, 3, 2, 2, 2, 3,
       3, 3, 4, 3, 3, 4, 3, 3, 3, 3, 3, 4, 3, 2, 3, 3, 2, 3, 3, 4, 3, 3,
       3, 3, 3, 2, 3, 3, 3, 3, 3, 2, 2, 2, 3, 3, 3, 2, 3, 4, 3, 3, 3, 3,
       3, 4, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 2, 3, 2, 3, 3,
       3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 2, 2, 2, 3,
       4, 3, 2, 4, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 2],
      dtype=int64)

In [171]:
ada1_acc_test = accuracy_score(y_test,ada1_y_pred)
print('ACCURACY SCORE:', ada1_acc_test)
ada1_pre = precision_score(y_test,ada1_y_pred, average = 'weighted')
print('PRECISION SCORE:',ada1_pre)
ada1_recall = recall_score(y_test,ada1_y_pred, average ='weighted')
print('RECALL SCORE:',ada1_recall)
ada1_f1 = f1_score(y_test,ada1_y_pred, average = 'weighted')
print('F1 SCORE:',ada1_f1)
print('------')
print('ADA CLASSIFICATION REPORT:')
print(classification_report(y_test,ada1_y_pred))

ACCURACY SCORE: 0.9
PRECISION SCORE: 0.8995803140096617
RECALL SCORE: 0.9
F1 SCORE: 0.8976106737717492
------
ADA CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           2       0.81      0.74      0.77        39
           3       0.91      0.96      0.94       175
           4       0.95      0.73      0.83        26

    accuracy                           0.90       240
   macro avg       0.89      0.81      0.85       240
weighted avg       0.90      0.90      0.90       240



In [172]:
pd.crosstab(y_test,ada1_y_pred)

col_0,2,3,4
PerformanceRating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,29,10,0
3,6,168,1
4,1,6,19


In [173]:
ada1_Metrics =pd.DataFrame({'METRIC': ['Accuracy','Precision','Recall','F1 Score'],'VALUE' : [ada1_acc_test,ada1_pre,ada1_recall,ada1_f1]})
ada1_Metrics

Unnamed: 0,METRIC,VALUE
0,Accuracy,0.9
1,Precision,0.89958
2,Recall,0.9
3,F1 Score,0.897611


In [174]:
print("ADA EVALUATION METRIC AFTER HYPERPARAMETER TUNING:")
print(tabulate(ada1_Metrics,headers ='keys',tablefmt= 'grid',showindex = False))

ADA EVALUATION METRIC AFTER HYPERPARAMETER TUNING:
+-----------+----------+
| METRIC    |    VALUE |
| Accuracy  | 0.9      |
+-----------+----------+
| Precision | 0.89958  |
+-----------+----------+
| Recall    | 0.9      |
+-----------+----------+
| F1 Score  | 0.897611 |
+-----------+----------+


- AdaBoost showed significant improvement after hyperparameter tuning. Its high accuracy and precision after tuning make it a useful model for predicting employee performance, though it still doesn’t quite match the performance of Random Forest or Gradient Boosting.

# 8. Extra Trees Classifier

In [175]:
from sklearn.ensemble import ExtraTreesClassifier

In [176]:
et = ExtraTreesClassifier(n_estimators = 100)

In [177]:
et.fit(x_sm,y_sm)

###### Prediction

In [178]:
et_y_train = et.predict(x_sm)
et_y_train

array([3, 2, 3, ..., 4, 4, 4], dtype=int64)

In [179]:
et_y_pred = et.predict(x_test)
et_y_pred

array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2,
       3, 3, 3, 3, 4, 3, 3, 3, 3, 4, 3, 3, 3, 4, 3, 3, 4, 3, 3, 3, 3, 2,
       3, 4, 4, 3, 4, 3, 3, 2, 3, 4, 2, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 3,
       2, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 2, 3, 3, 3, 3, 2, 4,
       3, 3, 3, 2, 4, 3, 3, 3, 3, 3, 3, 2, 3, 3, 2, 3, 3, 3, 2, 2, 2, 3,
       3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 2, 3, 3, 2, 3, 3, 4, 3, 3,
       3, 3, 3, 2, 2, 3, 3, 3, 3, 2, 2, 2, 3, 3, 3, 2, 3, 4, 2, 3, 3, 3,
       3, 3, 3, 2, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 4, 2, 3, 2, 3, 3,
       3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 3,
       4, 3, 3, 3, 3, 3, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2],
      dtype=int64)

###### Evaluation

In [180]:
et_acc_train  = accuracy_score(y_sm,et_y_train)
et_acc_train

1.0

In [181]:
print(classification_report(y_sm,et_y_train))

              precision    recall  f1-score   support

           2       1.00      1.00      1.00       699
           3       1.00      1.00      1.00       699
           4       1.00      1.00      1.00       699

    accuracy                           1.00      2097
   macro avg       1.00      1.00      1.00      2097
weighted avg       1.00      1.00      1.00      2097



In [182]:
et_acc_test = accuracy_score(y_test,et_y_pred)
print('ACCURACY SCORE:', et_acc_test)
et_pre = precision_score(y_test,et_y_pred, average = 'weighted')
print('PRECISION SCORE:',et_pre)
et_recall = recall_score(y_test,et_y_pred, average ='weighted')
print('RECALL SCORE:',et_recall)
et_f1 = f1_score(y_test,et_y_pred, average = 'weighted')
print('F1 SCORE:',et_f1)
print('------')
print('ET CLASSIFICATION REPORT:')
print(classification_report(y_test,et_y_pred))

ACCURACY SCORE: 0.8416666666666667
PRECISION SCORE: 0.8389960247132378
RECALL SCORE: 0.8416666666666667
F1 SCORE: 0.8385466870357822
------
ET CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           2       0.65      0.62      0.63        39
           3       0.88      0.92      0.90       175
           4       0.85      0.65      0.74        26

    accuracy                           0.84       240
   macro avg       0.79      0.73      0.76       240
weighted avg       0.84      0.84      0.84       240



### HyperParameter Tuning in ETC

In [183]:
param_dist = {
    "n_estimators": np.arange(50, 501, 50),  # Number of estimators from 50 to 500
    "max_depth": [None, 10, 20, 30, 40, 50],  # Depth of trees
    "min_samples_split": [2, 5, 10],  # Minimum samples to split a node
    "min_samples_leaf": [1, 2, 5],  # Minimum samples at leaf
    "max_features": ["sqrt", "log2", None],  # Number of features for split
}

In [184]:
etc = ExtraTreesClassifier(random_state=42)

In [185]:
random_search = RandomizedSearchCV(
    estimator=etc,
    param_distributions=param_dist,
    n_iter=50,  # Number of random combinations to try
    scoring="accuracy",  # Scoring metric
    cv=5,  # 5-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1  # Use all cores
)

In [186]:
random_search.fit(x_sm, y_sm)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [187]:
random_search.best_params_

{'n_estimators': 50,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': None,
 'max_depth': 10}

In [188]:
etc1 = ExtraTreesClassifier(n_estimators = 450,
 min_samples_split = 2,
 min_samples_leaf = 1,
 max_features = 'sqrt',
 max_depth = 30)

In [189]:
etc1.fit(x_sm,y_sm)

In [190]:
etc1_y_pred = etc1.predict(x_test)
etc1_y_pred

array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2,
       3, 3, 3, 3, 4, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 2,
       3, 4, 4, 3, 4, 3, 3, 2, 3, 4, 2, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 2,
       2, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 2, 2, 3, 3, 3, 2, 4,
       3, 3, 3, 2, 4, 3, 3, 3, 3, 3, 3, 2, 3, 3, 2, 3, 3, 3, 2, 2, 2, 3,
       3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 2, 3, 3, 2, 3, 3, 4, 3, 3,
       3, 3, 3, 2, 3, 3, 3, 4, 3, 2, 2, 2, 3, 3, 3, 2, 3, 4, 2, 3, 3, 3,
       3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 2, 3, 2, 3, 3,
       3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 3,
       4, 3, 3, 4, 3, 3, 2, 2, 4, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 2],
      dtype=int64)

In [191]:
etc1_acc_test = accuracy_score(y_test,etc1_y_pred)
print('ACCURACY SCORE:', etc1_acc_test)
etc1_pre = precision_score(y_test,etc1_y_pred, average = 'weighted')
print('PRECISION SCORE:',etc1_pre)
etc1_recall = recall_score(y_test,etc1_y_pred, average ='weighted')
print('RECALL SCORE:',etc1_recall)
etc1_f1 = f1_score(y_test,etc1_y_pred, average = 'weighted')
print('F1 SCORE:',etc1_f1)
print('------')
print('ET CLASSIFICATION REPORT:')
print(classification_report(y_test,etc1_y_pred))

ACCURACY SCORE: 0.8416666666666667
PRECISION SCORE: 0.838521351021351
RECALL SCORE: 0.8416666666666667
F1 SCORE: 0.8388714603049133
------
ET CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           2       0.68      0.64      0.66        39
           3       0.88      0.91      0.90       175
           4       0.81      0.65      0.72        26

    accuracy                           0.84       240
   macro avg       0.79      0.74      0.76       240
weighted avg       0.84      0.84      0.84       240



In [192]:
pd.crosstab(y_test,etc1_y_pred)

col_0,2,3,4
PerformanceRating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,25,14,0
3,11,160,4
4,1,8,17


In [193]:
etc1_Metrics =pd.DataFrame({'METRIC': ['Accuracy','Precision','Recall','F1 Score'],'VALUE' : [etc1_acc_test,etc1_pre,etc1_recall,etc1_f1]})
etc1_Metrics

Unnamed: 0,METRIC,VALUE
0,Accuracy,0.841667
1,Precision,0.838521
2,Recall,0.841667
3,F1 Score,0.838871


In [194]:
print("ETC EVALUATION METRIC AFTER HYPERPARAMETER TUNING:")
print(tabulate(etc1_Metrics,headers ='keys',tablefmt= 'grid',showindex = False))

ETC EVALUATION METRIC AFTER HYPERPARAMETER TUNING:
+-----------+----------+
| METRIC    |    VALUE |
| Accuracy  | 0.841667 |
+-----------+----------+
| Precision | 0.838521 |
+-----------+----------+
| Recall    | 0.841667 |
+-----------+----------+
| F1 Score  | 0.838871 |
+-----------+----------+


- Extra Trees performed similarly before and after hyperparameter tuning. While it was stable and reasonable, it didn’t show a marked improvement. It is useful, though not the most optimal model for this task.

# 9.Neural Networks (MLP Classifier)

In [195]:
from sklearn.neural_network import MLPClassifier

In [196]:
mlp = MLPClassifier(hidden_layer_sizes =( 100,),max_iter = 500, activation ='relu', solver ='adam')

In [197]:
mlp.fit(x_sm,y_sm)

###### Prediction

In [198]:
mlp_y_train = mlp.predict(x_sm)
mlp_y_train

array([3, 2, 3, ..., 4, 4, 4], dtype=int64)

In [199]:
mlp_y_pred = mlp.predict(x_test)
mlp_y_pred

array([3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 4, 4, 3, 2, 3, 3, 4, 4, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3,
       3, 4, 4, 3, 3, 3, 3, 2, 3, 4, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 3,
       2, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 2, 3, 3, 3, 2, 2, 4,
       3, 3, 3, 2, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 2, 3, 3, 2, 2, 3,
       3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 4, 2, 2, 3, 3, 3, 3, 3, 4, 3, 3,
       3, 3, 3, 2, 3, 3, 3, 4, 3, 2, 2, 2, 3, 3, 3, 2, 3, 4, 3, 3, 3, 3,
       3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 2, 2, 2, 3, 3,
       3, 3, 2, 3, 3, 3, 4, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 2, 3, 3, 3, 3, 2, 3, 3, 3, 4, 3, 2, 2, 3, 3, 3, 2, 2, 2, 3,
       4, 3, 2, 4, 3, 2, 2, 3, 2, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 2],
      dtype=int64)

###### Evaluation

In [200]:
mlp_acc_train  = accuracy_score(y_sm,mlp_y_train)
mlp_acc_train

1.0

In [201]:
print(classification_report(y_sm,mlp_y_train))

              precision    recall  f1-score   support

           2       1.00      1.00      1.00       699
           3       1.00      1.00      1.00       699
           4       1.00      1.00      1.00       699

    accuracy                           1.00      2097
   macro avg       1.00      1.00      1.00      2097
weighted avg       1.00      1.00      1.00      2097



In [202]:
mlp_acc_test = accuracy_score(y_test,mlp_y_pred)
print('ACCURACY SCORE:', mlp_acc_test)
mlp_pre = precision_score(y_test,mlp_y_pred, average='weighted')
print('PRECISION SCORE:',mlp_pre)
mlp_recall = recall_score(y_test,mlp_y_pred, average ='weighted')
print('RECALL SCORE:',mlp_recall)
mlp_f1 = f1_score(y_test,mlp_y_pred, average = 'weighted')
print('F1 SCORE:',mlp_f1)
print('------')
print('MLP CLASSIFICATION REPORT:')
print(classification_report(y_test,mlp_y_pred))

ACCURACY SCORE: 0.825
PRECISION SCORE: 0.8232299286351472
RECALL SCORE: 0.825
F1 SCORE: 0.824048520923521
------
MLP CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           2       0.66      0.64      0.65        39
           3       0.88      0.89      0.89       175
           4       0.68      0.65      0.67        26

    accuracy                           0.82       240
   macro avg       0.74      0.73      0.73       240
weighted avg       0.82      0.82      0.82       240



### HyperParameter Tuning in MLP Classifier

In [203]:
from scipy.stats import uniform

# Define the hyperparameters to sample from
param_dist = {
    'hidden_layer_sizes': [(100,), (50, 50), (200, 100)],
    'activation': ['logistic', 'relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'learning_rate_init': uniform(0.001, 0.1),  # Uniform distribution for learning rate
    'max_iter': [1000, 2000],
}


In [204]:
random_search = RandomizedSearchCV(mlp, param_distributions=param_dist, n_iter=100, cv=3, n_jobs=-1, verbose=2)


In [205]:
random_search.fit(x_sm,y_sm)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [206]:
random_search.best_params_

{'activation': 'tanh',
 'hidden_layer_sizes': (100,),
 'learning_rate_init': 0.03703110064316854,
 'max_iter': 2000,
 'solver': 'sgd'}

In [207]:
mlp1 = MLPClassifier(hidden_layer_sizes =(200,100),max_iter = 1000, activation ='tanh', solver ='adam', learning_rate_init = 0.010339203116553185)

In [208]:
mlp1.fit(x_sm,y_sm)

In [209]:
mlp1_y_pred = mlp1.predict(x_test)
mlp1_y_pred

array([3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 4, 3, 3, 2,
       3, 3, 3, 4, 4, 3, 2, 3, 3, 4, 3, 3, 3, 4, 3, 3, 4, 3, 3, 3, 3, 3,
       3, 4, 4, 3, 3, 3, 3, 2, 3, 4, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 3,
       2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 2, 4,
       3, 3, 3, 2, 4, 3, 3, 3, 4, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 2, 2, 3,
       3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 2, 3, 3, 3, 3, 3, 4, 3, 3,
       3, 3, 3, 2, 3, 3, 4, 4, 3, 2, 2, 2, 3, 3, 3, 2, 3, 2, 3, 3, 3, 3,
       3, 3, 3, 2, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 4, 2, 3, 2, 3, 3,
       3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 2, 2, 3, 3, 3, 2, 2, 2, 3,
       4, 3, 2, 4, 3, 3, 4, 3, 2, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 2],
      dtype=int64)

In [210]:
mlp1_acc_test = accuracy_score(y_test,mlp1_y_pred)
print('ACCURACY SCORE:', mlp1_acc_test)
mlp1_pre = precision_score(y_test,mlp1_y_pred, average='weighted')
print('PRECISION SCORE:',mlp1_pre)
mlp1_recall = recall_score(y_test,mlp1_y_pred, average ='weighted')
print('RECALL SCORE:',mlp1_recall)
mlp1_f1 = f1_score(y_test,mlp1_y_pred, average = 'weighted')
print('F1 SCORE:',mlp1_f1)
print('------')
print('MLP CLASSIFICATION REPORT:')
print(classification_report(y_test,mlp1_y_pred))

ACCURACY SCORE: 0.8541666666666666
PRECISION SCORE: 0.8510017603726574
RECALL SCORE: 0.8541666666666666
F1 SCORE: 0.851802931166823
------
MLP CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           2       0.79      0.69      0.74        39
           3       0.90      0.93      0.91       175
           4       0.64      0.62      0.63        26

    accuracy                           0.85       240
   macro avg       0.78      0.74      0.76       240
weighted avg       0.85      0.85      0.85       240



In [211]:
pd.crosstab(y_test,mlp1_y_pred)

col_0,2,3,4
PerformanceRating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,27,11,1
3,5,162,8
4,2,8,16


In [212]:
mlp1_Metrics =pd.DataFrame({'METRIC': ['Accuracy','Precision','Recall','F1 Score'],'VALUE' : [mlp1_acc_test,mlp1_pre,mlp1_recall,mlp1_f1]})
mlp1_Metrics

Unnamed: 0,METRIC,VALUE
0,Accuracy,0.854167
1,Precision,0.851002
2,Recall,0.854167
3,F1 Score,0.851803


In [213]:
print("MLP Classifier EVALUATION METRIC AFTER HYPERPARAMETER TUNING:")
print(tabulate(mlp1_Metrics,headers ='keys',tablefmt= 'grid',showindex = False))

MLP Classifier EVALUATION METRIC AFTER HYPERPARAMETER TUNING:
+-----------+----------+
| METRIC    |    VALUE |
| Accuracy  | 0.854167 |
+-----------+----------+
| Precision | 0.851002 |
+-----------+----------+
| Recall    | 0.854167 |
+-----------+----------+
| F1 Score  | 0.851803 |
+-----------+----------+


- MLP improved after hyperparameter tuning but still didn’t reach the performance level of Random Forest or Gradient Boosting. It can model complex relationships but requires more tuning to perform well consistently. Moderately useful for this task.

# 10. CatBoost

In [214]:
import catboost
from catboost import CatBoostClassifier

In [215]:
cat = CatBoostClassifier(iterations = 100, depth = 10,learning_rate = 0.1, loss_function='MultiClass')

In [216]:
cat.fit(x_sm,y_sm)

0:	learn: 1.0069639	total: 229ms	remaining: 22.7s
1:	learn: 0.9321858	total: 308ms	remaining: 15.1s
2:	learn: 0.8426889	total: 318ms	remaining: 10.3s
3:	learn: 0.7756554	total: 356ms	remaining: 8.55s
4:	learn: 0.7261398	total: 434ms	remaining: 8.24s
5:	learn: 0.6772671	total: 510ms	remaining: 7.99s
6:	learn: 0.6315057	total: 592ms	remaining: 7.87s
7:	learn: 0.6002023	total: 671ms	remaining: 7.72s
8:	learn: 0.5630311	total: 710ms	remaining: 7.18s
9:	learn: 0.5297188	total: 793ms	remaining: 7.14s
10:	learn: 0.5009954	total: 872ms	remaining: 7.05s
11:	learn: 0.4770948	total: 946ms	remaining: 6.93s
12:	learn: 0.4529819	total: 1.02s	remaining: 6.83s
13:	learn: 0.4308393	total: 1.1s	remaining: 6.75s
14:	learn: 0.4118319	total: 1.17s	remaining: 6.65s
15:	learn: 0.3944733	total: 1.24s	remaining: 6.53s
16:	learn: 0.3776617	total: 1.32s	remaining: 6.46s
17:	learn: 0.3614513	total: 1.4s	remaining: 6.36s
18:	learn: 0.3493163	total: 1.47s	remaining: 6.27s
19:	learn: 0.3362212	total: 1.55s	remaining

<catboost.core.CatBoostClassifier at 0x1f0dde8cbd0>

###### Prediction

In [217]:
cat_y_train = cat.predict(x_sm)
cat_y_train

array([[3],
       [2],
       [3],
       ...,
       [4],
       [4],
       [4]], dtype=int64)

In [218]:
cat_y_pred = cat.predict(x_test)
cat_y_pred

array([[3],
       [3],
       [3],
       [2],
       [3],
       [3],
       [2],
       [3],
       [3],
       [3],
       [3],
       [2],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [2],
       [3],
       [3],
       [3],
       [2],
       [4],
       [3],
       [2],
       [3],
       [3],
       [4],
       [3],
       [3],
       [3],
       [4],
       [3],
       [3],
       [4],
       [3],
       [3],
       [3],
       [3],
       [2],
       [3],
       [4],
       [4],
       [3],
       [3],
       [3],
       [3],
       [2],
       [3],
       [4],
       [2],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [4],
       [4],
       [3],
       [3],
       [2],
       [2],
       [3],
       [3],
       [4],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [2],
       [3],
       [3],
       [2],
       [3],
    

###### Evaluation

In [219]:
cat_acc_train  = accuracy_score(y_sm,cat_y_train)
cat_acc_train

0.994754411063424

In [220]:
print(classification_report(y_sm,cat_y_train))

              precision    recall  f1-score   support

           2       0.99      1.00      0.99       699
           3       1.00      0.99      0.99       699
           4       1.00      1.00      1.00       699

    accuracy                           0.99      2097
   macro avg       0.99      0.99      0.99      2097
weighted avg       0.99      0.99      0.99      2097



In [221]:
cat_acc_test = accuracy_score(y_test,cat_y_pred)
print('ACCURACY SCORE:', cat_acc_test)
cat_pre = precision_score(y_test,cat_y_pred, average = 'weighted')
print('PRECISION SCORE:',cat_pre)
cat_recall = recall_score(y_test,cat_y_pred, average ='weighted')
print('RECALL SCORE:',cat_recall)
cat_f1 = f1_score(y_test,cat_y_pred, average = 'weighted')
print('F1 SCORE:',cat_f1)
print('------')
print('CAT CLASSIFICATION REPORT:')
print(classification_report(y_test,cat_y_pred))

ACCURACY SCORE: 0.9125
PRECISION SCORE: 0.9157143659515486
RECALL SCORE: 0.9125
F1 SCORE: 0.9132502093760924
------
CAT CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           2       0.77      0.87      0.82        39
           3       0.95      0.94      0.94       175
           4       0.91      0.81      0.86        26

    accuracy                           0.91       240
   macro avg       0.88      0.87      0.87       240
weighted avg       0.92      0.91      0.91       240



### HyperParameter Tuning in CAT

In [222]:
from scipy.stats import uniform, randint


param_dist = {
    'iterations': randint(100,500),  # Number of boosting iterations (trees)
    'depth': randint(4, 8),           # Depth of trees
    'learning_rate': uniform(0.01, 0.1),  # Learning rate
    'l2_leaf_reg': uniform(1, 5),     # L2 regularization
    'bagging_temperature': uniform(0, 0.5),  # Bagging temperature (randomness)
    'random_strength': uniform(0, 1),  # Randomness strength for splits
    'max_bin': randint(100, 150),      # Maximum number of bins for discretization
}


In [223]:
catboost = CatBoostClassifier(random_state=42, verbose=0)


In [224]:
random_search = RandomizedSearchCV(catboost, param_distributions=param_dist, 
                                   n_iter=50, cv=3, n_jobs=-1, verbose=2)


In [225]:
random_search.fit(x_sm,y_sm)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


In [226]:
random_search.best_params_

{'bagging_temperature': 0.21589464104345274,
 'depth': 7,
 'iterations': 483,
 'l2_leaf_reg': 4.379875290695939,
 'learning_rate': 0.09684466378369215,
 'max_bin': 130,
 'random_strength': 0.3007418991720453}

In [227]:
cat1 = CatBoostClassifier(max_bin = 119,bagging_temperature =  0.21163672536389072,iterations = 460, depth = 7,learning_rate = 0.07184799167523831, l2_leaf_reg = 4.096709951477855, random_strength = 0.8114774294373885)

In [228]:
cat1.fit(x_sm,y_sm)

0:	learn: 1.0225575	total: 6.53ms	remaining: 3s
1:	learn: 0.9406048	total: 13.5ms	remaining: 3.09s
2:	learn: 0.8765958	total: 20.8ms	remaining: 3.17s
3:	learn: 0.8187020	total: 24.8ms	remaining: 2.83s
4:	learn: 0.7694398	total: 31.5ms	remaining: 2.86s
5:	learn: 0.7269608	total: 37.5ms	remaining: 2.84s
6:	learn: 0.6893333	total: 44.8ms	remaining: 2.9s
7:	learn: 0.6548076	total: 51.9ms	remaining: 2.93s
8:	learn: 0.6218393	total: 57.7ms	remaining: 2.89s
9:	learn: 0.5933171	total: 63.5ms	remaining: 2.86s
10:	learn: 0.5672741	total: 69.7ms	remaining: 2.85s
11:	learn: 0.5450344	total: 76.4ms	remaining: 2.85s
12:	learn: 0.5236447	total: 82.7ms	remaining: 2.84s
13:	learn: 0.5020506	total: 86.9ms	remaining: 2.77s
14:	learn: 0.4817611	total: 94.2ms	remaining: 2.79s
15:	learn: 0.4657535	total: 101ms	remaining: 2.8s
16:	learn: 0.4506257	total: 107ms	remaining: 2.79s
17:	learn: 0.4326326	total: 113ms	remaining: 2.77s
18:	learn: 0.4189895	total: 119ms	remaining: 2.76s
19:	learn: 0.4045135	total: 126

184:	learn: 0.0760485	total: 1.02s	remaining: 1.52s
185:	learn: 0.0757683	total: 1.03s	remaining: 1.51s
186:	learn: 0.0754505	total: 1.03s	remaining: 1.5s
187:	learn: 0.0753784	total: 1.04s	remaining: 1.5s
188:	learn: 0.0748015	total: 1.04s	remaining: 1.5s
189:	learn: 0.0743541	total: 1.05s	remaining: 1.49s
190:	learn: 0.0738059	total: 1.05s	remaining: 1.48s
191:	learn: 0.0733320	total: 1.06s	remaining: 1.48s
192:	learn: 0.0731518	total: 1.06s	remaining: 1.47s
193:	learn: 0.0730808	total: 1.07s	remaining: 1.47s
194:	learn: 0.0726525	total: 1.08s	remaining: 1.46s
195:	learn: 0.0723183	total: 1.08s	remaining: 1.46s
196:	learn: 0.0717524	total: 1.09s	remaining: 1.45s
197:	learn: 0.0715455	total: 1.09s	remaining: 1.45s
198:	learn: 0.0710782	total: 1.1s	remaining: 1.44s
199:	learn: 0.0705246	total: 1.1s	remaining: 1.43s
200:	learn: 0.0701496	total: 1.11s	remaining: 1.43s
201:	learn: 0.0697860	total: 1.11s	remaining: 1.42s
202:	learn: 0.0697274	total: 1.12s	remaining: 1.42s
203:	learn: 0.069

349:	learn: 0.0401307	total: 1.85s	remaining: 582ms
350:	learn: 0.0401127	total: 1.86s	remaining: 577ms
351:	learn: 0.0399580	total: 1.86s	remaining: 572ms
352:	learn: 0.0398531	total: 1.87s	remaining: 566ms
353:	learn: 0.0398179	total: 1.87s	remaining: 561ms
354:	learn: 0.0397178	total: 1.88s	remaining: 556ms
355:	learn: 0.0397008	total: 1.88s	remaining: 550ms
356:	learn: 0.0396017	total: 1.89s	remaining: 545ms
357:	learn: 0.0394540	total: 1.89s	remaining: 540ms
358:	learn: 0.0392898	total: 1.9s	remaining: 535ms
359:	learn: 0.0391445	total: 1.9s	remaining: 529ms
360:	learn: 0.0389864	total: 1.91s	remaining: 524ms
361:	learn: 0.0388939	total: 1.91s	remaining: 518ms
362:	learn: 0.0387883	total: 1.92s	remaining: 513ms
363:	learn: 0.0387344	total: 1.92s	remaining: 508ms
364:	learn: 0.0387178	total: 1.93s	remaining: 502ms
365:	learn: 0.0387017	total: 1.93s	remaining: 497ms
366:	learn: 0.0385403	total: 1.94s	remaining: 491ms
367:	learn: 0.0384820	total: 1.94s	remaining: 486ms
368:	learn: 0.

<catboost.core.CatBoostClassifier at 0x1f0de0256d0>

In [229]:
cat1_y_pred = cat1.predict(x_test)
cat1_y_pred

array([[3],
       [3],
       [3],
       [2],
       [3],
       [3],
       [2],
       [3],
       [3],
       [3],
       [3],
       [2],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [2],
       [3],
       [3],
       [3],
       [3],
       [4],
       [3],
       [3],
       [3],
       [3],
       [4],
       [3],
       [3],
       [3],
       [4],
       [3],
       [3],
       [4],
       [3],
       [3],
       [3],
       [3],
       [2],
       [3],
       [4],
       [4],
       [3],
       [3],
       [3],
       [3],
       [2],
       [3],
       [4],
       [2],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [4],
       [4],
       [3],
       [3],
       [3],
       [2],
       [3],
       [3],
       [4],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [2],
       [3],
    

In [230]:
cat1_y_pred.shape

(240, 1)

In [231]:
y_test.shape

(240,)

In [232]:
cat1_acc_test = accuracy_score(y_test,cat1_y_pred)
print('ACCURACY SCORE:', cat1_acc_test)
cat1_pre = precision_score(y_test,cat1_y_pred, average = 'weighted')
print('PRECISION SCORE:',cat1_pre)
cat1_recall = recall_score(y_test,cat1_y_pred, average ='weighted')
print('RECALL SCORE:',cat1_recall)
cat1_f1 = f1_score(y_test,cat1_y_pred, average = 'weighted')
print('F1 SCORE:',cat1_f1)
print('------')
print('CAT CLASSIFICATION REPORT:')
print(classification_report(y_test,cat1_y_pred))

ACCURACY SCORE: 0.9125
PRECISION SCORE: 0.9119762241054613
RECALL SCORE: 0.9125
F1 SCORE: 0.9120700757575758
------
CAT CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           2       0.82      0.82      0.82        39
           3       0.94      0.95      0.94       175
           4       0.88      0.81      0.84        26

    accuracy                           0.91       240
   macro avg       0.88      0.86      0.87       240
weighted avg       0.91      0.91      0.91       240



In [233]:
cat1_Metrics =pd.DataFrame({'METRIC': ['Accuracy','Precision','Recall','F1 Score'],'VALUE' : [cat1_acc_test,cat1_pre,cat1_recall,cat1_f1]})
cat1_Metrics

Unnamed: 0,METRIC,VALUE
0,Accuracy,0.9125
1,Precision,0.911976
2,Recall,0.9125
3,F1 Score,0.91207


- CatBoost performed consistently well, even before hyperparameter tuning, and showed only slight improvement after tuning. It is highly useful for predicting employee performance due to its ability to handle categorical features and produce stable results.

# 11. Linear Discriminant Analysis(LDA)

In [234]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [235]:
lda = LinearDiscriminantAnalysis()

In [236]:
lda.fit(x_sm,y_sm)

###### Prediction

In [237]:
lda_y_train = lda.predict(x_sm)
lda_y_train

array([3, 3, 3, ..., 4, 4, 4], dtype=int64)

In [238]:
lda_y_pred = lda.predict(x_test)
lda_y_pred

array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 2, 2,
       2, 3, 2, 4, 4, 3, 2, 3, 3, 4, 4, 3, 3, 4, 3, 3, 4, 3, 3, 3, 3, 2,
       3, 4, 4, 3, 4, 3, 3, 2, 3, 4, 2, 3, 3, 3, 3, 3, 2, 4, 4, 3, 3, 2,
       2, 3, 3, 4, 3, 3, 3, 3, 3, 3, 4, 3, 2, 3, 3, 2, 2, 3, 3, 3, 2, 4,
       3, 3, 2, 2, 4, 3, 3, 3, 3, 3, 3, 2, 4, 3, 4, 3, 3, 3, 2, 2, 2, 4,
       4, 3, 4, 3, 3, 3, 3, 3, 2, 3, 3, 4, 3, 2, 2, 3, 2, 4, 3, 4, 3, 3,
       3, 4, 3, 4, 2, 4, 2, 4, 3, 2, 2, 2, 3, 4, 3, 2, 3, 4, 2, 3, 3, 4,
       3, 4, 2, 2, 3, 4, 3, 3, 3, 2, 4, 3, 3, 4, 3, 3, 4, 2, 2, 2, 3, 3,
       3, 3, 2, 3, 3, 3, 4, 3, 3, 3, 2, 3, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 4, 3, 3, 3, 3, 4, 2, 2, 2, 2, 4,
       4, 3, 2, 4, 3, 3, 2, 2, 2, 3, 3, 4, 3, 3, 3, 2, 3, 4, 3, 2],
      dtype=int64)

###### Evaluation

In [239]:
lda_acc_train  = accuracy_score(y_sm,lda_y_train)
lda_acc_train

0.8688602765855985

In [240]:
print(classification_report(y_sm,lda_y_train))

              precision    recall  f1-score   support

           2       0.86      0.93      0.89       699
           3       0.89      0.77      0.82       699
           4       0.86      0.91      0.89       699

    accuracy                           0.87      2097
   macro avg       0.87      0.87      0.87      2097
weighted avg       0.87      0.87      0.87      2097



In [241]:
lda_acc_test = accuracy_score(y_test,lda_y_pred)
print('ACCURACY SCORE:', lda_acc_test)
lda_pre = precision_score(y_test,lda_y_pred, average = 'weighted')
print('PRECISION SCORE:',lda_pre)
lda_recall = recall_score(y_test,lda_y_pred, average ='weighted')
print('RECALL SCORE:',lda_recall)
lda_f1 = f1_score(y_test,lda_y_pred, average = 'weighted')
print('F1 SCORE:',lda_f1)
print('------')
print('LDA CLASSIFICATION REPORT:')
print(classification_report(y_test,lda_y_pred))

ACCURACY SCORE: 0.7333333333333333
PRECISION SCORE: 0.792591972359816
RECALL SCORE: 0.7333333333333333
F1 SCORE: 0.7493256797699129
------
LDA CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           2       0.55      0.72      0.62        39
           3       0.90      0.73      0.81       175
           4       0.43      0.77      0.55        26

    accuracy                           0.73       240
   macro avg       0.63      0.74      0.66       240
weighted avg       0.79      0.73      0.75       240



- LDA's performance was subpar compared to other models, and it didn't improve much after hyperparameter tuning. Its linear nature makes it unsuitable for complex relationships in the data, so it is not recommended for employee performance prediction.

# 12.  Quadratic Discriminant Analysis(QDA)

In [242]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [243]:
qda = QuadraticDiscriminantAnalysis()

In [244]:
qda.fit(x_sm,y_sm)

###### Prediction

In [245]:
qda_y_train = qda.predict(x_sm)
qda_y_train

array([3, 3, 3, ..., 4, 4, 4], dtype=int64)

In [246]:
qda_y_pred = qda.predict(x_test)
qda_y_pred

array([3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 4, 4, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 4, 3,
       3, 4, 4, 3, 4, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 2, 4,
       3, 3, 3, 2, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 3,
       3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 2, 3, 3, 3, 3, 3, 4, 3, 3,
       3, 3, 3, 2, 3, 3, 3, 3, 3, 2, 3, 2, 3, 3, 3, 2, 3, 4, 3, 3, 3, 4,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 4, 2, 2, 2, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 2, 2, 2, 3,
       4, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3],
      dtype=int64)

###### Evaluation

In [247]:
qda_acc_train  = accuracy_score(y_sm,qda_y_train)
qda_acc_train

0.9246542680019075

In [248]:
print(classification_report(y_sm,qda_y_train))

              precision    recall  f1-score   support

           2       0.97      0.91      0.94       699
           3       0.86      0.93      0.89       699
           4       0.95      0.94      0.94       699

    accuracy                           0.92      2097
   macro avg       0.93      0.92      0.93      2097
weighted avg       0.93      0.92      0.93      2097



In [249]:
qda_acc_test = accuracy_score(y_test,qda_y_pred)
print('ACCURACY SCORE:', qda_acc_test)
qda_pre = precision_score(y_test,qda_y_pred, average = 'weighted')
print('PRECISION SCORE:',qda_pre)
qda_recall = recall_score(y_test,qda_y_pred, average ='weighted')
print('RECALL SCORE:',qda_recall)
qda_f1 = f1_score(y_test,qda_y_pred, average = 'weighted')
print('F1 SCORE:',qda_f1)
print('------')
print('QDA CLASSIFICATION REPORT:')
print(classification_report(y_test,qda_y_pred))

ACCURACY SCORE: 0.8041666666666667
PRECISION SCORE: 0.8045608974358974
RECALL SCORE: 0.8041666666666667
F1 SCORE: 0.7916347054931302
------
QDA CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           2       0.85      0.44      0.58        39
           3       0.83      0.93      0.88       175
           4       0.56      0.54      0.55        26

    accuracy                           0.80       240
   macro avg       0.75      0.63      0.67       240
weighted avg       0.80      0.80      0.79       240




- QDA showed similar results to LDA, with accuracy and other metrics falling short compared to other models. It struggled to capture the complexity of employee performance data, making it not useful for this task.

# Conclusion

- 1. **Random Forest** : The top performer, offering strong generalization and accuracy.

- 2. **Gradient Boosting** : A close contender with consistently high performance.

In [250]:
#Saving best trained model using joblib

In [251]:
joblib.dump(rf,"random_forest_model.pkl")

['random_forest_model.pkl']

In [252]:
joblib.dump(gbc1,"gradient_boosting_model.pkl")

['gradient_boosting_model.pkl']

In [253]:
#saving predictions

In [254]:
# Create a DataFrame to store the predictions of all models
predictions_data = pd.DataFrame({
    'True Labels': y_test,  # True labels for comparison
    'Random Forest' : rf_y_pred,
    'Gradient Boosting': gbc1_y_pred
})

In [255]:
predictions_data.to_csv('Predictions_data.csv',index = False)

In [256]:
joblib.dump(predictions_data,'Predictions.pkl')

['Predictions.pkl']