Pham Nhat Duc - 164630

In [1]:
%pip install numpy pandas scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, precision_score, f1_score
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder


1. Load and Preprocess the Data

In [3]:
data = pd.read_csv('cleaned_data.csv')

print(data.shape)
# Preview the first 10 rows
data.head(10)

(12259, 10)


Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,work_accident,left,promotion_last_5years,department,salary
0,0.38,0.53,2,157.0,3.0,0,1,0,sales,low
1,0.8,0.86,5,262.0,6.0,0,1,0,sales,medium
2,0.11,0.88,7,272.0,4.0,0,1,0,sales,medium
3,0.72,0.87,5,223.0,5.0,0,1,0,sales,low
4,0.37,0.52,2,200.511732,3.380048,0,1,0,sales,low
5,0.41,0.5,2,200.511732,3.380048,0,1,0,sales,low
6,0.1,0.77,6,247.0,4.0,0,1,0,sales,low
7,0.92,0.85,5,259.0,5.0,0,1,0,sales,low
8,0.89,1.0,5,224.0,5.0,0,1,0,sales,low
9,0.42,0.53,2,142.0,3.0,0,1,0,sales,low


In [4]:
# Convert categorical variables to numerical
salary_encoder = OrdinalEncoder(categories=[['low', 'medium', 'high']])
data[['salary']] = salary_encoder.fit_transform(data[['salary']])
department_encoder = OneHotEncoder(sparse_output=False)
data['department'] = department_encoder.fit_transform(data[['department']])

# Ensure all columns are numerical
print(data.dtypes)

# Define features (X) and target (y)
X = data.drop(columns='left')
Y = data['left']
data

satisfaction_level       float64
last_evaluation          float64
number_project             int64
average_montly_hours     float64
time_spend_company       float64
work_accident              int64
left                       int64
promotion_last_5years      int64
department               float64
salary                   float64
dtype: object


Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,work_accident,left,promotion_last_5years,department,salary
0,0.38,0.53,2,157.000000,3.000000,0,1,0,0.0,0.0
1,0.80,0.86,5,262.000000,6.000000,0,1,0,0.0,1.0
2,0.11,0.88,7,272.000000,4.000000,0,1,0,0.0,1.0
3,0.72,0.87,5,223.000000,5.000000,0,1,0,0.0,0.0
4,0.37,0.52,2,200.511732,3.380048,0,1,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
12254,0.40,0.47,2,128.000000,3.000000,0,1,0,0.0,1.0
12255,0.43,0.46,2,157.000000,3.000000,0,1,0,0.0,1.0
12256,0.89,0.88,5,228.000000,5.000000,1,1,0,0.0,0.0
12257,0.76,0.83,6,293.000000,6.000000,0,1,0,0.0,0.0


2. Train-Test Split

Initialize classifier with a `random_state`

In [5]:
classifier = SGDClassifier(random_state=42)

Confusion matrix, precision, and F1 score of test set for each case

In [6]:
for test_size in [0.15, 0.25, 0.35]:
    print(f"\nEvaluating for {int((1 - test_size)*100)}/{int(test_size*100)} split:")
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=42)

    # Train the classifier on the entire training set
    classifier.fit(X_train, Y_train)

    # Predict on the test set
    Y_test_pred = classifier.predict(X_test)

    # Compute metrics for the test set
    cm = confusion_matrix(Y_test, Y_test_pred)
    precision = precision_score(Y_test, Y_test_pred)
    f1 = f1_score(Y_test, Y_test_pred)

    # Display results
    print(f"Confusion Matrix:\n{cm}")
    print(f"Precision (Test Set): {precision:.4f}")
    print(f"F1 Score (Test Set): {f1:.4f}")


Evaluating for 85/15 split:
Confusion Matrix:
[[1529    0]
 [ 310    0]]
Precision (Test Set): 0.0000
F1 Score (Test Set): 0.0000

Evaluating for 75/25 split:
Confusion Matrix:
[[1344 1190]
 [ 217  314]]
Precision (Test Set): 0.2088
F1 Score (Test Set): 0.3086

Evaluating for 65/35 split:
Confusion Matrix:
[[3536   10]
 [ 745    0]]
Precision (Test Set): 0.0000
F1 Score (Test Set): 0.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


3. K-Fold Cross-Validation with 5 folds

In [7]:
# Convert X and Y to numpy arrays for KFold
X_arr = np.array(X)
Y_arr = np.array(Y)

fold = 1
precisions = []
f1_scores = []
KF = KFold(n_splits=5, random_state=42, shuffle=True)

for train_index, test_index in KF.split(X):

    X_train, X_test = X_arr[train_index], X_arr[test_index]
    Y_train, Y_test = Y_arr[train_index], Y_arr[test_index]

    # Train the classifier on the entire training set
    classifier.fit(X_train, Y_train)

    # Predict on the test set
    Y_test_pred = classifier.predict(X_test)

    # Compute metrics for the test set
    cm = confusion_matrix(Y_test, Y_test_pred)
    precision = precision_score(Y_test, Y_test_pred)
    f1 = f1_score(Y_test, Y_test_pred)
    precisions.append(precision)
    f1_scores.append(f1)

    # Display results
    print(f"\nFold {fold}:")
    print(f"Confusion Matrix:\n{cm}")
    print(f"Precision (Test Set): {precision:.4f}")
    print(f"F1 Score (Test Set): {f1:.4f}")
    fold += 1

# Display average metrics across all folds
avg_precision = np.mean(precisions)
avg_f1 = np.mean(f1_scores)
print("\nAverage metrics across all folds:")
print(f"\nAverage Precision (KFold): {avg_precision:.4f}")
print(f"Average F1 Score (KFold): {avg_f1:.4f}")


Fold 1:
Confusion Matrix:
[[1779  245]
 [ 252  176]]
Precision (Test Set): 0.4181
F1 Score (Test Set): 0.4146

Fold 2:
Confusion Matrix:
[[2034    2]
 [ 416    0]]
Precision (Test Set): 0.0000
F1 Score (Test Set): 0.0000

Fold 3:
Confusion Matrix:
[[ 387 1620]
 [  29  416]]
Precision (Test Set): 0.2043
F1 Score (Test Set): 0.3353

Fold 4:
Confusion Matrix:
[[2034   11]
 [ 407    0]]
Precision (Test Set): 0.0000
F1 Score (Test Set): 0.0000

Fold 5:
Confusion Matrix:
[[1145  887]
 [ 148  271]]
Precision (Test Set): 0.2340
F1 Score (Test Set): 0.3437

Average metrics across all folds:

Average Precision (KFold): 0.1713
Average F1 Score (KFold): 0.2187
