Pham Nhat Duc - 164630

In [None]:
%pip install numpy pandas scikit-learn

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, precision_score, f1_score
from sklearn.preprocessing import LabelEncoder

1. Load and Preprocess the Data

In [2]:
data = pd.read_csv('cleaned_data.csv')

print(data.shape)
# Preview the first 10 rows
data

(12259, 10)


Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,work_accident,left,promotion_last_5years,department,salary
0,0.38,0.53,2,157.000000,3.000000,0,1,0,sales,low
1,0.80,0.86,5,262.000000,6.000000,0,1,0,sales,medium
2,0.11,0.88,7,272.000000,4.000000,0,1,0,sales,medium
3,0.72,0.87,5,223.000000,5.000000,0,1,0,sales,low
4,0.37,0.52,2,200.511732,3.380048,0,1,0,sales,low
...,...,...,...,...,...,...,...,...,...,...
12254,0.40,0.47,2,128.000000,3.000000,0,1,0,sales,medium
12255,0.43,0.46,2,157.000000,3.000000,0,1,0,sales,medium
12256,0.89,0.88,5,228.000000,5.000000,1,1,0,support,low
12257,0.76,0.83,6,293.000000,6.000000,0,1,0,support,low


In [3]:
# Convert categorical variables to numerical
lbl_encoder = LabelEncoder()
data['salary'] = lbl_encoder.fit_transform(data['salary'])
data['department'] = lbl_encoder.fit_transform(data['department'])
print(data.dtypes)

# Define features X and target Y
X = data.drop(columns='left')
Y = data['left']

data

satisfaction_level       float64
last_evaluation          float64
number_project             int64
average_montly_hours     float64
time_spend_company       float64
work_accident              int64
left                       int64
promotion_last_5years      int64
department                 int64
salary                     int64
dtype: object


Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,work_accident,left,promotion_last_5years,department,salary
0,0.38,0.53,2,157.000000,3.000000,0,1,0,7,1
1,0.80,0.86,5,262.000000,6.000000,0,1,0,7,2
2,0.11,0.88,7,272.000000,4.000000,0,1,0,7,2
3,0.72,0.87,5,223.000000,5.000000,0,1,0,7,1
4,0.37,0.52,2,200.511732,3.380048,0,1,0,7,1
...,...,...,...,...,...,...,...,...,...,...
12254,0.40,0.47,2,128.000000,3.000000,0,1,0,7,2
12255,0.43,0.46,2,157.000000,3.000000,0,1,0,7,2
12256,0.89,0.88,5,228.000000,5.000000,1,1,0,8,1
12257,0.76,0.83,6,293.000000,6.000000,0,1,0,8,1


2. Train-Test Split

Confusion matrix, precision, and F1 score of test set for each case

In [4]:
for test_size in [0.15, 0.25, 0.35]:
    # Improves measurement of the case 65/35 split
    class_weight = None if test_size == 0.15 else 'balanced'
    classifier = SGDClassifier(class_weight=class_weight, random_state=42)

    print(f"Evaluating for {int((1 - test_size)*100)}/{int(test_size*100)} split:")
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=42)

    # Train the classifier on the entire training set
    classifier.fit(X_train, Y_train)

    # Predict on the test set
    Y_test_pred = classifier.predict(X_test)

    # Compute metrics for the test set
    cm = confusion_matrix(Y_test, Y_test_pred)
    precision = precision_score(Y_test, Y_test_pred)
    f1 = f1_score(Y_test, Y_test_pred)

    # Display results
    print(f"Confusion Matrix:\n{cm}")
    print(f"Precision on test set: {precision:.4f}")
    print(f"F1 Score on test set: {f1:.4f}")
    print()

Evaluating for 85/15 split:
Confusion Matrix:
[[1468   61]
 [ 262   48]]
Precision on test set: 0.4404
F1 Score on test set: 0.2291

Evaluating for 75/25 split:
Confusion Matrix:
[[2273  261]
 [ 414  117]]
Precision on test set: 0.3095
F1 Score on test set: 0.2574

Evaluating for 65/35 split:
Confusion Matrix:
[[2241 1305]
 [ 119  626]]
Precision on test set: 0.3242
F1 Score on test set: 0.4679



|Split|Precision|F1 Score|
| - | - | - |
|85/15 | 0.4404 | 0.2291 |
|75/25 | 0.3095 | 0.2574 |
|65/35 | 0.3242 | 0.4679 |

3. K-Fold Cross-Validation with 5 folds

In [5]:
# Convert X and Y to numpy arrays for KFold
X_arr = np.array(X)
Y_arr = np.array(Y)

fold = 1
precisions = []
f1_scores = []
classifier = SGDClassifier(class_weight='balanced', random_state=42)
KF = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

for train_index, test_index in KF.split(X, Y):
    X_train, X_test = X_arr[train_index], X_arr[test_index]
    Y_train, Y_test = Y_arr[train_index], Y_arr[test_index]

    # Train the classifier on the training set
    classifier.fit(X_train, Y_train)

    # Predict on the test set
    Y_test_pred = classifier.predict(X_test)

    # Compute metrics for the test set
    cm = confusion_matrix(Y_test, Y_test_pred)
    precision = precision_score(Y_test, Y_test_pred)
    f1 = f1_score(Y_test, Y_test_pred)
    precisions.append(precision)
    f1_scores.append(f1)

    # Display results
    print(f"\nFold {fold}:")
    print(f"Confusion Matrix:\n{cm}")
    print(f"Precision (Test set): {precision:.4f}")
    print(f"F1 Score (Test set): {f1:.4f}")
    fold += 1

# Display average metrics across all folds
avg_precision = np.mean(precisions)
avg_f1 = np.mean(f1_scores)
print("\nAverage metrics across all folds:")
print(f"\nAverage Precision (K-Fold): {avg_precision:.4f}")
print(f"Average F1 Score (K-Fold): {avg_f1:.4f}")


Fold 1:
Confusion Matrix:
[[1974   55]
 [ 423    0]]
Precision (Test set): 0.0000
F1 Score (Test set): 0.0000

Fold 2:
Confusion Matrix:
[[1661  368]
 [ 186  237]]
Precision (Test set): 0.3917
F1 Score (Test set): 0.4611

Fold 3:
Confusion Matrix:
[[1794  235]
 [ 321  102]]
Precision (Test set): 0.3027
F1 Score (Test set): 0.2684

Fold 4:
Confusion Matrix:
[[1829  200]
 [ 373   50]]
Precision (Test set): 0.2000
F1 Score (Test set): 0.1486

Fold 5:
Confusion Matrix:
[[1360  668]
 [  67  356]]
Precision (Test set): 0.3477
F1 Score (Test set): 0.4921

Average metrics across all folds:

Average Precision (K-Fold): 0.2484
Average F1 Score (K-Fold): 0.2740


|Fold|Precision|F1 Score|
| - | - | - |
| 1 | 0.0000 | 0.0000 |
| 2 | 0.3917 | 0.4611 |
| 3 | 0.3027 | 0.2684 |
| 4 | 0.2000 | 0.1486 |
| 5 | 0.3477 | 0.4921 |
|Avg| 0.2484 | 0.2740 |