In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

In [3]:
combined_data = pd.read_csv('../data/cleaned/combined_data.csv')

In [34]:
# Separate features (X) and target variable (y)
X = combined_data.drop('attrition', axis=1)
y = combined_data['attrition']

# Identify categorical columns to be encoded
columns_to_encode = ['business_travel', 'department', 'education_field', 'gender', 'job_role', 'marital_status', 'over_time']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the transformer for categorical columns (OneHotEncoder)
categorical_transformer = OneHotEncoder(drop='first')

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, columns_to_encode)
    ])

# Apply one-hot encoding to training and test sets
X_train_encoded = preprocessor.fit_transform(X_train)
X_test_encoded = preprocessor.transform(X_test)

# Base KNN model
knn_model = KNeighborsClassifier(n_neighbors=5)

# Combine preprocessing with a KNN model in a pipeline
base_model = Pipeline(steps=[
    ('classifier', knn_model)  
])

# Fit the base model on the training data
base_model.fit(X_train_encoded, y_train)

# Evaluate the performance of the base model
print("Accuracy (Unbalanced):", accuracy_score(y_test, base_model.predict(X_test_encoded)))
print("\nClassification Report (Unbalanced):\n", classification_report(y_test, base_model.predict(X_test_encoded)))
print("\nConfusion Matrix (Unbalanced):\n", confusion_matrix(y_test, base_model.predict(X_test_encoded)))

# Balancing Techniques
for method_name, balancing_method in balancing_methods:
    print(f"\nPerformance after {method_name}:")
    
    # Apply balancing method to the training set
    X_train_resampled, y_train_resampled = balancing_method.fit_resample(X_train_encoded, y_train)

    # Create a new instance of the KNN model
    knn_model_resampled = KNeighborsClassifier(n_neighbors=5)
    
    # Combine preprocessing with the resampled KNN model in a pipeline
    model_resampled = Pipeline(steps=[
        ('classifier', knn_model_resampled)  
    ])

    # Fit the model on the resampled data
    model_resampled.fit(X_train_resampled, y_train_resampled)

    # Evaluate the performance
    print(f"Accuracy ({method_name}):", accuracy_score(y_test, model_resampled.predict(X_test_encoded)))
    print(f"Classification Report ({method_name}):\n", classification_report(y_test, model_resampled.predict(X_test_encoded)))
    print(f"Confusion Matrix ({method_name}):\n", confusion_matrix(y_test, model_resampled.predict(X_test_encoded)))

    # Calculate Cohen's Kappa
    kappa_resampled = cohen_kappa_score(y_test, model_resampled.predict(X_test_encoded))
    print(f"Cohen's Kappa ({method_name}): {kappa_resampled}")

Accuracy (Unbalanced): 0.8309352517985612

Classification Report (Unbalanced):
               precision    recall  f1-score   support

          No       0.84      0.98      0.90       226
         Yes       0.67      0.19      0.30        52

    accuracy                           0.83       278
   macro avg       0.75      0.59      0.60       278
weighted avg       0.81      0.83      0.79       278


Confusion Matrix (Unbalanced):
 [[221   5]
 [ 42  10]]

Performance after SMOTE:
Accuracy (SMOTE): 0.7194244604316546
Classification Report (SMOTE):
               precision    recall  f1-score   support

          No       0.86      0.78      0.82       226
         Yes       0.32      0.44      0.37        52

    accuracy                           0.72       278
   macro avg       0.59      0.61      0.60       278
weighted avg       0.76      0.72      0.74       278

Confusion Matrix (SMOTE):
 [[177  49]
 [ 29  23]]
Cohen's Kappa (SMOTE): 0.19641268900088937

Performance after Ran

### Cross validation

In [35]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
# Base KNN model
knn_model = KNeighborsClassifier(n_neighbors=6)

# Combine preprocessing with a KNN model in a pipeline
base_model = Pipeline(steps=[
    ('classifier', knn_model)  
])

# Perform cross-validation on the base model
cv_scores_base = cross_val_score(base_model, X_train_encoded, y_train, cv=StratifiedKFold(n_splits=5), scoring='accuracy')

# Evaluate the performance of the base model
print("Cross-Validation Accuracy (Unbalanced):", np.mean(cv_scores_base))

# Balancing Techniques
for method_name, balancing_method in balancing_methods:
    print(f"\nPerformance after {method_name} with cross-validation:")
    
    # Apply balancing method to the training set
    X_train_resampled, y_train_resampled = balancing_method.fit_resample(X_train_encoded, y_train)

    # Create a new instance of the KNN model
    knn_model_resampled = KNeighborsClassifier(n_neighbors=6)
    
    # Combine preprocessing with the resampled KNN model in a pipeline
    model_resampled = Pipeline(steps=[
        ('classifier', knn_model_resampled)  
    ])

    # Perform cross-validation on the resampled model
    cv_scores_resampled = cross_val_score(model_resampled, X_train_resampled, y_train_resampled, cv=StratifiedKFold(n_splits=5), scoring='accuracy')

    # Evaluate the performance
    print(f"Cross-Validation Accuracy ({method_name}):", np.mean(cv_scores_resampled))

Cross-Validation Accuracy (Unbalanced): 0.8422159716277363

Performance after SMOTE with cross-validation:
Cross-Validation Accuracy (SMOTE): 0.7714736660036323

Performance after RandomUnderSampler with cross-validation:
Cross-Validation Accuracy (RandomUnderSampler): 0.6075653923541248

Performance after NearMiss with cross-validation:
Cross-Validation Accuracy (NearMiss): 0.5424547283702212


### Best parametr 'n_neighbors'

In [27]:
from sklearn.model_selection import GridSearchCV
param_grid = {'n_neighbors': range(1, 21)}  

# Create the grid search object
grid_search = GridSearchCV(knn_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the grid search on the training data
grid_search.fit(X_train_encoded, y_train)

# Print the best parameters and corresponding accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

Best Parameters: {'n_neighbors': 6}
Best Accuracy: 0.8485222779340426


After running the model with suggested best parametr 'n_neighbors': 6, no improvements were noticed. Prediction for the class 'No' were even worse. So I dicided to keep the model with 'n_neighbors': 5, which showed the best metrics

In [33]:
import pickle

# Save model
with open('../src/knn_model.pkl', 'wb') as file:
    pickle.dump(base_model, file)