In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
combined_data = pd.read_csv('../data/cleaned/combined_data.csv')

In [4]:
X = combined_data.drop('attrition', axis=1)
y = combined_data['attrition']

# Identify categorical columns to be encoded
columns_to_encode = ['business_travel', 'department', 'education_field', 'gender', 'job_role', 'marital_status', 'over_time']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the transformer for categorical columns (OneHotEncoder)
categorical_transformer = OneHotEncoder(drop='first')

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, columns_to_encode),
        # No numerical transformer for already scaled numerical columns
    ])

# Combine preprocessing with a KNN model in a pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier(n_neighbors=5))  # You can adjust the number of neighbors (k) as needed
])

# Fit the model on the training data
model.fit(X_train, y_train)

# Predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.8309352517985612

Classification Report:
               precision    recall  f1-score   support

          No       0.84      0.98      0.90       226
         Yes       0.67      0.19      0.30        52

    accuracy                           0.83       278
   macro avg       0.75      0.59      0.60       278
weighted avg       0.81      0.83      0.79       278


Confusion Matrix:
 [[221   5]
 [ 42  10]]


In [6]:
from sklearn.metrics import cohen_kappa_score

# Calculate Cohen's Kappa for KNN model
kappa_knn = cohen_kappa_score(y_test, y_pred)
print("Cohen's Kappa for KNN:", kappa_knn)

Cohen's Kappa for KNN: 0.23438415563107928


In [7]:
import pickle

# Save model
with open('../src/knn_model.pkl', 'wb') as file:
    pickle.dump(model, file)