In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
combined_data = pd.read_csv('../data/cleaned/combined_data.csv')

In [4]:
X = combined_data.drop('attrition', axis=1)
y = combined_data['attrition']

# Identify categorical columns to be encoded
columns_to_encode = ['business_travel', 'department', 'education_field', 'gender', 'job_role', 'marital_status', 'over_time']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the transformer for categorical columns (OneHotEncoder)
categorical_transformer = OneHotEncoder(drop='first')

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, columns_to_encode),
        # No numerical transformer for already scaled numerical columns
    ])

# Combine preprocessing with a KNN model in a pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier(n_neighbors=5))  # You can adjust the number of neighbors (k) as needed
])

# Fit the model on the training data
model.fit(X_train, y_train)

# Predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.8309352517985612

Classification Report:
               precision    recall  f1-score   support

          No       0.84      0.98      0.90       226
         Yes       0.67      0.19      0.30        52

    accuracy                           0.83       278
   macro avg       0.75      0.59      0.60       278
weighted avg       0.81      0.83      0.79       278


Confusion Matrix:
 [[221   5]
 [ 42  10]]


In [5]:
from sklearn.metrics import cohen_kappa_score

# Calculate Cohen's Kappa for KNN model
kappa_knn = cohen_kappa_score(y_test, y_pred)
print("Cohen's Kappa for KNN:", kappa_knn)

Cohen's Kappa for KNN: 0.23438415563107928


In [9]:
# Separate features (X) and target variable (y)
X = combined_data.drop('attrition', axis=1)
y = combined_data['attrition']

# Identify categorical columns to be encoded
columns_to_encode = ['business_travel', 'department', 'education_field', 'gender', 'job_role', 'marital_status', 'over_time']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the transformer for categorical columns (OneHotEncoder)
categorical_transformer = OneHotEncoder(drop='first')

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, columns_to_encode)
    ])

# Apply one-hot encoding to training and test sets
X_train_encoded = preprocessor.fit_transform(X_train)
X_test_encoded = preprocessor.transform(X_test)

# Combine preprocessing with a KNN model in a pipeline
model = Pipeline(steps=[
    ('classifier', KNeighborsClassifier(n_neighbors=5))  # You can adjust the number of neighbors (k) as needed
])

# Fit the model on the training data
model.fit(X_train_encoded, y_train)

# Predictions on the test set
y_pred = model.predict(X_test_encoded)

# Evaluate the performance
print("Accuracy (Unbalanced):", accuracy_score(y_test, y_pred))
print("\nClassification Report (Unbalanced):\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix (Unbalanced):\n", confusion_matrix(y_test, y_pred))

# Balancing Techniques
balancing_methods = [
    ('SMOTE', SMOTE(random_state=42)),
    ('RandomUnderSampler', RandomUnderSampler(random_state=42)),
    ('NearMiss', NearMiss(version=1, n_neighbors=3))
]

for method_name, balancing_method in balancing_methods:
    print(f"\nPerformance after {method_name}:")
    
    # Apply balancing method to the training set
    X_train_resampled, y_train_resampled = balancing_method.fit_resample(X_train_encoded, y_train)

    # Fit the model on the resampled data
    model.fit(X_train_resampled, y_train_resampled)

    # Predictions on the test set after balancing
    y_pred_resampled = model.predict(X_test_encoded)

    # Evaluate the performance
    print(f"Accuracy ({method_name}):", accuracy_score(y_test, y_pred_resampled))
    print(f"Classification Report ({method_name}):\n", classification_report(y_test, y_pred_resampled))
    print(f"Confusion Matrix ({method_name}):\n", confusion_matrix(y_test, y_pred_resampled))

    # Calculate Cohen's Kappa
    kappa_resampled = cohen_kappa_score(y_test, y_pred_resampled)
    print(f"Cohen's Kappa ({method_name}): {kappa_resampled}")

Accuracy (Unbalanced): 0.8309352517985612

Classification Report (Unbalanced):
               precision    recall  f1-score   support

          No       0.84      0.98      0.90       226
         Yes       0.67      0.19      0.30        52

    accuracy                           0.83       278
   macro avg       0.75      0.59      0.60       278
weighted avg       0.81      0.83      0.79       278


Confusion Matrix (Unbalanced):
 [[221   5]
 [ 42  10]]

Performance after SMOTE:
Accuracy (SMOTE): 0.7194244604316546
Classification Report (SMOTE):
               precision    recall  f1-score   support

          No       0.86      0.78      0.82       226
         Yes       0.32      0.44      0.37        52

    accuracy                           0.72       278
   macro avg       0.59      0.61      0.60       278
weighted avg       0.76      0.72      0.74       278

Confusion Matrix (SMOTE):
 [[177  49]
 [ 29  23]]
Cohen's Kappa (SMOTE): 0.19641268900088937

Performance after Ran

In [11]:
import pickle

# Save model
with open('../src/knn_model.pkl', 'wb') as file:
    pickle.dump(model, file)