In [2]:
import pandas as pd
from sklearn.ensemble import StackingClassifier
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import cohen_kappa_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, NearMiss
from sklearn.base import clone

In [3]:
combined_data = pd.read_csv('../data/cleaned/combined_data.csv')

### Model with original data

In [18]:
# Load the KNN model from the file
knn_model = joblib.load('../src/knn_model.pkl')

# Load the Logistic Regression model from the file
logreg_model = joblib.load('../src/logreg_model.pkl')

# Load the Decision Tree model from the file
dt_model = joblib.load('../src/dt_model.pkl')

In [24]:
X1 = combined_data.drop('attrition', axis=1)
y1= combined_data['attrition']

# Identify categorical columns to be encoded
columns_to_encode = ['business_travel', 'department', 'education_field', 'gender', 'job_role', 'marital_status', 'over_time']

# Split the data into training and testing sets
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=42)

# Define the transformer for categorical columns (OneHotEncoder)
categorical_transformer = OneHotEncoder(drop='first')

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, columns_to_encode),
    ])
# Apply one-hot encoding to training and test sets
X_train_encoded1 = preprocessor.fit_transform(X_train1)
X_test_encoded1 = preprocessor.transform(X_test1)
estimators = [
    ('knn', knn_model),
    ('logreg', logreg_model),
    ('dt', dt_model)
]

# Define the meta-model 
meta_model = LogisticRegression(random_state=42)

# Create the stacking model
stacking_model_1 = StackingClassifier(estimators=estimators, final_estimator=meta_model)

# Fit the stacking model
stacking_model_1.fit(X_train_encoded1, y_train1)

# Predictions on the test set
y_pred_stacking1 = stacking_model_1.predict(X_test_encoded1)

# Evaluate the performance of the stacking model
print("Accuracy:", round(accuracy_score(y_test1, y_pred_stacking1),3))
print("\nClassification Report:\n", classification_report(y_test1, y_pred_stacking1))
print("\nConfusion Matrix:\n", confusion_matrix(y_test1, y_pred_stacking1))
kappa_stacking1 = cohen_kappa_score(y_test1, y_pred_stacking1)
print("Cohen's Kappa for Stacking Model:", round(kappa_stacking1, 3))

Accuracy: 0.831

Classification Report:
               precision    recall  f1-score   support

          No       0.84      0.99      0.90       226
         Yes       0.73      0.15      0.25        52

    accuracy                           0.83       278
   macro avg       0.78      0.57      0.58       278
weighted avg       0.82      0.83      0.78       278


Confusion Matrix:
 [[223   3]
 [ 44   8]]
Cohen's Kappa for Stacking Model: 0.202


### Model after balancing

In [25]:
# Separate features (X) and target variable (y)
X = combined_data.drop('attrition', axis=1)
y = combined_data['attrition']

# Identify categorical columns to be encoded
columns_to_encode = ['business_travel', 'department', 'education_field', 'gender', 'job_role', 'marital_status', 'over_time']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the transformer for categorical columns (OneHotEncoder)
categorical_transformer = OneHotEncoder(drop='first')

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, columns_to_encode)
    ])

# Apply one-hot encoding to training and test sets
X_train_encoded = preprocessor.fit_transform(X_train)
X_test_encoded = preprocessor.transform(X_test)

# Define models for stacking
models_for_stacking = [
    ('KNN', knn_model),
    ('Logistic Regression', logreg_model),
    ('Decision Tree', dt_model)
]

# Balancing Techniques
balancing_methods = [
    ('SMOTE', SMOTE(random_state=42)),
    ('RandomUnderSampler', RandomUnderSampler(random_state=42)),
    ('NearMiss', NearMiss(version=1, n_neighbors=3))
]

for method_name, balancing_method in balancing_methods:
    print(f"\nPerformance after {method_name}:")

    # Clone the models for each balancing method
    current_models_for_stacking = [(name, clone(model)) for name, model in models_for_stacking]

    # Create the stacking model with the current balancing method
    current_stacking_model = StackingClassifier(estimators=current_models_for_stacking, final_estimator=LogisticRegression(random_state=42))

    # Apply balancing method to the training set
    X_train_resampled, y_train_resampled = balancing_method.fit_resample(X_train_encoded, y_train)

    # Fit the stacking model on the resampled data
    current_stacking_model.fit(X_train_resampled, y_train_resampled)

    # Predictions on the test set after balancing
    y_pred_resampled = current_stacking_model.predict(X_test_encoded)

    # Evaluate the performance
    print(f"Accuracy ({method_name}):", round(accuracy_score(y_test, y_pred_resampled), 3))
    print(f"Classification Report ({method_name}):\n", classification_report(y_test, y_pred_resampled))
    print(f"Confusion Matrix ({method_name}):\n", confusion_matrix(y_test, y_pred_resampled))

    # Calculate Cohen's Kappa
    kappa_resampled = round(cohen_kappa_score(y_test, y_pred_resampled), 3)
    print(f"Cohen's Kappa ({method_name}): {kappa_resampled}")


Performance after SMOTE:
Accuracy (SMOTE): 0.759
Classification Report (SMOTE):
               precision    recall  f1-score   support

          No       0.87      0.83      0.85       226
         Yes       0.38      0.46      0.42        52

    accuracy                           0.76       278
   macro avg       0.63      0.64      0.63       278
weighted avg       0.78      0.76      0.77       278

Confusion Matrix (SMOTE):
 [[187  39]
 [ 28  24]]
Cohen's Kappa (SMOTE): 0.267

Performance after RandomUnderSampler:
Accuracy (RandomUnderSampler): 0.712
Classification Report (RandomUnderSampler):
               precision    recall  f1-score   support

          No       0.89      0.74      0.81       226
         Yes       0.34      0.60      0.44        52

    accuracy                           0.71       278
   macro avg       0.62      0.67      0.62       278
weighted avg       0.79      0.71      0.74       278

Confusion Matrix (RandomUnderSampler):
 [[167  59]
 [ 21  31]]
C

### Model trained with unbalanced data shows the best scores

In [30]:
attrition_column = combined_data.pop('attrition')  
combined_data['attrition'] = attrition_column 
# Predictions on the entire dataset
combined_data['attrition_pred'] = stacking_model_1.predict(preprocessor.transform(X)) 

# Display the updated DataFrame
combined_data.head()

Unnamed: 0,age,daily_rate,distance_from_home,hourly_rate,monthly_income,monthly_rate,num_companies_worked,percent_salary_hike,years_in_current_role,years_since_last_promotion,...,job_role,job_satisfaction,marital_status,over_time,relationship_satisfaction,stock_option_level,training_times_last_year,work_life_balance,attrition_pred,attrition
0,0.547619,0.716332,0.0,0.914286,0.26323,0.698016,0.888889,0.0,0.266667,0.0,...,Sales Executive,4,Single,Yes,1,0,0,1,No,Yes
1,0.738095,0.126791,0.25,0.442857,0.217651,0.915991,0.111111,0.857143,0.466667,0.090909,...,Research Scientist,2,Married,No,4,1,3,3,No,No
2,0.452381,0.910458,0.035714,0.885714,0.057093,0.012007,0.666667,0.285714,0.0,0.0,...,Laboratory Technician,3,Single,Yes,2,0,3,3,Yes,Yes
3,0.357143,0.924069,0.071429,0.371429,0.100349,0.845796,0.111111,0.0,0.466667,0.272727,...,Research Scientist,3,Married,Yes,3,0,3,3,No,No
4,0.214286,0.350287,0.035714,0.142857,0.129872,0.583688,1.0,0.071429,0.133333,0.181818,...,Laboratory Technician,2,Married,No,4,1,3,3,No,No


In [26]:
import pickle

# Save model
with open('../src/stacking_model.pkl', 'wb') as file:
    pickle.dump(stacking_model_1, file)

In [31]:
#save the dataset with prediction column 'attrition_pred'
combined_data.to_csv('../data/cleaned/data_with_pred.csv', index=False)