In [12]:
import pandas as pd
from sklearn.ensemble import StackingClassifier
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import cohen_kappa_score

In [9]:
combined_data = pd.read_csv('../data/cleaned/combined_data.csv')

In [10]:
# Load the KNN model from the file
knn_model = joblib.load('../src/knn_model.pkl')

# Load the Logistic Regression model from the file
logreg_model = joblib.load('../src/logreg_model.pkl')

# Load the Decision Tree model from the file
dt_model = joblib.load('../src/dt_model.pkl')

In [11]:
X = combined_data.drop('attrition', axis=1)
y = combined_data['attrition']

# Identify categorical columns to be encoded
columns_to_encode = ['business_travel', 'department', 'education_field', 'gender', 'job_role', 'marital_status', 'over_time']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the transformer for categorical columns (OneHotEncoder)
categorical_transformer = OneHotEncoder(drop='first')

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, columns_to_encode),
    ])

estimators = [
    ('knn', knn_model),
    ('logreg', logreg_model),
    ('dt', dt_model)
]

# Define the meta-model (you can choose any classifier as a meta-model)
meta_model = LogisticRegression(random_state=42)

# Create the stacking model
stacking_model = StackingClassifier(estimators=estimators, final_estimator=meta_model)

# Fit the stacking model
stacking_model.fit(X_train, y_train)

# Predictions on the test set
y_pred_stacking = stacking_model.predict(X_test)

# Evaluate the performance of the stacking model
print("Accuracy:", accuracy_score(y_test, y_pred_stacking))
print("\nClassification Report:\n", classification_report(y_test, y_pred_stacking))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_stacking))

Accuracy: 0.8309352517985612

Classification Report:
               precision    recall  f1-score   support

          No       0.84      0.99      0.90       226
         Yes       0.73      0.15      0.25        52

    accuracy                           0.83       278
   macro avg       0.78      0.57      0.58       278
weighted avg       0.82      0.83      0.78       278


Confusion Matrix:
 [[223   3]
 [ 44   8]]


In [13]:
kappa_stacking = cohen_kappa_score(y_test, y_pred_stacking)
print("Cohen's Kappa for Stacking Model:", kappa_stacking)

Cohen's Kappa for Stacking Model: 0.20183262064752594
