In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier, StackingClassifier, RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# load data
train_df = pd.read_csv("Train_Dataset.csv")
test_df = pd.read_csv("Test_Dataset.csv")

# separate features and target
target_col = "Attrition"
id_col = "EmployeeID"

# drop rows where target is NaN
train_df_clean = train_df[train_df[target_col].notna()].copy()

X = train_df_clean.drop(columns=[target_col, id_col])
y = train_df_clean[target_col]

# identify column types
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()

print(f"Dataset shape: {X.shape}")
print(f"Target distribution:\n{y.value_counts(normalize=True)}")

# Advanced preprocessing with feature engineering
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop='if_binary'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols)
    ]
)

# train/validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y
)

# Create multiple models for stacking
xgb_model = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.1,
    random_state=42,
    n_jobs=-1,
    eval_metric='logloss'
)

lgbm_model = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

rf_model = RandomForestClassifier(
    n_estimators=500,
    max_depth=15,
    min_samples_split=4,
    min_samples_leaf=2,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1
)

et_model = ExtraTreesClassifier(
    n_estimators=500,
    max_depth=15,
    min_samples_split=4,
    min_samples_leaf=2,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1
)

gb_model = GradientBoostingClassifier(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=5,
    min_samples_split=4,
    subsample=0.8,
    random_state=42
)

# Create stacking ensemble
print("\n" + "="*60)
print("Training Stacking Ensemble with 5 Base Models")
print("="*60)

base_models = [
    ('xgb', Pipeline([('preprocess', preprocessor), ('model', xgb_model)])),
    ('lgbm', Pipeline([('preprocess', preprocessor), ('model', lgbm_model)])),
    ('rf', Pipeline([('preprocess', preprocessor), ('model', rf_model)])),
    ('et', Pipeline([('preprocess', preprocessor), ('model', et_model)])),
    ('gb', Pipeline([('preprocess', preprocessor), ('model', gb_model)]))
]

# Meta-learner
meta_learner = LogisticRegression(
    C=1.0,
    max_iter=1000,
    random_state=42,
    n_jobs=-1
)

stacking_model = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_learner,
    cv=5,
    n_jobs=-1,
    verbose=1
)

# Train stacking model
print("\nTraining on training set...")
stacking_model.fit(X_train, y_train)

# Evaluate on validation set
y_pred_val = stacking_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_pred_val)

print("\n" + "="*60)
print(f"Validation Accuracy: {val_accuracy:.6f}")
print("="*60)
print("\nClassification Report:")
print(classification_report(y_val, y_pred_val))
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred_val))

# Train on full dataset
print("\n" + "="*60)
print("Training on full dataset for final predictions...")
print("="*60)
stacking_model.fit(X, y)

# Cross-validation on full data
print("\nPerforming 10-fold cross-validation on full data...")
cv_scores = cross_val_score(stacking_model, X, y, cv=10, scoring='accuracy', n_jobs=-1, verbose=1)
print(f"\nCross-validation scores: {cv_scores}")
print(f"Mean CV accuracy: {cv_scores.mean():.6f} (+/- {cv_scores.std() * 2:.6f})")
print(f"Min CV accuracy: {cv_scores.min():.6f}")
print(f"Max CV accuracy: {cv_scores.max():.6f}")

# predict on test set
X_test = test_df.drop(columns=[id_col])
test_preds = stacking_model.predict(X_test)

# create submission file
submission = pd.DataFrame({
    id_col: test_df[id_col],
    target_col: test_preds
})
submission.to_csv("submission_test.csv", index=False)
print("\n" + "="*60)
print("Submission file created: submission_test.csv")
print(f"Total predictions: {len(submission)}")
print(f"Prediction distribution:\n{pd.Series(test_preds).value_counts()}")
print("="*60)

Dataset shape: (5180, 20)
Target distribution:
Attrition
0.0    0.721042
1.0    0.278958
Name: proportion, dtype: float64

Training Stacking Ensemble with 5 Base Models

Training on training set...

Validation Accuracy: 0.984556

Classification Report:
              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99       560
         1.0       0.98      0.97      0.97       217

    accuracy                           0.98       777
   macro avg       0.98      0.98      0.98       777
weighted avg       0.98      0.98      0.98       777


Confusion Matrix:
[[555   5]
 [  7 210]]

Training on full dataset for final predictions...

Performing 10-fold cross-validation on full data...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.6min finished



Cross-validation scores: [0.98648649 0.99034749 0.99227799 0.98455598 0.98262548 0.98841699
 0.98841699 0.996139   0.99420849 0.98841699]
Mean CV accuracy: 0.989189 (+/- 0.007950)
Min CV accuracy: 0.982625
Max CV accuracy: 0.996139

Submission file created: submission_test.csv
Total predictions: 2630
Prediction distribution:
0.0    1895
1.0     735
Name: count, dtype: int64
