In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Paths
train_path = "Train_Dataset.csv"
test_path = "Test_Dataset.csv"

# Load data
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

target_col = "Attrition"
id_col = "EmployeeID"

# Remove rows with missing target values
train_df = train_df[train_df[target_col].notna()].copy()

# Encode target if needed
if train_df[target_col].dtype == 'object':
    le = LabelEncoder()
    train_df[target_col] = le.fit_transform(train_df[target_col])

# Combine for feature engineering
train_df['is_train'] = 1
test_df['is_train'] = 0
combined_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)

print(f"Original features: {combined_df.shape[1]}")
print(f"Dataset size: {len(combined_df)}")

# ============================================
# ADVANCED FEATURE ENGINEERING
# ============================================

def create_advanced_features(df):
    df = df.copy()
    
    # Salary features
    if 'MonthlyIncome' in df.columns and 'Age' in df.columns:
        df['IncomePerAge'] = df['MonthlyIncome'] / (df['Age'] + 1)
    
    if 'MonthlyIncome' in df.columns and 'YearsAtCompany' in df.columns:
        df['IncomePerYear'] = df['MonthlyIncome'] / (df['YearsAtCompany'] + 1)
    
    if 'HourlyRate' in df.columns and 'MonthlyRate' in df.columns:
        df['TotalRate'] = df['HourlyRate'] + df['MonthlyRate']
        df['RateRatio'] = df['HourlyRate'] / (df['MonthlyRate'] + 1)
    
    if 'DailyRate' in df.columns and 'MonthlyRate' in df.columns:
        df['DailyMonthlyRatio'] = df['DailyRate'] / (df['MonthlyRate'] + 1)
    
    # Experience features
    if 'TotalWorkingYears' in df.columns and 'Age' in df.columns:
        df['ExperienceRatio'] = df['TotalWorkingYears'] / (df['Age'] + 1)
        df['YearsNotWorking'] = df['Age'] - df['TotalWorkingYears'] - 18  # Assuming 18 is working start age
    
    if 'YearsAtCompany' in df.columns and 'TotalWorkingYears' in df.columns:
        df['CompanyTenureRatio'] = df['YearsAtCompany'] / (df['TotalWorkingYears'] + 1)
    
    if 'YearsInCurrentRole' in df.columns and 'YearsAtCompany' in df.columns:
        df['RoleTenureRatio'] = df['YearsInCurrentRole'] / (df['YearsAtCompany'] + 1)
    
    if 'YearsSinceLastPromotion' in df.columns and 'YearsAtCompany' in df.columns:
        df['PromotionRatio'] = df['YearsSinceLastPromotion'] / (df['YearsAtCompany'] + 1)
        df['YearsWithoutPromotion'] = df['YearsSinceLastPromotion']
    
    if 'YearsWithCurrManager' in df.columns and 'YearsAtCompany' in df.columns:
        df['ManagerTenureRatio'] = df['YearsWithCurrManager'] / (df['YearsAtCompany'] + 1)
    
    # Career mobility
    if 'NumCompaniesWorked' in df.columns and 'TotalWorkingYears' in df.columns:
        df['AvgYearsPerCompany'] = df['TotalWorkingYears'] / (df['NumCompaniesWorked'] + 1)
        df['JobHoppingScore'] = df['NumCompaniesWorked'] / (df['TotalWorkingYears'] + 1)
    
    # Satisfaction features
    satisfaction_cols = ['JobSatisfaction', 'EnvironmentSatisfaction', 'RelationshipSatisfaction']
    available_satisfaction = [col for col in satisfaction_cols if col in df.columns]
    if len(available_satisfaction) >= 2:
        df['AvgSatisfaction'] = df[available_satisfaction].mean(axis=1)
        df['MinSatisfaction'] = df[available_satisfaction].min(axis=1)
        df['MaxSatisfaction'] = df[available_satisfaction].max(axis=1)
        df['SatisfactionRange'] = df['MaxSatisfaction'] - df['MinSatisfaction']
        df['SatisfactionStd'] = df[available_satisfaction].std(axis=1)
    
    # Work-life balance
    if 'DistanceFromHome' in df.columns and 'WorkLifeBalance' in df.columns:
        df['CommuteBalanceScore'] = df['DistanceFromHome'] * df['WorkLifeBalance']
        df['CommuteStress'] = df['DistanceFromHome'] / (df['WorkLifeBalance'] + 1)
    
    if 'OverTime' in df.columns:
        df['OverTime_Binary'] = (df['OverTime'] == 'Yes').astype(int)
    
    # Career progression
    if 'JobLevel' in df.columns and 'YearsAtCompany' in df.columns:
        df['JobLevelPerYear'] = df['JobLevel'] / (df['YearsAtCompany'] + 1)
    
    if 'JobLevel' in df.columns and 'Age' in df.columns:
        df['JobLevelPerAge'] = df['JobLevel'] / (df['Age'] + 1)
    
    # Performance
    if 'PerformanceRating' in df.columns and 'YearsSinceLastPromotion' in df.columns:
        df['PerformancePromotionScore'] = df['PerformanceRating'] / (df['YearsSinceLastPromotion'] + 1)
    
    # Training
    if 'TrainingTimesLastYear' in df.columns and 'YearsAtCompany' in df.columns:
        df['TrainingIntensity'] = df['TrainingTimesLastYear'] / (df['YearsAtCompany'] + 1)
    
    # Stagnation indicators
    if 'YearsSinceLastPromotion' in df.columns:
        df['LongTimeNoPromotion'] = (df['YearsSinceLastPromotion'] > 5).astype(int)
        df['RecentPromotion'] = (df['YearsSinceLastPromotion'] <= 1).astype(int)
    
    if 'YearsInCurrentRole' in df.columns:
        df['LongTimeInRole'] = (df['YearsInCurrentRole'] > 7).astype(int)
        df['NewInRole'] = (df['YearsInCurrentRole'] <= 1).astype(int)
    
    # Interaction features
    if 'JobInvolvement' in df.columns and 'JobSatisfaction' in df.columns:
        df['JobEngagement'] = df['JobInvolvement'] * df['JobSatisfaction']
    
    if 'WorkLifeBalance' in df.columns and 'JobSatisfaction' in df.columns:
        df['OverallWellbeing'] = df['WorkLifeBalance'] * df['JobSatisfaction']
    
    if 'EnvironmentSatisfaction' in df.columns and 'WorkLifeBalance' in df.columns:
        df['WorkEnvironmentScore'] = df['EnvironmentSatisfaction'] * df['WorkLifeBalance']
    
    # Polynomial features for key metrics
    if 'MonthlyIncome' in df.columns:
        df['MonthlyIncome_Squared'] = df['MonthlyIncome'] ** 2
        df['MonthlyIncome_Log'] = np.log1p(df['MonthlyIncome'])
    
    if 'Age' in df.columns:
        df['Age_Squared'] = df['Age'] ** 2
    
    if 'DistanceFromHome' in df.columns:
        df['Distance_Squared'] = df['DistanceFromHome'] ** 2
    
    # ============================================
    # NEW ADVANCED FEATURES
    # ============================================
    
    # Age-based career milestones
    if 'Age' in df.columns:
        df['Age_Young'] = (df['Age'] < 30).astype(int)
        df['Age_MidCareer'] = ((df['Age'] >= 30) & (df['Age'] < 45)).astype(int)
        df['Age_Senior'] = (df['Age'] >= 45).astype(int)
        df['Age_Cube'] = df['Age'] ** 3
    
    # Income percentile features
    if 'MonthlyIncome' in df.columns:
        df['Income_Percentile'] = df['MonthlyIncome'].rank(pct=True)
        df['HighEarner'] = (df['MonthlyIncome'] > df['MonthlyIncome'].quantile(0.75)).astype(int)
        df['LowEarner'] = (df['MonthlyIncome'] < df['MonthlyIncome'].quantile(0.25)).astype(int)
    
    # Comprehensive satisfaction score
    satisfaction_all = ['JobSatisfaction', 'EnvironmentSatisfaction', 'RelationshipSatisfaction', 
                        'WorkLifeBalance', 'JobInvolvement']
    available_all = [col for col in satisfaction_all if col in df.columns]
    if len(available_all) >= 3:
        df['ComprehensiveSatisfaction'] = df[available_all].mean(axis=1)
        df['LowOverallSatisfaction'] = (df['ComprehensiveSatisfaction'] < 2.5).astype(int)
    
    # Manager relationship longevity
    if 'YearsWithCurrManager' in df.columns and 'YearsInCurrentRole' in df.columns:
        df['ManagerRoleAlignment'] = abs(df['YearsWithCurrManager'] - df['YearsInCurrentRole'])
        df['SameManagerRole'] = (df['YearsWithCurrManager'] == df['YearsInCurrentRole']).astype(int)
    
    # Career acceleration/deceleration
    if 'YearsAtCompany' in df.columns and 'YearsSinceLastPromotion' in df.columns and 'JobLevel' in df.columns:
        df['PromotionVelocity'] = df['JobLevel'] / (df['YearsAtCompany'] + 1)
        df['CareerStuck'] = ((df['YearsSinceLastPromotion'] > 4) & (df['JobLevel'] <= 2)).astype(int)
    
    # Training engagement
    if 'TrainingTimesLastYear' in df.columns:
        df['NoTraining'] = (df['TrainingTimesLastYear'] == 0).astype(int)
        df['HighTraining'] = (df['TrainingTimesLastYear'] >= 4).astype(int)
        df['Training_Squared'] = df['TrainingTimesLastYear'] ** 2
    
    # OverTime and Income interaction
    if 'OverTime' in df.columns and 'MonthlyIncome' in df.columns:
        df['OvertimeIncomeLow'] = ((df['OverTime'] == 'Yes') & 
                                    (df['MonthlyIncome'] < df['MonthlyIncome'].median())).astype(int)
    
    # Distance categories
    if 'DistanceFromHome' in df.columns:
        df['Distance_VeryClose'] = (df['DistanceFromHome'] <= 5).astype(int)
        df['Distance_VeryFar'] = (df['DistanceFromHome'] >= 20).astype(int)
        df['Distance_Log'] = np.log1p(df['DistanceFromHome'])
    
    # Experience gaps and ratios
    if 'TotalWorkingYears' in df.columns and 'YearsAtCompany' in df.columns and 'NumCompaniesWorked' in df.columns:
        df['ExperienceGap'] = df['TotalWorkingYears'] - df['YearsAtCompany']
        df['CompanyLoyalty'] = df['YearsAtCompany'] / (df['NumCompaniesWorked'] + 1)
        df['FrequentJobChanger'] = (df['NumCompaniesWorked'] > 5).astype(int)
    
    # Stock option and job level interaction
    if 'StockOptionLevel' in df.columns and 'JobLevel' in df.columns:
        df['StockJobAlignment'] = df['StockOptionLevel'] * df['JobLevel']
        df['NoStockHighLevel'] = ((df['StockOptionLevel'] == 0) & (df['JobLevel'] >= 3)).astype(int)
    
    # Performance and satisfaction mismatch
    if 'PerformanceRating' in df.columns and 'JobSatisfaction' in df.columns:
        df['PerformanceSatisfactionGap'] = abs(df['PerformanceRating'] - df['JobSatisfaction'])
        df['HighPerfLowSat'] = ((df['PerformanceRating'] >= 3) & (df['JobSatisfaction'] <= 2)).astype(int)
    
    # Years ratios and complex interactions
    if 'YearsInCurrentRole' in df.columns and 'YearsSinceLastPromotion' in df.columns:
        df['RolePromotionGap'] = df['YearsInCurrentRole'] - df['YearsSinceLastPromotion']
        df['LongRoleNoPromotion'] = ((df['YearsInCurrentRole'] > 5) & 
                                      (df['YearsSinceLastPromotion'] > 5)).astype(int)
    
    # Education and income
    if 'Education' in df.columns and 'MonthlyIncome' in df.columns:
        df['IncomePerEducation'] = df['MonthlyIncome'] / (df['Education'] + 1)
    
    # Job involvement and overtime
    if 'JobInvolvement' in df.columns and 'OverTime' in df.columns:
        df['LowInvolvementOvertime'] = ((df['JobInvolvement'] <= 2) & 
                                        (df['OverTime'] == 'Yes')).astype(int)
    
    # Age and experience alignment
    if 'Age' in df.columns and 'TotalWorkingYears' in df.columns:
        df['LateCareerStart'] = ((df['Age'] - df['TotalWorkingYears']) > 25).astype(int)
        df['EarlyCareerStart'] = ((df['Age'] - df['TotalWorkingYears']) < 20).astype(int)
    
    # Multi-way interactions
    if 'MonthlyIncome' in df.columns and 'JobLevel' in df.columns and 'YearsAtCompany' in df.columns:
        df['IncomeJobYears'] = (df['MonthlyIncome'] * df['JobLevel']) / (df['YearsAtCompany'] + 1)
    
    if 'DistanceFromHome' in df.columns and 'OverTime' in df.columns and 'WorkLifeBalance' in df.columns:
        df['CommuteOvertimeBalance'] = (df['DistanceFromHome'] * 
                                        (df['OverTime'] == 'Yes').astype(int)) / (df['WorkLifeBalance'] + 1)
    
    # Job hopping with age
    if 'NumCompaniesWorked' in df.columns and 'Age' in df.columns:
        df['JobHoppingPerAge'] = df['NumCompaniesWorked'] / (df['Age'] + 1)
        df['StableCareer'] = ((df['NumCompaniesWorked'] <= 2) & (df['Age'] >= 35)).astype(int)
    
    return df

# Apply feature engineering
print("\nApplying advanced feature engineering...")
combined_df = create_advanced_features(combined_df)
print(f"Features after engineering: {combined_df.shape[1]}")

# Split back to train and test
train_engineered = combined_df[combined_df['is_train'] == 1].drop(columns=['is_train']).reset_index(drop=True)
test_engineered = combined_df[combined_df['is_train'] == 0].drop(columns=['is_train']).reset_index(drop=True)

# Split features and target
X = train_engineered.drop(columns=[target_col])
y = train_engineered[target_col]
X_test = test_engineered.drop(columns=[target_col], errors='ignore')

# Identify feature types
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object", "category"]).columns.tolist()

# Remove ID column
if id_col in numeric_features:
    numeric_features.remove(id_col)
if id_col in categorical_features:
    categorical_features.remove(id_col)

print(f"\nNumeric features: {len(numeric_features)}")
print(f"Categorical features: {len(categorical_features)}")

# Preprocessing
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features),
], remainder="drop")

# Transform data
X_transformed = preprocessor.fit_transform(X)
X_test_transformed = preprocessor.transform(X_test)

# Split for validation
X_train, X_val, y_train, y_val = train_test_split(
    X_transformed, y, test_size=0.15, random_state=42, stratify=y
)

print("\n" + "="*70)
print("BUILDING OPTIMIZED SUPER ENSEMBLE WITH 6 MODELS")
print("="*70)

# Define models with fine-tuned hyperparameters
models = {
    'XGBoost': XGBClassifier(
        n_estimators=1500,
        learning_rate=0.02,
        max_depth=8,
        min_child_weight=2,
        subsample=0.85,
        colsample_bytree=0.85,
        gamma=0.15,
        reg_alpha=0.2,
        reg_lambda=1.5,
        scale_pos_weight=3,  # Handle class imbalance
        random_state=42,
        n_jobs=-1,
        eval_metric='logloss'
    ),
    'LightGBM': LGBMClassifier(
        n_estimators=1500,
        learning_rate=0.02,
        max_depth=8,
        num_leaves=40,
        subsample=0.85,
        colsample_bytree=0.85,
        reg_alpha=0.2,
        reg_lambda=1.5,
        min_child_samples=25,
        is_unbalance=True,  # Handle class imbalance
        random_state=42,
        n_jobs=-1,
        verbose=-1
    ),
    'CatBoost': CatBoostClassifier(
        iterations=1500,
        learning_rate=0.02,
        depth=8,
        l2_leaf_reg=2,
        border_count=128,
        random_strength=0.5,
        auto_class_weights='Balanced',  # Handle class imbalance
        random_state=42,
        verbose=0
    ),
    'RandomForest': RandomForestClassifier(
        n_estimators=1200,
        max_depth=25,
        min_samples_split=2,
        min_samples_leaf=1,
        max_features='sqrt',
        bootstrap=True,
        random_state=42,
        n_jobs=-1,
        class_weight='balanced_subsample'
    ),
    'ExtraTrees': ExtraTreesClassifier(
        n_estimators=1200,
        max_depth=25,
        min_samples_split=2,
        min_samples_leaf=1,
        max_features='sqrt',
        bootstrap=True,
        random_state=42,
        n_jobs=-1,
        class_weight='balanced_subsample'
    ),
    'GradientBoosting': GradientBoostingClassifier(
        n_estimators=800,
        learning_rate=0.03,
        max_depth=7,
        min_samples_split=3,
        min_samples_leaf=2,
        subsample=0.85,
        max_features='sqrt',
        random_state=42
    )
}

# Train and evaluate individual models
print("\nTraining individual models...")
individual_preds = {}
val_accuracies = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    val_pred = model.predict(X_val)
    val_acc = accuracy_score(y_val, val_pred)
    val_accuracies[name] = val_acc
    print(f"{name} Validation Accuracy: {val_acc:.6f}")
    
    # Store predictions
    individual_preds[name] = val_pred

# Ensemble prediction using voting
print("\n" + "="*70)
print("CREATING WEIGHTED VOTING ENSEMBLE")
print("="*70)

# Retrain on full training data
print("\nRetraining all models on full data...")
final_preds = {}
for name, model in models.items():
    model.fit(X_transformed, y)
    test_pred = model.predict(X_test_transformed)
    final_preds[name] = test_pred
    print(f"{name} trained")

# Weighted voting based on validation accuracy
weights = np.array([val_accuracies[name] for name in models.keys()])
weights = weights / weights.sum()

print("\nModel Weights:")
for name, weight in zip(models.keys(), weights):
    print(f"  {name}: {weight:.4f} (Val Acc: {val_accuracies[name]:.6f})")

# Create final predictions using weighted voting
final_pred_array = np.array([final_preds[name] for name in models.keys()])
weighted_votes = np.average(final_pred_array, axis=0, weights=weights)
final_predictions = (weighted_votes > 0.5).astype(int)

# Calculate ensemble validation accuracy
val_pred_array = np.array([individual_preds[name] for name in models.keys()])
val_weighted_votes = np.average(val_pred_array, axis=0, weights=weights)
ensemble_val_pred = (val_weighted_votes > 0.5).astype(int)
ensemble_val_acc = accuracy_score(y_val, ensemble_val_pred)

print("\n" + "="*70)
print(f"ENSEMBLE VALIDATION ACCURACY: {ensemble_val_acc:.6f}")
print("="*70)

# Create submission
submission = pd.DataFrame({
    id_col: test_df[id_col],
    target_col: final_predictions,
})

submission.to_csv("submission_test2_optimized.csv", index=False)
print(f"\nSubmission saved: submission_test2_optimized.csv ({len(submission)} rows)")
print(f"Prediction distribution:\n{pd.Series(final_predictions).value_counts()}")
print("\n" + "="*70)


Original features: 23
Dataset size: 7810

Applying advanced feature engineering...
Features after engineering: 39

Numeric features: 30
Categorical features: 6

BUILDING OPTIMIZED SUPER ENSEMBLE WITH 6 MODELS

Training individual models...

Training XGBoost...
XGBoost Validation Accuracy: 0.980695

Training LightGBM...
LightGBM Validation Accuracy: 0.980695

Training CatBoost...
CatBoost Validation Accuracy: 0.989704

Training RandomForest...
RandomForest Validation Accuracy: 0.985843

Training ExtraTrees...
ExtraTrees Validation Accuracy: 0.985843

Training GradientBoosting...
GradientBoosting Validation Accuracy: 0.990991

CREATING WEIGHTED VOTING ENSEMBLE

Retraining all models on full data...
XGBoost trained
LightGBM trained
CatBoost trained
RandomForest trained
ExtraTrees trained
GradientBoosting trained

Model Weights:
  XGBoost: 0.1658 (Val Acc: 0.980695)
  LightGBM: 0.1658 (Val Acc: 0.980695)
  CatBoost: 0.1674 (Val Acc: 0.989704)
  RandomForest: 0.1667 (Val Acc: 0.985843)
  Ex