In [None]:
"""
CHAMPIONSHIP MODEL - Insurance Agent NILL Prediction
Data Storm v6.0 - First Place Solution (with Optuna HPO & SMOTE Augmentation)

Key enhancements:
1. Stratified time-series cross-validation with gap
2. Feature importance-based selection with stability analysis (Now with RFECV)
3. CatBoost integration with custom loss function
4. Agent-specific dynamic thresholding (Conceptually improved, OOF-optimized fixed threshold)
5. Recursive feature elimination with stability scores (RFECV implemented)
6. Optuna for Hyperparameter Optimization (with Pruning)
7. SMOTE Data Augmentation for minority class
8. Enhanced Feature Engineering (Lags, Rolling Windows, Interactions, Time Since Event)
9. LightGBM added to ensemble
"""
import os
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import time
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, f1_score as f1_score_metric
from sklearn.feature_selection import RFECV, SelectFromModel
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
import optuna
from optuna.pruners import MedianPruner


from imblearn.over_sampling import SMOTE

import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
optuna.logging.set_verbosity(optuna.logging.WARNING)

# --- DIRECTORY SETUP ---
try:
    script_dir = os.path.dirname(os.path.abspath(__file__))
except NameError:
    script_dir = os.getcwd() # Fallback for interactive environments

data_dir = os.path.join(script_dir, 'dataset')
output_dir = os.path.join(script_dir, 'outputs')

if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"Created output directory: {output_dir}")

print("=" * 100)
print("CHAMPIONSHIP KAGGLE SOLUTION - ADVANCED ENSEMBLE WITH OPTUNA HPO & SMOTE - ENHANCED")
print("=" * 100)
start_time_script = time.time()
print(f"Starting at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# --- STEP 1: LOAD DATA ---
print("\nStep 1: Loading data...")
try:
    train_df = pd.read_csv(os.path.join(data_dir, 'train_storming_round.csv'))
    test_df = pd.read_csv(os.path.join(data_dir, 'test_storming_round.csv'))
    submission_template = pd.read_csv(os.path.join(data_dir, 'sample_submission_storming_round.csv'))
except FileNotFoundError as e:
    print(f"ERROR: Dataset file not found. Please ensure 'train_storming_round.csv', 'test_storming_round.csv', and 'sample_submission_storming_round.csv' are in the '{data_dir}' directory.")
    print(e)
    exit()

print(f"Train data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print(f"Submission template shape: {submission_template.shape}")

print("Performing data integrity checks...")
assert len(test_df) == len(submission_template), "Test and submission template row counts don't match!"
dupes_train = train_df.duplicated().sum()
if dupes_train > 0:
    print(f"WARNING: Found {dupes_train} duplicate rows in training data. Removing...")
    train_df = train_df.drop_duplicates(keep='first').reset_index(drop=True)
dupes_test = test_df.duplicated().sum()
if dupes_test > 0:
    print(f"WARNING: Found {dupes_test} duplicate rows in test data. Removing...")
    test_df = test_df.drop_duplicates(keep='first').reset_index(drop=True)
    if len(test_df) != len(submission_template):
        print("WARNING: Test data size changed after deduplication. Submission template might not align.")

# --- STEP 2: ENHANCED PREPROCESSING ---
print("\nStep 2: Enhanced preprocessing with domain expertise...")
date_columns = ['agent_join_month', 'first_policy_sold_month', 'year_month']
for df_loop in [train_df, test_df]:
    for col in date_columns:
        if col in df_loop.columns:
            df_loop[col] = pd.to_datetime(df_loop[col], errors='coerce')

train_df = train_df.sort_values(['agent_code', 'year_month'])
train_df['target_column'] = 0
unique_agents = train_df['agent_code'].unique()
print(f"Processing target for {len(unique_agents)} unique agents...")
for agent_idx, agent in enumerate(unique_agents):
    agent_data = train_df[train_df['agent_code'] == agent].copy().sort_values('year_month')
    for i in range(len(agent_data) - 1):
        current_row_id = agent_data.iloc[i]['row_id']
        next_month_sales = agent_data.iloc[i+1]['new_policy_count']
        if next_month_sales > 0:
            train_df.loc[train_df['row_id'] == current_row_id, 'target_column'] = 1
    if (agent_idx + 1) % 1000 == 0: print(f"  Processed target for {agent_idx+1}/{len(unique_agents)} agents...")
last_month_indices = train_df.groupby('agent_code')['year_month'].idxmax()
train_df = train_df.drop(last_month_indices)
print(f"Processed training data shape after target creation: {train_df.shape}")
print(f"Target distribution:\n{train_df['target_column'].value_counts(normalize=True)}")

# --- STEP 3: ADVANCED FEATURE ENGINEERING ---
print("\nStep 3: Advanced feature engineering with agent profiling...")

def comprehensive_feature_engineering(df_input, is_train=True, train_reference_df=None):
    df = df_input.copy()
    df = df.sort_values(['agent_code', 'year_month']).reset_index(drop=True) # CRITICAL for shift/rolling

    # Extract time-based features
    for col in date_columns:
        if col in df.columns and pd.api.types.is_datetime64_any_dtype(df[col]):
            df[f'{col}_month'] = df[col].dt.month
            df[f'{col}_year'] = df[col].dt.year
            df[f'{col}_quarter'] = df[col].dt.quarter
            df[f'{col}_dayofweek'] = df[col].dt.dayofweek
            df[f'{col}_dayofyear'] = df[col].dt.dayofyear
            if hasattr(df[col].dt, 'isocalendar'):
                 df[f'{col}_weekofyear'] = df[col].dt.isocalendar().week.astype(int)
            else:
                 df[f'{col}_weekofyear'] = df[col].dt.weekofyear.astype(int)
            df[f'{col}_month_sin'] = np.sin(2 * np.pi * df[f'{col}_month']/12)
            df[f'{col}_month_cos'] = np.cos(2 * np.pi * df[f'{col}_month']/12)

    if all(c in df.columns for c in ['year_month', 'agent_join_month']) and \
       pd.api.types.is_datetime64_any_dtype(df['year_month']) and \
       pd.api.types.is_datetime64_any_dtype(df['agent_join_month']):
        df['months_with_company'] = ((df['year_month'].dt.year - df['agent_join_month'].dt.year) * 12 + \
                                    (df['year_month'].dt.month - df['agent_join_month'].dt.month)).fillna(0)
    if all(c in df.columns for c in ['first_policy_sold_month', 'agent_join_month']) and \
       pd.api.types.is_datetime64_any_dtype(df['first_policy_sold_month']) and \
       pd.api.types.is_datetime64_any_dtype(df['agent_join_month']):
        df['months_to_first_sale'] = ((df['first_policy_sold_month'].dt.year - df['agent_join_month'].dt.year) * 12 + \
                                     (df['first_policy_sold_month'].dt.month - df['agent_join_month'].dt.month)).fillna(-1)
    if all(c in df.columns for c in ['year_month', 'first_policy_sold_month']) and \
       pd.api.types.is_datetime64_any_dtype(df['year_month']) and \
       pd.api.types.is_datetime64_any_dtype(df['first_policy_sold_month']):
        df['months_since_first_sale'] = ((df['year_month'].dt.year - df['first_policy_sold_month'].dt.year) * 12 + \
                                        (df['year_month'].dt.month - df['first_policy_sold_month'].dt.month)).fillna(-1)

    # --- Enhanced FE: Lagged and Rolling Features ---
    # These features are calculated directly on 'df' (which is train_df or test_df)
    # Ensure 'df' is sorted by 'agent_code', 'year_month' before these calculations.
    # This is done at the start of the function.

    # Define features for lags/rolling (excluding new_policy_count initially, handle separately if needed for test)
    # Using features that are typically available for both train and test instances.
    # Note: If 'new_policy_count' from previous months is needed as a feature for test, it must be part of test_df's historical data or merged.
    # For this problem, 'new_policy_count' is the actuals of the current month in train_df.
    # So, new_policy_count and its lags/rolling stats (shifted) are valid features.
    key_numeric_features = ['new_policy_count', # Available in train_df, represents current month sales
                            'unique_proposals', 'unique_quotations', 'unique_customers',
                            'ANBP_value', 'net_income']
    lag_periods = [1, 2, 3, 6]
    rolling_windows = [3, 6, 12]

    for col in key_numeric_features:
        if col in df.columns:
            # Lagged Features
            for k in lag_periods:
                df[f'{col}_lag_{k}'] = df.groupby('agent_code')[col].shift(k).fillna(0)

            # Rolling Window Statistics (shifted to avoid leakage)
            for w in rolling_windows:
                grouped_col = df.groupby('agent_code')[col]
                # .reset_index after rolling().operation() is important to align for .shift()
                df[f'{col}_roll_mean_{w}m'] = grouped_col.rolling(window=w, min_periods=1).mean().reset_index(level=0, drop=True).shift(1).fillna(0)
                df[f'{col}_roll_median_{w}m'] = grouped_col.rolling(window=w, min_periods=1).median().reset_index(level=0, drop=True).shift(1).fillna(0)
                df[f'{col}_roll_std_{w}m'] = grouped_col.rolling(window=w, min_periods=1).std().reset_index(level=0, drop=True).shift(1).fillna(0)
                df[f'{col}_roll_sum_{w}m'] = grouped_col.rolling(window=w, min_periods=1).sum().reset_index(level=0, drop=True).shift(1).fillna(0)

    # Difference Features
    for col in key_numeric_features:
        if col in df.columns:
            if f'{col}_lag_1' in df.columns:
                df[f'{col}_diff_lag1'] = df[col].fillna(0) - df[f'{col}_lag_1'].fillna(0)
            if f'{col}_roll_mean_3m' in df.columns: # Uses the 3m rolling mean (already shifted)
                df[f'{col}_diff_roll_mean_3m'] = df[col].fillna(0) - df[f'{col}_roll_mean_3m'].fillna(0)


    activity_cols_periods = {
        'unique_proposals': ['7_days', '15_days', '21_days'],
        'unique_quotations': ['7_days', '15_days', '21_days'],
        'unique_customers': ['7_days', '15_days', '21_days']
    } # Original trend features
    for base_col, periods in activity_cols_periods.items():
        for i in range(len(periods) - 1):
            col1_name_suffix = f'_last_{periods[i]}'
            col2_name_suffix = f'_last_{periods[i+1]}'
            col1 = base_col + col1_name_suffix if not base_col.endswith(col1_name_suffix) else base_col
            col2 = base_col + col2_name_suffix if not base_col.endswith(col2_name_suffix) else base_col
            if col1 in df.columns and col2 in df.columns:
                df[f'{base_col}_trend_{periods[i]}_{periods[i+1]}'] = df[col1].fillna(0) / np.maximum(df[col2].fillna(0), 1e-6)

    for col_to_transform in ['unique_proposals', 'unique_quotations', 'unique_customers', 'ANBP_value', 'net_income', 'agent_age']: # unique_proposal -> unique_proposals
        if col_to_transform in df.columns:
            df[f'log_{col_to_transform}'] = np.log1p(df[col_to_transform].fillna(0))
            df[f'sqrt_{col_to_transform}'] = np.sqrt(np.maximum(0, df[col_to_transform].fillna(0)))

    # --- Historical Features (NILL rate, Avg Policies) ---
    # These depend on 'new_policy_count' and need careful handling for train/test split
    if is_train:
        if 'new_policy_count' in df.columns:
             df['hist_nill_rate_calc'] = df.groupby('agent_code')['new_policy_count'].transform(lambda x: x.expanding().apply(lambda y: (y==0).mean() if len(y)>1 else 0.5).shift(1)).fillna(0.5)
             df['hist_avg_policies_calc'] = df.groupby('agent_code')['new_policy_count'].transform(lambda x: x.expanding().mean().shift(1)).fillna(0)
        else: # Should not happen if train_df is correctly passed
            df['hist_nill_rate_calc'] = 0.5
            df['hist_avg_policies_calc'] = 0
    else: # For test data
        if train_reference_df is not None and 'new_policy_count' in train_reference_df.columns:
            agent_hist_stats = train_reference_df.groupby('agent_code').agg(
                hist_nill_rate_ref=('new_policy_count', lambda x: (x==0).mean()),
                hist_avg_policies_ref=('new_policy_count', 'mean')
            ).reset_index()
            df = pd.merge(df, agent_hist_stats, on='agent_code', how='left')

            overall_train_nill_rate = train_reference_df['new_policy_count'].eq(0).mean()
            overall_train_avg_policies = train_reference_df['new_policy_count'].mean()

            df['hist_nill_rate_calc'] = df['hist_nill_rate_ref'].fillna(overall_train_nill_rate)
            df['hist_avg_policies_calc'] = df['hist_avg_policies_ref'].fillna(overall_train_avg_policies)
            df.drop(columns=['hist_nill_rate_ref', 'hist_avg_policies_ref'], inplace=True, errors='ignore')
        else: # Fallback if train_reference_df is not available or lacks new_policy_count
            df['hist_nill_rate_calc'] = 0.5 # Global default
            df['hist_avg_policies_calc'] = 0 # Global default


    # --- Enhanced FE: Time Since Last Sale ---
    # This feature also depends on 'new_policy_count'
    temp_calc_df_mssf = None
    if is_train and 'new_policy_count' in df.columns:
        temp_calc_df_mssf = df.copy() # df is train_df, has new_policy_count and is sorted
    elif not is_train and train_reference_df is not None and 'new_policy_count' in train_reference_df.columns:
        # For test data, compute on the full history from train_reference_df
        temp_calc_df_mssf = train_reference_df.copy().sort_values(['agent_code', 'year_month']).reset_index(drop=True)

    if temp_calc_df_mssf is not None:
        temp_calc_df_mssf['had_sale_mssf'] = (temp_calc_df_mssf['new_policy_count'] > 0).astype(int)
        temp_calc_df_mssf['month_idx_asc_mssf'] = temp_calc_df_mssf.groupby('agent_code').cumcount()
        temp_calc_df_mssf['sale_month_idx_mssf'] = temp_calc_df_mssf['month_idx_asc_mssf'].where(temp_calc_df_mssf['had_sale_mssf'] == 1)
        temp_calc_df_mssf['last_sale_month_idx_ffill_mssf'] = temp_calc_df_mssf.groupby('agent_code')['sale_month_idx_mssf'].ffill()
        temp_calc_df_mssf['current_months_since_last_sale_mssf'] = temp_calc_df_mssf['month_idx_asc_mssf'] - temp_calc_df_mssf['last_sale_month_idx_ffill_mssf']
        # Shift by 1 to represent "months since last sale as of START of current month"
        temp_calc_df_mssf['months_since_last_sale_feat'] = temp_calc_df_mssf.groupby('agent_code')['current_months_since_last_sale_mssf'].shift(1).fillna(999) # 999 for never sold / very long time / first month

        if is_train:
            df['months_since_last_sale_feat'] = temp_calc_df_mssf['months_since_last_sale_feat']
        else: # is_test, merge from the computed values on train_reference_df
            df = pd.merge(df, temp_calc_df_mssf[['agent_code', 'year_month', 'months_since_last_sale_feat']],
                          on=['agent_code', 'year_month'], how='left')
            df['months_since_last_sale_feat'] = df['months_since_last_sale_feat'].fillna(999) # Fallback for any test rows not covered
    else:
        df['months_since_last_sale_feat'] = 999


    # --- Agent Profile Features ---
    profile_ref_df = train_reference_df if train_reference_df is not None else (df if is_train else None)
    if profile_ref_df is not None:
        agent_profiles_agg = {}
        # Use unique_proposals instead of unique_proposal
        if 'unique_proposals' in profile_ref_df.columns: agent_profiles_agg['unique_proposals_mean_profile'] = ('unique_proposals', 'mean')
        if 'unique_quotations' in profile_ref_df.columns: agent_profiles_agg['unique_quotations_mean_profile'] = ('unique_quotations', 'mean')
        if 'agent_age' in profile_ref_df.columns: agent_profiles_agg['agent_age_mean_profile'] = ('agent_age', 'mean')
        # Add more profile features if sensible (e.g. std dev, sum of ANBP, etc.)

        if agent_profiles_agg:
            agent_profiles = profile_ref_df.groupby('agent_code', as_index=False).agg(**agent_profiles_agg)
            df = pd.merge(df, agent_profiles, on='agent_code', how='left')
            for col_name in agent_profiles.columns:
                if col_name != 'agent_code' and col_name in df.columns:
                    original_feature_name = col_name.replace('_mean_profile', '') # Adjusted based on new naming
                    fill_val = profile_ref_df[original_feature_name].mean() if original_feature_name in profile_ref_df else 0
                    df[col_name] = df[col_name].fillna(fill_val)
    # Fallback if profiles couldn't be created
    for prof_col in ['unique_proposals_mean_profile', 'unique_quotations_mean_profile', 'agent_age_mean_profile']:
        if prof_col not in df.columns: df[prof_col] = 0


    # --- Interaction Features ---
    if 'agent_age' in df.columns and 'hist_avg_policies_calc' in df.columns:
        df['inter_age_x_hist_avg_policies'] = df['agent_age'].fillna(0) * df['hist_avg_policies_calc'].fillna(0)
    if 'months_with_company' in df.columns and 'unique_proposals_mean_profile' in df.columns:
        df['inter_exp_x_prop_profile'] = df['months_with_company'].fillna(0) * df['unique_proposals_mean_profile'].fillna(0)
    if 'months_since_last_sale_feat' in df.columns and 'hist_nill_rate_calc' in df.columns:
        df['inter_msls_x_nill_rate'] = df['months_since_last_sale_feat'].replace(999, 50).fillna(50) * df['hist_nill_rate_calc'].fillna(0.5)


    # Final NaN fill for numeric columns
    for col in df.columns:
        if df[col].dtype == 'object' and col not in ['agent_code', 'row_id']:
            try: df[col] = pd.to_numeric(df[col])
            except ValueError:
                 if df[col].isnull().any():
                     df[col] = df[col].fillna(df[col].mode()[0] if not df[col].mode().empty else "Unknown")
        if pd.api.types.is_numeric_dtype(df[col]):
            if df[col].isnull().any():
                df[col] = df[col].fillna(0) # Consider median/mean for specific features if 0 is not appropriate
    return df

train_df_reference_for_fe = train_df.copy() # This has new_policy_count
train_df_fe = comprehensive_feature_engineering(train_df, is_train=True, train_reference_df=train_df_reference_for_fe)
test_df_fe = comprehensive_feature_engineering(test_df, is_train=False, train_reference_df=train_df_reference_for_fe)

print(f"Train data shape after FE: {train_df_fe.shape}")
print(f"Test data shape after FE: {test_df_fe.shape}")


# --- STEP 4: FEATURE SELECTION (with RFECV) ---
print("\nStep 4: Feature selection with RFECV...")
non_feature_cols = ['row_id', 'agent_code', 'year_month', 'target_column', 'new_policy_count',
                    'agent_join_month', 'first_policy_sold_month']
potential_features = [col for col in train_df_fe.columns if col not in non_feature_cols and col in test_df_fe.columns]
numeric_potential_features = []
for col in potential_features:
    try:
        if not pd.api.types.is_numeric_dtype(train_df_fe[col]):
            train_df_fe[col] = pd.to_numeric(train_df_fe[col], errors='coerce').fillna(0)
            if col in test_df_fe.columns:
                 test_df_fe[col] = pd.to_numeric(test_df_fe[col], errors='coerce').fillna(0)
        if pd.api.types.is_numeric_dtype(train_df_fe[col]):
            numeric_potential_features.append(col)
    except Exception as e_conv:
        print(f"Could not process column {col} for numeric check: {e_conv}")
        continue
potential_features = numeric_potential_features

if not potential_features:
    print("ERROR: No potential numeric features found. Adding dummy.")
    train_df_fe['dummy_numeric_feat'] = np.random.rand(len(train_df_fe))
    test_df_fe['dummy_numeric_feat'] = np.random.rand(len(test_df_fe))
    potential_features = ['dummy_numeric_feat']

X_temp_fs = train_df_fe[potential_features].copy()
y_temp_fs = train_df_fe['target_column'].copy()

# Handle potential inf values and NaNs more robustly for RFECV
X_temp_fs = X_temp_fs.replace([np.inf, -np.inf], np.nan)
for col in X_temp_fs.columns:
    if X_temp_fs[col].isnull().any():
        X_temp_fs[col] = X_temp_fs[col].fillna(X_temp_fs[col].median()) # Use median for filling
X_temp_fs = X_temp_fs.fillna(0) # Final fallback for columns that were all NaN


final_features = []
global_tscv_for_fs = TimeSeriesSplit(n_splits=3) # Use 3 splits for FS speed

if not X_temp_fs.empty and len(X_temp_fs) == len(y_temp_fs) and len(potential_features) > 1:
    try:
        print(f"Performing RFECV from {len(potential_features)} potential features...")
        estimator_for_rfecv = lgb.LGBMClassifier(random_state=RANDOM_STATE, n_jobs=-1, class_weight='balanced')

        # Scale data for RFECV for estimators that might be sensitive (though LGBM is fairly robust)
        scaler_rfecv = StandardScaler()
        X_temp_fs_scaled = scaler_rfecv.fit_transform(X_temp_fs)

        rfecv_selector = RFECV(estimator=estimator_for_rfecv,
                               step=0.1, # Remove 10% of features at each step
                               cv=global_tscv_for_fs,
                               scoring='roc_auc',
                               min_features_to_select=max(1, int(min(len(potential_features),150)*0.2)), # Select at least 20% of up to 150 features
                               n_jobs=-1,
                               verbose=0) # Set to 1 for more verbosity

        rfecv_selector.fit(X_temp_fs_scaled, y_temp_fs)
        final_features = X_temp_fs.columns[rfecv_selector.support_].tolist()

        if not final_features:
            print("WARNING: RFECV selected 0 features. Falling back to top N from RandomForest.")
            selector_model_fallback = RandomForestClassifier(n_estimators=50, random_state=RANDOM_STATE, n_jobs=-1, max_depth=8, min_samples_leaf=5)
            selector_model_fallback.fit(X_temp_fs, y_temp_fs)
            importances_fallback = pd.Series(selector_model_fallback.feature_importances_, index=X_temp_fs.columns).sort_values(ascending=False)
            final_features = list(importances_fallback.head(min(len(importances_fallback), 75)).index)

    except Exception as e_fs:
        print(f"Error during RFECV: {e_fs}. Falling back to simpler feature selection.")
        selector_model_fallback = RandomForestClassifier(n_estimators=50, random_state=RANDOM_STATE, n_jobs=-1, max_depth=8, min_samples_leaf=5)
        selector_model_fallback.fit(X_temp_fs, y_temp_fs)
        importances_fallback = pd.Series(selector_model_fallback.feature_importances_, index=X_temp_fs.columns).sort_values(ascending=False)
        final_features = list(importances_fallback.head(min(len(importances_fallback), 75)).index)
else:
    print("Skipping RFECV due to insufficient features/data. Using all potential numeric features or fallback.")
    final_features = potential_features if potential_features else X_temp_fs.columns.tolist()


if not final_features:
    print("CRITICAL WARNING: final_features list is empty. Using fallback.")
    fallback_numeric_cols = [col for col in train_df_fe.columns if pd.api.types.is_numeric_dtype(train_df_fe[col]) and col not in non_feature_cols and col in test_df_fe.columns]
    final_features = fallback_numeric_cols[:min(10, len(fallback_numeric_cols))]
    if not final_features: print("FATAL ERROR: No features available. Exiting."); exit()

print(f"Selected {len(final_features)} features. Examples: {final_features[:min(5, len(final_features))]}")
selected_features_path = os.path.join(output_dir, 'selected_features_optuna_smote_enhanced.txt')
with open(selected_features_path, 'w') as f:
    for feature in final_features: f.write(f"{feature}\n")
print(f"Selected features saved to: {selected_features_path}")


global_final_X_df = train_df_fe[final_features].copy()
global_final_y = train_df_fe['target_column'].copy()
for col in global_final_X_df.columns: # Impute any remaining NaNs in selected features
    if global_final_X_df[col].isnull().any():
        global_final_X_df[col] = global_final_X_df[col].fillna(global_final_X_df[col].median())
global_final_X_df = global_final_X_df.fillna(0) # Final safety net

global_final_scaler = StandardScaler()
global_final_X_scaled_np = global_final_scaler.fit_transform(global_final_X_df)
global_tscv = TimeSeriesSplit(n_splits=3) # For HPO and OOF. Can increase to 5 if time allows.


# --- OPTUNA OBJECTIVE FUNCTION (with SMOTE, LGBM, Pruning) ---
def objective(trial):
    smote_k_neighbors_max_candidate = 7
    if global_final_y.value_counts().min() > 1:
         smote_k_neighbors_max_candidate = min(smote_k_neighbors_max_candidate, global_final_y.value_counts().min() -1 )
    smote_k_neighbors_max_candidate = max(1, smote_k_neighbors_max_candidate)
    smote_k_neighbors = trial.suggest_int('smote_k_neighbors', 1, smote_k_neighbors_max_candidate)

    rf_n_estimators = trial.suggest_int('rf_n_estimators', 50, 250, step=25) # Range extended
    rf_max_depth = trial.suggest_int('rf_max_depth', 4, 15) # Range extended
    rf_min_samples_leaf = trial.suggest_int('rf_min_samples_leaf', 2, 15) # Range extended

    xgb_n_estimators = trial.suggest_int('xgb_n_estimators', 50, 250, step=25)
    xgb_max_depth = trial.suggest_int('xgb_max_depth', 3, 10)
    xgb_learning_rate = trial.suggest_float('xgb_learning_rate', 0.005, 0.15, log=True) # Range extended
    xgb_scale_pos_weight = trial.suggest_float('xgb_scale_pos_weight', 0.5, 10.0)


    cat_iterations = trial.suggest_int('cat_iterations', 50, 250, step=25)
    cat_depth = trial.suggest_int('cat_depth', 4, 10)
    cat_learning_rate = trial.suggest_float('cat_learning_rate', 0.005, 0.15, log=True)
    cat_l2_leaf_reg = trial.suggest_float('cat_l2_leaf_reg', 0.5, 15.0, log=True) # Range extended
    cat_class_weight_0 = trial.suggest_float('cat_class_weight_0_cb', 0.2, 8.0) # Range extended

    lgbm_n_estimators = trial.suggest_int('lgbm_n_estimators', 50, 250, step=25)
    lgbm_max_depth = trial.suggest_int('lgbm_max_depth', 3, 10)
    lgbm_learning_rate = trial.suggest_float('lgbm_learning_rate', 0.005, 0.15, log=True)
    lgbm_num_leaves = trial.suggest_int('lgbm_num_leaves', 10, 150) # Wider range
    lgbm_reg_alpha = trial.suggest_float('lgbm_reg_alpha', 1e-4, 10.0, log=True) # Wider range
    lgbm_reg_lambda = trial.suggest_float('lgbm_reg_lambda', 1e-4, 10.0, log=True) # Wider range
    lgbm_colsample_bytree = trial.suggest_float('lgbm_colsample_bytree', 0.5, 1.0) # Wider range
    lgbm_scale_pos_weight = trial.suggest_float('lgbm_scale_pos_weight', 0.5, 10.0)


    w_rf = trial.suggest_float('w_rf', 0.05, 2.0) # Range adjusted
    w_xgb = trial.suggest_float('w_xgb', 0.05, 2.0)
    w_cat = trial.suggest_float('w_cat', 0.05, 2.0)
    w_lgbm = trial.suggest_float('w_lgbm', 0.05, 2.0)
    ensemble_weights = [w_rf, w_xgb, w_cat, w_lgbm]

    fold_roc_auc_scores = []
    for fold, (train_idx, val_idx) in enumerate(global_tscv.split(global_final_X_scaled_np)):
        X_train_fold, X_val_fold = global_final_X_scaled_np[train_idx], global_final_X_scaled_np[val_idx]
        y_train_fold, y_val_fold = global_final_y.iloc[train_idx], global_final_y.iloc[val_idx]

        minority_class_count_fold = np.sum(y_train_fold == 1)
        current_k_for_smote = smote_k_neighbors
        if minority_class_count_fold <= current_k_for_smote:
            current_k_for_smote = max(1, minority_class_count_fold - 1) if minority_class_count_fold > 1 else 1

        X_train_fold_aug, y_train_fold_aug = X_train_fold, y_train_fold
        if minority_class_count_fold > 0 and current_k_for_smote > 0 :
            smote = SMOTE(random_state=RANDOM_STATE + fold, k_neighbors=current_k_for_smote)
            try: X_train_fold_aug, y_train_fold_aug = smote.fit_resample(X_train_fold, y_train_fold)
            except ValueError: pass # Fallback to original if SMOTE fails

        rf = RandomForestClassifier(n_estimators=rf_n_estimators, max_depth=rf_max_depth, min_samples_leaf=rf_min_samples_leaf,
                                    random_state=RANDOM_STATE, class_weight='balanced_subsample', n_jobs=-1)
        xgb_m = xgb.XGBClassifier(n_estimators=xgb_n_estimators, max_depth=xgb_max_depth, learning_rate=xgb_learning_rate,
                                  random_state=RANDOM_STATE, scale_pos_weight=xgb_scale_pos_weight, use_label_encoder=False, eval_metric='logloss', n_jobs=-1)
        cat_m = cb.CatBoostClassifier(iterations=cat_iterations, depth=cat_depth, learning_rate=cat_learning_rate, l2_leaf_reg=cat_l2_leaf_reg,
                                      random_seed=RANDOM_STATE, loss_function='Logloss', verbose=0, class_weights={0: cat_class_weight_0, 1: 1.0}, thread_count=-1)
        lgbm_m = lgb.LGBMClassifier(n_estimators=lgbm_n_estimators, max_depth=lgbm_max_depth, learning_rate=lgbm_learning_rate,
                                    num_leaves=lgbm_num_leaves, reg_alpha=lgbm_reg_alpha, reg_lambda=lgbm_reg_lambda,
                                    colsample_bytree=lgbm_colsample_bytree, random_state=RANDOM_STATE,
                                    scale_pos_weight=lgbm_scale_pos_weight, n_jobs=-1)

        ensemble = VotingClassifier(estimators=[('rf', rf), ('xgb', xgb_m), ('cat', cat_m), ('lgbm', lgbm_m)], voting='soft', weights=ensemble_weights, n_jobs=-1)

        try:
            ensemble.fit(X_train_fold_aug, y_train_fold_aug)
            if hasattr(ensemble, "predict_proba"):
                y_val_proba = ensemble.predict_proba(X_val_fold)[:, 1]
                current_fold_auc = roc_auc_score(y_val_fold, y_val_proba)
                fold_roc_auc_scores.append(current_fold_auc)

                # Pruning integration
                trial.report(current_fold_auc, step=fold)
                if trial.should_prune():
                    raise optuna.TrialPruned()
            else:
                fold_roc_auc_scores.append(0.5) # Penalize
        except Exception as e_model_fit:
            # print(f"Trial {trial.number}, Fold {fold} failed: {e_model_fit}")
            return 0.0 # Low score for failed trials

    avg_roc_auc = np.mean(fold_roc_auc_scores) if fold_roc_auc_scores else 0.0
    return avg_roc_auc

# --- STEP 5: HYPERPARAMETER OPTIMIZATION ---
print("\nStep 5: Hyperparameter Optimization with Optuna (SMOTE, LGBM, Pruning)...")
# Using MedianPruner: n_startup_trials runs without pruning, n_warmup_steps is number of intermediate steps (folds) before pruning is active.
study = optuna.create_study(direction='maximize',
                            sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE),
                            pruner=MedianPruner(n_startup_trials=10, n_warmup_steps=1, interval_steps=1)) # Prune after 2nd fold (step=1)
N_OPTUNA_TRIALS = 250 # ADJUST AS NEEDED (Original was 750 - very long! Using 100 for reasonable demo time)
optuna_timeout_hours = 1.0 # ADJUST (Original was 3.5)
study.optimize(objective, n_trials=N_OPTUNA_TRIALS, timeout=3600 * optuna_timeout_hours, n_jobs=1) # n_jobs=1 for SMOTE safety

best_params = study.best_params
print(f"\nBest ROC AUC from Optuna: {study.best_value:.4f}")
print("Best hyperparameters found by Optuna:")
for key, value in best_params.items(): print(f"  {key}: {value}")
try:
    study_trials_path = os.path.join(output_dir, 'optuna_study_trials_enhanced.csv')
    study.trials_dataframe().to_csv(study_trials_path, index=False)
    print(f"Optuna study trials saved to: {study_trials_path}")
except Exception as e_study_save: print(f"Could not save Optuna study trials: {e_study_save}")


# --- STEP 5.5: OPTIMIZE FIXED THRESHOLD USING OOF FROM BEST PARAMS ---
print("\nStep 5.5: Optimizing fixed threshold on OOF predictions...")
oof_true_labels_list = []
oof_probas_list = []
smote_k_for_oof = best_params.get('smote_k_neighbors', 3)

# Initialize models with best_params
best_rf_oof = RandomForestClassifier(n_estimators=best_params.get('rf_n_estimators', 100), max_depth=best_params.get('rf_max_depth', 8),
    min_samples_leaf=best_params.get('rf_min_samples_leaf', 5), random_state=RANDOM_STATE, class_weight='balanced_subsample', n_jobs=-1)
best_xgb_oof = xgb.XGBClassifier(n_estimators=best_params.get('xgb_n_estimators', 100), max_depth=best_params.get('xgb_max_depth', 5),
    learning_rate=best_params.get('xgb_learning_rate', 0.05), random_state=RANDOM_STATE, scale_pos_weight=best_params.get('xgb_scale_pos_weight', 1.0),
    use_label_encoder=False, eval_metric='logloss', n_jobs=-1)
best_cat_oof = cb.CatBoostClassifier(iterations=best_params.get('cat_iterations', 100), depth=best_params.get('cat_depth', 6),
    learning_rate=best_params.get('cat_learning_rate', 0.05), l2_leaf_reg=best_params.get('cat_l2_leaf_reg', 3.0),
    random_seed=RANDOM_STATE, loss_function='Logloss', verbose=0, class_weights={0: best_params.get('cat_class_weight_0_cb', 1.0), 1: 1.0}, thread_count=-1)
best_lgbm_oof = lgb.LGBMClassifier(n_estimators=best_params.get('lgbm_n_estimators', 100), max_depth=best_params.get('lgbm_max_depth', 7),
    learning_rate=best_params.get('lgbm_learning_rate', 0.05), num_leaves=best_params.get('lgbm_num_leaves', 31),
    reg_alpha=best_params.get('lgbm_reg_alpha', 0.0), reg_lambda=best_params.get('lgbm_reg_lambda', 0.0),
    colsample_bytree=best_params.get('lgbm_colsample_bytree', 0.8), random_state=RANDOM_STATE,
    scale_pos_weight=best_params.get('lgbm_scale_pos_weight', 1.0), n_jobs=-1)

best_ensemble_oof_weights = [best_params.get('w_rf', 1.0), best_params.get('w_xgb', 1.0), best_params.get('w_cat', 1.0), best_params.get('w_lgbm', 1.0)]
best_ensemble_for_oof = VotingClassifier(estimators=[('rf', best_rf_oof), ('xgb', best_xgb_oof), ('cat', best_cat_oof), ('lgbm', best_lgbm_oof)],
                                       voting='soft', weights=best_ensemble_oof_weights, n_jobs=-1)

all_val_indices = [] # To map OOF preds back to original train_df_fe if needed for error analysis
for fold, (train_idx, val_idx) in enumerate(global_tscv.split(global_final_X_scaled_np)):
    X_train_fold, X_val_fold = global_final_X_scaled_np[train_idx], global_final_X_scaled_np[val_idx]
    y_train_fold, y_val_fold = global_final_y.iloc[train_idx], global_final_y.iloc[val_idx]
    all_val_indices.extend(val_idx)


    minority_class_count_fold = np.sum(y_train_fold == 1)
    current_k_for_smote_oof = smote_k_for_oof
    if minority_class_count_fold <= current_k_for_smote_oof:
        current_k_for_smote_oof = max(1, minority_class_count_fold - 1) if minority_class_count_fold > 1 else 1

    X_train_fold_aug, y_train_fold_aug = X_train_fold, y_train_fold
    if minority_class_count_fold > 0 and current_k_for_smote_oof > 0:
        smote_oof = SMOTE(random_state=RANDOM_STATE + fold, k_neighbors=current_k_for_smote_oof)
        try: X_train_fold_aug, y_train_fold_aug = smote_oof.fit_resample(X_train_fold, y_train_fold)
        except ValueError: pass

    best_ensemble_for_oof.fit(X_train_fold_aug, y_train_fold_aug)
    y_val_proba_oof = best_ensemble_for_oof.predict_proba(X_val_fold)[:, 1]

    oof_true_labels_list.extend(y_val_fold.tolist())
    oof_probas_list.extend(y_val_proba_oof.tolist())

oof_true_labels_np = np.array(oof_true_labels_list)
oof_probas_np = np.array(oof_probas_list)
optimized_fixed_threshold = 0.5 # Default

if len(oof_probas_np) > 0:
    precisions_pr, recalls_pr, thresholds_pr = precision_recall_curve(oof_true_labels_np, oof_probas_np)
    # Ensure thresholds_pr is not empty and handle edge cases for f1_scores_pr calculation
    if len(thresholds_pr) > 0:
        f1_scores_pr_calc = np.zeros_like(thresholds_pr)
        # Valid indices: precision and recall are for thresholds_pr, but len(precisions/recalls) = len(thresholds) + 1
        # We need to compare elements of same length for division.
        # (2 * precisions[:-1] * recalls[:-1]) / (precisions[:-1] + recalls[:-1])
        numerator = 2 * precisions_pr[:-1] * recalls_pr[:-1]
        denominator = precisions_pr[:-1] + recalls_pr[:-1]
        # Avoid division by zero
        valid_idx_f1 = denominator > 0
        f1_scores_pr_calc[valid_idx_f1] = numerator[valid_idx_f1] / denominator[valid_idx_f1]

        if len(f1_scores_pr_calc) > 0:
            best_f1_idx = np.argmax(f1_scores_pr_calc)
            optimized_fixed_threshold = thresholds_pr[best_f1_idx]
            print(f"Optimized fixed threshold (max F1 on OOF): {optimized_fixed_threshold:.4f} (F1: {f1_scores_pr_calc[best_f1_idx]:.4f})")
        else:
            print("Warning: Could not calculate F1 scores for threshold optimization. Using default 0.5.")
    else:
        print("Warning: No thresholds found from precision_recall_curve. Using default 0.5.")
else:
    print("Warning: OOF predictions not generated for threshold optimization. Using default 0.5.")


# --- STEP 6: FINAL MODEL TRAINING ---
print("\nStep 6: Training final model on all data with optimized parameters & SMOTE...")
final_smote_k_neighbors_tuned = best_params.get('smote_k_neighbors', 3)
final_minority_count_full = np.sum(global_final_y == 1)
current_k_for_smote_final = final_smote_k_neighbors_tuned
X_train_final_aug, y_train_final_aug = global_final_X_scaled_np, global_final_y

if final_minority_count_full > 0 :
    if final_minority_count_full <= current_k_for_smote_final:
        current_k_for_smote_final = max(1, final_minority_count_full - 1) if final_minority_count_full > 1 else 1
        print(f"Adjusting SMOTE k_neighbors for final training to {current_k_for_smote_final}")
    if current_k_for_smote_final > 0:
        smote_final = SMOTE(random_state=RANDOM_STATE, k_neighbors=current_k_for_smote_final)
        try: X_train_final_aug, y_train_final_aug = smote_final.fit_resample(global_final_X_scaled_np, global_final_y)
        except ValueError as e_smote_final: print(f"SMOTE failed for final model: {e_smote_final}. Using original data.")
    else: print("Warning: k_neighbors for SMOTE is <=0. Using original data.")
else: print("Warning: No minority samples. SMOTE not applied.")
print(f"Original full train shape: {global_final_X_scaled_np.shape}, Augmented full train shape: {X_train_final_aug.shape}")

final_rf = best_rf_oof # Reuse model instances from OOF thresholding step, already configured with best_params
final_xgb = best_xgb_oof
final_cat = best_cat_oof
final_lgbm = best_lgbm_oof
final_ensemble_weights = best_ensemble_oof_weights # Reuse weights

final_ensemble = VotingClassifier(
    estimators=[('rf', final_rf), ('xgb', final_xgb), ('cat', final_cat), ('lgbm', final_lgbm)],
    voting='soft', weights=final_ensemble_weights, n_jobs=-1)

final_ensemble.fit(X_train_final_aug, y_train_final_aug)
model_components = {
    'scaler': global_final_scaler, 'ensemble_model': final_ensemble,
    'final_features': final_features, 'best_optuna_params': best_params,
    'optimized_threshold': optimized_fixed_threshold
}
model_path = os.path.join(output_dir, 'champion_model_bundle_optuna_smote_enhanced.pkl')
joblib.dump(model_components, model_path)
print(f"Final model bundle saved to: {model_path}")


# --- STEP 7: GENERATE TEST PREDICTIONS ---
print("\nStep 7: Generating optimized test predictions...")
X_test_final_df = pd.DataFrame(columns=global_final_X_df.columns, index=test_df_fe.index)
for col in global_final_X_df.columns:
    X_test_final_df[col] = test_df_fe[col] if col in test_df_fe.columns else 0
for col in X_test_final_df.columns: # Impute using TRAIN median
    if X_test_final_df[col].isnull().any():
        X_test_final_df[col] = X_test_final_df[col].fillna(global_final_X_df[col].median())
X_test_final_df = X_test_final_df.fillna(0) # Final safety net

X_test_scaled_np = global_final_scaler.transform(X_test_final_df)
test_proba = final_ensemble.predict_proba(X_test_scaled_np)[:, 1]
test_proba_path = os.path.join(output_dir, 'test_probabilities_optuna_smote_enhanced.npy')
np.save(test_proba_path, test_proba)
print(f"Test probabilities saved to: {test_proba_path}")

# Dynamic thresholding (using optimized fixed threshold as a better placeholder)
print("Applying dynamic thresholds (using OOF-optimized fixed threshold as example)...")
# True dynamic thresholding requires a strategy to vary the threshold per instance/agent.
# Examples:
# 1. Segment agents (e.g., by tenure, past performance) and find optimal thresholds for each segment on CV OOF data.
# 2. Train a small model to predict an optimal threshold for each test instance based on its features.
# For this script, the OOF-optimized fixed threshold is used.
dynamic_threshold_to_apply = optimized_fixed_threshold
# Ensure test_df_fe has row_id if it was lost (it should have it from initial load)
if 'row_id' not in test_df_fe.columns and 'row_id' in test_df.columns:
    test_df_fe = test_df_fe.reset_index().merge(test_df[['row_id']].reset_index(drop=True), left_index=True, right_index=True)


test_df_fe['probability'] = test_proba # Add probability to test_df_fe for submission mapping
test_df_fe['dynamic_prediction'] = (test_df_fe['probability'] >= dynamic_threshold_to_apply).astype(int)

dynamic_submission = submission_template.copy()
if 'row_id' in test_df_fe.columns and 'row_id' in dynamic_submission.columns:
    # Ensure test_df_fe['row_id'] is compatible with submission_template['row_id']
    # This merge is safer if test_df_fe was re-indexed or row order changed.
    final_preds_df_dynamic = test_df_fe[['row_id', 'dynamic_prediction']].rename(columns={'dynamic_prediction': 'target_column'})
    dynamic_submission = dynamic_submission.drop(columns=['target_column'], errors='ignore').merge(final_preds_df_dynamic, on='row_id', how='left')
    dynamic_submission['target_column'] = dynamic_submission['target_column'].fillna(0).astype(int) # Fill if any row_id mismatch
else: # Fallback if row_id mapping is problematic
    print("Warning: row_id not available for precise mapping in dynamic submission. Using direct assignment.")
    dynamic_submission['target_column'] = test_df_fe['dynamic_prediction'].values[:len(dynamic_submission)]

dynamic_submission_path = os.path.join(output_dir, 'dynamic_submission_optuna_smote_enhanced.csv')
dynamic_submission.to_csv(dynamic_submission_path, index=False)
print(f"Dynamic threshold submission saved to: {dynamic_submission_path}")


# Fixed threshold submission (using the OOF optimized threshold)
best_fixed_threshold = optimized_fixed_threshold
optimal_predictions = (test_proba >= best_fixed_threshold).astype(int)
optimal_submission = submission_template.copy()

if 'row_id' in test_df_fe.columns and 'row_id' in optimal_submission.columns:
    temp_preds_fixed = pd.DataFrame({'row_id': test_df_fe['row_id'], 'target_column': optimal_predictions})
    optimal_submission = optimal_submission.drop(columns=['target_column'], errors='ignore').merge(temp_preds_fixed, on='row_id', how='left')
    optimal_submission['target_column'] = optimal_submission['target_column'].fillna(0).astype(int)
else:
    print("Warning: row_id not available for precise mapping in optimal submission. Using direct assignment.")
    optimal_submission['target_column'] = optimal_predictions[:len(optimal_submission)]

optimal_submission_path = os.path.join(output_dir, 'submission_optuna_smote_enhanced.csv') # Main submission
optimal_submission.to_csv(optimal_submission_path, index=False)
print(f"Optimal fixed threshold submission saved to: {optimal_submission_path}")


# --- STEP 8: FEATURE IMPORTANCE ANALYSIS ---
print("\nStep 8: Feature importance analysis...")
if hasattr(final_ensemble, 'named_estimators_'):
    importances_data = {'Feature': global_final_X_df.columns.tolist()}
    estimators_with_importance = {
        'RF': final_ensemble.named_estimators_.get('rf'),
        'XGB': final_ensemble.named_estimators_.get('xgb'),
        'CAT': final_ensemble.named_estimators_.get('cat'),
        'LGBM': final_ensemble.named_estimators_.get('lgbm')
    }
    for model_name, model_obj in estimators_with_importance.items():
        if model_obj and hasattr(model_obj, 'feature_importances_'):
            if len(model_obj.feature_importances_) == len(global_final_X_df.columns):
                importances_data[f'{model_name}_Importance'] = model_obj.feature_importances_
            else:
                print(f"Warning: Mismatch in feature count for {model_name} importances.")

    feature_importance_df = pd.DataFrame(importances_data)
    importance_cols = [col for col in feature_importance_df.columns if '_Importance' in col]
    if importance_cols:
        feature_importance_df['Avg_Importance'] = feature_importance_df[importance_cols].mean(axis=1)
        feature_importance_df = feature_importance_df.sort_values('Avg_Importance', ascending=False).reset_index(drop=True)
        feature_importance_path = os.path.join(output_dir, 'feature_importance_optuna_smote_enhanced.csv')
        feature_importance_df.to_csv(feature_importance_path, index=False)
        print(f"Feature importance table saved to: {feature_importance_path}")

        plt.figure(figsize=(12, max(8, min(len(feature_importance_df), 30) // 1.5))) # Adjusted figure size
        sns.barplot(x='Avg_Importance', y='Feature', data=feature_importance_df.head(min(30, len(feature_importance_df))), palette="viridis")
        plt.title(f'Top {min(30, len(feature_importance_df))} Features by Average Importance (Enhanced Model)')
        plt.tight_layout()
        feature_plot_path = os.path.join(output_dir, 'top_features_optuna_smote_enhanced.png')
        plt.savefig(feature_plot_path); plt.close()
        print(f"Feature importance plot saved to: {feature_plot_path}")
    else: print("No feature importances could be extracted.")
else: print("Final ensemble model structure issue for feature importances.")


# --- STEP 9: ERROR ANALYSIS (Conceptual on OOF) ---
print("\nStep 9: Error Analysis (Conceptual)...")
# This section provides a template for analyzing misclassifications from OOF predictions.
# It requires 'oof_true_labels_np', 'oof_probas_np', and 'all_val_indices' from Step 5.5.

# if len(oof_probas_np) > 0 and len(oof_true_labels_np) == len(oof_probas_np) and len(all_val_indices) == len(oof_true_labels_np):
#     oof_preds_for_analysis = (oof_probas_np >= optimized_fixed_threshold).astype(int)
#
#     # Get original indices from train_df_fe that correspond to the OOF predictions
#     # This assumes all_val_indices were collected in the same order as oof_true_labels_np/oof_probas_np
#     oof_original_indices = train_df_fe.iloc[all_val_indices].index
#
#     misclassified_mask = (oof_preds_for_analysis != oof_true_labels_np)
#     misclassified_original_indices = oof_original_indices[misclassified_mask]
#
#     if not misclassified_original_indices.empty:
#         print(f"Analyzing {len(misclassified_original_indices)} misclassified samples from OOF...")
#         misclassified_samples_df = train_df_fe.loc[misclassified_original_indices].copy()
#         misclassified_samples_df['oof_probability'] = oof_probas_np[misclassified_mask]
#         misclassified_samples_df['oof_prediction'] = oof_preds_for_analysis[misclassified_mask]
#         misclassified_samples_df['true_target_actual'] = oof_true_labels_np[misclassified_mask] # Renamed to avoid conflict if 'target_column' exists
#
#         fp_df = misclassified_samples_df[(misclassified_samples_df['true_target_actual'] == 0) & (misclassified_samples_df['oof_prediction'] == 1)]
#         print(f"False Positives ({len(fp_df)}):")
#         # print(fp_df[['agent_code', 'year_month', 'oof_probability'] + final_features[:3]].head())
#
#         fn_df = misclassified_samples_df[(misclassified_samples_df['true_target_actual'] == 1) & (misclassified_samples_df['oof_prediction'] == 0)]
#         print(f"False Negatives ({len(fn_df)}):")
#         # print(fn_df[['agent_code', 'year_month', 'oof_probability'] + final_features[:3]].head())
#
#         # Example: Save misclassified samples for external review
#         # misclassified_samples_df.to_csv(os.path.join(output_dir, 'misclassified_oof_samples.csv'), index=False)
#     else:
#         print("No misclassified samples found in OOF data with the optimized threshold.")
# else:
#     print("OOF data not available for error analysis.")
print("Error analysis section is conceptual. Uncomment and adapt as needed with OOF data.")


# --- SCRIPT COMPLETION ---
end_time_script = time.time()
elapsed_time_script = end_time_script - start_time_script
print("\n" + "=" * 100)
print(f"CHAMPIONSHIP SOLUTION (ENHANCED) completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Total execution time: {elapsed_time_script:.2f} seconds ({elapsed_time_script/60:.2f} minutes)")
print(f"OPTIMAL SUBMISSION (ENHANCED): {optimal_submission_path}")
print(f"DYNAMIC SUBMISSION (ENHANCED): {dynamic_submission_path}")
print(f"All outputs saved in: {output_dir}")
print("=" * 100)

CHAMPIONSHIP KAGGLE SOLUTION - ADVANCED ENSEMBLE WITH OPTUNA HPO & SMOTE - ENHANCED
Starting at: 2025-05-07 22:04:32

Step 1: Loading data...
Train data shape: (15308, 23)
Test data shape: (914, 23)
Submission template shape: (914, 2)
Performing data integrity checks...

Step 2: Enhanced preprocessing with domain expertise...
Processing target for 905 unique agents...
Processed training data shape after target creation: (14403, 24)
Target distribution:
target_column
1    0.900437
0    0.099563
Name: proportion, dtype: float64

Step 3: Advanced feature engineering with agent profiling...
Train data shape after FE: (14403, 166)
Test data shape after FE: (914, 165)

Step 4: Feature selection with RFECV...
Performing RFECV from 159 potential features...
