In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from scipy import stats
from scipy.optimize import minimize_scalar
from sklearn.base import BaseEstimator, TransformerMixin
import joblib

# Core ML Libraries
from sklearn.model_selection import (StratifiedKFold, cross_val_score,
                                   train_test_split, RepeatedStratifiedKFold)
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, IterativeImputer
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# Imbalanced Learning
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.ensemble import BalancedRandomForestClassifier

# Advanced Models
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier,
                             GradientBoostingClassifier, VotingClassifier,
                             StackingClassifier, BaggingClassifier)
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

# Evaluation
from sklearn.metrics import f1_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.calibration import CalibratedClassifierCV

# Optimization
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

warnings.filterwarnings('ignore')
np.random.seed(42)

print("✅ All libraries imported successfully\n")

✅ All libraries imported successfully



In [None]:
%pip install catboost



In [None]:
%pip install scikit-optimize



In [None]:
print("📊 STEP 1: Ultra-Advanced Data Loading & Analysis")
print("-" * 50)

def comprehensive_data_analysis(train_df, test_df):
    """Perform deep data analysis for optimal preprocessing strategy"""

    print(f"📋 Training Data: {train_df.shape[0]} rows, {train_df.shape[1]} columns")
    print(f"📋 Test Data: {test_df.shape[0]} rows, {test_df.shape[1]} columns")

    # Target distribution analysis
    # Check if 'age_group' is in train_df before accessing it
    if 'age_group' in train_df.columns:
        target_dist = train_df['age_group'].value_counts()
        imbalance_ratio = target_dist.min() / target_dist.max()
        print(f"\n🎯 Target Distribution:")
        # Use .get() with default 0 in case a category is missing after splitting
        print(f"   Adult (0): {target_dist.get(0, 0)} ({target_dist.get(0, 0)/len(train_df)*100:.1f}%)")
        print(f"   Senior (1): {target_dist.get(1, 0)} ({target_dist.get(1, 0)/len(train_df)*100:.1f}%)")
        print(f"   Imbalance Ratio: {imbalance_ratio:.3f}")
    else:
        print("\n🎯 Target Distribution: 'age_group' column not found in training data for analysis.")
        imbalance_ratio = None # Or a suitable default/indicator

    # Missing value analysis
    missing_train = train_df.isnull().sum()
    missing_test = test_df.isnull().sum()

    print(f"\n🔍 Missing Values Analysis:")
    all_cols = set(missing_train.index).union(set(missing_test.index))
    for col in all_cols:
        if col in missing_train and missing_train[col] > 0 or (col in missing_test and missing_test[col] > 0):
            train_pct = (missing_train[col] / len(train_df)) * 100 if col in missing_train else 0
            test_pct = (missing_test[col] / len(test_df)) * 100 if col in missing_test else 0
            print(f"   {col}: Train {train_pct:.1f}%, Test {test_pct:.1f}%")


    # Feature correlation with target
    correlations = {}
    if 'age_group' in train_df.columns:
        numeric_cols = train_df.select_dtypes(include=[np.number]).columns.drop('age_group', errors='ignore')
        for col in numeric_cols:
            if col in train_df.columns:
                # Ensure both columns are numeric before calculating correlation
                if pd.api.types.is_numeric_dtype(train_df[col]) and pd.api.types.is_numeric_dtype(train_df['age_group']):
                     corr = train_df[col].corr(train_df['age_group'])
                     correlations[col] = abs(corr)

        print(f"\n🔗 Top Correlations with Target:")
        sorted_corr = sorted(correlations.items(), key=lambda x: x[1], reverse=True)
        for col, corr in sorted_corr[:5]:
            print(f"   {col}: {corr:.4f}")
    else:
         print("\n🔗 Feature Correlation with Target: 'age_group' column not found for correlation analysis.")


    return imbalance_ratio, correlations

# Load data
try:
    train_df_raw = pd.read_csv('Train_Data.csv')
    test_df_raw = pd.read_csv('Test_Data.csv')

    # Store test IDs
    test_ids = test_df_raw['SEQN'].copy()

    # Separate target variable from training data
    if 'age_group' in train_df_raw.columns:
        train_target = train_df_raw['age_group'].copy()
        train_features = train_df_raw.drop(['SEQN', 'age_group'], axis=1)
    else:
        print("❌ Error: 'age_group' column not found in Train_Data.csv")
        exit()

    # Remove identifier from test data
    test_features = test_df_raw.drop('SEQN', axis=1)


    imbalance_ratio, feature_correlations = comprehensive_data_analysis(train_df_raw, test_df_raw) # Pass raw data for initial analysis


except Exception as e:
    print(f"❌ Error: {e}")
    # exit() # Removing exit to allow further code execution if error is non-fatal

print("\n✅ Data loading and analysis complete\n")

📊 STEP 1: Ultra-Advanced Data Loading & Analysis
--------------------------------------------------
📋 Training Data: 1966 rows, 9 columns
📋 Test Data: 312 rows, 8 columns

🎯 Target Distribution:
   Adult (0): 1638 (83.3%)
   Senior (1): 314 (16.0%)
   Imbalance Ratio: 0.192

🔍 Missing Values Analysis:
   PAQ605: Train 0.7%, Test 0.3%
   age_group: Train 0.7%, Test 0.0%
   DIQ010: Train 0.9%, Test 0.3%
   BMXBMI: Train 0.9%, Test 0.3%
   LBXGLU: Train 0.7%, Test 0.3%
   LBXGLT: Train 0.6%, Test 0.6%
   LBXIN: Train 0.5%, Test 0.3%
   SEQN: Train 0.6%, Test 0.6%
   RIAGENDR: Train 0.9%, Test 0.6%

🔗 Top Correlations with Target:

✅ Data loading and analysis complete



In [None]:
print("🧬 STEP 2: Medical Domain Expert Feature Engineering")
print("-" * 50)
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import KNNImputer # Import KNNImputer


class MedicalFeatureEngineer(BaseEstimator, TransformerMixin):
    """Advanced medical domain feature engineering"""

    def __init__(self):
        self.imputer = KNNImputer(n_neighbors=5) # Use KNNImputer
        self.scaler = None
        self.numeric_cols = None # Add attribute to store numeric column names

    def fit(self, X, y=None):
        # Identify numeric columns
        self.numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
        # Ensure only numeric columns are selected and convert to numeric, coerce errors to NaN
        X_numeric = X[self.numeric_cols].apply(pd.to_numeric, errors='coerce')
        # Fit imputer only on the cleaned numeric columns
        self.imputer.fit(X_numeric)
        return self

    def transform(self, X):
        X = X.copy()

        # 1. Handle missing values with imputation
        if self.numeric_cols is None:
             raise RuntimeError("Call fit before calling transform.")

        # Ensure only numeric columns are selected for transformation and convert, coerce errors
        X_numeric = X[self.numeric_cols].apply(pd.to_numeric, errors='coerce')

        # print("\n--- Diagnostic: Data before Imputation ---")
        # display(X_numeric.describe(include='all'))
        # print("----------------------------------------\n")


        X[self.numeric_cols] = self.imputer.transform(X_numeric)


        # 2. MEDICAL EXPERT FEATURES
        print("🔬 Creating medical expert features...")

        # Ensure necessary columns exist before creating features
        required_medical_cols = ['LBXIN', 'LBXGLU', 'BMXBMI', 'LBXGLT', 'DIQ010', 'RIAGENDR', 'PAQ605']
        if not all(col in X.columns for col in required_medical_cols):
            print("Warning: Not all required medical columns present for feature creation.")
            # You might want to handle this more robustly depending on your needs,
            # e.g., by raising an error or skipping feature creation.
            # For now, we'll proceed with available columns.

        # Insulin Resistance Indices (Check if required cols exist)
        if all(col in X.columns for col in ['LBXIN', 'LBXGLU']):
            X['HOMA_IR'] = (X['LBXIN'] * X['LBXGLU']) / 405.0
            X['QUICKI'] = 1 / (np.log(X['LBXIN'] + 1e-6) + np.log(X['LBXGLU'] + 1e-6))
            X['McAuley_Index'] = np.exp(2.63 - 0.28 * np.log(X['LBXIN'] + 1e-6) - 0.31 * np.log(X['LBXGLU'] + 1e-6))

        # Metabolic Syndrome Components (Check if required cols exist)
        if all(col in X.columns for col in ['BMXBMI', 'LBXGLU', 'LBXGLT', 'DIQ010']):
            X['MetSyn_BMI'] = (X['BMXBMI'] >= 30).astype(int)
            X['MetSyn_Glucose'] = (X['LBXGLU'] >= 100).astype(int)
            X['MetSyn_GTT'] = (X['LBXGLT'] >= 140).astype(int)
            X['MetSyn_Score'] = X['MetSyn_BMI'] + X['MetSyn_Glucose'] + X['MetSyn_GTT'] + X['DIQ010']

        # Advanced Glucose Metabolism (Check if required cols exist)
        if all(col in X.columns for col in ['LBXGLU', 'LBXIN', 'LBXGLT']):
            X['Glucose_Insulin_Ratio'] = X['LBXGLU'] / (X['LBXIN'] + 1e-6)
            X['GTT_Fasting_Ratio'] = X['LBXGLT'] / (X['LBXGLU'] + 1e-6)
            X['Insulin_GTT_Interaction'] = X['LBXIN'] * X['LBXGLT']


        # BMI-related Features (Check if required cols exist)
        if all(col in X.columns for col in ['BMXBMI', 'LBXGLU', 'LBXIN']):
             X['BMI_Category'] = pd.cut(X['BMXBMI'],
                                  bins=[0, 18.5, 25, 30, 35, 100],
                                  labels=[0, 1, 2, 3, 4],
                                  right=True, # Include the right edge of the bin
                                  duplicates='drop' # Drop duplicate bin edges if any
                                 ).astype('Int64') # Use nullable integer type

             X['BMI_Glucose_Product'] = X['BMXBMI'] * X['LBXGLU']
             X['BMI_Insulin_Product'] = X['BMXBMI'] * X['LBXIN']


        # 3. STATISTICAL TRANSFORMATIONS
        print("📊 Creating statistical features...")

        # Log transformations for skewed features (Check if required cols exist)
        skewed_features = ['LBXIN', 'LBXGLT']
        for feat in skewed_features:
            if feat in X.columns:
                X[f'{feat}_log'] = np.log1p(X[feat])
                X[f'{feat}_sqrt'] = np.sqrt(X[feat] + 1e-6)

        # Polynomial features for key variables (Check if required cols exist)
        key_features = ['BMXBMI', 'LBXGLU', 'LBXIN']
        for feat in key_features:
            if feat in X.columns:
                X[f'{feat}_squared'] = X[feat] ** 2
                X[f'{feat}_cubed'] = X[feat] ** 3

        # 4. INTERACTION FEATURES
        print("🔄 Creating interaction features...")

        # All pairwise interactions of key features (Check if required cols exist)
        interaction_features = ['BMXBMI', 'LBXGLU', 'LBXIN', 'LBXGLT']
        for i, feat1 in enumerate(interaction_features):
            for feat2 in interaction_features[i+1:]:
                if feat1 in X.columns and feat2 in X.columns:
                    X[f'{feat1}_x_{feat2}'] = X[feat1] * X[feat2]
                    X[f'{feat1}_div_{feat2}'] = X[feat1] / (X[feat2] + 1e-6)

        # Gender interactions (Check if required cols exist)
        if 'RIAGENDR' in X.columns:
            for feat in ['BMXBMI', 'LBXGLU', 'LBXIN']:
                if feat in X.columns:
                    X[f'{feat}_x_Gender'] = X[feat] * X['RIAGENDR']

        # Activity interactions (Check if required cols exist)
        if 'PAQ605' in X.columns:
            for feat in ['BMXBMI', 'LBXGLU']:
                if feat in X.columns:
                    X[f'{feat}_x_Activity'] = X[feat] * X['PAQ605']

        # 5. CLUSTERING FEATURES
        print("🎯 Creating clustering features...")

        # Create metabolic profiles using clustering (Check if required cols exist)
        metabolic_features = ['BMXBMI', 'LBXGLU', 'LBXIN', 'LBXGLT']
        if all(feat in X.columns for feat in metabolic_features):
            kmeans = KMeans(n_clusters=5, random_state=42, n_init=10) # Add n_init to suppress warning
            X['Metabolic_Cluster'] = kmeans.fit_predict(X[metabolic_features])


        # 6. PERCENTILE FEATURES
        print("📈 Creating percentile features...")

        # Convert continuous features to percentile ranks (Check if required cols exist)
        for feat in ['BMXBMI', 'LBXGLU', 'LBXIN', 'LBXGLT']:
            if feat in X.columns:
                X[f'{feat}_percentile'] = stats.rankdata(X[feat]) / len(X)

        print(f"✅ Feature engineering complete. Features: {len(X.columns)}")
        return X

# Apply feature engineering
feature_engineer = MedicalFeatureEngineer()
print("🔧 Applying to training data...")
train_enhanced = feature_engineer.fit_transform(train_features)

print("🔧 Applying to test data...")
test_enhanced = feature_engineer.transform(test_features)

print(f"📊 Enhanced features: {len(train_enhanced.columns)} (was {len(train_features.columns)})")

🧬 STEP 2: Medical Domain Expert Feature Engineering
--------------------------------------------------
🔧 Applying to training data...
🔬 Creating medical expert features...
📊 Creating statistical features...
🔄 Creating interaction features...
🎯 Creating clustering features...
📈 Creating percentile features...
✅ Feature engineering complete. Features: 52
🔧 Applying to test data...
🔬 Creating medical expert features...
📊 Creating statistical features...
🔄 Creating interaction features...
🎯 Creating clustering features...
📈 Creating percentile features...
✅ Feature engineering complete. Features: 52
📊 Enhanced features: 52 (was 7)


In [None]:
# ADVANCED PREPROCESSING PIPELINE
# ==============================================================================
print("\n⚙️ STEP 3: Advanced Preprocessing Pipeline")
print("-" * 50)

def create_ultra_preprocessing_pipeline(X_train, y_train, X_test):
    """Ultra-advanced preprocessing for maximum performance"""

    # No need to drop 'age_group' from X_train here, it should already be separated
    # if 'age_group' in X_train.columns:
    #     X_train = X_train.drop('age_group', axis=1)


    print("🧹 Advanced outlier detection and removal...")

    # 1. ISOLATION FOREST FOR OUTLIER DETECTION
    from sklearn.ensemble import IsolationForest
    iso_forest = IsolationForest(contamination=0.05, random_state=42)
    # Ensure X_train is numeric for IsolationForest
    X_train_numeric = X_train.select_dtypes(include=[np.number])
    outlier_mask = iso_forest.fit_predict(X_train_numeric) == -1


    print(f"   🎯 Isolation Forest detected {outlier_mask.sum()} outliers")

    # 2. LOCAL OUTLIER FACTOR
    from sklearn.neighbors import LocalOutlierFactor
    lof = LocalOutlierFactor(contamination=0.05)
    # Ensure X_train is numeric for LocalOutlierFactor
    lof_outliers = lof.fit_predict(X_train_numeric) == -1


    print(f"   🎯 LOF detected {lof_outliers.sum()} outliers")

    # Combine outlier detection methods
    combined_outliers = outlier_mask | lof_outliers
    print(f"   🗑️ Removing {combined_outliers.sum()} total outliers ({combined_outliers.sum()/len(X_train)*100:.1f}%)")

    X_train_clean = X_train[~combined_outliers]
    y_train_clean = y_train[~combined_outliers] # Use y_train for cleaning

    # Handle missing values in the target variable after outlier removal
    nan_in_target_mask = y_train_clean.isna()
    if nan_in_target_mask.sum() > 0:
        print(f"   🗑️ Removing {nan_in_target_mask.sum()} samples with missing target values after outlier removal.")
        X_train_clean = X_train_clean[~nan_in_target_mask]
        y_train_clean = y_train_clean[~nan_in_target_mask]


    # Convert target variable to numerical labels
    y_train_clean_numeric = y_train_clean.map({'Adult': 0, 'Senior': 1})


    # 3. ADVANCED SCALING
    print("📏 Applying advanced scaling...")

    # Use PowerTransformer for making features more Gaussian
    pt = PowerTransformer(method='yeo-johnson')
    # Ensure data is numeric before scaling
    X_train_numeric_clean = X_train_clean.select_dtypes(include=[np.number])
    X_test_numeric = X_test.select_dtypes(include=[np.number])


    X_train_transformed = pt.fit_transform(X_train_numeric_clean)
    X_test_transformed = pt.transform(X_test_numeric)


    # Then apply RobustScaler
    scaler = RobustScaler()
    X_train_scaled = scaler.fit_transform(X_train_transformed)
    X_test_scaled = scaler.transform(X_test_transformed)

    # Convert back to DataFrames, preserving original column names
    X_train_final = pd.DataFrame(X_train_scaled, columns=X_train_numeric_clean.columns, index=X_train_clean.index)
    X_test_final = pd.DataFrame(X_test_scaled, columns=X_test_numeric.columns, index=X_test.index)


    # 4. ADVANCED CLASS BALANCING
    print("⚖️ Advanced class balancing...")

    # Ensure only numeric columns are passed to SMOTEENN
    X_train_final_numeric = X_train_final.select_dtypes(include=[np.number])

    # Use SMOTEENN (SMOTE + Edited Nearest Neighbours)
    smoteenn = SMOTEENN(random_state=42)
    X_train_balanced, y_train_balanced = smoteenn.fit_resample(X_train_final_numeric, y_train_clean_numeric) # Use y_train_clean_numeric for balancing


    print(f"   📊 Balanced dataset: {len(X_train_balanced)} samples")
    print(f"   📊 Class distribution: {np.bincount(y_train_balanced)}")


    return X_train_balanced, y_train_balanced, X_test_final, (pt, scaler)

# Apply preprocessing
X_train_processed, y_train_processed, X_test_processed, preprocessors = create_ultra_preprocessing_pipeline(
    train_enhanced, train_target, test_enhanced # Pass train_target as y_train
)

print("\n✅ Preprocessing complete\n")


⚙️ STEP 3: Advanced Preprocessing Pipeline
--------------------------------------------------
🧹 Advanced outlier detection and removal...
   🎯 Isolation Forest detected 99 outliers
   🎯 LOF detected 99 outliers
   🗑️ Removing 141 total outliers (7.2%)
   🗑️ Removing 13 samples with missing target values after outlier removal.
📏 Applying advanced scaling...
⚖️ Advanced class balancing...
   📊 Balanced dataset: 2210 samples
   📊 Class distribution: [ 904 1306]

✅ Preprocessing complete



In [None]:
# INTELLIGENT FEATURE SELECTION
# ==============================================================================
print("\n🎯 STEP 4: Intelligent Feature Selection")
print("-" * 50)

def intelligent_feature_selection(X, y, n_features=25):
    """Multi-method feature selection for optimal performance"""

    print(f"🔍 Selecting top {n_features} features from {X.shape[1]} candidates...")

    # Method 1: Mutual Information
    mi_selector = SelectKBest(score_func=mutual_info_classif, k=n_features)
    mi_selector.fit(X, y)
    mi_features = X.columns[mi_selector.get_support()]

    # Method 2: F-statistics
    f_selector = SelectKBest(score_func=f_classif, k=n_features)
    f_selector.fit(X, y)
    f_features = X.columns[f_selector.get_support()]

    # Method 3: Random Forest Feature Importance
    rf_selector = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_selector.fit(X, y)
    rf_importance = pd.Series(rf_selector.feature_importances_, index=X.columns)
    rf_features = rf_importance.nlargest(n_features).index

    # Combine selections (features appearing in at least 2 methods)
    feature_votes = {}
    for feature in X.columns:
        votes = 0
        if feature in mi_features: votes += 1
        if feature in f_features: votes += 1
        if feature in rf_features: votes += 1
        feature_votes[feature] = votes

    # Select features with at least 2 votes, then fill with highest single votes
    selected_features = []
    for feature, votes in sorted(feature_votes.items(), key=lambda x: x[1], reverse=True):
        if len(selected_features) < n_features:
            if votes >= 2 or len(selected_features) < n_features//2:
                selected_features.append(feature)

    print(f"✅ Selected {len(selected_features)} features using ensemble selection")
    return selected_features

# Select best features
selected_features = intelligent_feature_selection(X_train_processed, y_train_processed, n_features=30)
X_train_selected = X_train_processed[selected_features]
X_test_selected = X_test_processed[selected_features]

print(f"🎯 Final feature set: {len(selected_features)} features")


🎯 STEP 4: Intelligent Feature Selection
--------------------------------------------------
🔍 Selecting top 30 features from 52 candidates...
✅ Selected 30 features using ensemble selection
🎯 Final feature set: 30 features


In [None]:
# ULTRA-ADVANCED ENSEMBLE WITH MULTIPLE LEVELS
# ==============================================================================
print("\n🚀 STEP 5: Ultra-Advanced Multi-Level Ensemble")
print("-" * 50)

def create_ultimate_ensemble():
    """Create the ultimate ensemble for maximum F1 score"""

    print("🏗️ Building multi-level ensemble architecture...")

    # LEVEL 1: Diverse Base Models
    base_models = {
        'rf_balanced': BalancedRandomForestClassifier(
            n_estimators=300,
            max_depth=12,
            min_samples_split=3,
            min_samples_leaf=1,
            random_state=42,
            n_jobs=-1
        ),
        'xgb_optimized': xgb.XGBClassifier(
            n_estimators=400,
            max_depth=7,
            learning_rate=0.03,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_alpha=0.1,
            reg_lambda=0.1,
            scale_pos_weight=3,  # Handle class imbalance
            random_state=42,
            eval_metric='logloss'
        ),
        'lgb_optimized': lgb.LGBMClassifier(
            n_estimators=400,
            max_depth=7,
            learning_rate=0.03,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_alpha=0.1,
            reg_lambda=0.1,
            class_weight='balanced',
            random_state=42,
            verbose=-1
        ),
        'catboost': CatBoostClassifier(
            iterations=300,
            depth=8,
            learning_rate=0.05,
            l2_leaf_reg=3,
            class_weights=[1, 3],  # Handle imbalance
            random_seed=42,
            verbose=False
        ),
        'et_balanced': ExtraTreesClassifier(
            n_estimators=300,
            max_depth=12,
            min_samples_split=3,
            min_samples_leaf=1,
            class_weight='balanced',
            random_state=42,
            n_jobs=-1
        ),
        'gb_optimized': GradientBoostingClassifier(
            n_estimators=200,
            max_depth=6,
            learning_rate=0.05,
            subsample=0.8,
            random_state=42
        )
    }

    # LEVEL 2: Meta-learners with different strengths
    meta_models = {
        'lr_meta': LogisticRegression(
            C=0.1,
            class_weight='balanced',
            random_state=42
        ),
        'xgb_meta': xgb.XGBClassifier(
            n_estimators=100,
            max_depth=3,
            learning_rate=0.1,
            scale_pos_weight=3,
            random_state=42,
            eval_metric='logloss'
        )
    }

    # Create multiple stacking ensembles
    ensemble_1 = StackingClassifier(
        estimators=list(base_models.items())[:3],
        final_estimator=meta_models['lr_meta'],
        cv=5,
        stack_method='predict_proba',
        n_jobs=-1
    )

    ensemble_2 = StackingClassifier(
        estimators=list(base_models.items())[3:],
        final_estimator=meta_models['xgb_meta'],
        cv=5,
        stack_method='predict_proba',
        n_jobs=-1
    )

    # LEVEL 3: Final ensemble of ensembles
    final_ensemble = VotingClassifier(
        estimators=[
            ('stack1', ensemble_1),
            ('stack2', ensemble_2)
        ],
        voting='soft',
        n_jobs=-1
    )

    return final_ensemble, base_models

# Create the ultimate ensemble
print("🎯 Creating ultimate ensemble...")
ultimate_ensemble, base_models = create_ultimate_ensemble()



🚀 STEP 5: Ultra-Advanced Multi-Level Ensemble
--------------------------------------------------
🎯 Creating ultimate ensemble...
🏗️ Building multi-level ensemble architecture...


In [None]:
# THRESHOLD OPTIMIZATION FOR F1 SCORE
# ==============================================================================
print("\n🎚️ STEP 6: F1 Score Threshold Optimization")
print("-" * 50)

def optimize_f1_threshold(model, X, y, cv_folds=5):
    """Find optimal threshold for maximum F1 score"""

    print("🔍 Optimizing threshold for F1 score...")

    # Get cross-validation predictions
    skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
    y_proba_cv = np.zeros(len(y))

    for train_idx, val_idx in skf.split(X, y):
        X_fold_train, X_fold_val = X.iloc[train_idx], X.iloc[val_idx]
        y_fold_train = y.iloc[train_idx] if hasattr(y, 'iloc') else y[train_idx]

        model.fit(X_fold_train, y_fold_train)
        y_proba_cv[val_idx] = model.predict_proba(X_fold_val)[:, 1]

    # Find optimal threshold
    thresholds = np.arange(0.1, 0.9, 0.01)
    best_f1 = 0
    best_threshold = 0.5

    for threshold in thresholds:
        y_pred_thresh = (y_proba_cv >= threshold).astype(int)
        f1 = f1_score(y, y_pred_thresh)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold

    print(f"✅ Optimal threshold: {best_threshold:.3f}")
    print(f"✅ Optimal F1 score: {best_f1:.4f}")

    return best_threshold, best_f1



🎚️ STEP 6: F1 Score Threshold Optimization
--------------------------------------------------


In [26]:
# COMPREHENSIVE MODEL EVALUATION
# ==============================================================================
print("\n📊 STEP 7: Comprehensive Model Evaluation")
print("-" * 50)

def comprehensive_evaluation(model, X, y, cv_folds=5):
    """Comprehensive evaluation with multiple metrics"""

    print("📈 Running comprehensive evaluation...")

    # Repeated Stratified K-Fold for robust evaluation
    rskf = RepeatedStratifiedKFold(n_splits=cv_folds, n_repeats=3, random_state=42)

    f1_scores = []
    for fold, (train_idx, val_idx) in enumerate(rskf.split(X, y)):
        X_fold_train, X_fold_val = X.iloc[train_idx], X.iloc[val_idx]
        y_fold_train = y.iloc[train_idx] if hasattr(y, 'iloc') else y[train_idx]
        y_fold_val = y.iloc[val_idx] if hasattr(y, 'iloc') else y[val_idx]

        model.fit(X_fold_train, y_fold_train)
        y_pred = model.predict(X_fold_val)
        f1 = f1_score(y_fold_val, y_pred)
        f1_scores.append(f1)

        if fold < 5:  # Print first 5 folds
            print(f"   Fold {fold+1:2d}: F1 = {f1:.4f}")

    mean_f1 = np.mean(f1_scores)
    std_f1 = np.std(f1_scores)

    print(f"\n🎯 Final Results:")
    print(f"   Mean F1 Score: {mean_f1:.4f} ± {std_f1:.4f}")
    print(f"   Min F1 Score:  {np.min(f1_scores):.4f}")
    print(f"   Max F1 Score:  {np.max(f1_scores):.4f}")

    return mean_f1, std_f1

# Train and evaluate the ultimate ensemble
print("🚀 Training ultimate ensemble...")
mean_f1, std_f1 = comprehensive_evaluation(
    ultimate_ensemble,
    X_train_selected,
    pd.Series(y_train_processed) if not isinstance(y_train_processed, pd.Series) else y_train_processed
)

# Find optimal threshold
optimal_threshold, optimal_f1 = optimize_f1_threshold(
    ultimate_ensemble,
    X_train_selected,
    pd.Series(y_train_processed) if not isinstance(y_train_processed, pd.Series) else y_train_processed
)



📊 STEP 7: Comprehensive Model Evaluation
--------------------------------------------------
🚀 Training ultimate ensemble...
📈 Running comprehensive evaluation...
   Fold  1: F1 = 0.9416
   Fold  2: F1 = 0.9585
   Fold  3: F1 = 0.9699
   Fold  4: F1 = 0.9515
   Fold  5: F1 = 0.9623

🎯 Final Results:
   Mean F1 Score: 0.9575 ± 0.0095
   Min F1 Score:  0.9416
   Max F1 Score:  0.9719
🔍 Optimizing threshold for F1 score...
✅ Optimal threshold: 0.690
✅ Optimal F1 score: 0.9579


In [27]:
# FINAL PREDICTION WITH OPTIMIZED THRESHOLD
# ==============================================================================
print("\n🎯 STEP 8: Final Prediction Generation")
print("-" * 50)

print("🔥 Training final model on complete dataset...")
ultimate_ensemble.fit(X_train_selected, y_train_processed)

print("🔮 Generating predictions with optimized threshold...")
test_probabilities = ultimate_ensemble.predict_proba(X_test_selected)[:, 1]
test_predictions = (test_probabilities >= optimal_threshold).astype(int)

# Create submission
submission = pd.DataFrame({
    'SEQN': test_ids,
    'age_group': test_predictions
})

# Final validation
print(f"\n✅ SUBMISSION VALIDATION:")
print(f"   Shape: {submission.shape}")
print(f"   Unique values: {sorted(submission['age_group'].unique())}")
print(f"   Distribution: {submission['age_group'].value_counts().to_dict()}")
print(f"   Missing values: {submission.isnull().sum().sum()}")

# Save submission
submission.to_csv('submission.csv', index=False)



🎯 STEP 8: Final Prediction Generation
--------------------------------------------------
🔥 Training final model on complete dataset...
🔮 Generating predictions with optimized threshold...

✅ SUBMISSION VALIDATION:
   Shape: (312, 2)
   Unique values: [np.int64(0), np.int64(1)]
   Distribution: {0: 230, 1: 82}
   Missing values: 2
