# V17: Feature Reduction Optimization

An iterative refinement of the ensemble approach with optimized feature engineering and strategic model configuration adjustments.

**Key Features:**
- 75 base features (24 original + 3 medical)
- 48 external features from Diabetes dataset
- Memory optimization for large dataset handling
- Adjusted hyperparameters (3000 estimators CV, 1000 final)
- Feature selection reducing to 38 features
- Isotonic calibration for probability refinement

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import gc
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.isotonic import IsotonicRegression
from sklearn.feature_selection import SelectFromModel

import xgboost as xgb
import lightgbm as lgb
import catboost as cb

print("V17")

## 2. Load the Data

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s5e12/train.csv')
test  = pd.read_csv('/kaggle/input/playground-series-s5e12/test.csv')
sub   = pd.read_csv('/kaggle/input/playground-series-s5e12/sample_submission.csv')
orig  = pd.read_csv('/kaggle/input/diabetes-health-indicators-dataset/diabetes_dataset.csv')

TARGET = 'diagnosed_diabetes'
BASE = [col for col in train.columns if col not in ['id', TARGET]]
CATS = train.select_dtypes('object').columns.tolist()
NUMS = [col for col in BASE if col not in CATS]

print(f'{len(BASE)} Base Features.')
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

## 3. External Encoding

In [None]:
ORIG = []
for col in BASE:
    mean_map = orig.groupby(col)[TARGET].mean()
    new_mean = f"orig_mean_{col}"
    train[new_mean] = train[col].map(mean_map).fillna(orig[TARGET].mean())
    test[new_mean] = test[col].map(mean_map).fillna(orig[TARGET].mean())
    ORIG.append(new_mean)
    
    count_map = orig.groupby(col).size()
    new_count = f"orig_count_{col}"
    train[new_count] = train[col].map(count_map).fillna(0)
    test[new_count] = test[col].map(count_map).fillna(0)
    ORIG.append(new_count)

print(f'{len(ORIG)} External Features.')

## 4. Stable Feature Engineering

In [None]:
train['bmi_cat'] = pd.cut(train['bmi'], bins=[0, 18.5, 25, 30, 100], labels=[0,1,2,3])
test['bmi_cat'] = pd.cut(test['bmi'], bins=[0, 18.5, 25, 30, 100], labels=[0,1,2,3])

train['bp_cat'] = 0
train.loc[(train['systolic_bp'] >= 140) | (train['diastolic_bp'] >= 90), 'bp_cat'] = 2
train.loc[((train['systolic_bp'] >= 120) & (train['systolic_bp'] < 140)) | ((train['diastolic_bp'] >= 80) & (train['diastolic_bp'] < 90)), 'bp_cat'] = 1
test['bp_cat'] = 0
test.loc[(test['systolic_bp'] >= 140) | (test['diastolic_bp'] >= 90), 'bp_cat'] = 2
test.loc[((test['systolic_bp'] >= 120) & (test['systolic_bp'] < 140)) | ((test['diastolic_bp'] >= 80) & (test['diastolic_bp'] < 90)), 'bp_cat'] = 1

train['non_hdl'] = train['cholesterol_total'] - train['hdl_cholesterol']
test['non_hdl'] = test['cholesterol_total'] - test['hdl_cholesterol']

NEW_FEATS = ['bmi_cat', 'bp_cat', 'non_hdl']
for feat in NEW_FEATS:
    BASE.append(feat)

print(f'{len(NEW_FEATS)} Stable FE Features.')

## 5. Memory Optimization

Reduces memory footprint by downcasting numeric dtypes while preserving precision.

In [None]:
def reduce_mem_usage(df):
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object and col_type.name != 'category':
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                else:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    return df

train = reduce_mem_usage(train)
test = reduce_mem_usage(test)
gc.collect()

print("Memory optimization completed")

## 6. Final Features & Preparation

In [None]:
FEATURES = BASE + ORIG
print(f'{len(FEATURES)} Total Features.')

X = train[FEATURES]
y = train[TARGET]

# Safe Label Encoding
ALL_CATS = CATS + ['bmi_cat', 'bp_cat']
for col in ALL_CATS:
    if col in X.columns:
        le = LabelEncoder()
        combined = pd.concat([X[col].astype(str), test[col].astype(str)])
        le.fit(combined)
        X[col] = le.transform(X[col].astype(str))
        test[col] = le.transform(test[col].astype(str))

X_test = test[FEATURES]

## 7. 10-Fold Ensemble with Ultra-Heavy Regularization

3-model ensemble with adjusted hyperparameters:
- 3000 estimators during CV (reduced from 5000 for faster iteration)
- Ultra-heavy regularization (L1=3.0, L2=3.0-3.5)
- Blend weights: 40% XGB, 35% LGBM, 25% CB

In [None]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

oof = np.zeros(len(X))
pred_xgb = np.zeros(len(X_test))
pred_lgb = np.zeros(len(X_test))
pred_cb = np.zeros(len(X_test))

print("\nTraining 10-Fold Ensemble with Ultra-Heavy Reg...\n")

for fold, (trn_idx, val_idx) in enumerate(skf.split(X, y), 1):
    print(f"Fold {fold}/10 â†’ ", end="")
    
    X_trn, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_trn, y_val = y.iloc[trn_idx], y.iloc[val_idx]
    
    # XGBoost (Ultra-Heavy Reg)
    m1 = xgb.XGBClassifier(n_estimators=3000, max_depth=4, learning_rate=0.008,
                           subsample=0.7, colsample_bytree=0.6, reg_alpha=3.0, reg_lambda=3.0,
                           random_state=42, tree_method="hist", n_jobs=-1, verbosity=0)
    m1.fit(X_trn, y_trn, eval_set=[(X_val, y_val)], early_stopping_rounds=200, verbose=False)
    
    # LightGBM (Ultra-Heavy Reg)
    m2 = lgb.LGBMClassifier(n_estimators=3000, max_depth=4, learning_rate=0.008,
                            num_leaves=20, subsample=0.7, colsample_bytree=0.6,
                            reg_alpha=3.0, reg_lambda=3.0, random_state=42, n_jobs=-1, verbose=-1)
    m2.fit(X_trn, y_trn, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(200)])
    
    # CatBoost (Ultra-Heavy Reg)
    m3 = cb.CatBoostClassifier(iterations=3000, depth=4, learning_rate=0.008,
                               l2_leaf_reg=8.0, random_seed=42, verbose=False, early_stopping_rounds=200)
    m3.fit(X_trn, y_trn, eval_set=(X_val, y_val))
    
    # Blend
    val_pred = (m1.predict_proba(X_val)[:,1] * 0.4 + m2.predict_proba(X_val)[:,1] * 0.35 + m3.predict_proba(X_val)[:,1] * 0.25)
    oof[val_idx] = val_pred
    
    pred_xgb += m1.predict_proba(X_test)[:,1] / skf.n_splits
    pred_lgb += m2.predict_proba(X_test)[:,1] / skf.n_splits
    pred_cb  += m3.predict_proba(X_test)[:,1] / skf.n_splits
    
    fold_auc = roc_auc_score(y_val, val_pred)
    print(f"AUC = {fold_auc:.6f}")

print(f"\nFinal CV AUC: {roc_auc_score(y, oof):.6f}")

## 8. Feature Selection

In [None]:
selector = SelectFromModel(m1, threshold='median', prefit=True)
X_selected = selector.transform(X)
X_test_selected = selector.transform(X_test)

print(f"Selected {X_selected.shape[1]} features out of {X.shape[1]}")

## 9. Quick Re-train on Selected Features (XGB only for speed)

In [None]:
m_final = xgb.XGBClassifier(n_estimators=1000, max_depth=4, learning_rate=0.01,
                            subsample=0.7, colsample_bytree=0.6, reg_alpha=2.0, reg_lambda=2.0,
                            random_state=42, tree_method="hist", n_jobs=-1, verbosity=0)
m_final.fit(X_selected, y)

final_pred = m_final.predict_proba(X_test_selected)[:,1]
print(f"Final model trained on {X_selected.shape[1]} features")

## 10. Isotonic Calibration

In [None]:
calib = IsotonicRegression(out_of_bounds='clip')
calib.fit(oof, y)
final_pred = calib.transform(final_pred)
print("Isotonic calibration applied")

## 11. Submission

In [None]:
sub[TARGET] = final_pred
sub.to_csv('submission.csv', index=False)

print("\nsubmission.csv saved!")
print(f'Mean predicted: {final_pred.mean():.5f}')
print(f'Min predicted: {final_pred.min():.5f}')
print(f'Max predicted: {final_pred.max():.5f}')

print("\nFirst few predictions:")
sub.head()

## Summary

**V17 Optimizations:**
- 75 total features (27 base + 3 medical + 48 external)
- 3000 estimators for faster CV iteration
- Adjusted blend weights (40/35/25 instead of 50/35/15)
- Feature selection to 38 features
- 1000-estimator final model
- Memory optimization for dataset handling

V17 represents a middle-ground approach balancing computational efficiency with ensemble robustness.