# 02 — Feature Analysis → then Visualization (Task 1)
Compute importance first (mutual info, LightGBM gain, permutation), then visualize the top features by target.

In [None]:
from pathlib import Path
import sys, os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Auto-detect project ROOT (parent of notebooks/)
CWD = Path.cwd()
ROOT = CWD if (CWD / 'src').exists() else CWD.parent
if not (ROOT / 'src').exists():
    if (CWD.parent / 'src').exists():
        ROOT = CWD.parent
    elif (CWD.parent.parent / 'src').exists():
        ROOT = CWD.parent.parent
sys.path.insert(0, str(ROOT))
print(f'Using project ROOT: {ROOT}')

DATA_PATH = ROOT / 'data' / 'raw' / 'Participant_Selection_Final.csv'
REPORT_DIR = ROOT / 'artifacts' / 'reports'
REPORT_DIR.mkdir(parents=True, exist_ok=True)
TOP_K = 15
from src.data.preprocess import load, preprocess, drop_non_numeric_noise
from src.features.build_features import engineer_features
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import roc_auc_score
from sklearn.inspection import permutation_importance
from lightgbm import LGBMClassifier

df_raw = load(str(DATA_PATH))
df = engineer_features(preprocess(df_raw.copy()))
y = (df_raw['Participant_Selected_For_AID'] == 'Yes').astype(int)
X = drop_non_numeric_noise(df).select_dtypes(include=['number','bool'])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print('X:', X.shape, 'Train:', X_train.shape, 'Val:', X_val.shape, '| y+ rate:', y.mean().round(4))

## Importance — Mutual Information

In [None]:
mi = mutual_info_classif(X_train, y_train, random_state=42, discrete_features='auto')
mi_s = pd.Series(mi, index=X_train.columns, name='mutual_info').fillna(0.0)

## Importance — LightGBM gain

In [None]:
lgbm = LGBMClassifier(n_estimators=600, learning_rate=0.05, num_leaves=31,
                      class_weight='balanced', random_state=42)
lgbm.fit(X_train, y_train)
val_auc = roc_auc_score(y_val, lgbm.predict_proba(X_val)[:,1])
print('LGBM Val AUC:', round(val_auc, 4))
gain = pd.Series(lgbm.feature_importances_, index=X_train.columns, name='lgbm_gain')

## Importance — Permutation (validation, AUC)

In [None]:
perm = permutation_importance(lgbm, X_val, y_val, n_repeats=10, random_state=42, scoring='roc_auc')
perm_s = pd.Series(perm.importances_mean, index=X_train.columns, name='perm_importance')

## Combine & rank

In [None]:
imp = pd.concat([mi_s, gain, perm_s], axis=1)
for c in imp.columns:
    lo, hi = imp[c].min(), imp[c].max()
    imp[c] = 0.0 if hi==lo else (imp[c]-lo)/(hi-lo)
imp['combined'] = imp.mean(axis=1)
imp_sorted = imp.sort_values('combined', ascending=False)
display(imp_sorted.head(25))
imp_sorted.to_csv(REPORT_DIR / 'task1_feature_importance.csv')
top_features = imp_sorted.head(TOP_K).index.tolist()
print('Top features:', top_features)

## Visualize TOP_K — Histograms by target

In [None]:
def plot_numeric_by_target(df_all, target_raw, cols, out_name):
    k = len(cols)
    ncols = 3
    nrows = int(np.ceil(k / ncols))
    plt.figure(figsize=(15, 4*nrows))
    for i, c in enumerate(cols, 1):
        ax = plt.subplot(nrows, ncols, i)
        yes = df_all.loc[target_raw=='Yes', c].dropna().values
        no = df_all.loc[target_raw=='No', c].dropna().values
        ax.hist(no, bins=30, alpha=0.6, label='No')
        ax.hist(yes, bins=30, alpha=0.6, label='Yes')
        ax.set_title(c)
        if i==1: ax.legend()
    plt.tight_layout(); plt.savefig(REPORT_DIR / out_name, dpi=150, bbox_inches='tight'); plt.show()

plot_cols = [c for c in top_features if c in X.columns][:TOP_K]
plot_numeric_by_target(df, df_raw['Participant_Selected_For_AID'], plot_cols, 'top_features_hist_by_target.png')

## Visualize TOP_K — Boxplots by target

In [None]:
k = len(plot_cols)
ncols = 3
nrows = int(np.ceil(k / ncols))
plt.figure(figsize=(15, 4*nrows))
for i, c in enumerate(plot_cols, 1):
    ax = plt.subplot(nrows, ncols, i)
    groups = [df.loc[df_raw['Participant_Selected_For_AID']==cls, c].dropna().values for cls in df_raw['Participant_Selected_For_AID'].dropna().unique()]
    ax.boxplot(groups, labels=df_raw['Participant_Selected_For_AID'].dropna().unique().astype(str))
    ax.set_title(c)
plt.tight_layout(); plt.savefig(REPORT_DIR / 'top_features_boxplots_by_target.png', dpi=150, bbox_inches='tight'); plt.show()