# 01 — EDA (Relevant Columns Only)
This notebook runs EDA **only on relevant columns**, excluding identifiers, free-text, all-null/constant, and high-cardinality strings.

In [None]:
from pathlib import Path
import sys, os, re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Auto-detect project ROOT
CWD = Path.cwd()
ROOT = CWD if (CWD / 'src').exists() else CWD.parent
if not (ROOT / 'src').exists():
    if (CWD.parent / 'src').exists():
        ROOT = CWD.parent
    elif (CWD.parent.parent / 'src').exists():
        ROOT = CWD.parent.parent
sys.path.insert(0, str(ROOT))
print(f'Using project ROOT: {ROOT}')

DATA_PATH = ROOT / 'data' / 'raw' / 'Participant_Selection_Final.csv'
REPORT_DIR = ROOT / 'artifacts' / 'reports'
REPORT_DIR.mkdir(parents=True, exist_ok=True)

# Columns to explicitly ignore if present (identifiers)
EXCLUDE_COLS = {
    'Participant_ID','Household_ID','NID','Phone','Mobile','Phone_Number',
}
from src.data.preprocess import load, preprocess
from src.features.build_features import engineer_features

# Load & preprocess
df_raw = load(str(DATA_PATH))
df = preprocess(df_raw.copy())
df = engineer_features(df)
print('Raw shape:', df_raw.shape, '| After preprocess+engineer:', df.shape)
display(df.head())
print('\nInfo:')
print(df.info())

## Determine relevant columns

In [None]:
n = len(df)
drop = set()

# 1) explicit excludes
for c in df.columns:
    if c in EXCLUDE_COLS:
        drop.add(c)

# 2) near-unique ratio filter for object/category columns
for c in df.columns:
    if df[c].dtype == 'object' or str(df[c].dtype).startswith('category'):
        ratio = df[c].nunique(dropna=True) / max(1, n)
        if ratio > 0.9:
            drop.add(c)

# 3) all-null or constant
for c in df.columns:
    s = df[c]
    if s.isna().all():
        drop.add(c)
    elif s.nunique(dropna=True) <= 1:
        drop.add(c)

# 4) long free-text columns (heuristic: mean string length > 30 and > 50 unique)
for c in df.columns:
    if df[c].dtype == 'object':
        s = df[c].dropna().astype(str)
        if len(s) > 0:
            if (s.str.len().mean() > 30) and (s.nunique() > 50):
                drop.add(c)

relevant_cols = [c for c in df.columns if c not in drop]
df_rel = df[relevant_cols].copy()
print('Dropped columns:', len(drop))
print('Relevant columns:', len(relevant_cols))
pd.Series(sorted(drop)).to_csv(REPORT_DIR / 'eda_dropped_columns.csv', index=False)
pd.Series(relevant_cols).to_csv(REPORT_DIR / 'eda_relevant_columns.csv', index=False)
display(df_rel.head())

## Basic profile on relevant set

In [None]:
desc = df_rel.describe(include='all').T
display(desc.head(30))
desc.to_csv(REPORT_DIR / 'eda_relevant_describe.csv')
missing = df_rel.isna().mean().sort_values(ascending=False)
display(missing.head(30))
missing.to_csv(REPORT_DIR / 'eda_relevant_missing.csv')

## Target balance (if present)

In [None]:
def plot_counts(series, title, fname):
    vc = series.value_counts(dropna=False)
    plt.figure(figsize=(6,4))
    plt.bar(range(len(vc)), vc.values)
    plt.xticks(range(len(vc)), vc.index.astype(str), rotation=45, ha='right')
    plt.title(title); plt.ylabel('Count'); plt.tight_layout()
    plt.savefig(REPORT_DIR / fname, dpi=150, bbox_inches='tight'); plt.show()

if 'Participant_Selected_For_AID' in df_raw.columns:
    plot_counts(df_raw['Participant_Selected_For_AID'], 'Task 1: Selection (raw)', 'rel_target_task1_counts.png')
if 'Aid_Type_Recomended' in df_raw.columns:
    plot_counts(df_raw['Aid_Type_Recomended'], 'Task 2: Aid Type (raw)', 'rel_target_task2_counts.png')

## Numeric distributions (relevant)

In [None]:
num_cols = df_rel.select_dtypes(include=['number','bool']).columns.tolist()
if num_cols:
    n = len(num_cols); ncols = 3; nrows = int(np.ceil(n/ncols))
    plt.figure(figsize=(15, 4*nrows))
    for i, c in enumerate(num_cols, 1):
        ax = plt.subplot(nrows, ncols, i)
        vals = df_rel[c].dropna().values
        ax.hist(vals, bins=30)
        ax.set_title(c)
    plt.tight_layout(); plt.savefig(REPORT_DIR / 'rel_numeric_histograms.png', dpi=150, bbox_inches='tight'); plt.show()
else:
    print('No numeric/boolean relevant columns found.')

## Categorical distributions (relevant, low/medium cardinality only)

In [None]:
cat_cols = [c for c in df_rel.select_dtypes(include=['object','category']).columns
            if df_rel[c].nunique(dropna=True) <= 30]
if cat_cols:
    n = len(cat_cols); ncols = 3; nrows = int(np.ceil(n/ncols))
    plt.figure(figsize=(15, 4*nrows))
    for i, c in enumerate(cat_cols, 1):
        ax = plt.subplot(nrows, ncols, i)
        vc = df_rel[c].value_counts(dropna=False).head(30)
        ax.bar(vc.index.astype(str), vc.values)
        ax.set_title(c); ax.tick_params(axis='x', rotation=90)
    plt.tight_layout(); plt.savefig(REPORT_DIR / 'rel_categorical_bars.png', dpi=150, bbox_inches='tight'); plt.show()
else:
    print('No low/medium-cardinality categoricals found.')

## Correlations among relevant numeric features

In [None]:
num_cols = df_rel.select_dtypes(include=['number']).columns.tolist()
if len(num_cols) > 1:
    corr = df_rel[num_cols].corr()
    plt.figure(figsize=(12,10))
    im = plt.imshow(corr, aspect='auto', interpolation='nearest')
    plt.colorbar(im, fraction=0.046, pad=0.04)
    plt.xticks(range(len(num_cols)), num_cols, rotation=90)
    plt.yticks(range(len(num_cols)), num_cols)
    plt.title('Correlation Heatmap (Relevant Numeric)')
    plt.tight_layout(); plt.savefig(REPORT_DIR / 'rel_correlation_heatmap.png', dpi=150, bbox_inches='tight'); plt.show()
else:
    print('Not enough numeric columns for correlation heatmap.')

## Feature vs Target (relevant)

In [None]:
targets = []
if 'Participant_Selected_For_AID' in df_raw.columns: targets.append('Participant_Selected_For_AID')
if 'Aid_Type_Recomended' in df_raw.columns: targets.append('Aid_Type_Recomended')

for tgt in targets:
    # Numeric vs target
    t = df_raw[tgt].dropna()
    classes = t.unique().astype(str)
    num_cols = df_rel.select_dtypes(include=['number']).columns.tolist()[:9]
    if num_cols:
        plt.figure(figsize=(15, 12))
        n = len(num_cols); ncols = 3; nrows = int(np.ceil(n/ncols))
        for i, c in enumerate(num_cols, 1):
            ax = plt.subplot(nrows, ncols, i)
            groups = [df_rel.loc[df_raw[tgt]==cls, c].dropna().values for cls in classes]
            ax.boxplot(groups, labels=classes)
            ax.set_title(f'{c} by {tgt}')
            ax.tick_params(axis='x', rotation=45)
        plt.tight_layout(); plt.savefig(REPORT_DIR / f'rel_box_{tgt}.png', dpi=150, bbox_inches='tight'); plt.show()

    # Categorical vs target (stacked bars)
    cat_cols = [c for c in df_rel.select_dtypes(include=['object','category']).columns if df_rel[c].nunique(dropna=True) <= 15][:9]
    if cat_cols:
        plt.figure(figsize=(15, 12))
        n = len(cat_cols); ncols = 3; nrows = int(np.ceil(n/ncols))
        for i, c in enumerate(cat_cols, 1):
            ax = plt.subplot(nrows, ncols, i)
            ct = pd.crosstab(df_rel[c].astype(str), df_raw[tgt].astype(str))
            bottoms = np.zeros(len(ct))
            for cls in ct.columns:
                ax.bar(ct.index, ct[cls].values, bottom=bottoms, label=str(cls))
                bottoms += ct[cls].values
            ax.set_title(f'{c} by {tgt}')
            ax.tick_params(axis='x', rotation=90)
        plt.tight_layout();
        plt.legend(title=tgt, bbox_to_anchor=(1.05, 1), loc='upper left');
        plt.savefig(REPORT_DIR / f'rel_stacked_{tgt}.png', dpi=150, bbox_inches='tight'); plt.show()

## Save compact summary

In [None]:
summary = {
    'raw_shape': tuple(df_raw.shape),
    'processed_shape': tuple(df.shape),
    'relevant_cols': len(df_rel.columns),
}
import json
with open(REPORT_DIR / 'rel_eda_summary.json','w') as f:
    json.dump(summary, f, indent=2)
print('Saved rel_eda_summary.json')