Statistical test to assess discriminative power of numerical variables in predicting Chronic Obstructive Pulmonary Disease class

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import f_oneway, kruskal
import boto3
import io


In [None]:
s3 = boto3.client('s3')
bucket_name = "lneg-loka"
file_name = "patient_data_raw.csv"
obj = s3.get_object(Bucket=bucket_name, Key=file_name)
csv_string = obj['Body'].read().decode('utf-8')
df = pd.read_csv(io.StringIO(csv_string))

In [None]:
def detect_continuous_columns(df, target_col, min_unique_values=10):
    """
    Auto-detect continuous columns:
    - numeric dtype (int or float)
    - not the target column
    - at least `min_unique_values` unique values
    """
    conts = []
    for col in df.columns:
        if col == target_col:
            continue
        if pd.api.types.is_numeric_dtype(df[col]):
            if df[col].nunique(dropna=True) >= min_unique_values:
                conts.append(col)
    return conts

In [None]:
def eta_squared_anova(groups):
    """
    Compute eta-squared effect size for ANOVA:
    η² = SSB / SST
    """
    all_data = np.concatenate(groups)
    grand_mean = np.mean(all_data)
    ss_total = np.sum((all_data - grand_mean)**2)
    ss_between = np.sum([len(g) * (np.mean(g) - grand_mean)**2 for g in groups])
    if ss_total == 0:
        return 0.0
    return ss_between / ss_total

In [None]:
def evaluate_continuous_features(df, continuous_features, target_col):
    results = []
    classes = df[target_col].dropna().unique()
    for col in continuous_features:
        # Group data by class
        groups = []
        for c in classes:
            vals = df.loc[df[target_col] == c, col].dropna().values
            if len(vals) > 0:
                groups.append(vals)
        if len(groups) <= 1:
            results.append({
                'feature': col,
                'anova_F': np.nan,
                'anova_p': np.nan,
                'eta_squared': np.nan,
                'kruskal_H': np.nan,
                'kruskal_p': np.nan,
                'n_samples': len(df)
            })
            continue

        # ANOVA
        try:
            f_stat, p_anova = f_oneway(*groups)
            eta_sq = eta_squared_anova(groups)
        except Exception as e:
            f_stat, p_anova, eta_sq = np.nan, np.nan, np.nan

        # Kruskal–Wallis
        try:
            h_stat, p_kruskal = kruskal(*groups)
        except Exception as e:
            h_stat, p_kruskal = np.nan, np.nan

        results.append({
            'feature': col,
            'anova_F': f_stat,
            'anova_p': p_anova,
            'eta_squared': eta_sq,
            'kruskal_H': h_stat,
            'kruskal_p': p_kruskal,
            'n_samples': len(df)
        })
    return pd.DataFrame(results).sort_values(by='eta_squared', ascending=False).reset_index(drop=True)


In [None]:
target_col = 'chronic_obstructive_pulmonary_disease'
df = df.drop('patient_id',axis=1)
cats = detect_continuous_columns(df,target_col,5)
res = evaluate_continuous_features(df,cats,target_col)
res