In [1]:
import numpy as np
import pandas as pd
from scipy.stats import f_oneway, kruskal
import boto3
import io

Test discriminative power of numerical variables to predict Chronic Obstructive Pulmonary Disease.
Based on ANOVA and Kruskal-Wallis H Test.Tests null hypothesis that class-conditional sample means and medians are equal.

In [2]:
s3 = boto3.client('s3')
bucket_name = "lneg-loka"
file_name = "patient_data_raw/patient_data_raw.csv"
obj = s3.get_object(Bucket=bucket_name, Key=file_name)
csv_string = obj['Body'].read().decode('utf-8')
df = pd.read_csv(io.StringIO(csv_string))

In [3]:
def detect_continuous_columns(df, target_col, min_unique_values=10):
    """
    Detect continuous columns:
    - numeric dtype (int or float)
    - not the target column
    - at least `min_unique_values` unique values
    """
    conts = []
    for col in df.columns:
        if col == target_col:
            continue
        if pd.api.types.is_numeric_dtype(df[col]):
            if df[col].nunique(dropna=True) >= min_unique_values:
                conts.append(col)
    return conts

In [4]:
def eta_squared_anova(groups):
    """
    Compute eta-squared effect size for ANOVA:
    η² = SSB / SST
    """
    all_data = np.concatenate(groups)
    grand_mean = np.mean(all_data)
    ss_total = np.sum((all_data - grand_mean)**2)
    ss_between = np.sum([len(g) * (np.mean(g) - grand_mean)**2 for g in groups])
    if ss_total == 0:
        return 0.0
    return ss_between / ss_total

In [5]:
def evaluate_continuous_features(df, continuous_features, target_col):
    results = []
    classes = df[target_col].dropna().unique()
    for col in continuous_features:
        # Group data by class
        groups = []
        for c in classes:
            vals = df.loc[df[target_col] == c, col].dropna().values
            if len(vals) > 0:
                groups.append(vals)
        if len(groups) <= 1:
            results.append({
                'feature': col,
                'anova_F': np.nan,
                'anova_p': np.nan,
                'eta_squared': np.nan,
                'kruskal_H': np.nan,
                'kruskal_p': np.nan,
                'n_samples': len(df)
            })
            continue

        # ANOVA
        try:
            f_stat, p_anova = f_oneway(*groups)
            eta_sq = eta_squared_anova(groups)
        except Exception as e:
            f_stat, p_anova, eta_sq = np.nan, np.nan, np.nan

        # Kruskal–Wallis
        try:
            h_stat, p_kruskal = kruskal(*groups)
        except Exception as e:
            h_stat, p_kruskal = np.nan, np.nan

        results.append({
            'feature': col,
            'anova_F': f_stat,
            'anova_p': p_anova,
            'eta_squared': eta_sq,
            'kruskal_H': h_stat,
            'kruskal_p': p_kruskal,
            'n_samples': len(df)
        })
    return pd.DataFrame(results).sort_values(by='eta_squared', ascending=False).reset_index(drop=True)


In [6]:
target_col = 'chronic_obstructive_pulmonary_disease'
df = df.drop('patient_id',axis=1)
cats = detect_continuous_columns(df,target_col,5)
res = evaluate_continuous_features(df,cats,target_col)
res

Unnamed: 0,feature,anova_F,anova_p,eta_squared,kruskal_H,kruskal_p,n_samples
0,last_lab_glucose,1.663291,0.172613,0.000499,5.042725,0.168695,10000
1,albumin_globulin_ratio,0.999311,0.39199,0.0003,3.012068,0.389768,10000
2,age,0.589052,0.622123,0.000177,1.715591,0.633473,10000
3,medication_count,0.562822,0.639514,0.000169,1.389691,0.707953,10000
4,days_hospitalized,0.491563,0.688132,0.000148,3.230547,0.357427,10000
5,bmi,0.209122,0.890137,6.3e-05,0.868448,0.833035,10000
6,alanine_aminotransferase,0.209119,0.89014,6.3e-05,0.86008,0.835049,10000
