Statistical tests to assess discriminative power of categorical variables in predicting Chronic Obstructive Pulmonary Disease class

In [None]:
import argparse
import sys
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency
import boto3
import io


In [None]:
s3 = boto3.client('s3')
bucket_name = "lneg-loka"
file_name = "patient_data_raw.csv"
obj = s3.get_object(Bucket=bucket_name, Key=file_name)
csv_string = obj['Body'].read().decode('utf-8')
df = pd.read_csv(io.StringIO(csv_string))

In [None]:
def cramers_v(confusion_matrix):
    """
    Compute Cram√©r's V for a contingency table.
    Uses the bias correction from Bergsma & Wicher (2013).
    """
    chi2, _, _, _ = chi2_contingency(confusion_matrix)
    n = confusion_matrix.sum().sum()
    if n == 0:
        return np.nan
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    # Bias correction
    phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
    rcorr = r - ((r - 1) ** 2) / (n - 1)
    kcorr = k - ((k - 1) ** 2) / (n - 1)
    denom = min((kcorr - 1), (rcorr - 1))
    if denom <= 0:
        return 0.0
    return np.sqrt(phi2corr / denom)

In [None]:
def detect_categorical_columns(df, target_col, max_categories=6):
    """Auto-detect categorical columns:
       - object or category dtype
       - integer/float columns with small number of unique values (<= max_unique_for_numeric)
    """
    cats = []
    for col in df.columns:
        if col == target_col:
            continue
        if pd.api.types.is_object_dtype(df[col]) or pd.api.types.is_categorical_dtype(df[col]):
            cats.append(col)
            continue
        # numeric but small cardinality (likely categorical)
        if (pd.api.types.is_integer_dtype(df[col]) or pd.api.types.is_float_dtype(df[col])):
            nunique = df[col].nunique(dropna=True)
            if nunique <= max_categories:
                cats.append(col)
    return cats

In [None]:
def evaluate_features(df, categorical_features, target_col):
    rows = []
    for col in categorical_features:
        # Drop rows with missing values in either column
        subset = df[[col, target_col]].dropna()
        if subset.shape[0] == 0:
            rows.append({
                'feature': col,
                'chi2': np.nan,
                'p_value': np.nan,
                'dof': np.nan,
                'cramers_v': np.nan,
                'n_samples': 0,
                'note': 'no data after dropna'
            })
            continue

        contingency = pd.crosstab(subset[col], subset[target_col])
        # If contingency has any zero-row/column, chi2_contingency still works but may warn.
        try:
            chi2, p, dof, expected = chi2_contingency(contingency)
        except Exception as e:
            rows.append({
                'feature': col,
                'chi2': np.nan,
                'p_value': np.nan,
                'dof': np.nan,
                'cramers_v': np.nan,
                'n_samples': subset.shape[0],
                'note': f'chi2 error: {e}'
            })
            continue

        crv = cramers_v(contingency)
        # Check expected frequencies rule of thumb (many expected < 5 reduces reliability)
        low_expected = (expected < 5).sum()
        note = ''
        if low_expected > 0:
            note = f'{low_expected} expected cells < 5 (caution)'

        rows.append({
            'feature': col,
            'chi2': float(chi2),
            'p_value': float(p),
            'dof': int(dof),
            'cramers_v': float(crv),
            'n_samples': int(subset.shape[0]),
            'note': note
        })
    res_df = pd.DataFrame(rows).sort_values(by='cramers_v', ascending=False).reset_index(drop=True)
    return res_df

In [None]:
target_col = 'chronic_obstructive_pulmonary_disease'
df = df.drop('patient_id',axis=1)
cats = detect_categorical_columns(df,target_col,5)
res = evaluate_features(df,cats,target_col)
res