In [2]:
# Core libraries
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from scipy.stats import chi2_contingency, ttest_ind
from sklearn.model_selection import train_test_split

In [3]:
# Load and inspect data
# Load CSV
df = pd.read_csv("adult23.csv")

# Keep only selected columns for heart disease modeling
cols = [
    #––– cardio & metabolic “ever told” ----------------------------------------------------
    'MIEV_A', 'CHDEV_A', 'ANGEV_A', 'STREV_A', 'HYPEV_A', 'CHLEV_A',
    #––– other “ever told” ---------------------------------------------------------------
    'DIBLAST1_A', 'BMICAT_A', 'SMKCIGST_A', 'ASEV_A', 'CANEV_A',
    'ARTHEV_A', 'COPDEV_A', 'DEPEV_A', 'ANXEV_A',
    #––– mental‑health & meds ------------------------------------------------------------
    'DEPFREQ_A', 'ANXFREQ_A', 'ANXMED_A',
    #––– insurance & plan‑payment --------------------------------------------------------
    'NOTCOV_A', 'PLN1PAY4_A', 'PLN1PAY5_A', 'PLN1PAY6_A',
    #––– demographics & SES --------------------------------------------------------------
    'PRDEDUC1_A', 'EDUCP_A', 'AGEP_A', 'SEX_A', 'RACEALLP_A',
    'EMPLASTWK_A', 'REGION', 'URBRRL'
]
df_filtered = df[cols]

In [4]:
# Drop rows with more than 50% missing values
thresh = len(df_filtered) * 0.5
df_filtered = df_filtered.dropna(axis=1, thresh=thresh)

In [5]:
# Handle survey data codes
ambiguous_codes = [7, 8, 9]
binary_map_cols = [
    'MIEV_A', 'CHDEV_A', 'ANGEV_A', 'STREV_A', 'HYPEV_A', 'CHLEV_A',
    'ASEV_A', 'CANEV_A', 'ARTHEV_A', 'COPDEV_A', 'DEPEV_A', 'ANXEV_A',
    'ANXMED_A', 'PLN1PAY4_A', 'PLN1PAY5_A', 'PLN1PAY6_A'
]

for col in binary_map_cols:
    df_filtered[col] = df_filtered[col].replace(ambiguous_codes, pd.NA)
    df_filtered[col] = df_filtered[col].map({1: 1, 2: 0})


ordinal_or_multiclass = [
    'DIBLAST1_A',   # diabetes
    'BMICAT_A',     # BMI
    'SMKCIGST_A',   # smoking status
    'DEPFREQ_A',    # depression frequency
    'ANXFREQ_A',    # anxiety frequency
    'NOTCOV_A',     # insurance coverage
    'PRDEDUC1_A',   # deductible
    'SEX_A',        # male or female
    'RACEALLP_A',   # race
    'EDUCP_A',      # education 
    'EMPLASTWK_A',  # employment
    'REGION',       # region
    'URBRRL'        # urban/rural
]

for col in ordinal_or_multiclass:
    df_filtered[col] = df_filtered[col].replace(ambiguous_codes, pd.NA)
    
for col in df_filtered.columns:
    df_filtered[col] = df_filtered[col].fillna(df_filtered[col].mode()[0])

  df_filtered[col] = df_filtered[col].fillna(df_filtered[col].mode()[0])
  df_filtered[col] = df_filtered[col].fillna(df_filtered[col].mode()[0])


In [6]:
# Build binary heart_disease target variable
df_filtered['heart_disease'] = (
    (df_filtered['CHDEV_A'] == 1) |
    (df_filtered['ANGEV_A'] == 1) |
    (df_filtered['MIEV_A']  == 1)
).astype(int)

In [7]:
df_filtered.to_csv("filtered_adult23.csv", index=False)

In [8]:
# Chi-square and T-tests
df_stats = df_filtered.copy()

cat_feats = [
    'MIEV_A', 'CHDEV_A', 'ANGEV_A', 'STREV_A', 'HYPEV_A', 'CHLEV_A',
    'DIBLAST1_A', 'BMICAT_A', 'SMKCIGST_A', 'ASEV_A', 'CANEV_A', 'ARTHEV_A',
    'COPDEV_A', 'DEPEV_A', 'ANXEV_A', 'DEPFREQ_A', 'ANXFREQ_A', 'ANXMED_A',
    'NOTCOV_A', 'PLN1PAY4_A', 'PLN1PAY5_A', 'PLN1PAY6_A',
    'SEX_A', 'RACEALLP_A', 'EDUCP_A', 'EMPLASTWK_A', 'REGION', 'URBRRL'
]

num_feats = ['PRDEDUC1_A', 'AGEP_A']  

df_stats = df_stats.dropna(subset=['heart_disease'])

chi2_results = []
for col in cat_feats:
    tbl = pd.crosstab(df_stats[col].fillna("Missing"), df_stats['heart_disease'])
    if tbl.shape[0] > 1: 
        chi2, p, dof, expected = chi2_contingency(tbl)
        chi2_results.append({
            'feature': col,
            'chi2_stat': chi2,
            'p_value': p
        })

chi2_df = pd.DataFrame(chi2_results).sort_values('p_value')

ttest_results = []
for col in num_feats:
    grp0 = df_stats[df_stats['heart_disease'] == 0][col].dropna()
    grp1 = df_stats[df_stats['heart_disease'] == 1][col].dropna()
    if len(grp0) > 1 and len(grp1) > 1:
        t_stat, p = ttest_ind(grp0, grp1, equal_var=False)
        ttest_results.append({
            'feature': col,
            't_stat': t_stat,
            'p_value': p
        })

t_df = pd.DataFrame(ttest_results).sort_values('p_value')

# Print results
print("Chi-Square Test: Categorical Features")
print(chi2_df.to_string(index=False))

print("\nT-Test: Numerical Features")
print(t_df.to_string(index=False))

Chi-Square Test: Categorical Features
    feature    chi2_stat       p_value
     MIEV_A 12781.472403  0.000000e+00
    CHDEV_A 21933.256244  0.000000e+00
    ANGEV_A  6479.342173  0.000000e+00
    HYPEV_A  1700.647180  0.000000e+00
    CHLEV_A  1444.059028  0.000000e+00
EMPLASTWK_A  1112.667506 5.830274e-244
   ARTHEV_A  1055.036214 1.957459e-231
   COPDEV_A   904.369983 1.101137e-198
    STREV_A   763.131015 5.601172e-168
    CANEV_A   499.531271 1.202144e-110
 SMKCIGST_A   358.715492  2.301735e-76
 DIBLAST1_A   362.510841  3.177716e-75
    EDUCP_A   142.744727  6.370807e-27
  DEPFREQ_A   127.106779  1.618202e-26
      SEX_A   101.406690  7.491034e-24
   NOTCOV_A    92.442908  6.929527e-22
     URBRRL    92.232085  7.262276e-20
 RACEALLP_A    82.768024  2.210085e-16
   BMICAT_A    58.952747  9.839059e-13
    DEPEV_A    49.069362  2.470693e-12
  ANXFREQ_A    57.230184  1.106972e-11
 PLN1PAY4_A    41.704993  1.061376e-10
     REGION    44.347145  1.273486e-09
   ANXMED_A    24.549946  

In [9]:
top_chi2_features = chi2_df.loc[chi2_df['p_value'] < 0.05, 'feature'].tolist()
print("Top statistically significant categorical features:", top_chi2_features)

Top statistically significant categorical features: ['MIEV_A', 'CHDEV_A', 'ANGEV_A', 'HYPEV_A', 'CHLEV_A', 'EMPLASTWK_A', 'ARTHEV_A', 'COPDEV_A', 'STREV_A', 'CANEV_A', 'SMKCIGST_A', 'DIBLAST1_A', 'EDUCP_A', 'DEPFREQ_A', 'SEX_A', 'NOTCOV_A', 'URBRRL', 'RACEALLP_A', 'BMICAT_A', 'DEPEV_A', 'ANXFREQ_A', 'PLN1PAY4_A', 'REGION', 'ANXMED_A', 'ANXEV_A', 'ASEV_A', 'PLN1PAY6_A', 'PLN1PAY5_A']


In [10]:
X = df_filtered[top_chi2_features + ['AGEP_A', 'PRDEDUC1_A']]
y = df_filtered['heart_disease']

In [11]:
# One-hot encoding
categorical_cols = [
    col for col in X.columns
    if X[col].nunique() > 2 and X[col].dtype in ['int64', 'float64']
]

column_transformer = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'
)

X_encoded = column_transformer.fit_transform(X)

encoded_col_names = column_transformer.named_transformers_['onehot'].get_feature_names(categorical_cols)

passthrough_cols = [col for col in X.columns if col not in categorical_cols]
final_cols = list(encoded_col_names) + passthrough_cols

X_final = pd.DataFrame(X_encoded, columns=final_cols)

print(X_final.shape)

TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'

In [None]:
# Train-test split

X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, stratify=y, random_state=42)

print("Train shape:", X_train.shape)
print("Test shape :", X_test.shape)

Train shape: (23617, 105)
Test shape : (5905, 105)
