# Load in Data

In [1]:
import os
import pandas as pd
import pyreadstat

# Base folder paths
base_folder = "/Users/matthewmaslow/desktop/DS-596-Special Topics Medical Science/Final Project"
data_folder = os.path.join(base_folder, "data")
output_folder = os.path.join(base_folder, "DataCSV")

# Define selected columns per dataset
selected_columns = {
    "mhDepressionScreener_nhanes.xpt": ["SEQN", "DPQ020", "DPQ060", "DPQ090"],
    "demographics_nhanes.xpt": ["SEQN", "RIAGENDR", "RIDAGEYR", "RIDRETH3", "DMDEDUC2", "DMDMARTZ", "INDFMPIR"],
    "income_nhanes.xpt": ["SEQN", "INDFMMPI", "INDFMMPC", "INQ300", "IND310"],
    "sleepDisorders_nhanes.xpt": ["SEQN", "SLD012", "SLD013"],
    "smokingBehavior_nhanes.xpt": ["SEQN", "SMQ020", "SMQ040", "SMD650"],
    "alcoholUse_nhanes.xpt": ["SEQN", "ALQ111", "ALQ121", "ALQ270"],
    "physicalActivity_nhanes.xpt": ["SEQN", "PAD800", "PAD820", "PAD680"]
}

# Load XPTs with fallback
def load_selected_columns(file_path, columns):
    try:
        df, _ = pyreadstat.read_xport(file_path, usecols=columns)
        print(f"Loaded with pyreadstat: {os.path.basename(file_path)}")
    except Exception:
        df = pd.read_sas(file_path, format='xport', encoding='latin1')
        df = df[columns]
        print(f"Fallback to pandas: {os.path.basename(file_path)}")
    return df

# Merge selected variables
merged_df = None
for file_name, cols in selected_columns.items():
    path = os.path.join(data_folder, file_name)
    df = load_selected_columns(path, cols)
    merged_df = df if merged_df is None else pd.merge(merged_df, df, on="SEQN", how="left")

# Save to DataCSV
os.makedirs(output_folder, exist_ok=True)
output_path = os.path.join(output_folder, "nhanes_cleaned_merged.csv")
merged_df.to_csv(output_path, index=False)

print(f"\nMerged file saved to: {output_path}")
print(f"Final shape: {merged_df.shape}")

Loaded with pyreadstat: mhDepressionScreener_nhanes.xpt
Loaded with pyreadstat: demographics_nhanes.xpt
Loaded with pyreadstat: income_nhanes.xpt
Loaded with pyreadstat: sleepDisorders_nhanes.xpt
Loaded with pyreadstat: smokingBehavior_nhanes.xpt
Loaded with pyreadstat: alcoholUse_nhanes.xpt
Loaded with pyreadstat: physicalActivity_nhanes.xpt

Merged file saved to: /Users/matthewmaslow/desktop/DS-596-Special Topics Medical Science/Final Project/DataCSV/nhanes_cleaned_merged.csv
Final shape: (6337, 25)


In [2]:
df = pd.read_csv(os.path.join(output_folder, "nhanes_cleaned_merged.csv"))

print(df.dtypes)
print(df.isnull().sum())
print(df.shape)
print(df.head())

SEQN        float64
DPQ020      float64
DPQ060      float64
DPQ090      float64
RIAGENDR    float64
RIDAGEYR    float64
RIDRETH3    float64
DMDEDUC2    float64
DMDMARTZ    float64
INDFMPIR    float64
INDFMMPI    float64
INDFMMPC    float64
INQ300      float64
IND310      float64
SLD012      float64
SLD013      float64
SMQ020      float64
SMQ040      float64
SMD650      float64
ALQ111      float64
ALQ121      float64
ALQ270      float64
PAD800      float64
PAD820      float64
PAD680      float64
dtype: object
SEQN           0
DPQ020       819
DPQ060       827
DPQ090       831
RIAGENDR       0
RIDAGEYR       0
RIDRETH3       0
DMDEDUC2     273
DMDMARTZ     274
INDFMPIR     831
INDFMMPI    1286
INDFMMPC     520
INQ300       517
IND310      3476
SLD012        65
SLD013        67
SMQ020         2
SMQ040      3819
SMD650      5425
ALQ111       856
ALQ121      1415
ALQ270      3971
PAD800      1306
PAD820      3460
PAD680         6
dtype: int64
(6337, 25)
       SEQN  DPQ020  DPQ060  DPQ090  

# Data Preparation

In [3]:
df = df.dropna(subset=["DPQ020", "DPQ060", "DPQ090"])

In [4]:
df = df.drop(columns=["SMD650", "SMQ040", "ALQ270", "PAD820", "IND310"])

In [5]:
print(df.isnull().sum())
print(df.shape)

SEQN           0
DPQ020         0
DPQ060         0
DPQ090         0
RIAGENDR       0
RIDAGEYR       0
RIDRETH3       0
DMDEDUC2     245
DMDMARTZ     245
INDFMPIR     669
INDFMMPI    1075
INDFMMPC     411
INQ300       408
SLD012        55
SLD013        56
SMQ020         2
ALQ111        25
ALQ121       584
PAD800      1063
PAD680         4
dtype: int64
(5506, 20)


In [6]:
from sklearn.impute import SimpleImputer

num_cols = df.select_dtypes(include="number").columns.tolist()
num_cols = [col for col in num_cols if col not in ["SEQN", "DPQ020", "DPQ060", "DPQ090"]]

imputer = SimpleImputer(strategy="median")
df[num_cols] = imputer.fit_transform(df[num_cols])

In [7]:
print(df.isnull().sum())
print(df.shape)

SEQN        0
DPQ020      0
DPQ060      0
DPQ090      0
RIAGENDR    0
RIDAGEYR    0
RIDRETH3    0
DMDEDUC2    0
DMDMARTZ    0
INDFMPIR    0
INDFMMPI    0
INDFMMPC    0
INQ300      0
SLD012      0
SLD013      0
SMQ020      0
ALQ111      0
ALQ121      0
PAD800      0
PAD680      0
dtype: int64
(5506, 20)


# Model Training