In [None]:
import cudf
import numpy as np
import os

# Function to load and convert a SAS file to cuDF DataFrame
def load_sas_to_cudf(file_path):
    return cudf.read_sas(file_path, format='xport')

# Load the medical conditions data
mcq = load_sas_to_cudf('MCQ_J.XPT')

# Define the relevant columns for autoimmune diseases
autoimmune_columns = ['MCQ160A', 'MCQ160B', 'MCQ160C', 'MCQ160D']

# Create the target variable
mcq['autoimmune'] = mcq[autoimmune_columns].applymap(lambda x: 1 if x == 1 else 0).max(axis=1)

# List of files to be loaded
files_to_load = [
    'CBC_J.XPT',
    'CRP_J.XPT',
    'DBQ_J.XPT',
    'DEMO_J.XPT',
    'DPQ_J.XPT',
    'DR1TOT_J.XPT',
    'DR2TOT_J.XPT',
    'DS1IDS_J.XPT',
    'FEM_J.XPT',
    'FERTIN_J.XPT',
    'GHB_J.XPT',
    'HEPA_J.XPT',
    'HSCRP_J.XPT',
    'HSQ_J.XPT',
    'INMEHg_J.XPT',
    'LBX_J.XPT',
    'MCQ_J.XPT',
    'PAQ_J.XPT',
    'SF_J.XPT',
    'SSPUIDX_J.XPT',
    'THYRO_J.XPT',
    'UASRCN_J.XPT',
    'UM_J.XPT',
    'VOC_J.XPT',
    'WHQ_J.XPT'
]

# Load and merge datasets
merged_df = mcq[['SEQN', 'autoimmune']].copy()
for file_name in files_to_load:
    df = load_sas_to_cudf(file_name)
    merged_df = merged_df.merge(df, on='SEQN', how='inner')

# Handle missing values (impute or drop as appropriate)
merged_df = merged_df.dropna()

# Feature selection based on domain knowledge and variable list
# Example features (to be adjusted based on your PDF and domain knowledge)
selected_features = [
    'RIDAGEYR', 'RIAGENDR', 'BMXBMI', 'BPXDI1', 'BPXSY1', 'LBXTC', 'LBXGLU', 'LBDLDL', 
    'LBDHDD', 'DBQ700', 'DPQ010', 'PAQ605', 'WHQ030', 'DR1TCAFF', 'DR2TCAFF', 'DSDSUPP', 
    'LBXCRP', 'LBXHGB', 'LBXWBC', 'LBXPLTS', 'LBXIRN', 'LBDFER'
]

# Ensure selected features exist in the merged DataFrame
selected_features = [feature for feature in selected_features if feature in merged_df.columns]

X = merged_df[selected_features]
y = merged_df['autoimmune']

# Display the resulting DataFrame structure
print("Features and target loaded:")
print(X.head())
print("Target distribution:")
print(y.value_counts())

# Optional: Save the prepared data for future use
X.to_csv('prepared_features.csv', index=False)
y.to_csv('prepared_target.csv', index=False)
