In [None]:
import pandas as pd

# Load dataset with automatic delimiter detection
df = pd.read_csv("/Users/martinnwadiugwu/Downloads/RAC_data_imputed_new_new.csv", sep=None, engine='python')
df.columns = df.columns.str.strip()

# Check for required ID column
if 'ID' not in df.columns:
    raise KeyError("⚠️ 'ID' column not found in the dataset.")

# Format ID and Age
df['ID'] = df['ID'].astype(str)
df['Age'] = df['Age'].replace('90+', '90').astype(float)

# Identify PMI column
pmi_col_candidates = [col for col in df.columns if col.startswith('PMI')]
if not pmi_col_candidates:
    raise KeyError("⚠️ PMI column not found. Check column names.")
pmi_col = pmi_col_candidates[0]

# Define numeric columns for imputation
numeric_cols = [pmi_col]
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

# Define categorical columns to one-hot encode (exclude race_* which are already binary)
categorical_cols = ['Gender', 'Diagnosis', 'structure', 'APOE_Status', 'Braak', 'Pathology', 'last_MMSE', 'Years_Education', 'Highest_Education_Level']
categorical_cols = [col for col in categorical_cols if col in df.columns]

# Apply one-hot encoding
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=False)

# Convert all boolean columns (if any) to integers
bool_cols = df_encoded.select_dtypes(include='bool').columns
df_encoded[bool_cols] = df_encoded[bool_cols].astype(int)

# Save cleaned file as comma-separated CSV
output_path = "/Users/martinnwadiugwu/Downloads/RAC_data_onehot_new_new.csv"
df_encoded.to_csv(output_path, sep=",", index=False, encoding="utf-8", quotechar='"')

print(f"✅ Processing complete. Saved to: {output_path}")
