In [None]:
# CELL 1: Load raw 400MB dataset
import pandas as pd
import numpy as np

# Load from Git LFS
df_raw = pd.read_csv("../data/DementiaPredictionDataset.csv")

print(f"Raw data: {df_raw.shape[0]:,} rows × {df_raw.shape[1]} columns")

FileNotFoundError: [Errno 2] No such file or directory: 'data/dataset.xlsx'

In [None]:
# CELL 2: Allowed non-medical columns (from NACC dictionary + hackathon rules)
allowed_non_medical = [
    'NACCID', 'VISCODE', 'VISITMO', 'VISITDAY', 'VISITYR',
    'BIRTHMO', 'BIRTHYR', 'SEX', 'HISPANIC', 'HISPOR', 'HISPORX',
    'RACE', 'RACEX', 'RACESEC', 'RACESECX', 'RACETER', 'RACETERX',
    'PRIMLANG', 'PRIMLANX', 'EDUC', 'MARISTAT', 'NACCLIVS',
    'INDEPEND', 'RESIDENC', 'HANDED',
    'TOBAC30', 'TOBAC100', 'SMOKYRS', 'PACKSPER', 'QUITSMOK',
    'ALCOCCAS', 'ALCFREQ', 'ALCDRINK', 'ALCFREQYR',
    'HEIGHT', 'WEIGHT',
    'NACCADC', 'PACKET', 'FORMVER', 'NACCDAYS', 'NACCFDYS'
]

target = 'NACCIDEM'  # 1 = dementia, 0 = no dementia

# AUTO-FILTER: Only keep columns that exist
existing_cols = [col for col in allowed_non_medical + [target] if col in df_raw.columns]
df = df_raw[existing_cols].copy()

print(f"Kept {len(existing_cols)} columns:")
print(existing_cols)

NameError: name 'df_raw' is not defined

In [None]:
# CELL 3: Replace special missing codes
missing_codes = [-4, -4.0, 88, 88.0, 999, 999.0, '88', '999', '-4']

df = df.replace(missing_codes, np.nan)

print("Missing codes → NaN")
print(f"Top missing:\n{df.isnull().sum().sort_values(ascending=False).head(10)}")

Replaced -4, 88, 999 → NaN
Missing values:
RACETERX    195160
RACESECX    194813
HISPORX     194669
RACEX       192851
RACETER     192713
dtype: int64


In [None]:
# CELL 4: Create AGE, BMI, SMOKING, ALCOHOL
if 'VISITYR' in df.columns and 'BIRTHYR' in df.columns:
    df['AGE'] = df['VISITYR'] - df['BIRTHYR']
    if 'BIRTHMO' in df.columns and 'VISITMO' in df.columns:
        df['AGE'] = np.where(
            (df['BIRTHMO'] > df['VISITMO']) & (df['AGE'] > 0),
            df['AGE'] - 1, df['AGE']
        )
    print("AGE created")

if 'WEIGHT' in df.columns and 'HEIGHT' in df.columns:
    df['BMI'] = 703 * df['WEIGHT'] / (df['HEIGHT'] ** 2)
    print("BMI created")

if 'TOBAC100' in df.columns:
    df['EVER_SMOKER'] = (df['TOBAC100'] == 1).astype(int)
    print("EVER_SMOKER created")

if 'ALCFREQ' in df.columns:
    alcohol_map = {1: 0, 2: 1, 3: 2, 4: 3}  # Never → Daily
    df['ALCOHOL_FREQ'] = df['ALCFREQ'].map(alcohol_map)
    print("ALCOHOL_FREQ created")

if 'EDUC' in df.columns:
    df['EDUC_YEARS'] = df['EDUC']
    print("EDUC_YEARS created")

AGE created
BMI created
EVER_SMOKER created
ALCOHOL_FREQ created
EDUC_YEARS created


In [None]:
# CELL 5: Drop columns with >70% missing or text
missing_pct = df.isnull().mean() * 100
high_missing = missing_pct[missing_pct > 70].index

text_cols = df.select_dtypes(include=['object']).columns
drop_cols = list(high_missing) + list(text_cols)
drop_cols = [col for col in drop_cols if col not in ['NACCID', target]]

df = df.drop(columns=drop_cols, errors='ignore')

print(f"Dropped {len(drop_cols)} columns")
print(f"Final shape: {df.shape}")

Dropped 17 high-missing/text columns
Final modeling shape: (195196, 32)


In [None]:
# CELL 6: Final dataset
final_cols = [col for col in df.columns if col not in [target]]
df_final = df[final_cols + [target]].copy()
df_final = df_final.rename(columns={target: 'DEMENTIA'})

print(f"FINAL CLEAN: {df_final.shape}")
df_final.head()

FINAL DATASET: (195196, 32)
Ready for EDA & Modeling!


Unnamed: 0,NACCID,VISITMO,VISITDAY,VISITYR,BIRTHMO,BIRTHYR,SEX,HISPANIC,RACE,PRIMLANG,...,WEIGHT,NACCADC,FORMVER,NACCDAYS,NACCFDYS,AGE,BMI,EVER_SMOKER,EDUC_YEARS,DEMENTIA
0,NACC002909,12,28,2022,5,1952,1,0,1,1,...,232.0,186,3.0,391.0,0.0,70,32.353898,0,16,0
1,NACC002909,1,23,2024,5,1952,1,0,1,1,...,220.0,186,3.0,391.0,391.0,71,30.680421,0,16,0
2,NACC003487,11,15,2023,12,1956,1,0,1,1,...,175.0,186,3.0,0.0,0.0,66,23.731674,0,16,0
3,NACC004352,10,5,2021,1,1958,2,1,1,2,...,888.0,186,3.0,0.0,0.0,63,79.166667,0,16,8
4,NACC004687,11,14,2022,2,1945,1,1,1,1,...,114.0,186,3.0,0.0,0.0,77,18.968521,0,12,0


In [None]:
# CELL 7: Save clean data
import os
os.makedirs("data", exist_ok=True)

df_final.to_csv("data/processed_clean_non_medical.csv", index=False)
print("SAVED: data/processed_clean_non_medical.csv")

# Git commit
!git add .
!git commit -m "feat: full data cleaning - non-medical only, handle -4/88/999, engineer features"
!git push

Saved: data/processed_clean_non_medical.csv
[main 5ff0215] feat: clean data - auto-detect columns, safe feature eng, non-medical only
 1 file changed, 30 insertions(+), 6 deletions(-)


To https://github.com/pavi06112005-afk/Model-X.git
   b6b957c..5ff0215  main -> main
