In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import os


os.makedirs('data', exist_ok=True)
os.makedirs('outputs', exist_ok=True)

In [4]:
data_path = 'data/cleaned_data.csv'
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,NACCID,NACCADC,PACKET,FORMVER,VISITMO,VISITDAY,VISITYR,NACCVNUM,NACCAVST,NACCNVST,...,INLIVWTH,NACCFAM,NACCMOM,NACCFADM,NACCAM,NACCFFTD,NACCFM,NACCBMI,NACCUDSD,DEMENTED
0,NACC002909,186,I,3.0,12,28,2022,1,2,2,...,1.0,1.0,0.0,0,9.0,0,9.0,32.4,3,0
1,NACC002909,186,F,3.0,1,23,2024,2,2,2,...,1.0,1.0,0.0,0,9.0,0,9.0,30.7,3,0
2,NACC003487,186,I,3.0,11,15,2023,1,1,1,...,1.0,0.0,0.0,0,9.0,0,9.0,23.7,1,0
3,NACC004352,186,I,3.0,10,5,2021,1,1,1,...,,,,0,,0,,888.8,4,1
4,NACC004687,186,I,3.0,11,14,2022,1,1,1,...,0.0,9.0,0.0,0,9.0,0,9.0,19.0,1,0


Identify Target Variable

In [6]:
TARGET_CANDIDATES = ['DEMENTED', 'NORMCOG', 'NACCALZD', 'CDRGLOB', 'DEMENTIA_BINARY']

TARGET = None

print(" Searching for target variable...\n")

for candidate in TARGET_CANDIDATES:
    if candidate in df.columns:
        print(f"✓ Found: {candidate}")
        print(f"  Distribution: {df[candidate].value_counts().to_dict()}")
        if TARGET is None:
            TARGET = candidate


if TARGET is None and 'CDRGLOB' in df.columns:
    print("⚙️  Creating binary target from CDRGLOB...")
    df['DEMENTIA_BINARY'] = (df['CDRGLOB'] >= 0.5).astype(int)
    TARGET = 'DEMENTIA_BINARY'

 Searching for target variable...

✓ Found: DEMENTED
  Distribution: {0: 137606, 1: 57590}


Separate Features and Target

In [8]:
X = df.drop(columns=[TARGET])
y = df[TARGET]

print(f" Target: {TARGET}")
print(f"   Features: {len(X.columns)}")
print(f"   Samples: {len(X):,}")

 Target: DEMENTED
   Features: 43
   Samples: 195,196


Create Age-Based Features

In [10]:
if 'NACCAGE' in X.columns:

    X['age_squared'] = X['NACCAGE'] ** 2
    X['age_cubed'] = X['NACCAGE'] ** 3
    

    X['age_group_65'] = (X['NACCAGE'] >= 65).astype(int)
    X['age_group_75'] = (X['NACCAGE'] >= 75).astype(int)
    X['age_group_85'] = (X['NACCAGE'] >= 85).astype(int)
    
    print("Age features created: 5")
    print("   - age_squared")
    print("   - age_cubed")
    print("   - age_group_65, age_group_75, age_group_85")
else:
    print("⚠️  NACCAGE not found, skipping age features")

Age features created: 5
   - age_squared
   - age_cubed
   - age_group_65, age_group_75, age_group_85


Create Education Features

In [11]:

if 'EDUC' in X.columns:

    X['low_education'] = (X['EDUC'] < 12).astype(int)  # Less than high school
    X['high_education'] = (X['EDUC'] >= 16).astype(int)  # College graduate+
    
    print("Education features created: 2")
    print("   - low_education (<12 years)")
    print("   - high_education (>=16 years)")
    
    # Age-Education interactions
    if 'NACCAGE' in X.columns:
        X['age_edu_interaction'] = X['NACCAGE'] * X['EDUC']
        X['age_edu_ratio'] = X['NACCAGE'] / (X['EDUC'] + 1)
        print("\n Age-Education interactions: 2")
        print("   - age_edu_interaction")
        print("   - age_edu_ratio")
else:
    print("⚠️  EDUC not found, skipping education features")

Education features created: 2
   - low_education (<12 years)
   - high_education (>=16 years)

 Age-Education interactions: 2
   - age_edu_interaction
   - age_edu_ratio


Create Social Isolation Score

In [14]:
social_isolation = 0
factors = 0

# Not married
if 'MARISTAT' in X.columns:
    social_isolation += (X['MARISTAT'] != 1).astype(int)
    factors += 1
    print("    Using MARISTAT (marital status)")

# Lives alone
if 'NACCLIVS' in X.columns:
    social_isolation += (X['NACCLIVS'] == 1).astype(int)
    factors += 1
    print("    Using NACCLIVS (living situation)")

# No live-in companion
if 'INLIVWTH' in X.columns:
    social_isolation += (X['INLIVWTH'] == 0).astype(int)
    factors += 1
    print("    Using INLIVWTH (informant lives with)")

if factors > 0 and isinstance(social_isolation, pd.Series):
    X['social_isolation_score'] = social_isolation
    print(f"\n Social isolation score created (based on {factors} factors)")
    print(f"   Distribution: {X['social_isolation_score'].value_counts().to_dict()}")
else:
    print("  Cannot create social isolation score (no relevant features)")

    Using MARISTAT (marital status)
    Using NACCLIVS (living situation)
    Using INLIVWTH (informant lives with)

 Social isolation score created (based on 3 factors)
   Distribution: {0: 100287, 3: 43724, 1: 32710, 2: 18475}


Create Family Risk Score

In [15]:
family_risk = 0
factors = 0


if 'NACCMOM' in X.columns:
    family_risk += (X['NACCMOM'] == 1).astype(int)
    factors += 1
    print("   ✓ Using NACCMOM (mother's dementia)")

if 'NACCDAG' in X.columns:
    family_risk += (X['NACCDAG'] == 1).astype(int)
    factors += 1
    print("   ✓ Using NACCDAG (father's dementia)")
elif 'NACCFAD' in X.columns:
    family_risk += (X['NACCFAD'] == 1).astype(int)
    factors += 1
    print("   ✓ Using NACCFAD (father's dementia)")


if 'NACCFAM' in X.columns:
    family_risk += (X['NACCFAM'] == 1).astype(int)
    factors += 1
    print("   ✓ Using NACCFAM (family history)")

if factors > 0 and isinstance(family_risk, pd.Series):
    X['family_risk_score'] = family_risk
    X['has_family_history'] = (family_risk > 0).astype(int)
    print(f"\n Family risk features created (based on {factors} factors)")
    print(f"   family_risk_score distribution: {X['family_risk_score'].value_counts().to_dict()}")
    print(f"   has_family_history: {X['has_family_history'].value_counts().to_dict()}")
else:
    print("⚠️  Cannot create family risk score (no relevant features)")

   ✓ Using NACCMOM (mother's dementia)
   ✓ Using NACCFAM (family history)

 Family risk features created (based on 2 factors)
   family_risk_score distribution: {0: 85416, 2: 70779, 1: 39001}
   has_family_history: {1: 109780, 0: 85416}


Create Temporal Features

In [16]:
if 'NACCDAYS' in X.columns:
    X['years_in_study'] = X['NACCDAYS'] / 365.25
    X['months_in_study'] = X['NACCDAYS'] / 30.44
    print(" Temporal features created: 2")
    print("   - years_in_study")
    print("   - months_in_study")
    
    if 'NACCVNUM' in X.columns:
        X['avg_days_between_visits'] = X['NACCDAYS'] / (X['NACCVNUM'] + 1)
        print("\n Visit frequency feature created:")
        print("   - avg_days_between_visits")
else:
    print("  NACCDAYS not found, skipping temporal features")

 Temporal features created: 2
   - years_in_study
   - months_in_study

 Visit frequency feature created:
   - avg_days_between_visits


Feature Engineering Summary

In [19]:
original_features = len(df.columns) - 1  
new_features_count = len(X.columns) - original_features



print(f"Original features: {original_features}")
print(f"New features created: {new_features_count}")
print(f"Total features: {len(X.columns)}")
print(f"Samples: {len(X):,}")


Original features: 43
New features created: 15
Total features: 58
Samples: 195,196


In [21]:
df_engineered = pd.concat([X, y], axis=1)
output_path = 'data/engineered_data.csv'
df_engineered.to_csv(output_path, index=False)