In [2]:
import pandas as pd
import numpy as np
import os
import pickle
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
pd.set_option('display.max_columns', None)
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [3]:
data_path = 'data/engineered_data.csv'
df = pd.read_csv(data_path)

Identify and Separate Target

In [4]:
TARGET_CANDIDATES = ['DEMENTED', 'NORMCOG', 'DEMENTIA_BINARY', 'NACCALZD']

TARGET = None
for candidate in TARGET_CANDIDATES:
    if candidate in df.columns:
        TARGET = candidate
        break

if TARGET is None:
    raise ValueError("No target variable found!")

print(f" Target: {TARGET}")
print(f"   Distribution: {df[TARGET].value_counts().to_dict()}")


X = df.drop(columns=[TARGET])
y = df[TARGET]


valid_idx = y.notna()
X = X[valid_idx]
y = y[valid_idx]

 Target: DEMENTED
   Distribution: {0: 137606, 1: 57590}


 Handle Non-Numeric Columns

Manual Label Encoding 

In [6]:
from sklearn.preprocessing import LabelEncoder



numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
non_numeric_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

print(f"Numeric columns: {len(numeric_cols)}")
print(f"Non-numeric columns: {len(non_numeric_cols)}")

if non_numeric_cols:
    print(f"\n Encoding {len(non_numeric_cols)} columns with LabelEncoder...")
    
    le = LabelEncoder()
    
    for col in non_numeric_cols:
        print(f"   - Encoding {col}...", end=" ")
        
        # Convert to string and fill NaN
        X[col] = X[col].astype(str).fillna('missing')
        
        # Label encode
        X[col] = le.fit_transform(X[col])
        
        print("✓")
    
    print(f"\n    All columns encoded!")
    print(f"   New shape: {X.shape}")
else:
    print(" All columns are numeric")

print(f"\nFinal feature count: {X.shape[1]}")

Numeric columns: 58
Non-numeric columns: 0
 All columns are numeric

Final feature count: 58


Handle Missing Values

In [9]:
missing_count = X.isnull().sum().sum()
missing_pct = (missing_count / (X.shape[0] * X.shape[1])) * 100

print(f"Missing values: {missing_count:,} ({missing_pct:.2f}%)")

if missing_count > 0:
    print(f"\nTop 10 columns with missing data:")
    missing_by_col = X.isnull().sum().sort_values(ascending=False)
    print(missing_by_col[missing_by_col > 0].head(10))
    
    # Impute
    print(f"\n⚙️  Imputing with median strategy...")
    imputer = SimpleImputer(strategy='median')
    X_imputed = imputer.fit_transform(X)
    X = pd.DataFrame(X_imputed, columns=X.columns, index=X.index)
    
    print(f"    Imputation complete!")
    print(f"   Remaining missing: {X.isnull().sum().sum()}")
else:
    print(" No missing values!")

# Handle infinite values
print(f"\n Checking for infinite values...")
inf_count = np.isinf(X).sum().sum()
if inf_count > 0:
    print(f"   ⚠️  Found {inf_count} infinite values, replacing with NaN...")
    X = X.replace([np.inf, -np.inf], np.nan)
    # Re-impute
    X_imputed = imputer.fit_transform(X)
    X = pd.DataFrame(X_imputed, columns=X.columns, index=X.index)
    print(f"   Handled infinite values")
else:
    print(f"    No infinite values")

Missing values: 0 (0.00%)
 No missing values!

 Checking for infinite values...
    No infinite values


Train-Test Split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=y
)

print(f" Data split complete!\n")
print(f"Training set:")
print(f"   X_train: {X_train.shape[0]:,} rows x {X_train.shape[1]:,} features")
print(f"   y_train: {len(y_train):,} samples")
print(f"   Distribution: {y_train.value_counts().to_dict()}")

print(f"\nTest set:")
print(f"   X_test: {X_test.shape[0]:,} rows x {X_test.shape[1]:,} features")
print(f"   y_test: {len(y_test):,} samples")
print(f"   Distribution: {y_test.value_counts().to_dict()}")

 Data split complete!

Training set:
   X_train: 156,156 rows x 58 features
   y_train: 156,156 samples
   Distribution: {0: 110084, 1: 46072}

Test set:
   X_test: 39,040 rows x 58 features
   y_test: 39,040 samples
   Distribution: {0: 27522, 1: 11518}


Feature Scaling CRASH-PROOF VERSION

In [11]:
memory_mb = X_train.memory_usage(deep=True).sum() / 1024**2


# CRITICAL: Use sample if dataset is large (prevents crashes)
if len(X_train) > 10000:
    print(f"\n Large dataset detected ({len(X_train):,} rows)")
    print(f"   Using sample to prevent kernel crash...")
    
    sample_size = min(10000, len(X_train))
    sample_idx = X_train.sample(n=sample_size, random_state=RANDOM_STATE).index
    
    X_train = X_train.loc[sample_idx]
    y_train = y_train.loc[sample_idx]
    
    print(f"    Reduced to {len(X_train):,} rows")
    print(f"    NOTE: Using sample for training!")

# Initialize scaler
scaler = StandardScaler()

try:
    print(f"\nScaling {X_train.shape[1]} features...")
    
    # Fit and transform training data
    X_train_scaled = scaler.fit_transform(X_train)
    X_train_scaled = pd.DataFrame(
        X_train_scaled,
        columns=X_train.columns,
        index=X_train.index
    )
    
    # Transform test data
    X_test_scaled = scaler.transform(X_test)
    X_test_scaled = pd.DataFrame(
        X_test_scaled,
        columns=X_test.columns,
        index=X_test.index
    )
    print(f"\n Scaling successful!")
    print(f"   Train scaled: {X_train_scaled.shape}")
    print(f"   Test scaled: {X_test_scaled.shape}")
    
    # Verify scaling
    print(f"\nVerification (first 3 features):")
    for col in X_train_scaled.columns[:3]:
        mean = X_train_scaled[col].mean()
        std = X_train_scaled[col].std()
        print(f"   {col}: mean={mean:.4f}, std={std:.4f}")

except MemoryError:
    print("\n MEMORY ERROR OCCURRED!")
    print("\n   Solutions:")
    print("   1. Reduce sample_size to 5000")
    print("   2. Restart kernel: Kernel → Restart & Clear Output")
    print("   3. Close other applications")
    raise


 Large dataset detected (156,156 rows)
   Using sample to prevent kernel crash...
    Reduced to 10,000 rows
    NOTE: Using sample for training!

Scaling 58 features...

 Scaling successful!
   Train scaled: (10000, 58)
   Test scaled: (39040, 58)

Verification (first 3 features):
   NACCID: mean=-0.0000, std=1.0001
   NACCADC: mean=0.0000, std=1.0001
   PACKET: mean=0.0000, std=1.0001


Save Preprocessed Data

In [12]:
with open('data/X_train_scaled.pkl', 'wb') as f:
    pickle.dump(X_train_scaled, f)
print("    Saved: X_train_scaled.pkl")

with open('data/X_test_scaled.pkl', 'wb') as f:
    pickle.dump(X_test_scaled, f)
print("   Saved: X_test_scaled.pkl")

with open('data/y_train.pkl', 'wb') as f:
    pickle.dump(y_train, f)
print("    Saved: y_train.pkl")

with open('data/y_test.pkl', 'wb') as f:
    pickle.dump(y_test, f)
print("   Saved: y_test.pkl")

with open('data/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
print("    Saved: scaler.pkl")



    Saved: X_train_scaled.pkl
   Saved: X_test_scaled.pkl
    Saved: y_train.pkl
   Saved: y_test.pkl
    Saved: scaler.pkl
