# Assignment 3: Machine Learning for Huntington's Disease Prediction

---

**Objective:** Build and evaluate machine learning models to predict disease stage in Huntington's Disease patients using clinical, genetic, and molecular features.

**Dataset:** Huntington's Disease Dataset (48,536 patients, 13 clinical features)

**Target Variable:** Disease_Stage (5-class classification: Pre-symptomatic, Early Stage, Mid Stage, Late Stage, Advanced)

---

## Why This Matters

Accurate prediction of disease stage in Huntington's Disease enables:
- **Early Intervention:** Identify patients who would benefit from early treatment
- **Treatment Planning:** Tailor therapeutic strategies based on disease progression
- **Clinical Trials:** Stratify patients for more effective trial enrollment
- **Patient Counseling:** Provide evidence-based prognosis for personalized care

---

## Success Criteria

- High classification accuracy (>85%)
- Balanced precision and recall across all disease stages
- Interpretable models that align with clinical knowledge
- Robust generalization to unseen patient data

---

## 1. Introduction & Setup

### 1.1 Import Libraries

In [None]:
#import core libraries for data manipulation and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

#utilities
import warnings
warnings.filterwarnings('ignore')

#setting random seed ensures reproducible results across runs
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

#configure display settings for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.4f}'.format)

#plotting style for visualizations
plt.style.use('default')
sns.set_palette("husl")

print("✓ Core libraries imported successfully")
print(f"✓ Random seed set to {RANDOM_STATE} for reproducibility")
print("\nNote: Additional libraries will be imported in relevant sections as needed")

In [None]:
#install required packages in jupyter kernel environment
import sys
!{sys.executable} -m pip install scikit-learn xgboost shap lime --quiet

## 2. Load Data

In [None]:
#load cleaned data from assinment 2
#data preprocessing (removing irrelevant columns, handling duplicates, etc) was completed in assignment 2
df = pd.read_csv('data/Huntington_Disease_Cleaned.csv')

print(f"Data loaded: {df.shape[0]:,} patients, {df.shape[1]} features")
print(f"Target variable: Disease_Stage (multi-class classification)")

In [None]:
#quick overview
df.head()

In [None]:
#check target variable distribution
#check for class imbalance
print("Disease Stage Distribution:")
print(df['Disease_Stage'].value_counts())
print(f"\nClass balance:")
print(df['Disease_Stage'].value_counts(normalize=True) * 100)

In [None]:
#visualize class distribution
plt.figure(figsize=(10, 5))
df['Disease_Stage'].value_counts().plot(kind='bar', color='steelblue')
plt.title('Distribution of Disease Stages')
plt.xlabel('Disease Stage')
plt.ylabel('Number of Patients')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 3. Feature Engineering

In [None]:
#check current features
print("Current features:")
for i, col in enumerate(df.columns, 1):
    print(f"{i}. {col}")

In [None]:
#create new features based on domain knowledge
df_fe = df.copy()

#genetic risk score: CAG repeats × gene expression
df_fe['Genetic_Risk_Score'] = df_fe['HTT_CAG_Repeat_Length'] * df_fe['HTT_Gene_Expression_Level']

#age-adjusted CAG: earlier onset = more aggressive
df_fe['Age_Adjusted_CAG'] = df_fe['HTT_CAG_Repeat_Length'] / df_fe['Age']

#brain health index: brain volume vs protein damage
df_fe['Brain_Health_Index'] = (100 - df_fe['Brain_Volume_Loss']) / (df_fe['Protein_Aggregation_Level'] + 1)

#motor-cognitive composite: combined symptom severity
#using chorea score only since cognitive_decline is categorical
df_fe['Motor_Cognitive_Composite'] = df_fe['Chorea_Score'] * df_fe['Brain_Volume_Loss']

print(f"Created 4 new features")
print(f"Total features now: {df_fe.shape[1]}")

In [None]:
#check new features
df_fe[['Genetic_Risk_Score', 'Age_Adjusted_CAG', 'Brain_Health_Index', 'Motor_Cognitive_Composite']].describe()

In [None]:
#encode categorical variables
#one-hot encoding for nominal variables
df_fe = pd.get_dummies(df_fe, columns=['Sex', 'Family_History', 'Motor_Symptoms', 'Cognitive_Decline'], drop_first=True)

#label encoding for gene mutation type
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_fe['Gene_Mutation_Type_Encoded'] = le.fit_transform(df_fe['Gene_Mutation_Type'])
df_fe = df_fe.drop('Gene_Mutation_Type', axis=1)

print(f"Encoded categorical variables")
print(f"Total features after encoding: {df_fe.shape[1]}")

In [None]:
#check final feature list
print("Final features after engineering:")
for i, col in enumerate(df_fe.columns, 1):
    print(f"{i}. {col}")

## 4. Feature Selection

In [None]:
#separate features and target
X = df_fe.drop('Disease_Stage', axis=1)
y = df_fe['Disease_Stage']

print(f"Features: {X.shape[1]}")
print(f"Samples: {X.shape[0]}")
print(f"Target classes: {y.nunique()}")

In [None]:
#method 1: ANOVA f-test
#tests relationship between each feature and target
#works well for numerical features in classification
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(score_func=f_classif, k=10)
selector.fit(X, y)

anova_features = X.columns[selector.get_support()].tolist()
print("Top 10 features by ANOVA:")
for i, feat in enumerate(anova_features, 1):
    print(f"{i}. {feat}")

In [None]:
#method 2: mutual information
#captures non-linear relationships between features and target
#complements ANOVA which only finds linear relationships
from sklearn.feature_selection import mutual_info_classif

mi_scores = mutual_info_classif(X, y, random_state=RANDOM_STATE)
mi_features = pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)

print("Top 10 features by Mutual Information:")
for i, (feat, score) in enumerate(mi_features.head(10).items(), 1):
    print(f"{i}. {feat}: {score:.4f}")

In [None]:
#method 3: random forest importance
#embedded method that considers feature interactions
#importance based on how much each feature improves tree splits
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE, n_jobs=-1)
rf.fit(X, y)

rf_importance = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)

print("Top 10 features by Random Forest:")
for i, (feat, score) in enumerate(rf_importance.head(10).items(), 1):
    print(f"{i}. {feat}: {score:.4f}")

In [None]:
#find consensus features across methods
#features appearing in top 10 of at least 2 methods are most reliable
anova_top10 = set(anova_features)
mi_top10 = set(mi_features.head(10).index)
rf_top10 = set(rf_importance.head(10).index)

#count how many methods selected each feature
all_features = anova_top10 | mi_top10 | rf_top10
feature_counts = {}
for feat in all_features:
    count = 0
    if feat in anova_top10: count += 1
    if feat in mi_top10: count += 1
    if feat in rf_top10: count += 1
    feature_counts[feat] = count

#select features with consensus (appear in 2+ methods)
selected_features = [feat for feat, count in feature_counts.items() if count >= 2]

print(f"\nConsensus features (≥2 methods): {len(selected_features)}")
for feat in sorted(selected_features):
    methods = []
    if feat in anova_top10: methods.append('ANOVA')
    if feat in mi_top10: methods.append('MI')
    if feat in rf_top10: methods.append('RF')
    print(f"  {feat} [{', '.join(methods)}]")

In [None]:
#create final dataset with selected features
X_selected = X[selected_features]

print(f"\nOriginal features: {X.shape[1]}")
print(f"Selected features: {X_selected.shape[1]}")
print(f"Reduction: {((X.shape[1] - X_selected.shape[1]) / X.shape[1] * 100):.1f}%")

## 5. Machine Learning Models

In [None]:
#split data into train and test sets
#stratify maintains class balance in both sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Features: {X_train.shape[1]}")

In [None]:
#scale features for models that need it
#svm and knn are sensitive to feature scales
#tree-based models (random forest) don't need scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features scaled (mean=0, std=1)")

In [None]:
#model 1: logistic regression (baseline)
#simple linear model, interpretable
#provides probability estimates for clinical decisions
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

lr = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)

print("Logistic Regression:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_lr):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred_lr, average='weighted'):.4f}")

In [None]:
#model 2: random forest
#ensemble method, handles non-linear relationships
#robust to outliers, provides feature importance
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE, n_jobs=-1)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

print("Random Forest:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred_rf, average='weighted'):.4f}")

In [None]:
# TODO: xgboost has import error - check this one later
# maybe find another alternative
print("XGBoost: Skipped")

In [None]:
#model 3: support vector machine
#effective in high-dimensional space
#rbf kernel captures non-linear patterns
from sklearn.svm import SVC

svm_model = SVC(kernel='rbf', random_state=RANDOM_STATE)
svm_model.fit(X_train_scaled, y_train)
y_pred_svm = svm_model.predict(X_test_scaled)

print("Support Vector Machine:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_svm):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred_svm, average='weighted'):.4f}")

In [None]:
#model 4: k-nearest neighbors
#non-parametric, simple and interpretable
#classifies based on similarity to training samples
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_scaled, y_train)
y_pred_knn = knn_model.predict(X_test_scaled)

print("K-Nearest Neighbors:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_knn):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred_knn, average='weighted'):.4f}")

In [None]:
#compare all models
results = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest', 'SVM', 'KNN'],
    'Accuracy': [
        accuracy_score(y_test, y_pred_lr),
        accuracy_score(y_test, y_pred_rf),
        accuracy_score(y_test, y_pred_svm),
        accuracy_score(y_test, y_pred_knn)
    ],
    'F1-Score': [
        f1_score(y_test, y_pred_lr, average='weighted'),
        f1_score(y_test, y_pred_rf, average='weighted'),
        f1_score(y_test, y_pred_svm, average='weighted'),
        f1_score(y_test, y_pred_knn, average='weighted')
    ]
})

results = results.sort_values('Accuracy', ascending=False)
print("\nModel Comparison:")
print(results.to_string(index=False))