# Lab 3: Weakest Link Classification
## 4DV652 - Data Intensive Systems

**Project:** Automated Movement Assessment (AIMO)  
**Task:** Classify which joint/body region is the weakest link during a movement

### Data Structure (UPDATED)
The data is now provided as:
- **Single CSV file:** train_dataset.csv
- **Pre-merged:** Movement features + weak link scores combined
- **Pre-split:** Training data ready to use

### Weak Link Classes (14 categories)
ForwardHead, LeftArmFallForward, RightArmFallForward, LeftShoulderElevation, RightShoulderElevation, ExcessiveForwardLean, LeftAsymmetricalWeightShift, RightAsymmetricalWeightShift, LeftKneeMovesInward, RightKneeMovesInward, LeftKneeMovesOutward, RightKneeMovesOutward, LeftHeelRises, RightHeelRises

## Step 0: Setup & Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import mutual_info_classif, RFE
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 7)

print("✓ All imports successful")
print(f"✓ Pandas version: {pd.__version__}")

## Step 1: Data Loading & Exploration

**What we're doing:**
- Loading the pre-merged CSV file
- Contains both movement features (38) and weak link scores (14)
- Creating classification target

**What to expect:**
- CSV loads successfully
- ~1600-1700 rows
- 38 movement features
- 14 weak link score columns

In [None]:
TRAIN_DATA_PATH = "/Users/reemothman/Downloads/Data-intensive-systems/A3/A3_Data/train_dataset.csv"

print("="*80)
print("STEP 1: DATA LOADING & EXPLORATION")
print("="*80)

try:
    df = pd.read_csv(TRAIN_DATA_PATH)
    print(f"\n✓ Loaded training dataset: {df.shape}")
    print(f"  Rows: {df.shape[0]}, Columns: {df.shape[1]}")
    print(f"\nFirst few rows:")
    print(df.head())
    print(f"\nColumn names:")
    print(df.columns.tolist())
except FileNotFoundError:
    print(f"❌ Error: File not found at {TRAIN_DATA_PATH}")
    print("Please update TRAIN_DATA_PATH to your actual file location")
    raise

In [None]:
WEAK_LINK_TYPES = [
    'ForwardHead',
    'LeftArmFallForward', 'RightArmFallForward',
    'LeftShoulderElevation', 'RightShoulderElevation',
    'ExcessiveForwardLean',
    'LeftAsymmetricalWeightShift', 'RightAsymmetricalWeightShift',
    'LeftKneeMovesInward', 'RightKneeMovesInward',
    'LeftKneeMovesOutward', 'RightKneeMovesOutward',
    'LeftHeelRises', 'RightHeelRises'
]

print(f"Defined {len(WEAK_LINK_TYPES)} weak link categories")

# Identify columns
weak_link_cols = [col for col in df.columns if any(wl in col for wl in WEAK_LINK_TYPES)]
movement_cols = [col for col in df.columns if 'Angle' in col or 'NASM' in col or 'Time' in col]

print(f"✓ Found {len(movement_cols)} movement feature columns")
print(f"✓ Found {len(weak_link_cols)} weak link score columns")

In [None]:
print("\nCreating target variable...")
df['Weakest_Link'] = df[weak_link_cols].idxmax(axis=1)
df['Weakest_Link_Score'] = df[weak_link_cols].max(axis=1)

print(f"\n✓ Added 'Weakest_Link' target column")
print(f"\nClass distribution:")
class_dist = df['Weakest_Link'].value_counts()
print(class_dist)

print(f"\nClass imbalance ratio: {class_dist.max() / class_dist.min():.2f}:1")

In [None]:
print("\nPreparing dataset for modeling...")
X = df[movement_cols].copy()
y = df['Weakest_Link'].copy()

print(f"✓ Feature matrix X shape: {X.shape}")
print(f"✓ Target vector y shape: {y.shape}")
print(f"\nMissing values in X: {X.isnull().sum().sum()}")
print(f"Missing values in y: {y.isnull().sum()}")

if X.isnull().sum().sum() > 0 or y.isnull().sum() > 0:
    X = X.fillna(X.mean())
    y = y.dropna()
    X = X.loc[y.index]
    print(f"After handling: X={X.shape}, y={y.shape}")
else:
    print("✓ No missing values - data is clean")

## Step 2: Exploratory Data Analysis

In [None]:
print("\n" + "="*80)
print("STEP 2: FEATURE IMPORTANCE ANALYSIS")
print("="*80)

le_temp = LabelEncoder()
y_encoded_temp = le_temp.fit_transform(y)
mi_scores = mutual_info_classif(X, y_encoded_temp, random_state=42)

feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Mutual_Information': mi_scores
}).sort_values('Mutual_Information', ascending=False)

print(f"\nTop 10 Most Informative Features:")
print(feature_importance.head(10).to_string(index=False))

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
top_features = feature_importance.head(15)
ax.barh(range(len(top_features)), top_features['Mutual_Information'].values)
ax.set_yticks(range(len(top_features)))
ax.set_yticklabels(top_features['Feature'].values)
ax.set_xlabel('Mutual Information Score', fontsize=12, fontweight='bold')
ax.set_title('Top 15 Most Informative Features', fontsize=14, fontweight='bold')
ax.invert_yaxis()
plt.tight_layout()
plt.show()

## Step 3: Data Preprocessing - Train-Validation Split & Scaling

In [None]:
print("\n" + "="*80)
print("STEP 3: TRAIN-VALIDATION SPLIT & SCALING")
print("="*80)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")
print(f"Features: {X_train.shape[1]}")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

print(f"\n✓ Feature scaling completed")
print(f"  Mean: {X_train_scaled.mean():.6f}")
print(f"  Std Dev: {X_train_scaled.std():.6f}")

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)

print(f"\n✓ Target variable encoded")
print(f"  Classes: {list(label_encoder.classes_)}")

## Step 4: Train Classification Models

In [None]:
print("\n" + "="*80)
print("STEP 4: TRAINING CLASSIFICATION MODELS")
print("="*80)

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'KNN (k=5)': KNeighborsClassifier(n_neighbors=5),
    'KNN (k=10)': KNeighborsClassifier(n_neighbors=10),
    'Linear Discriminant Analysis': LinearDiscriminantAnalysis(),
    'Quadratic Discriminant Analysis': QuadraticDiscriminantAnalysis(),
    'Random Forest (10 trees)': RandomForestClassifier(n_estimators=10, random_state=42),
    'Random Forest (50 trees)': RandomForestClassifier(n_estimators=50, random_state=42)
}

trained_models = {}
train_predictions = {}
val_predictions = {}

print("\nTraining models...\n")

for name, model in models.items():
    print(f"Training {name}...", end=' ')
    model.fit(X_train_scaled, y_train_encoded)
    trained_models[name] = model
    
    y_train_pred = model.predict(X_train_scaled)
    y_val_pred = model.predict(X_val_scaled)
    
    train_predictions[name] = y_train_pred
    val_predictions[name] = y_val_pred
    
    train_acc = accuracy_score(y_train_encoded, y_train_pred)
    val_acc = accuracy_score(y_val_encoded, y_val_pred)
    
    print(f"✓ (Train: {train_acc:.4f}, Val: {val_acc:.4f})")

print(f"\n✓ All {len(models)} models trained")

## Step 5: Model Evaluation

In [None]:
print("\n" + "="*80)
print("STEP 5: MODEL EVALUATION")
print("="*80)

results = []

for model_name in models.keys():
    y_pred = val_predictions[model_name]
    y_train_pred = train_predictions[model_name]
    
    val_acc = accuracy_score(y_val_encoded, y_pred)
    precision = precision_score(y_val_encoded, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_val_encoded, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_val_encoded, y_pred, average='weighted', zero_division=0)
    
    train_acc = accuracy_score(y_train_encoded, y_train_pred)
    gap = train_acc - val_acc
    
    results.append({
        'Model': model_name,
        'Train_Accuracy': train_acc,
        'Val_Accuracy': val_acc,
        'Precision': precision,
        'Recall': recall,
        'F1_Weighted': f1,
        'Overfitting_Gap': gap
    })

results_df = pd.DataFrame(results).sort_values('Val_Accuracy', ascending=False)

print("\nMODEL COMPARISON:")
print(results_df.to_string(index=False))

best_model_name = results_df.iloc[0]['Model']
best_model = trained_models[best_model_name]

print(f"\n🏆 BEST MODEL: {best_model_name}")
print(f"    Validation Accuracy: {results_df.iloc[0]['Val_Accuracy']:.4f}")
print(f"    F1-Score: {results_df.iloc[0]['F1_Weighted']:.4f}")

In [None]:
print("\nCLASSIFICATION REPORT:")
y_pred_best = val_predictions[best_model_name]
print(classification_report(y_val_encoded, y_pred_best, target_names=label_encoder.classes_, digits=4))

In [None]:
fig, ax = plt.subplots(figsize=(12, 10))
cm = confusion_matrix(y_val_encoded, y_pred_best)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_, ax=ax)
ax.set_xlabel('Predicted')
ax.set_ylabel('True')
ax.set_title(f'Confusion Matrix: {best_model_name}')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## Step 6: Cross-Validation

In [None]:
print("\n" + "="*80)
print("STEP 6: CROSS-VALIDATION")
print("="*80)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print("\nPerforming 5-fold cross-validation...\n")

for model_name, model in models.items():
    cv_scores = cross_val_score(model, X_train_scaled, y_train_encoded, cv=skf, scoring='accuracy')
    print(f"{model_name}:")
    print(f"  Mean: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
    print(f"  Folds: {[f'{x:.4f}' for x in cv_scores]}")
    print()

## Step 7: Feature Selection

In [None]:
print("\n" + "="*80)
print("STEP 7: FEATURE SELECTION (RFE)")
print("="*80)

best_model_instance = trained_models[best_model_name]
n_features_to_try = [10, 15, 20, 25, 30, X_train.shape[1]]

rfe_results = []

print(f"\nTesting different feature counts...\n")

for n_features in n_features_to_try:
    if n_features > X_train.shape[1]:
        n_features = X_train.shape[1]
    
    rfe = RFE(best_model_instance, n_features_to_select=n_features)
    rfe.fit(X_train_scaled, y_train_encoded)
    
    X_train_rfe = rfe.transform(X_train_scaled)
    X_val_rfe = rfe.transform(X_val_scaled)
    
    model_rfe = type(best_model_instance)(**best_model_instance.get_params())
    model_rfe.fit(X_train_rfe, y_train_encoded)
    
    y_val_pred_rfe = model_rfe.predict(X_val_rfe)
    val_acc_rfe = accuracy_score(y_val_encoded, y_val_pred_rfe)
    
    reduction = (1 - n_features / X_train.shape[1]) * 100
    print(f"Features: {n_features:2d} | Reduction: {reduction:5.1f}% | Val Acc: {val_acc_rfe:.4f}")
    
    rfe_results.append({'Features': n_features, 'Reduction_%': reduction, 'Val_Accuracy': val_acc_rfe})

## Step 8: Final Summary

In [None]:
print("\n" + "="*80)
print("FINAL SUMMARY")
print("="*80)

print(f"\n📊 DATASET:")
print(f"  Samples: {X.shape[0]}")
print(f"  Features: {X.shape[1]}")
print(f"  Classes: {len(label_encoder.classes_)}")

print(f"\n🏆 CHAMPION MODEL:")
print(f"  Name: {best_model_name}")
print(f"  Validation Accuracy: {results_df.iloc[0]['Val_Accuracy']:.4f}")
print(f"  F1-Score: {results_df.iloc[0]['F1_Weighted']:.4f}")
print(f"  Overfitting Gap: {results_df.iloc[0]['Overfitting_Gap']:.4f}")

print(f"\n✓ Lab 3 - ML Phase: COMPLETE")

## Step 9: Save Models

In [None]:
import joblib
import os

output_path = "./models"
os.makedirs(output_path, exist_ok=True)

best_model_instance = trained_models[best_model_name]

joblib.dump(best_model_instance, os.path.join(output_path, 'weakest_link_classifier.pkl'))
joblib.dump(scaler, os.path.join(output_path, 'scaler.pkl'))
joblib.dump(label_encoder, os.path.join(output_path, 'label_encoder.pkl'))

with open(os.path.join(output_path, 'feature_names.txt'), 'w') as f:
    for feat in X.columns:
        f.write(f"{feat}\n")

results_df.to_csv(os.path.join(output_path, 'model_results.csv'), index=False)

print(f"✓ Model saved to {output_path}/weakest_link_classifier.pkl")
print(f"✓ Scaler saved to {output_path}/scaler.pkl")
print(f"✓ Label encoder saved to {output_path}/label_encoder.pkl")
print(f"✓ Feature names saved to {output_path}/feature_names.txt")
print(f"✓ Results saved to {output_path}/model_results.csv")