In [8]:
# =============================================================================
# IMPORTS & SETUP
# =============================================================================
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

# Sklearn - Preprocessing & Selection
from sklearn.model_selection import GroupShuffleSplit
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.cluster import KMeans  # <--- NEW: For Clustering

# Sklearn - Models & Metrics
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    roc_auc_score, f1_score, precision_score, recall_score,
    accuracy_score, classification_report, confusion_matrix
)

warnings.filterwarnings('ignore')
sns.set_theme(style="whitegrid")

# Configuration
DATA_DIR = Path(".")  # Update to your data path
KEY_COLS = ["student_id", "code_module", "code_presentation"]
TARGET_COL = "target_pass"
PASS_MARK = 40

# Leakage columns to drop later
LEAKAGE_COLS = [
    "final_result", "date_unregistration", "target_score", 
    "target_score_norm", "weight_covered", "target_score_x", "target_score_y", "target_pass"
]

print("✓ Libraries & Config Loaded")

✓ Libraries & Config Loaded


In [9]:
# =============================================================================
# LOAD DATA
# =============================================================================
def load_csv(name: str) -> pd.DataFrame:
    return pd.read_csv(DATA_DIR / name)

print("Loading OULAD tables...")
assessments          = load_csv("./assessments.csv")
courses              = load_csv("./courses.csv")
student_assessment   = load_csv("./studentAssessment.csv")
student_info         = load_csv("./studentInfo.csv")
student_registration = load_csv("./studentRegistration.csv")
student_vle          = load_csv("./studentVle.csv")
vle                  = load_csv("./vle.csv")

# Standardize IDs
for df in [student_assessment, student_info, student_registration, student_vle]:
    df.rename(columns={"id_student": "student_id"}, inplace=True)

print("✓ Data Loaded")

Loading OULAD tables...
✓ Data Loaded


In [10]:
# =============================================================================
# 1. BASE FEATURES (Demographics & Registration)
# =============================================================================
print("Building Base Features...")

# Demographics
demographics = student_info[KEY_COLS + [
    "gender", "region", "highest_education", "imd_band",
    "age_band", "num_of_prev_attempts", "studied_credits", "disability", "final_result"
]].copy()

# Fix Targets (Binary Class + Regression)
demographics["target_pass"] = demographics["final_result"].isin(["Pass", "Distinction"]).astype(int)

# Score Aggregation (Fixes the duplicate merge bug)
score_df = student_assessment.merge(
    assessments[["id_assessment", "code_module", "code_presentation", "weight"]], 
    on="id_assessment", how="left"
)
score_agg = (
    score_df.groupby(KEY_COLS)
    .apply(lambda g: pd.Series({
        "target_score": np.average(g["score"].fillna(0), weights=g["weight"].fillna(1)) 
                        if g["weight"].sum() > 0 else np.nan
    })).reset_index()
)

# Merge targets ONCE (This fixes the bug!)
demographics = demographics.merge(score_agg, on=KEY_COLS, how="left")

# =============================================================================
# 2. TEMPORAL VLE FEATURES (Velocity & Recency)
# =============================================================================
print("Building Temporal VLE Features...")

# Prepare VLE interactions
vle_interactions = student_vle.merge(
    vle[["id_site", "code_module", "code_presentation", "activity_type"]],
    on=["id_site", "code_module", "code_presentation"], how="left"
)
vle_interactions["week"] = (vle_interactions["date"] // 7).astype(int)
vle_interactions = vle_interactions[vle_interactions["week"] >= 0]

# Aggregate by Student-Week
weekly_vle = vle_interactions.groupby(KEY_COLS + ["week"]).agg(
    weekly_clicks=("sum_click", "sum"),
    active_days=("date", "nunique")
).reset_index()

# Create a Skeleton (Student x Week) to handle weeks with ZERO activity
# This is crucial for "Weeks Since Last Login"
unique_students = demographics[KEY_COLS].drop_duplicates()
weeks = range(0, 40) # Course length approx 39 weeks
skeleton = pd.concat([unique_students.assign(week=w) for w in weeks], ignore_index=True)

# Merge actual data into skeleton (fill missing weeks with 0)
ews_df = skeleton.merge(weekly_vle, on=KEY_COLS + ["week"], how="left")
ews_df[["weekly_clicks", "active_days"]] = ews_df[["weekly_clicks", "active_days"]].fillna(0)

# Sort for rolling calculations
ews_df = ews_df.sort_values(KEY_COLS + ["week"])

# --- NEW FEATURE: CUMULATIVE ---
ews_df["cum_clicks"] = ews_df.groupby(KEY_COLS)["weekly_clicks"].cumsum()

# --- NEW FEATURE: VELOCITY (Trend) ---
# Difference in clicks from previous week
ews_df["clicks_velocity"] = ews_df.groupby(KEY_COLS)["weekly_clicks"].diff().fillna(0)

# --- NEW FEATURE: RECENCY (Weeks since last activity) ---
# If active_days > 0, set current week. Forward fill this, then subtract from current week.
ews_df["last_active_week"] = ews_df["week"].where(ews_df["active_days"] > 0)
ews_df["last_active_week"] = ews_df.groupby(KEY_COLS)["last_active_week"].ffill().fillna(-1)
ews_df["weeks_since_last_activity"] = ews_df["week"] - ews_df["last_active_week"]

# --- NEW FEATURE: COHORT RELATIVE PERFORMANCE ---
# Compare student clicks to the average of their module-presentation cohort
cohort_means = ews_df.groupby(["code_module", "code_presentation", "week"])["cum_clicks"].transform("mean")
ews_df["clicks_vs_cohort"] = ews_df["cum_clicks"] / (cohort_means + 1) # +1 to avoid div/0

print(f"✓ Temporal Features Built: {ews_df.shape}")

Building Base Features...
Building Temporal VLE Features...
✓ Temporal Features Built: (1303720, 11)


In [11]:
# =============================================================================
# 3. UNSUPERVISED CLUSTERING (Student Segmentation)
# =============================================================================
print("Running Clustering...")

# We will cluster students based on their behavior pattern (Average Clicks & Variability)
cluster_data = ews_df.groupby(KEY_COLS).agg({
    "weekly_clicks": ["mean", "std", "max"],
    "active_days": "mean"
}).fillna(0)
cluster_data.columns = ["_".join(col) for col in cluster_data.columns]

# Scaling
scaler_cl = StandardScaler()
X_cluster = scaler_cl.fit_transform(cluster_data)

# K-Means (4 Clusters: e.g., High Flyers, Steady, Struggling, Ghosts)
kmeans = KMeans(n_clusters=4, random_state=42)
cluster_data["cluster_id"] = kmeans.fit_predict(X_cluster)

# Merge Cluster ID back into main EWS dataframe
ews_df = ews_df.merge(cluster_data["cluster_id"], on=KEY_COLS, how="left")

# One-Hot Encode the Cluster ID for the Supervised Model
ews_df = pd.get_dummies(ews_df, columns=["cluster_id"], prefix="cluster")

print("✓ Clustering Completed & Merged")

Running Clustering...
✓ Clustering Completed & Merged


In [12]:
# =============================================================================
# 4. FINAL MERGE & CLEANING
# =============================================================================

# Merge Demographics
feature_store = ews_df.merge(demographics, on=KEY_COLS, how="left")

# Filter: Only weeks 0 to 38 (standard OULAD length)
feature_store = feature_store[(feature_store["week"] >= 0) & (feature_store["week"] <= 38)]

# Impute Missing Values
# IMD Band -> Mode
feature_store["imd_band"] = feature_store["imd_band"].fillna(feature_store["imd_band"].mode()[0])
# Age Band -> Mode
feature_store["age_band"] = feature_store["age_band"].fillna(feature_store["age_band"].mode()[0])

print("="*60)
print("FINAL FEATURE STORE READY")
print(f"Shape: {feature_store.shape}")
print(f"Columns: {list(feature_store.columns)}")
print("="*60)

FINAL FEATURE STORE READY
Shape: (1271127, 26)
Columns: ['student_id', 'code_module', 'code_presentation', 'week', 'weekly_clicks', 'active_days', 'cum_clicks', 'clicks_velocity', 'last_active_week', 'weeks_since_last_activity', 'clicks_vs_cohort', 'cluster_0', 'cluster_1', 'cluster_2', 'cluster_3', 'gender', 'region', 'highest_education', 'imd_band', 'age_band', 'num_of_prev_attempts', 'studied_credits', 'disability', 'final_result', 'target_pass', 'target_score']


In [13]:
# =============================================================================
# 5. MODEL TRAINING (Temporal Split)
# =============================================================================

def train_at_week(week_num):
    print(f"\n--- Training for Week {week_num} ---")
    
    # 1. Snapshot: Take data only up to this week
    # For our temporal structure, we just take the row corresponding to this week
    # because it contains cumulative data + current velocity.
    df_week = feature_store[feature_store["week"] == week_num].copy()
    
    # 2. Define Features & Target
    drop_cols = KEY_COLS + LEAKAGE_COLS + ["week"]
    X = df_week.drop(columns=[c for c in drop_cols if c in df_week.columns])
    y = df_week["target_pass"]
    
    # 3. Preprocessing
    # Identify Cat/Num columns
    cat_cols = X.select_dtypes(include=['object']).columns
    num_cols = X.select_dtypes(include=['number']).columns
    
    preprocessor = ColumnTransformer([
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ])
    
    # 4. Split (GroupShuffleSplit to prevent student leakage)
    splitter = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    train_idx, test_idx = next(splitter.split(X, y, groups=df_week["student_id"]))
    
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    # 5. Pipeline & Train
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
    ])
    
    model.fit(X_train, y_train)
    
    # 6. Evaluate
    preds = model.predict(X_test)
    probs = model.predict_proba(X_test)[:, 1]
    
    acc = accuracy_score(y_test, preds)
    auc = roc_auc_score(y_test, probs)
    
    print(f"Accuracy: {acc:.4f}")
    print(f"ROC AUC:  {auc:.4f}")
    
    # Feature Importance (Optional)
    if hasattr(model['classifier'], 'feature_importances_'):
        # Get feature names after one-hot encoding
        ohe = model['preprocessor'].named_transformers_['cat']
        ohe_cols = ohe.get_feature_names_out(cat_cols)
        all_cols = list(num_cols) + list(ohe_cols)
        
        importances = pd.Series(model['classifier'].feature_importances_, index=all_cols)
        print("\nTop 5 Predictors:")
        print(importances.sort_values(ascending=False).head(5))

# --- Run for key intervention points ---
for w in [5, 10, 20]:
    train_at_week(w)


--- Training for Week 5 ---
Accuracy: 0.7259
ROC AUC:  0.8035

Top 5 Predictors:
clicks_vs_cohort    0.155136
cum_clicks          0.121007
weekly_clicks       0.108594
clicks_velocity     0.090902
active_days         0.067936
dtype: float64

--- Training for Week 10 ---
Accuracy: 0.7640
ROC AUC:  0.8444

Top 5 Predictors:
clicks_vs_cohort             0.160643
cum_clicks                   0.119714
last_active_week             0.095663
clicks_velocity              0.085349
weeks_since_last_activity    0.082028
dtype: float64

--- Training for Week 20 ---
Accuracy: 0.8448
ROC AUC:  0.9150

Top 5 Predictors:
last_active_week             0.151124
clicks_vs_cohort             0.141537
weeks_since_last_activity    0.123668
weekly_clicks                0.109970
active_days                  0.094397
dtype: float64
