**Imports + Paths**

In [10]:
import pandas as pd
import numpy as np
from pathlib import Path

DATA_DIR = Path("../../data/raw")
PROCESSED_DIR = Path("../../data/processed")
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

KEY_COLS = ["student_id", "code_module", "code_presentation"]  # keep consistent everywhere


**Load all tables + basic inspection**

In [11]:
def load_csv(name: str) -> pd.DataFrame:
    return pd.read_csv(DATA_DIR / name)

assessments          = load_csv("assessments.csv")
courses              = load_csv("courses.csv")
student_assessment   = load_csv("studentAssessment.csv")
student_info         = load_csv("studentInfo.csv")
student_registration = load_csv("studentRegistration.csv")
student_vle          = load_csv("studentVle.csv")
vle                  = load_csv("vle.csv")

# normalize id_student -> student_id everywhere it appears
student_assessment   = student_assessment.rename(columns={"id_student": "student_id"})
student_info         = student_info.rename(columns={"id_student": "student_id"})
student_registration = student_registration.rename(columns={"id_student": "student_id"})
student_vle          = student_vle.rename(columns={"id_student": "student_id"})

tables = {
    "assessments": assessments,
    "courses": courses,
    "student_assessment": student_assessment,
    "student_info": student_info,
    "student_registration": student_registration,
    "student_vle": student_vle,
    "vle": vle,
}

for name, df in tables.items():
    print(f"{name}: {df.shape[0]} rows, {df.shape[1]} cols")


assessments: 206 rows, 6 cols
courses: 22 rows, 3 cols
student_assessment: 173912 rows, 5 cols
student_info: 32593 rows, 12 cols
student_registration: 32593 rows, 5 cols
student_vle: 10655280 rows, 6 cols
vle: 6364 rows, 6 cols


**Demographics** (static features)

In [12]:
demographics = student_info[KEY_COLS + [
    "gender",
    "region",
    "highest_education",
    "imd_band",
    "age_band",
    "num_of_prev_attempts",
    "studied_credits",
    "disability",
]].copy()


**Course features** (static)

In [13]:
course_features = courses[["code_module", "code_presentation", "module_presentation_length"]].copy()


**Targets (classification and regression)**

Classification: Pass vs fail/withdraw

In [14]:
targets_class = student_info[KEY_COLS + ["final_result"]].copy()
targets_class["target_pass"] = targets_class["final_result"].isin(["Pass", "Distinction"]).astype(int)


Regression: Compute weighted final score from assesment + studentAssesment

In [15]:
# Join student scores to assessment weights + module keys
sa = student_assessment.merge(
    assessments[["id_assessment", "code_module", "code_presentation", "weight"]],
    on="id_assessment",
    how="left",
)

# numeric safety
sa["score"] = pd.to_numeric(sa["score"], errors="coerce")
sa["weight"] = pd.to_numeric(sa["weight"], errors="coerce")

# drop rows lacking weight/score
sa = sa.dropna(subset=["weight", "score", "code_module", "code_presentation"])

# weighted contribution (0-100 scale)
sa["weighted_contrib"] = sa["score"] * (sa["weight"] / 100.0)

targets_reg = (
    sa.groupby(KEY_COLS, as_index=False)
      .agg(
          target_score=("weighted_contrib", "sum"),
          weight_covered=("weight", "sum"),
      )
)

# Optional normalized score (performance on submitted work only)
targets_reg["target_score_norm"] = np.where(
    targets_reg["weight_covered"] > 0,
    targets_reg["target_score"] * (100.0 / targets_reg["weight_covered"]),
    np.nan
)

targets = targets_class.merge(
    targets_reg[KEY_COLS + ["target_score", "target_score_norm", "weight_covered"]],
    on=KEY_COLS,
    how="left"
)

targets.head()


Unnamed: 0,student_id,code_module,code_presentation,final_result,target_pass,target_score,target_score_norm,weight_covered
0,11391,AAA,2013J,Pass,1,82.4,82.4,100.0
1,28400,AAA,2013J,Pass,1,65.4,65.4,100.0
2,30268,AAA,2013J,Withdrawn,0,,,
3,31604,AAA,2013J,Pass,1,76.3,76.3,100.0
4,32885,AAA,2013J,Pass,1,55.0,55.0,100.0


**Registration features**

In [16]:
registration = student_registration[KEY_COLS + [
    "date_registration",
    "date_unregistration"
]].copy()

registration["date_registration"] = pd.to_numeric(
    registration["date_registration"], errors="coerce"
)
registration["date_unregistration"] = pd.to_numeric(
    registration["date_unregistration"], errors="coerce"
)

registration_features = registration.copy()


**Vle Enrichment + weekly aggregation**

In [17]:
# --- VLE enrichment: add activity_type ---
vle_enriched = student_vle.merge(
    vle[["id_site", "code_module", "code_presentation", "activity_type"]],
    on=["id_site", "code_module", "code_presentation"],
    how="left",
)

# normalise activity_type so column names are stable
vle_enriched["activity_type"] = (
    vle_enriched["activity_type"]
    .fillna("Unknown")
    .astype(str)
    .str.strip()
    .str.lower()
)

# week bins (date is "days since start")
vle_enriched["week"] = (pd.to_numeric(vle_enriched["date"], errors="coerce") // 7).astype("Int64")

weekly_by_type = (
    vle_enriched
    .groupby(KEY_COLS + ["week", "activity_type"], as_index=False)
    .agg(
        weekly_clicks=("sum_click", "sum"),
        weekly_active_days=("date", "nunique"),
        weekly_n_resources=("id_site", "nunique"),
    )
)


**Pivot and count**

In [18]:
def pivot_metric(df: pd.DataFrame, value_col: str, prefix: str) -> pd.DataFrame:
    wide = (
        df.pivot_table(
            index=KEY_COLS + ["week"],
            columns="activity_type",
            values=value_col,
            aggfunc="sum",
            fill_value=0,
        )
        .reset_index()
    )
    wide.columns = [
        f"{prefix}_{c}".lower().replace(" ", "_")
        if c not in KEY_COLS + ["week"] else c
        for c in wide.columns
    ]
    return wide

weekly_clicks_wide = pivot_metric(weekly_by_type, "weekly_clicks", "weekly_clicks")
weekly_days_wide   = pivot_metric(weekly_by_type, "weekly_active_days", "weekly_days")
weekly_res_wide    = pivot_metric(weekly_by_type, "weekly_n_resources", "weekly_resources")

weekly_wide = (
    weekly_clicks_wide
    .merge(weekly_days_wide, on=KEY_COLS + ["week"], how="left")
    .merge(weekly_res_wide, on=KEY_COLS + ["week"], how="left")
)

# cumulative features
weekly_wide = weekly_wide.sort_values(KEY_COLS + ["week"]).copy()
weekly_feature_cols = [c for c in weekly_wide.columns if c not in KEY_COLS + ["week"]]
cum_cols = ["cum_" + c for c in weekly_feature_cols]

weekly_wide[cum_cols] = (
    weekly_wide.groupby(KEY_COLS)[weekly_feature_cols]
              .cumsum()
              .rename(columns={c: "cum_" + c for c in weekly_feature_cols})
)

ews_features = weekly_wide[KEY_COLS + ["week"] + cum_cols].copy()
ews_features.head()


Unnamed: 0,student_id,code_module,code_presentation,week,cum_weekly_clicks_dataplus,cum_weekly_clicks_dualpane,cum_weekly_clicks_externalquiz,cum_weekly_clicks_folder,cum_weekly_clicks_forumng,cum_weekly_clicks_glossary,...,cum_weekly_resources_ouelluminate,cum_weekly_resources_ouwiki,cum_weekly_resources_page,cum_weekly_resources_questionnaire,cum_weekly_resources_quiz,cum_weekly_resources_repeatactivity,cum_weekly_resources_resource,cum_weekly_resources_sharedsubpage,cum_weekly_resources_subpage,cum_weekly_resources_url
0,6516,AAA,2014J,-4,0,0,0,0,33,0,...,0,0,0,0,0,0,1,0,1,0
1,6516,AAA,2014J,-3,0,0,0,0,46,0,...,0,0,0,0,0,0,5,0,4,1
2,6516,AAA,2014J,-2,0,0,0,0,47,0,...,0,0,0,0,0,0,5,0,4,1
3,6516,AAA,2014J,-1,0,0,0,0,64,0,...,0,0,0,0,0,0,7,0,7,2
4,6516,AAA,2014J,0,0,0,0,0,124,0,...,0,0,0,0,0,0,8,0,9,4


**Build full feature list**

In [19]:
feature_store = (
    ews_features
    .merge(demographics, on=KEY_COLS, how="left")
    .merge(course_features, on=["code_module", "code_presentation"], how="left")
    .merge(registration_features, on=KEY_COLS, how="left")
    .merge(targets, on=KEY_COLS, how="left")
)


**Export**

In [20]:
out_path = PROCESSED_DIR / "ews_feature_store.csv"
feature_store.to_csv(out_path, index=False)
print(f"Saved feature store to: {out_path.resolve()}")

Saved feature store to: C:\Users\ollie\Desktop\WM9QG Group Project\VLE-analysis\WM9QG-15_Group_Project\data\processed\ews_feature_store.csv


**Inspection**

In [21]:
print("Weeks:", feature_store["week"].min(), "→", feature_store["week"].max())
print("\nClass balance (target_pass):")
print(feature_store["target_pass"].value_counts(dropna=False))

print("\nMissing targets:")
print("target_pass:", feature_store["target_pass"].isna().sum())
print("target_score:", feature_store["target_score"].isna().sum())

print("\nWeight covered stats:")
print(feature_store["weight_covered"].describe())

print("\nData Head")
print(feature_store.head())


Weeks: -4 → 38

Class balance (target_pass):
target_pass
1    462871
0    164160
Name: count, dtype: int64

Missing targets:
target_pass: 0
target_score: 11373

Weight covered stats:
count    615658.000000
mean        105.298787
std          59.920430
min           0.000000
25%          81.000000
50%         100.000000
75%         125.000000
max         200.000000
Name: weight_covered, dtype: float64

Data Head
   student_id code_module code_presentation  week  cum_weekly_clicks_dataplus  \
0        6516         AAA             2014J    -4                           0   
1        6516         AAA             2014J    -3                           0   
2        6516         AAA             2014J    -2                           0   
3        6516         AAA             2014J    -1                           0   
4        6516         AAA             2014J     0                           0   

   cum_weekly_clicks_dualpane  cum_weekly_clicks_externalquiz  \
0                           0    