In [1]:
import pickle
import numpy as np
import pandas as pd

In [2]:
# === CONFIG ===
INPUT_PATH   = "data/oulad/oulad_full_raw.pkl"
BAG_INDEX    = 800   # change to view a different bag

# === LOAD DATA ===
with open(INPUT_PATH, "rb") as f:
    data = pickle.load(f)

raw_bags = data["raw_bags"]
labels   = data["labels"]
bag_ids  = data["bag_ids"]

# === DISPLAY ONE BAG ===
idx = BAG_INDEX
print(f"\n=== RAW FULL BAG at index {idx} ===")
print(f"Bag ID    : {bag_ids[idx]}")
print(f"Label     : {labels[idx]}")
print(f"Num Items : {len(raw_bags[idx])}\n")
print("Instances:")
for j, inst in enumerate(raw_bags[idx], start=1):
    print(f"  Instance {j}: {inst}")



=== RAW FULL BAG at index 800 ===
Bag ID    : ('BBB', '2013B', 185240)
Label     : 0
Num Items : 12

Instances:
  Instance 1: [('code_module', 'BBB')]
  Instance 2: [('code_presentation', '2013B')]
  Instance 3: [('gender', 'F')]
  Instance 4: [('region', 'West Midlands Region')]
  Instance 5: [('imd_band', '40-50%')]
  Instance 6: [('age_band', '35-55')]
  Instance 7: [('disability', 'N')]
  Instance 8: [('highest_education', 'Lower Than A Level')]
  Instance 9: [('module_presentation_length', 240)]
  Instance 10: [('num_of_prev_attempts', 1)]
  Instance 11: [('studied_credits', 120)]
  Instance 12: [('date_registration', -87.0)]


In [3]:
# === CONFIG ===
INPUT_PATH   = "data/oulad/oulad_full.pkl"
BAG_INDEX    = 800   # same index as above for comparison

# === LOAD DATA ===
with open(INPUT_PATH, "rb") as f:
    data = pickle.load(f)

bags    = data["bags"]
labels  = data["labels"]
bag_ids = data["bag_ids"]

# === DISPLAY ONE BAG ===
idx = BAG_INDEX
print(f"\n=== ENCODED FULL BAG at index {idx} ===")
print(f"Bag ID    : {bag_ids[idx]}")
print(f"Label     : {labels[idx]}")
print(f"Num Items : {bags[idx].shape[0]}\n")
print("Instances (vectors):")
for j, vec in enumerate(bags[idx], start=1):
    print(f"  Instance {j}: {np.array2string(vec, precision=4, floatmode='fixed')}")



=== ENCODED FULL BAG at index 800 ===
Bag ID    : ('BBB', '2013B', 185240)
Label     : 0
Num Items : 12

Instances (vectors):
  Instance 1: [2.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000]
  Instance 2: [0.0000 1.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000]
  Instance 3: [0.0000 0.0000 1.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000]
  Instance 4: [ 0.0000  0.0000  0.0000 12.0000  0.0000  0.0000  0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
  0.0000  0.0000]
  Instance 5: [0.0000 0.0000 0.0000 0.0000 5.0000 0.0000 0.0000 0.0000 0.0000 0.0000
 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000]
  Instance 6: [0.0000 0.0000 0.0000 0.0000 0.0000 2.0000 0.0000 0.0

In [4]:
# 1) Load full raw bags
with open("data/oulad/oulad_full_raw.pkl", "rb") as f:
    data = pickle.load(f)

raw_bags = data["raw_bags"]

# 2) Compute number of instances per bag
lengths = [len(bag) for bag in raw_bags]

# 3) Aggregate statistics
stats = pd.Series(lengths).agg(['mean', 'median', 'min', 'max'])
print("Instances per bag (full raw):")
print(stats)


Instances per bag (full raw):
mean       209.393769
median     157.000000
min         12.000000
max       1302.000000
dtype: float64


In [5]:
# === CONFIG ===
INPUT_PATH = "data/oulad/oulad_aggregated_raw.pkl"
BAG_INDEX   = 800   # change this to inspect a different bag

# === LOAD DATA ===
with open(INPUT_PATH, "rb") as f:
    data = pickle.load(f)

raw_bags = data["raw_bags"]
labels   = data["labels"]
bag_ids  = data["bag_ids"]

# === DISPLAY ONE BAG ===
idx = BAG_INDEX
print(f"\n=== RAW BAG at index {idx} ===")
print(f"Bag ID    : {bag_ids[idx]}")
print(f"Label     : {labels[idx]}")
print(f"Num Items : {len(raw_bags[idx])}\n")
print("Instances:")
for j, inst in enumerate(raw_bags[idx], start=1):
    print(f"  Instance {j}: {inst}")



=== RAW BAG at index 800 ===
Bag ID    : ('BBB', '2013B', 185240)
Label     : 0
Num Items : 12

Instances:
  Instance 1: [('code_module', 'BBB')]
  Instance 2: [('code_presentation', '2013B')]
  Instance 3: [('gender', 'F')]
  Instance 4: [('region', 'West Midlands Region')]
  Instance 5: [('imd_band', '40-50%')]
  Instance 6: [('age_band', '35-55')]
  Instance 7: [('disability', 'N')]
  Instance 8: [('highest_education', 'Lower Than A Level')]
  Instance 9: [('module_presentation_length', 240)]
  Instance 10: [('num_of_prev_attempts', 1)]
  Instance 11: [('studied_credits', 120)]
  Instance 12: [('date_registration', -87.0)]


In [6]:
# === CONFIG ===
INPUT_PATH = "data/oulad/oulad_aggregated.pkl"
BAG_INDEX   = 800   # same index as above to compare

# === LOAD DATA ===
with open(INPUT_PATH, "rb") as f:
    data = pickle.load(f)

bags    = data["bags"]
labels  = data["labels"]
bag_ids = data["bag_ids"]

# === DISPLAY ONE BAG ===
idx = BAG_INDEX
print(f"\n=== ENCODED BAG at index {idx} ===")
print(f"Bag ID    : {bag_ids[idx]}")
print(f"Label     : {labels[idx]}")
print(f"Num Items : {bags[idx].shape[0]}\n")
print("Instances (full vectors):")
for j, vec in enumerate(bags[idx], start=1):
    print(f"  Instance {j}: {np.array2string(vec, precision=4, floatmode='fixed')}")



=== ENCODED BAG at index 800 ===
Bag ID    : ('BBB', '2013B', 185240)
Label     : 0
Num Items : 12

Instances (full vectors):
  Instance 1: [2.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
 0.0000 0.0000]
  Instance 2: [0.0000 1.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
 0.0000 0.0000]
  Instance 3: [0.0000 0.0000 1.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
 0.0000 0.0000]
  Instance 4: [ 0.0000  0.0000  0.0000 12.0000  0.0000  0.0000  0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  0.0000]
  Instance 5: [0.0000 0.0000 0.0000 0.0000 5.0000 0.0000 0.0000 0.0000 0.0000 0.0000
 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
 0.0000

In [7]:
# 1) Load full raw bags
with open("data/oulad/oulad_aggregated_raw.pkl", "rb") as f:
    data = pickle.load(f)

raw_bags = data["raw_bags"]

# 2) Compute number of instances per bag
lengths = [len(bag) for bag in raw_bags]

# 3) Aggregate statistics
stats = pd.Series(lengths).agg(['mean', 'median', 'min', 'max'])
print("Instances per bag (full raw):")
print(stats)


Instances per bag (full raw):
mean      26.425545
median    26.000000
min       12.000000
max       39.000000
dtype: float64


In [8]:
# Paths to encoded datasets
datasets = {
    "full":       "data/oulad/oulad_full.pkl",
    "aggregated": "data/oulad/oulad_aggregated.pkl"
}

summary = []
for name, path in datasets.items():
    with open(path, "rb") as f:
        data = pickle.load(f)
    bags = data["bags"]
    # feature dimension is the vector length of any instance
    feature_dim   = bags[0].shape[1] if bags else 0
    # maximum number of instances in any bag
    max_instances = max(len(bag) for bag in bags) if bags else 0
    summary.append({
        "dataset":       name,
        "feature_dim":   feature_dim,
        "max_instances": max_instances
    })

df = pd.DataFrame(summary)
print(df)


      dataset  feature_dim  max_instances
0        full           20           1302
1  aggregated           22             39


In [9]:
import pandas as pd

# After loading raw_bags, labels, bag_ids:
rows = []
for bag_idx, instances in enumerate(raw_bags):
    bag_label = labels[bag_idx]
    bag_id    = bag_ids[bag_idx]
    for instance_idx, inst in enumerate(instances):
        # assume inst is a dict of categorical features
        row = {
            "bag_idx":   bag_idx,
            "bag_id":    bag_id,
            "bag_label": bag_label,
            "instance_idx":  instance_idx,
        }
        # merge in all key/value pairs from the instance
        # if your inst is not a dict, you could wrap it first
        row.update(inst)
        rows.append(row)

meta_df = pd.DataFrame(rows)
print(meta_df.columns)   # you'll see bag_idx, bag_label, inst_idx, plus all your inst-keys

# Save
meta_df.to_csv("instance_metadata.csv", index=False)


Index(['bag_idx', 'bag_id', 'bag_label', 'instance_idx', 'code_module',
       'code_presentation', 'gender', 'region', 'imd_band', 'age_band',
       'disability', 'highest_education', 'module_presentation_length',
       'num_of_prev_attempts', 'studied_credits', 'date_registration',
       'assessment_type', 'score', 'weight', 'date_submitted', 'is_banked',
       'activity_type', 'sum_click', 'first_click_day', 'last_click_day',
       'click_days'],
      dtype='object')


In [10]:
attn_df = pd.read_csv("/projects/prjs1491/MasterThesisNinaBraakman/runs/classification/seed_4/oulad_aggregated_subset/instances/tabular/label/bag_size_20/MeanMLP_22_16_22/neg_policy_only_loss_attention_reg_sum_sample_static/attention_log.csv")
meta_df = pd.read_csv("instance_metadata.csv")  # now with all your categorical fields

full_df = attn_df.merge(meta_df, on=["bag_idx","instance_idx"], how="left")
full_df.head()

Unnamed: 0,epoch,batch_idx,bag_idx,instance_idx,logit,prob,chosen,bag_id,bag_label,code_module,...,assessment_type,score,weight,date_submitted,is_banked,activity_type,sum_click,first_click_day,last_click_day,click_days
0,0,0,0,0,0.091539,0.008087,1,"('AAA', '2013J', 11391)",1.0,AAA,...,,,,,,,,,,
1,0,0,0,1,-0.05409,0.006043,0,"('AAA', '2013J', 11391)",1.0,,...,,,,,,,,,,
2,0,0,0,2,0.209513,0.010239,1,"('AAA', '2013J', 11391)",1.0,,...,,,,,,,,,,
3,0,0,0,3,-0.02957,0.006347,0,"('AAA', '2013J', 11391)",1.0,,...,,,,,,,,,,
4,0,0,0,4,0.072425,0.007783,1,"('AAA', '2013J', 11391)",1.0,,...,,,,,,,,,,


In [11]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt

# --- Configuration ---
LOG_FILES = {
    "MaxMLP": "MaxMLP_attention_log.csv",
    "MeanMLP": "MeanMLP_attention_log.csv",
    "Repset": "Repset_attention_log.csv",
    "AttentionMLP": "AttentionMLP_attention_log.csv",
}
INPUT_PATH = "/mnt/data/oulad/oulad_full_raw.pkl"
BAG_INDEX = 0  # adjust to your bag of interest

# --- Load raw metadata ---
with open(INPUT_PATH, "rb") as f:
    data = pickle.load(f)
raw_bags = data["raw_bags"]

# --- Read logs and merge with metadata ---
dfs = {}
for name, path in LOG_FILES.items():
    df = pd.read_csv(path)
    # filter to epoch=0, batch_idx=0, bag_idx=0 for simplicity, and our bag index
    df = df[(df["epoch"] == 0) & (df["batch_idx"] == 0) & (df["bag_idx"] == 0)]
    # attach instance metadata
    metadata = pd.DataFrame({
        "instance_idx": list(range(len(raw_bags[BAG_INDEX]))),
        "instance": raw_bags[BAG_INDEX]
    })
    df = pd.merge(df, metadata, on="instance_idx", how="left")
    dfs[name] = df

# --- Plotting attention probabilities and highlighting chosen ---
plt.figure(figsize=(12, 8))
for i, (name, df) in enumerate(dfs.items(), 1):
    plt.subplot(2, 2, i)
    bars = plt.bar(df["instance_idx"], df["prob"], alpha=0.6, label="prob")
    # highlight chosen
    for bar, chosen in zip(bars, df["chosen"]):
        if chosen:
            bar.set_edgecolor("red")
            bar.set_linewidth(2)
    plt.title(name)
    plt.xlabel("Instance Index")
    plt.ylabel("Attention Probability")
    plt.ylim(0, df["prob"].max() * 1.1)
    plt.legend()
plt.tight_layout()
plt.show()

# --- Display merged table for one of the methods as example ---
dfs["AttentionMLP"]


FileNotFoundError: [Errno 2] No such file or directory: '/mnt/data/oulad/oulad_full_raw.pkl'

In [4]:
import pandas as pd
import pickle

# Path to one of your generated pickle files
# Adjust if your SAVE_DIR is different in the main script
# From your prepare_oulad_data.py, it would be something like:
# "data/seed_0/oulad_aggregated_subset/instances/tabular/train.pickle"

# file_path = "data/seed_0/oulad_aggregated_subset/instances/tabular/train.pickle" 
# file_path = "data/seed_0/oulad_aggregated_subset/instances/tabular/val.pickle" 
file_path = "data/seed_0/oulad_aggregated_subset/instances/tabular/test.pickle" 

try:
    df = pd.read_pickle(file_path)
    if not df.empty and "bag_embeddings" in df.columns:
        print(f"Loaded DataFrame from: {file_path}")
        first_bag_embedding = df["bag_embeddings"].iloc[0]
        print(f"Shape of the first 'bag_embeddings' entry: {first_bag_embedding.shape}")
        print(f"Dtype of the first 'bag_embeddings' entry: {first_bag_embedding.dtype}")
    else:
        print(f"DataFrame loaded from {file_path} is empty or missing 'bag_embeddings' column.")

except FileNotFoundError:
    print(f"ERROR: Could not find the file at {file_path}")
except Exception as e:
    print(f"An error occurred: {e}")

Loaded DataFrame from: data/seed_0/oulad_aggregated_subset/instances/tabular/test.pickle
Shape of the first 'bag_embeddings' entry: (39, 22)
Dtype of the first 'bag_embeddings' entry: float32
