In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import functional as F
import pandas as pd
from sklearn.metrics import accuracy_score, roc_auc_score


In [2]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
path = '/content/drive/MyDrive/PSC/skill_builder_data_corrected_collapsed.csv'
data = pd.read_csv(path, encoding="latin1")

  data = pd.read_csv(path, encoding="latin1")


In [4]:
df = data[["user_id", "problem_id", "skill_id", "correct", "order_id"]]
df = df.dropna(subset=["skill_id"]) # remove rows with no concept

In [8]:
df["skill_id"] = df["skill_id"].astype(int)
df["correct"] = df["correct"].astype(int)

In [9]:
df = df.sort_values(["user_id", "order_id"])

In [10]:
q_map = {q:i for i,q in enumerate(df["problem_id"].unique())}
c_map = {c:i for i,c in enumerate(df["skill_id"].unique())}

df["q"] = df["problem_id"].map(q_map)
df["c"] = df["skill_id"].map(c_map)
df["r"] = df["correct"]

In [11]:
  from sklearn.model_selection import GroupShuffleSplit

  gss = GroupShuffleSplit(
      n_splits=1,
      test_size=0.2,
      random_state=42
  )

  groups = df["user_id"].values
  X = df.index.values
  y = df["correct"].values

  train_idx, test_idx = next(gss.split(X, y, groups=groups))

  df_train = df.iloc[train_idx].copy()
  df_test  = df.iloc[test_idx].copy()

In [12]:
nb_question = len(q_map)
nb_concept = len(c_map)
print(nb_question, nb_concept)

17751 149


In [13]:
def build_student_seqs(d):
    seqs = []
    for uid, u_df in d.groupby("user_id"):
        q = u_df["q"].values
        c = u_df["c"].values
        r = u_df["r"].values
        if len(q) >= 2:   # need history + target
            seqs.append((q, c, r))
    return seqs

In [14]:
train_student_seqs = build_student_seqs(df_train)
test_student_seqs  = build_student_seqs(df_test)

In [15]:
print("train users:", df_train["user_id"].nunique())
print("test users:", df_test["user_id"].nunique())

train users: 3330
test users: 833


In [16]:
context_size = 20 # including query

In [17]:
def build_samples(student_seqs):
    samples = []
    for q, c, r in student_seqs:
        L = len(q)
        for i in range(0, L - context_size + 1):
            q_hist = q[i:i+context_size-1]
            c_hist = c[i:i+context_size-1]
            r_hist = r[i:i+context_size-1]

            q_query = q[i+context_size-1]
            c_query = c[i+context_size-1]
            r_target = r[i+context_size-1]

            samples.append((q_hist, c_hist, r_hist, q_query, c_query, r_target))
    return samples


In [18]:
train_samples = build_samples(train_student_seqs)
test_samples  = build_samples(test_student_seqs)


In [19]:
print(len(train_samples), len(test_samples))

179601 45412


In [20]:
# # Nb of questions
# nb_question = 30
# # Nb of concepts
# nb_concept = 10
# Embedding dimension
embedding_dim = 16
# Nb of attention heads
nb_head = 8

# Attentive Knowledge Tracing

In [21]:
class Head(nn.Module):
    def __init__(self, D = embedding_dim, head_size = embedding_dim//8, T = context_size, monotonic = False):
        super().__init__()

        # embedding dim
        self.D = D

        # monotonic attention head or simple attention head
        self.monotonic = monotonic

        # decay parameter
        if monotonic:
            self.theta_raw = nn.Parameter(torch.tensor(-2.0, dtype=torch.float32))

        # Dk = Dq = Dv
        self.head_size = head_size

        self.query_key = nn.Linear(D, head_size, bias=False)
        self.value = nn.Linear(D, head_size, bias=False)

        self.register_buffer('tril', torch.tril(torch.ones(T, T)))

    def forward(self, q_in, k_in, v_in):
        # x is of size B, T, D
        B, Tq, D = q_in.shape #Tq = Tk = T-1
        Tk = k_in.size(1)
        assert v_in.size(1) == Tk


        k = self.query_key(k_in) #  B, Tk, head_size
        q = self.query_key(q_in) #  B, Tq, head_size
        v = self.value(v_in) # B, Tk, head_size

        # (B, Tq, head_size) @ (B, head_size, Tq) = (B, Tq, Tq)
        weights = q @ k.transpose(-2, -1) * self.head_size**(-0.5)


        mask = self.tril[:Tq, :Tk]

        if self.monotonic:
            # COMPUTE d(t, tau)
            with torch.no_grad():

                scores_masked = weights.masked_fill(mask == 0, -1e32)
                gamma = F.softmax(scores_masked, dim = -1)
                gamma = gamma * mask.float()

                prefix = torch.cumsum(gamma, dim = -1)  # (B,Tq,Tk)

                temp = torch.arange(Tq, device=q_in.device)

                # prefix_at_t = prefix[b,t,t]
                prefix_at_t = prefix[:, temp, temp].unsqueeze(-1)  # (B,Tq,1)
                sum_tau1_to_t = prefix_at_t - prefix  # (B,Tq,Tq)

                abs_dtau = (temp.view(1, Tq, 1) - temp.view(1, 1, Tq)).abs().float()  # (1,Tq,Tq)
                d = torch.clamp(abs_dtau * sum_tau1_to_t, min = 0.) # (B, Tq, Tk), clamp to be non-negative

                d = d.sqrt().detach()

            decay_rate = F.softplus(self.theta_raw)
            # Clamp for numerical stability
            decay_rate = torch.clamp(decay_rate, max=10.0)

            # Compute final weights
            factor = torch.clamp(torch.clamp((-d*decay_rate).exp(), min = 1e-5), max = 1e5) # of size (B,Tq, Tq)
            weights = weights * factor # of size (B,Tq, Tq)

        # Apply mask after monotonic attention
        weights = weights.masked_fill(mask == 0, float('-inf'))
        weights = F.softmax(weights, dim = -1)

        # (B, T, T) @ (B, T, head_size) = (B, T, head_size)
        out = weights @ v

        return out # (B, T, head_size)

In [22]:
class Multi(nn.Module):
    def __init__(self, nb_head, monotonic, D = embedding_dim, head_size = embedding_dim//8, T = context_size, dropout = 0.2):
        super().__init__()
        self.nb_head = nb_head
        self.head_size = head_size

        # selfcount = False if monotonic True otherwise
        self.heads = nn.ModuleList([Head(D, head_size, T, monotonic) for _ in range(nb_head)])

        self.proj = nn.Linear(nb_head * head_size, D)
        self.drop = nn.Dropout(dropout)


    def forward(self, q_in, k_in, v_in):
        out = torch.cat([h(q_in, k_in, v_in) for h in self.heads], dim=-1)        # (B,T-1,nb_head*head_size)
        out = self.drop(self.proj(out))    # (B,T-1,D)

        return out



In [23]:
class EncoderBlock(nn.Module):
    def __init__(self, nb_head = 8, D = embedding_dim, head_size = embedding_dim//8, T = context_size, dropout = 0.2):
        super().__init__()

        self.sa_encoder = Multi(nb_head,False,D,head_size, T, dropout)

        self.norm1 = nn.LayerNorm(D)
        self.act1 = nn.ReLU()
        self.drop1 = nn.Dropout(dropout)

        self.ffn = nn.Sequential(
            nn.Linear(D, 4 * D),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(4 * D, D),
        )

        self.norm2 = nn.LayerNorm(D)
        self.drop2 = nn.Dropout(dropout)

    def forward(self, x):
        out = self.sa_encoder(x, x, x)
        out = self.norm1(x + self.drop1(out))          # (B,T-1,D)
        ffn_out = self.ffn(out)
        out = self.norm2(out + self.drop2(ffn_out))
        return out



In [24]:
class DecoderBlock(nn.Module):
    def __init__(self, nb_head = 8, D = embedding_dim, head_size = embedding_dim//8, T = context_size, dropout = 0.2):
        super().__init__()

        self.sa_decoder = Multi(nb_head,True,D,head_size, T, dropout)

        self.norm1 = nn.LayerNorm(D)
        self.act1 = nn.ReLU()
        self.drop1 = nn.Dropout(dropout)

        hidden = 4 * D
        self.ffn = nn.Sequential(
            nn.Linear(D, hidden),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden, D),
        )

        self.norm2 = nn.LayerNorm(D)
        self.drop2 = nn.Dropout(dropout)

    def forward(self, q_in, k_in, v_in):
        out = self.sa_decoder(q_in, k_in, v_in)
        out = self.norm1(q_in + self.drop1(out))         # (B,T-1,D)
        ffn_out = self.ffn(out)
        out = self.norm2(out + self.drop2(ffn_out))
        return out



In [25]:
class AKT(nn.Module):

    def __init__(self, C = nb_concept, Q = nb_question, D = embedding_dim):
        """
        AKT
        C : int = nb Concepts
        Q : int = nb Questions
        D : int = embedding Dimension
        """
        super().__init__()

        self.C = C
        self.Q = Q
        self.D = D

        # Rasch model-based embeddings
        # c_c
        self.c_embedding = nn.Embedding(C, D)

        # d_c
        self.d_embedding = nn.Embedding(C, D)

        # difficulty mu_q
        self.mu_embedding = nn.Embedding(Q, 1)

        # correct or wrong answer g_r
        self.g_embedding = nn.Embedding(2, D)

        # f_(c, r)
        self.f_embedding = nn.Embedding(2*C, D)


        self.question_encoder = EncoderBlock(nb_head,D,D//8,context_size,0.2)
        self.knowledge_encoder = EncoderBlock(nb_head,D,D//8,context_size-1,0.2)
        self.knowledge_retriever = DecoderBlock(nb_head,D,D//8,context_size,0.2)

        self.prediction_layer = nn.Sequential(
            nn.Linear(2*D, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 1)
        )


    def forward(self, q_hist, c_hist, r_hist, q_query, c_query, r_target=None):
        """
        q_hist : tensor of size B, T-1   (question history)
        c_hist : tensor of size B, T-1   (concept history)
        r_hist : tensor of size B, T-1   (response history)
        q_query : tensor of size B, 1   (question query)
        c_query : tensor of size B, 1   (concept query)
        r_target : tensor of size B, 1  (response target) (optional)
        ---------------
        returns : tensor of size B, T-1 (predicted probabilities)
        """
        B, Th = q_hist.shape
        T = Th + 1

        question = torch.cat([q_hist, q_query], dim=1) # size B, T
        concept = torch.cat([c_hist, c_query], dim=1) # size B, T

        x = self.c_embedding(concept) + self.mu_embedding(question)*self.d_embedding(concept) # size B, T, D
        y_hist = self.c_embedding(c_hist) + self.g_embedding(r_hist) + self.mu_embedding(q_hist)*self.f_embedding(c_hist+r_hist*self.C) # size B, T-1, D

        x_hat = self.question_encoder(x) # size B, T, D
        y_hat = self.knowledge_encoder(y_hist) # size B, Th, D

        x_q = x_hat[:, 1:, :] # queries B,T-1,D : question at time t
        x_k = x_hat[:, :-1, :] # keys B,T-1,D : questions up to t-1
        y_v = y_hat # values B,T-1,D  : responses up to t-1

        h = self.knowledge_retriever(x_q,x_k,y_v) # size B, T, D

        out = torch.cat([h, x[:, 1:, :]], dim=-1) # B, T-1, 2D
        out = self.prediction_layer(out) # size B, T-1, 1
        out = torch.sigmoid(out) # size B, T-1, 1

        if r_target is None:
            loss = None
        else:
            target = torch.cat([r_hist, r_target.long()], dim=1)[:, 1:].float().unsqueeze(-1)  # B,T-1,1
            loss = F.binary_cross_entropy(out, target)
        return out, loss



In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AKT().to(device)

In [27]:
# Count total parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Model size: {total_params * 4 / 1024**2:.2f} MB (assuming float32)")

Total parameters: 184,736
Trainable parameters: 184,736
Model size: 0.70 MB (assuming float32)


In [28]:
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4, weight_decay=1e-5)

In [29]:
from torch.utils.data import Dataset, DataLoader

class AKTDataset(Dataset):
    def __init__(self, samples):
        self.samples = samples

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        qh, ch, rh, qq, cq, rt = self.samples[idx]
        return (
            torch.tensor(qh, dtype=torch.long),      # (T-1,)
            torch.tensor(ch, dtype=torch.long),      # (T-1,)
            torch.tensor(rh, dtype=torch.long),      # (T-1,)
            torch.tensor([qq], dtype=torch.long),    # (1,)
            torch.tensor([cq], dtype=torch.long),    # (1,)
            torch.tensor([rt], dtype=torch.long),    # (1,) keep long for concat; convert later in loss
        )

In [30]:

train_loader = DataLoader(AKTDataset(train_samples), batch_size=64, shuffle=True, drop_last=True)
test_loader  = DataLoader(AKTDataset(test_samples), batch_size=256, shuffle=False)

In [33]:
epochs = 6
for epoch in range(epochs):
    model.train()
    total = 0.0

    for qh, ch, rh, qq, cq, rt in train_loader:
        qh, ch, rh, qq, cq, rt = [t.to(device, non_blocking=True) for t in (qh, ch, rh, qq, cq, rt)]

        optimizer.zero_grad()
        _, loss = model(qh, ch, rh, qq, cq, rt)  # rt is (B,1)
        loss.backward()
        optimizer.step()

        total += loss.item()

    print(f"epoch {epoch+1} loss {total/len(train_loader):.4f}")

epoch 1 loss 0.4811
epoch 2 loss 0.4737
epoch 3 loss 0.4681
epoch 4 loss 0.4639
epoch 5 loss 0.4607
epoch 6 loss 0.4579


In [34]:

model.eval()
all_p, all_y = [], []

with torch.no_grad():
    for qh, ch, rh, qq, cq, rt in test_loader:
        qh, ch, rh, qq, cq, rt = [t.to(device) for t in (qh, ch, rh, qq, cq, rt)]

        # predict using ONLY (qh, ch, rh, qq, cq)
        p, _ = model(qh, ch, rh, qq, cq, r_target=None)  # (B, T-1, 1)

        # take prediction at time T (the query step) = last output position
        pT = p[:, -1, 0].detach().cpu().numpy()           # (B,)
        yT = rt[:, 0].detach().cpu().numpy().astype(int)  # (B,)

        all_p.append(pT)
        all_y.append(yT)

all_p = np.concatenate(all_p)
all_y = np.concatenate(all_y)

acc = accuracy_score(all_y, (all_p >= 0.5).astype(int))
auc = roc_auc_score(all_y, all_p) if len(np.unique(all_y)) > 1 else float("nan")

print("test accuracy:", acc)
print("test auc:", auc)

test accuracy: 0.7317889544613758
test auc: 0.7642758875766416
