In [1]:
import os
import sys
import torch
import pandas as pd
import torch.nn as nn
sys.path.append('../..')
from Model.helper import *
from Config import Config
from sklearn.metrics import roc_curve
sys.path.append(os.path.join(os.getcwd(), '../../Data'))
from Data import *
from choosedataset import *
from torch.utils.data import Dataset
from sklearn.metrics import confusion_matrix
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [2]:
config = Config()
data = [Codeworkout, Falcon][config.dataset]()
df = data.df
padding_size_code = 765
loss_func = False
df['num_snapshots'] = df['prev_tasks'].apply(lambda x: [len(i) for i in x])

10 False


In [3]:
all_future_q = set()
for i in df['new_task_id']:
    all_future_q.add(i)

all_prev_q = set()
for i in df['prev_tasks_id']:
    all_prev_q = all_prev_q.union(set(i))
all_problems = all_future_q.union(all_prev_q)
vocab = {name: idx for idx, name in enumerate(all_problems)}

# Option 1 - With last attempt

In [4]:
class StudentDataset(Dataset):
    def __init__(self, df, text_tokenizer, max_len_code=768, padding_size_code=100, padding_size_q=30):
        self.df = df
        self.vocab = text_tokenizer
        self.vocab['empty'] = len(text_tokenizer)
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        return {
            'prev_problems_id': torch.tensor([self.vocab.get(pid) for pid in row["prev_tasks_id"]]),
            'prev_labels': torch.tensor(row['prev_labels']),
            'future_problem_id': torch.tensor(self.vocab.get(row['new_task_id'])),
            'label': torch.tensor(row['Label'])
        }

In [5]:
class DKTWithFutureTaskID(nn.Module):
    def __init__(self, num_tasks, embed_size=3, lstm_hidden_size=512):
        super(DKTWithFutureTaskID, self).__init__()
        self.task_embedding = nn.Embedding(num_tasks, embed_size)        # Embedding layer for the task IDs
        self.lstm = nn.LSTM(input_size=embed_size + 1, hidden_size=lstm_hidden_size, num_layers=1, batch_first=True)  # Task embedding + binary success/failure
        self.fc = nn.Linear((lstm_hidden_size + embed_size), 1)
        self.fc_all = nn.Linear(lstm_hidden_size, 1)
        self.sig = nn.Sigmoid()
    
    def forward(self, past_task_ids, past_successes, future_task_id):
        past_task_embeddings = self.task_embedding(past_task_ids)  # Shape: (batch_size, num_past_tasks, embed_size)
        past_input = torch.cat([past_task_embeddings, past_successes.unsqueeze(-1).float()], dim=-1)          
        lstm_out, _ = self.lstm(past_input)  
        final_lstm_out = lstm_out[:, -1, :] # Take the final LSTM output (from the last time step)        
        future_task_embeddings = self.task_embedding(future_task_id) # Embed the future task ID 
        combined_input = torch.cat([final_lstm_out, future_task_embeddings], dim=-1) # Combine the LSTM output with the future task embedding
        output = self.fc(combined_input)
        all_question_preds = self.sig(self.fc_all(lstm_out))
        return all_question_preds, self.sig(output)

# Option 2 - With all attempts

In [4]:
class StudentDataset(Dataset):
    def __init__(self, df, text_tokenizer, max_len_code=768, padding_size_code=100, padding_size_q=30):
        self.df = df
        self.padding = padding_size_code
        self.vocab = text_tokenizer
        self.vocab['empty'] = len(text_tokenizer)
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        prev_problem_id = torch.zeros((self.padding), dtype=torch.long)
        prev_label = torch.zeros((self.padding), dtype=torch.long)
        c = 0
        for i, s in enumerate(row['num_snapshots']):
            prev_problem_id_i = self.vocab.get(row["prev_tasks_id"][i])
            for j in range(s):
                prev_problem_id[c] = prev_problem_id_i
                prev_label[c] = row["prev_labels"][i] if s - 1 == j else False
                c += 1
        return {
            'code_num': torch.tensor(sum(row['num_snapshots'])), 
            'prev_problems_id': prev_problem_id,
            'prev_labels': prev_label,
            'future_problem_id': torch.tensor(self.vocab.get(row['new_task_id'])),
            'label': torch.tensor(row['Label'])
        }

In [5]:
class DKTWithFutureTaskID(nn.Module):
    def __init__(self, num_tasks, embed_size=3, lstm_hidden_size=64):
        super(DKTWithFutureTaskID, self).__init__()
        self.task_embedding = nn.Embedding(num_tasks, embed_size)        # Embedding layer for the task IDs
        self.lstm = nn.LSTM(input_size=embed_size + 1, hidden_size=lstm_hidden_size, num_layers=1, batch_first=True)  # Task embedding + binary success/failure
        self.fc = nn.Linear((lstm_hidden_size + embed_size), 1)
        self.fc_all = nn.Linear(lstm_hidden_size, 1)
        self.sig = nn.Sigmoid()
    
    def forward(self, code_num, past_task_ids, past_successes, future_task_id):
        past_task_embeddings = self.task_embedding(past_task_ids)  # Shape: (batch_size, num_past_tasks, embed_size)
        past_input = torch.cat([past_task_embeddings, past_successes.unsqueeze(-1).float()], dim=-1)
        snapshots_lstm = pack_padded_sequence(
            past_input,
            lengths=code_num.to('cpu'),
            batch_first=True,
            enforce_sorted=False
        )
        packed_output, (hn, cn) = self.lstm(snapshots_lstm)  
        lstm_out, _ = pad_packed_sequence(packed_output, batch_first=True)  # (batch_size, max_seq_length, lstm_hidden_size)
        future_task_embeddings = self.task_embedding(future_task_id) # Embed the future task ID 
        combined_input = torch.cat([hn[-1], future_task_embeddings], dim=-1) # Combine the LSTM output with the future task embedding
        output = self.fc(combined_input)
        all_question_preds = self.sig(self.fc_all(lstm_out))
        return all_question_preds, self.sig(output)

# Loss Function Option - caculate also all the past tasks loss

In [6]:
lossfunc = True
class lossFunc(nn.Module):
    def __init__(self):
        super(lossFunc, self).__init__()
        self.crossEntropy = nn.BCELoss()

    def forward(self, all_pred, target_prev, code_num, target_label):
        loss = 0
        pred, target_q = all_pred
        pred = pred.to('cpu')
        code_num = code_num.to('cpu')
        target_q = target_q.to('cpu')
        target_prev = target_prev.to('cpu')
        target_label = target_label.to('cpu').unsqueeze(1)
        for batch in range(pred.shape[0]):
            s = code_num[batch]
            p = torch.cat([pred.squeeze(-1)[batch, :s], target_q[batch]])
            a = torch.cat([target_prev.squeeze(-1)[batch, :s], target_label[batch]])
            loss += self.crossEntropy(p, a)
        return loss

# Start

In [7]:
def caculate_1loss(batch, model, device, criterion):
    dict_batch = {k: v.to(device) for k, v in batch.items()}
    model_params = {k: v for k, v in dict_batch.items() if k != 'label'}
    logits = model(*model_params.values())
    label = dict_batch['label'].float()
    if not criterion:
        return logits[1], label
    if loss_func == False:
        return criterion(logits[1], label.unsqueeze(1)) 
    return criterion(logits, batch['prev_label'], batch['code_num'], label)

In [8]:
model = DKTWithFutureTaskID(len(vocab))
caculate_func = caculate_1loss
criterion = nn.BCEWithLogitsLoss() 
optimizer = torch.optim.Adam(model.parameters(), lr=config.lr, weight_decay=1e-4)

device_name = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(device_name)

# Split test, val, train

In [None]:
train_dataloader, valid_dataloader, test_dataloader = create_data_loader(df, StudentDataset, padding_size_code=padding_size_code, 
                                                                         text_tokenizer=vocab, batch_size=config.batch_size, create_split=True)

Load existing splitting


In [7]:
print(len(train_dataloader), len(valid_dataloader), len(test_dataloader), flush=True)
print(train_dataloader.dataset.df['Label'].value_counts())
print(valid_dataloader.dataset.df['Label'].value_counts())
print(test_dataloader.dataset.df['Label'].value_counts())
print(len(set(train_dataloader.dataset.df['student_id'])), len(set(valid_dataloader.dataset.df['student_id'])), len(set(test_dataloader.dataset.df['student_id'])))

263 39 75
Label
False    6372
True     2018
Name: count, dtype: int64
Label
False    921
True     300
Name: count, dtype: int64
Label
False    1826
True      558
Name: count, dtype: int64
441 63 126


In [10]:
model = model.to(device)
model = training_loop(model=model, train_dataloader=train_dataloader, test_dataloader=valid_dataloader, 
                      optimizer=optimizer, criterion=criterion, device=device, name='a', caculate_func=caculate_func)

17/02/2025_23:27:54
263 39
Epoch: 0
Batch 0 from 263


  'label': torch.tensor(row['Label'])


Batch 100 from 263
Batch 200 from 263
Test Batch 0 from 39
Epoch [1], LR: 0.000100, Loss: 0.7142, Val Loss: 0.6932, patience: 5
success deep copy
success save in a
Epoch: 1
Batch 0 from 263
Batch 100 from 263
Batch 200 from 263
Test Batch 0 from 39
Epoch [2], LR: 0.000100, Loss: 0.6932, Val Loss: 0.6932, patience: 5
success deep copy
success save in a
Epoch: 2
Batch 0 from 263
Batch 100 from 263
Batch 200 from 263
Test Batch 0 from 39
Epoch [3], LR: 0.000100, Loss: 0.6932, Val Loss: 0.6932, patience: 5
success deep copy
success save in a
Epoch: 3
Batch 0 from 263
Batch 100 from 263
Batch 200 from 263
Test Batch 0 from 39
Epoch [4], LR: 0.000100, Loss: 0.6932, Val Loss: 0.6932, patience: 5
success deep copy
success save in a
Epoch: 4
Batch 0 from 263
Batch 100 from 263
Batch 200 from 263
Test Batch 0 from 39
Epoch [5], LR: 0.000100, Loss: 0.6932, Val Loss: 0.6932, patience: 5
success deep copy
success save in a
Epoch: 5
Batch 0 from 263
Batch 100 from 263
Batch 200 from 263
Test Batch 0

In [11]:
def results(threshold, y_true, y_prob):
    y_prob = np.array(y_prob)
    y_true = np.array(y_true)
    y_pred = np.where(y_prob > threshold, 1, 0)
    roc_auc = roc_auc_score(y_true, y_prob)
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    best = "best"
    if threshold == 0.5:
        best = "0.5"
    #  df = pd.concat([pd.DataFrame([[model_name, threshold, roc_auc, accuracy, precision, recall, f1]], columns=df.columns), df], ignore_index=True)
    print({"threshold": threshold, "roc_auc": roc_auc, "accuracy": accuracy, f"precision_{best}": precision, f"recall_{best}": recall, f"f1_{best}": f1})
    cm = confusion_matrix(y_true, y_pred)
    print(cm)

In [12]:
all_labels, all_probs = eval_loop(model, valid_dataloader, device, caculate_func=caculate_func)

fpr, tpr, thresholds = roc_curve(all_labels, all_probs)
J = tpr - fpr
best_index = J.argmax()

y_labels, y_probs = eval_loop(model, test_dataloader, device, caculate_func=caculate_func)
results(thresholds[best_index], y_labels, y_probs)

Test Batch 0 from 39
Test Batch 0 from 75


  'label': torch.tensor(row['Label'])
  'label': torch.tensor(row['Label'])


{'threshold': np.float32(0.500011), 'roc_auc': np.float64(0.29765297750140346), 'accuracy': 0.7655201342281879, 'precision_best': np.float64(0.0), 'recall_best': np.float64(0.0), 'f1_best': np.float64(0.0)}
[[1825    1]
 [ 558    0]]


# 5 - fold

In [9]:
data_loaders = create_data_loader_k_fold(df, StudentDataset, vocab, batch_size=config.batch_size)

In [10]:
def num_of(train_dataloader, test_dataloader):
    print(len(train_dataloader), len(test_dataloader))
    print(len(set(train_dataloader.dataset.df['student_id'])), len(set(test_dataloader.dataset.df['student_id'])))
    print(set(train_dataloader.dataset.df['student_id']).intersection(set(test_dataloader.dataset.df['student_id'])))
    print(train_dataloader.dataset.df.Label.value_counts(normalize=True))
    print(test_dataloader.dataset.df.Label.value_counts(normalize=True))

for train, test in data_loaders:
    num_of(train, test)

300 76
504 126
set()
Label
False    0.765811
True     0.234189
Name: proportion, dtype: float64
Label
False    0.738085
True     0.261915
Name: proportion, dtype: float64
300 76
504 126
set()
Label
False    0.757667
True     0.242333
Name: proportion, dtype: float64
Label
False    0.770444
True     0.229556
Name: proportion, dtype: float64
302 74
504 126
set()
Label
False    0.770388
True     0.229612
Name: proportion, dtype: float64
Label
False    0.71871
True     0.28129
Name: proportion, dtype: float64
299 77
504 126
set()
Label
False    0.752907
True     0.247093
Name: proportion, dtype: float64
Label
False    0.788807
True     0.211193
Name: proportion, dtype: float64
301 74
504 126
set()
Label
False    0.754337
True     0.245663
Name: proportion, dtype: float64
Label
False    0.784206
True     0.215794
Name: proportion, dtype: float64


In [None]:
fold_results = {'ROC-AUC' : [], 'f1' : [], 'recall': [], "precision": []}

for fold, (train_dataloader, test_dataloader) in enumerate(data_loaders):
    print(f"Fold {fold + 1}:")    # Prepare data for current fold
    m = DKTWithFutureTaskID(len(vocab))
    loss_fn = None
    optimizer = torch.optim.Adam(m.parameters(), lr=config.lr, weight_decay=1e-4)

    m = m.to(device)
    print(m)
    # Training Loop
    for epoch in range(config.epoch):
        total_loss = train_loop(m, train_dataloader, device, optimizer, criterion, caculate_func)

        # Optional: Print metrics every few epochs
        if epoch % 10 == 0:
            print(f"Fold {fold + 1}, Epoch {epoch}: Loss = {total_loss / len(train_dataloader)}")

    y_labels, y_probs = eval_loop(m, test_dataloader, device, caculate_func=caculate_func)
    y_prob = np.array(y_probs)
    y_true = np.array(y_labels)
    y_pred = np.where(y_prob > 0.25, 1, 0)

    fold_results['ROC-AUC'].append(roc_auc_score(y_true, y_prob))
    fold_results['precision'].append(precision_score(y_true, y_pred))
    fold_results['recall'].append(recall_score(y_true, y_pred))
    fold_results['f1'].append(f1_score(y_true, y_pred))

In [None]:
fold_results