# Import data and data description

In [1]:
import pandas as pd
data = pd.read_csv('2015_100_skill_builders_main_problems.csv')
data.head()

Unnamed: 0,user_id,log_id,sequence_id,correct
0,50121,167478035,7014,0.0
1,50121,167478043,7014,1.0
2,50121,167478053,7014,1.0
3,50121,167478069,7014,1.0
4,50964,167478041,7014,1.0


In [2]:
def compute_descriptive_stats(data):
    # Compute basic statistics
    stats = data.describe()
    
    # Compute additional statistics
    distinct_counts = data.nunique()
    
    return distinct_counts

In [3]:
compute_descriptive_stats(data)

user_id         19917
log_id         708631
sequence_id       100
correct            11
dtype: int64

# Data pre-processing

In [4]:
data['sequence_id'] = data['sequence_id'].rank(method='dense').astype(int)
data.head()


Unnamed: 0,user_id,log_id,sequence_id,correct
0,50121,167478035,26,0.0
1,50121,167478043,26,1.0
2,50121,167478053,26,1.0
3,50121,167478069,26,1.0
4,50964,167478041,26,1.0


## Convert the raw data to train-data and test-data

In [5]:
import pandas as pd

def read_data_from_csv(read_file, write_train, write_test):
    df = pd.read_csv(read_file)
    df['sequence_id'] = df['sequence_id'].rank(method='dense').astype(int)
    ui_df = df.groupby(['user_id'], sort=False)
    user_inters = []
    for ui in ui_df:
        tmp_inter = ui[1]
        tmp_seq_len = len(tmp_inter)  # Ensure `len` is not reassigned
        tmp_questions = [str(x) for x in list(tmp_inter['sequence_id'])]
        tmp_ans = ['1' if x == 1.0 else '0' for x in list(tmp_inter['correct'])]
        user_inters.append([str(tmp_seq_len), tmp_questions, tmp_ans])

    train = user_inters[: int(0.8 * len(user_inters))]
    test = user_inters[int(0.8 * len(user_inters)) :]
    write_datafile(write_train, train)
    write_datafile(write_test, test)
    return

def write_datafile(file, data):
    with open(file, 'w') as f:
        for index, dd in enumerate(data):
            if index == 0:
                continue  # Skip writing the header for the first row
            row = [str(dd[0]), ','.join(dd[1]), ','.join(dd[2])]
            for item in row:
                f.write(item + '\n')



if __name__ == '__main__':
    read_data_from_csv('2015_100_skill_builders_main_problems.csv', 'train-data.csv', 'test-data.csv')


The provided code defines a class called RNN that represents a Recurrent Neural Network. The class has methods for building, training, and evaluating the RNN model. It uses linear layers, activation functions, and loss functions from the Torch library. The code also includes functions for saving and loading the model, calculating gradients, and performing forward propagation. The RNN class is initialized with parameters such as the number of questions, hidden units, dropout settings, and maximum steps. The code demonstrates how to create, train, and use an RNN model for sequential data analysis.


# Data loader and Read data function

In [6]:
%%writefile Data/dataloader.py
import torch
import torch.utils.data as Data
from Data.readdata import DataReader


def getDataLoader(batch_size, num_of_questions, max_step):
    handle = DataReader('train-data.csv',
                        'test-data.csv', max_step,
                        num_of_questions)
    dtrain = torch.tensor(handle.getTrainData().astype(float).tolist(),
                          dtype=torch.float32)
    dtest = torch.tensor(handle.getTestData().astype(float).tolist(),
                         dtype=torch.float32)
    trainLoader = Data.DataLoader(dtrain, batch_size=batch_size, shuffle=True)
    testLoader = Data.DataLoader(dtest, batch_size=batch_size, shuffle=False)
    return trainLoader, testLoader



Overwriting Data/dataloader.py


In [7]:
%%writefile Data/readdata.py


import numpy as np
import itertools


class DataReader():
    def __init__(self, train_path, test_path, maxstep, numofques):
        self.train_path = train_path
        self.test_path = test_path
        self.maxstep = maxstep
        self.numofques = numofques

    def getData(self, file_path):
        data = []
        with open(file_path, 'r') as file:
            for len, ques, ans in itertools.zip_longest(*[file] * 3):
                len = int(len.strip().strip(','))
                ques = [int(q) for q in ques.strip().strip(',').split(',')]
                ans = [int(a) for a in ans.strip().strip(',').split(',')]
                slices = len//self.maxstep + (1 if len % self.maxstep > 0 else 0)
                for i in range(slices):
                    temp = temp = np.zeros(shape=[self.maxstep, 2 * self.numofques])
                    if len > 0:
                        if len >= self.maxstep:
                            steps = self.maxstep
                        else:
                            steps = len
                        for j in range(steps):
                            if ans[i*self.maxstep + j] == 1:
                                temp[j][ques[i*self.maxstep + j]] = 1
                            else:
                                temp[j][ques[i*self.maxstep + j] + self.numofques] = 1
                        len = len - self.maxstep
                    data.append(temp.tolist())
            print('done: ' + str(np.array(data).shape))
        return data

    def getTrainData(self):
        print('loading train data...')
        trainData = self.getData(self.train_path)
        return np.array(trainData)

    def getTestData(self):
        print('loading test data...')
        testData = self.getData(self.test_path)
        return np.array(testData)

Overwriting Data/readdata.py


Overall, this code trains an RNN model on the provided data and saves the trained model and training progress to files. It uses semi-sorted mini-batches for training and evaluates the model's accuracy on the test data.



# RNN Model

In [8]:
%%writefile model/DKT/RNNModel.py

import torch
import torch.nn as nn

class RNNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, device):
        super(RNNModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.output_dim = output_dim
        self.rnn = nn.RNN(input_dim, hidden_dim, layer_dim, batch_first=True, nonlinearity='tanh')
        self.fc = nn.Linear(self.hidden_dim, self.output_dim)
        self.sig = nn.Sigmoid()
        self.device = device

    def forward(self, x):
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim, device=self.device)
        out, hn = self.rnn(x, h0)
        res = self.sig(self.fc(out))
        return res


Overwriting model/DKT/RNNModel.py


# Evaluation

1. performance(ground_truth, prediction): Calculates and prints evaluation metrics such as AUC, F1 score, recall, and precision based on the ground truth labels and predicted probabilities.


2. lossFunc(num_of_questions, max_step, device): Implements a custom loss function for the DKT model. Computes the loss, prediction, and ground truth tensors for evaluation.


3. train_epoch(model, trainLoader, optimizer, loss_func, device): Performs a single training epoch. Computes forward pass, loss, backpropagation, and updates model parameters.


4. test_epoch(model, testLoader, loss_func, device): Evaluates the DKT model on test data. Computes forward pass, loss, and collects predicted probabilities and ground truth labels. Prints evaluation metrics using performance function.

In [None]:
%%writefile Evaluation/eval.py

import tqdm
import torch
import logging

import torch.nn as nn
from sklearn import metrics

logger = logging.getLogger('main.eval')


def performance(ground_truth, prediction):
    fpr, tpr, thresholds = metrics.roc_curve(ground_truth.detach().cpu().numpy(),
                                             prediction.detach().cpu().numpy())
    auc = metrics.auc(fpr, tpr)

    f1 = metrics.f1_score(ground_truth.detach().cpu().numpy(),
                          torch.round(prediction).detach().cpu().numpy())
    recall = metrics.recall_score(ground_truth.detach().cpu().numpy(),
                                  torch.round(prediction).detach().cpu().numpy())
    precision = metrics.precision_score(
        ground_truth.detach().cpu().numpy(),
        torch.round(prediction).detach().cpu().numpy())
    logger.info('auc: ' + str(auc) + ' f1: ' + str(f1) + ' recall: ' +
                str(recall) + ' precision: ' + str(precision))
    print('auc: ' + str(auc) + ' f1: ' + str(f1) + ' recall: ' + str(recall) +
          ' precision: ' + str(precision))


class lossFunc(nn.Module):
    def __init__(self, num_of_questions, max_step, device):
        super(lossFunc, self).__init__()
        self.crossEntropy = nn.BCELoss()
        self.num_of_questions = num_of_questions
        self.max_step = max_step
        self.device = device

    def forward(self, pred, batch):
        loss = 0
        prediction = torch.tensor([], device=self.device)
        ground_truth = torch.tensor([], device=self.device)
        for student in range(pred.shape[0]):
            delta = batch[student][:, 0:self.num_of_questions] + batch[
                student][:, self.num_of_questions:]  # shape: [length, questions]
            temp = pred[student][:self.max_step - 1].mm(delta[1:].t())
            index = torch.tensor([[i for i in range(self.max_step - 1)]],
                                 dtype=torch.long, device=self.device)
            p = temp.gather(0, index)[0]
            a = (((batch[student][:, 0:self.num_of_questions] -
                   batch[student][:, self.num_of_questions:]).sum(1) + 1) //
                 2)[1:]
            for i in range(len(p) - 1, -1, -1):
                if p[i] > 0:
                    p = p[:i + 1]
                    a = a[:i + 1]
                    break
            loss += self.crossEntropy(p, a)
            prediction = torch.cat([prediction, p])
            ground_truth = torch.cat([ground_truth, a])
        return loss, prediction, ground_truth


def train_epoch(model, trainLoader, optimizer, loss_func, device):
    model.to(device)
    for batch in tqdm.tqdm(trainLoader, desc='Training:    ', mininterval=2):
        batch = batch.to(device)
        pred = model(batch)
        loss, prediction, ground_truth = loss_func(pred, batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return model, optimizer

def test_epoch(model, testLoader, loss_func, device):
    model.to(device)
    ground_truth = torch.tensor([], device=device)
    prediction = torch.tensor([], device=device)
    for batch in tqdm.tqdm(testLoader, desc='Testing:     ', mininterval=2):
        batch = batch.to(device)
        pred = model(batch)
        loss, p, a = loss_func(pred, batch)
        prediction = torch.cat([prediction, p])
        ground_truth = torch.cat([ground_truth, a])
    performance(ground_truth, prediction)
    return prediction




# Write main function

In [18]:
%%writefile run.py

"""
Usage:
    run.py rnn --hidden=<h> [options]

Options:
    --length=<int>                      max length of question sequence [default: 50]
    --questions=<int>                   num of question [default: 150]
    --lr=<float>                        learning rate [default: 0.001]
    --bs=<int>                          batch size [default: 64]
    --seed=<int>                        random seed [default: 13]
    --epochs=<int>                      number of epochs [default: 2]
    --cuda=<int>                        use GPU id [default: 0]
    --hidden=<int>                      dimension of hidden state [default: 50]
    --layers=<int>                      layers of rnn [default: 4]
    --dropout=<float>                   dropout rate [default: 0.1]
"""


import os
import random
import logging
import torch

import torch.optim as optim
import numpy as np

from datetime import datetime
from docopt import docopt
from Data.dataloader import getDataLoader
from Evaluation import eval


def setup_seed(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


def main():

    args = docopt(__doc__)
    length = int(args['--length'])
    questions = int(args['--questions'])
    lr = float(args['--lr'])
    bs = int(args['--bs'])
    seed = int(args['--seed'])
    epochs = int(args['--epochs'])
    cuda = args['--cuda']
    hidden = int(args['--hidden'])
    layers = int(args['--layers'])
    dropout = float(args['--dropout'])
    model_type = 'RNN'

    logger = logging.getLogger('main')
    logger.setLevel(level=logging.DEBUG)
    date = datetime.now()
    setup_seed(seed)

    if torch.cuda.is_available():
        os.environ["CUDA_VISIBLE_DEVICES"] = cuda
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')

    trainLoader, testLoader = getDataLoader(bs, questions, length)

    from model.DKT.RNNModel import RNNModel
    model = RNNModel(questions * 2, hidden, layers, questions, device)

    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_func = eval.lossFunc(questions, length, device)

    predicted_var = None  # Initialize a variable to store the predicted variable

    
    for epoch in range(epochs):
        print('epoch: ' + str(epoch))
        model, optimizer = eval.train_epoch(model, trainLoader, optimizer,
                                          loss_func, device)
        logger.info(f'epoch {epoch}')

    # Save the model
    model_dir = 'Result'  
    os.makedirs(model_dir, exist_ok=True)
    model_path = os.path.join(model_dir, 'model.pth')
    torch.save(model.state_dict(), model_path)
    



if __name__ == '__main__':
    main()


   


Overwriting run.py


In [19]:
!python run.py rnn --hidden=50


loading train data...
done: (22037, 50, 300)
loading test data...
done: (4691, 50, 300)
epoch: 0
Training:    : 100%|██████████████████████████| 345/345 [00:17<00:00, 19.93it/s]
epoch: 1
Training:    : 100%|██████████████████████████| 345/345 [00:17<00:00, 20.11it/s]
Traceback (most recent call last):
  File "/Users/nguyenhoangngocha21/Documents/GitHub/Knowledge-tracing-model/run.py", line 96, in <module>
    main()
  File "/Users/nguyenhoangngocha21/Documents/GitHub/Knowledge-tracing-model/run.py", line 91, in main
    return predicted_values
NameError: name 'predicted_values' is not defined


In [2]:

import streamlit as st
import pandas as pd
import torch
import torch.utils.data as Data
from Data.dataloader import getDataLoader
from model.DKT.RNNModel import RNNModel
from Evaluation.eval import performance, lossFunc

# Function to load and preprocess the data
def load_data(train_file, test_file, max_step, num_of_questions):
    from Data.readdata import readdata
    handle = DataLoader(train_file, test_file, max_step, num_of_questions)
    dtest = torch.tensor(handle.getTestData().astype(float).tolist(), dtype=torch.float32)
    test_loader = Data.DataLoader(dtest, batch_size=batch_size, shuffle=False)
    return test_loader

# Function to perform prediction
def predict(model, test_loader, num_of_questions):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()
    prediction = torch.tensor([], device=device)
    ground_truth = torch.tensor([], device=device)

    for batch in test_loader:
        batch = batch.to(device)
        pred = model(batch)
        prediction = torch.cat([prediction, pred])
        ground_truth = torch.cat([ground_truth, batch[:, :, :num_of_questions].sum(dim=1)])

    return prediction, ground_truth

# Function to display performance metrics
def display_performance(ground_truth, prediction):
    st.subheader('Performance Metrics')
    performance(ground_truth, prediction)

# Function to recommend items
def recommend_items():
    # Add your recommendation logic here
    # You can display recommended items based on the prediction or any other recommendation algorithm

    st.subheader('Recommendation')
    # Display recommended items

# Main Streamlit web application code
st.title('DKT Model Evaluation and Recommendation System')
st.sidebar.title('Configuration')

# Read input files
train_file = 'train.data.csv'
test_file = 'test.data.csv'

# Read parameters from user input or use default values
max_step = st.sidebar.number_input('Max length of question sequence', value=50)
num_of_questions = st.sidebar.number_input('Number of questions', value=150)
input_dim = st.sidebar.number_input('Input dimension', value=300)
hidden_dim = st.sidebar.number_input('Hidden dimension', value=50)
layer_dim = st.sidebar.number_input('Number of layers', value=4)
output_dim = st.sidebar.number_input('Output dimension', value=150)

if train_file is not None and test_file is not None:
    # Load data
    test_loader = load_data(train_file, test_file, max_step, num_of_questions)

    # Perform prediction
    model = RNNModel(input_dim, hidden_dim, layer_dim, output_dim, device)
    model.load_state_dict(torch.load(model_path))
    prediction, ground_truth = predict(model, test_loader, num_of_questions)

    # Display performance metrics
    display_performance(ground_truth, prediction)

    # Recommend items
    recommend_items()




NameError: name 'DataReader' is not defined