# MultiSkill Knowledge Tracing predictions from BKT parameters
In this notebook, we will at first learn BKT parameters from train data. We will take a look at the performance on BKT predictions with these parameters on test data. Then, we will integrate these parameters into our algorithm and check its performance on test data. Finally, we will compare the two algorithms.

In [1]:
import sys
sys.path.append("/Users/olivier/PycharmProjects/bayesian-kst/")  # for mac

import pyAgrum as gum
from kgraph.expert_layer.domain import Domain
from kgraph.expert_layer.knowledge_components import KnowledgeComponent
from kgraph.expert_layer.link import Link
from kgraph.resources_layer.exercise import Exercise
from kgraph.learner_layer.answer import LearnerAnswer
from kgraph.learner_layer.learner import Learner
from kgraph.learner_layer.learner_pool import LearnerPool
from kgraph.helpers.truthtable import truthtable
from math import floor
import pyAgrum.lib.notebook as gnb
import pyAgrum.lib.dynamicBN as gdyn
import random
import itertools
import numpy as np
import tqdm
import sklearn.metrics as sk_metrics
from sklearn.model_selection import KFold

## Import the data

In [2]:
# we define the KCs

KC_A = KnowledgeComponent(55365, "Déterminer l'appartenance d'un nombre réel à un intervalle fini")
KC_B = KnowledgeComponent(55363, "Déterminer l'appartenance d'un nombre réel à un intervalle infini")
KC_C = KnowledgeComponent(55364, "Déterminer l'appartenance d'un nombre réel à un intervalle simple")
KC_D = KnowledgeComponent(50988, "Déterminer l'appartenance d'un nombre réel à une intersection d'intervalles de R")
KC_E = KnowledgeComponent(50989, "Déterminer l'appartenance d'un nombre réel à une réunion d'intervalles de R")

A_2_C = Link(source=KC_A, target=KC_C)
B_2_C = Link(source=KC_B, target=KC_C)
C_2_D = Link(source=KC_C, target=KC_D)
C_2_E = Link(source=KC_C, target=KC_E)
domain = Domain([KC_A, KC_B, KC_C, KC_D, KC_E], [A_2_C, B_2_C, C_2_D, C_2_E])

params = {"slip": .01, "guess":.01}

# there are 5 exercises corresponding to KC A
ex_A_1 = Exercise(237957, KC_A, "qcm", ex_content="", params=params)
ex_A_2 = Exercise(237958, KC_A, "qcm", ex_content="", params=params)
ex_A_3 = Exercise(237959, KC_A, "qcm", ex_content="", params=params)
ex_A_4 = Exercise(237960, KC_A, "qcm", ex_content="", params=params)
ex_A_5 = Exercise(237961, KC_A, "qcm", ex_content="", params=params)

# there are also 5 exercises corresponding to KC B
ex_B_1 = Exercise(237947, KC_B, "qcm", ex_content="", params=params)
ex_B_2 = Exercise(237948, KC_B, "qcm", ex_content="", params=params)
ex_B_3 = Exercise(237949, KC_B, "qcm", ex_content="", params=params)
ex_B_4 = Exercise(237950, KC_B, "qcm", ex_content="", params=params)
ex_B_5 = Exercise(237951, KC_B, "qcm", ex_content="", params=params)

ex_C_1 = Exercise(237952, KC_C, "qcm", ex_content="", params=params)
ex_C_2 = Exercise(237953, KC_C, "qcm", ex_content="", params=params)
ex_C_3 = Exercise(237954, KC_C, "qcm", ex_content="", params=params)
ex_C_4 = Exercise(237955, KC_C, "qcm", ex_content="", params=params)
ex_C_5 = Exercise(237956, KC_C, "qcm", ex_content="", params=params)


ex_D_1 = Exercise(225183, KC_D, "qcm", ex_content="", params=params)
ex_D_2 = Exercise(225184, KC_D, "qcm", ex_content="", params=params)
ex_D_3 = Exercise(225185, KC_D, "qcm", ex_content="", params=params)
ex_D_4 = Exercise(225186, KC_D, "qcm", ex_content="", params=params)
ex_D_5 = Exercise(225187, KC_D, "qcm", ex_content="", params=params)

ex_E_1 = Exercise(225165, KC_E, "qcm", ex_content="", params=params)
ex_E_2 = Exercise(225166, KC_E, "qcm", ex_content="", params=params)
ex_E_3 = Exercise(225167, KC_E, "qcm", ex_content="", params=params)
ex_E_4 = Exercise(225168, KC_E, "qcm", ex_content="", params=params)
ex_E_5 = Exercise(225169, KC_E, "qcm", ex_content="", params=params)

def get_KC_from_exercise_id(exercise_id):
    if exercise_id in range(237957, 237962):
        return KC_A
    elif exercise_id in range(237947, 237952):
        return KC_B
    elif exercise_id in range(237952, 237957):
        return KC_C
    elif exercise_id in range(225183, 225188):
        return KC_D
    else:
        return KC_E

## Clean the data

## Define train and test datasets

         idx  doc_id  exercise_id  evaluation_id  success  user_id  \
0          1   50988       225183      109276367        0   757204   
1          2   50988       225183      109293461        1  2052585   
2          3   50988       225183      109293517        1  2052585   
3          4   50988       225183      109293574        1  2052585   
4          5   50988       225183      109307385        1  1896564   
...      ...     ...          ...            ...      ...      ...   
42478  42479   55365       237961      151659716        1  1278392   
42479  42480   55365       237961      151664581        1  3926123   
42480  42481   55365       237961      151667070        1  3275043   
42481  42482   55365       237961      151667191        1  3275043   
42482  42483   55365       237961      151669835        1  1699544   

                 createdAt  
0      2019-08-12 12:57:49  
1      2019-08-12 19:18:03  
2      2019-08-12 19:20:45  
3      2019-08-12 19:23:40  
4      2019-08

## Learn the BKT parameters from the train dataset

In [4]:
from pyBKT.models import Model
import pandas as pd

def get_strongest_folds(full, axis="user_id", nb_folds=5):
    all_elements = full[axis].unique()

    kfold = KFold(nb_folds, shuffle=True)
    folds = []
    for i, (train, test) in enumerate(kfold.split(all_elements)):
        list_of_test_ids = []
        for element_id in test:
            list_of_test_ids += list(full.query(f'{axis} == {all_elements[element_id]}').index)
        folds.append(np.array(list_of_test_ids))
    
    return folds

folds = get_strongest_folds(df, "user_id", 2)
test_ids = folds[0]

train_ids = list(set(list(df.index.values)) - set(test_ids))

df_train = df[df.index.isin(train_ids)]
df_test = df[df.index.isin(test_ids)]


# Initialize the model with an optional seed
model = Model(seed = 42, num_fits = 1)
defaults = {'order_id': 'idx', 'skill_name': 'doc_id', 'correct': 'success'}

model.fit(data = df_train, defaults = defaults)
preds_df = model.predict(data=df_test)

expected_values = df_test["success"]
predicted_values = list(preds_df["correct_predictions"])
print('ACC', model.evaluate(data = df_test, metric = 'accuracy'))
print('AUC', model.evaluate(data = df_test, metric = 'auc'))
print('kappa', sk_metrics.cohen_kappa_score(np.array(expected_values), [1 if predicted_values[i]>.6 else 0 for i in range(len(predicted_values))]))

### Score with pyBKT

ACC 0.7362683438155136
AUC 0.69986070273449
kappa 0.24418008730785223


## Compute the score of MSKT on the test dataset with trained BKT parameters

In [6]:
def get_predictions_from_dataset(dataset, learner_pool, floor_idx=0, verbose=False):
    """
    Computes the prediction performance of the MSKT.
    :param metrics: list or str, the metrics that are computed to measure the prediction performance
    :param dataset: pandas DataFrame, the dataset on which the performance is computed
    :param floor_index: int, the minimal number of steps from which the performance is computed
    :param verbose: bool, verbose info
    :return the value of the metrics on the dataset
    """
    all_exp, all_pred = [], []
    for learner_id in tqdm.tqdm(dataset["user_id"].unique()[:100]):
        learner = Learner(learner_id, learner_pool)
        learner_df = dataset[dataset["user_id"] == learner_id]

        if len(learner_df["doc_id"].unique()) > 1:
            learner_traces = [
                [get_KC_from_exercise_id(row["exercise_id"]), row["success"]] for i, row in learner_df.iterrows()]

            n_traces = len(learner_traces)
            key_idx = [i for i in range(1, n_traces) if learner_traces[i-1][0] is not learner_traces[i][0]]
            
            expected_values = [trace[1] for trace in np.array(learner_traces)[key_idx]]
            predicted_values = learner.predict_sequence(learner_traces, floor_idx, verbose)
            predicted_values = [predicted_values[f"eval({learner_traces[j][0].name}){j}"] for j in key_idx]
            
            all_exp.append(expected_values)
            all_pred.append(predicted_values)
            
            if verbose:
                print(f"learner #{learner.id}")
                print("learner traces:", [trace[0].name, trace[1] for trace in learner_traces])
                print("key indices", key_idx)
                print("predicted values", predicted_values)
    return all_exp, all_pred


def get_prediction_performance(metrics, dataset, learner_pool, floor_idx=0, verbose=False):
    if not isinstance(metrics, list):
        if isinstance(metrics, str):
            metrics = [metrics]
    for metric in metrics:
        assert metric in ('roc_auc', 'accuracy', 'cohen_kappa'), f"Metric {metric} not handled: only AUC, ACC, " \
                                                                  f"and Cohen Kappa can be considered."

    all_exp, all_pred = get_predictions_from_dataset(dataset, learner_pool, floor_idx, verbose)
    res = [getattr(sk_metrics, f'{metric}_score')(all_exp, all_pred) for metric in metrics if metric!='cohen_kappa']
    return res

SyntaxError: invalid syntax (<ipython-input-6-c12fc3b10d7c>, line 31)

### No linking -- should be equivalent to BKT

In [None]:
no_linking = {KC_A: {KC_C: 'not existing'}, KC_B:{KC_C: 'not existing'}, 
                  KC_C: {KC_A: 'not existing', KC_B:'not existing', KC_D: 'not existing', KC_E:'not existing'},
                  KC_D: {KC_C: 'not existing'}, KC_E:{KC_C: 'not existing'}}
learner_pool = LearnerPool(domain, no_linking, 'no links')

for kc in learner_pool.get_knowledge_components():
    learner_pool.set_learn(kc, model.params().loc[f'{kc.id}', 'learns', 'default'].value)
    learner_pool.set_prior(kc, model.params().loc[f'{kc.id}', 'prior', 'default'].value)
    learner_pool.set_slip(kc, model.params().loc[f'{kc.id}', 'slips', 'default'].value)
    learner_pool.set_guess(kc, model.params().loc[f'{kc.id}', 'guesses', 'default'].value)
    learner_pool.set_forget(kc, model.params().loc[f'{kc.id}', 'forgets', 'default'].value)
    print(model.params().loc[f'{kc.id}', 'prior', 'default'].value)

    
metrics = ['accuracy', 'roc_auc', 'cohen_kappa']

all_exp, all_pred = get_predictions_from_dataset(df, learner_pool, floor_idx=0, verbose=False)

In [None]:
acc = sk_metrics.accuracy_score(all_exp, [1 if all_pred[i]>.5 else 0 for i in range(len(all_pred))])
auc = sk_metrics.roc_auc_score(all_exp, all_pred)
cohen_kappa = sk_metrics.cohen_kappa_score(np.array(all_exp), [1 if all_pred[i]>.6 else 0 for i in range(len(all_pred))])

print(acc, auc, cohen_kappa)

In [None]:
import sys 
np.set_printoptions(threshold=sys.maxsize)
print(df[df["user_id"].isin(df["user_id"].unique()[:100])])
print(np.array(all_exp), np.array(all_pred))

ACC 0.7314253222137983
AUC 0.6760619811530224
MAE 0.37621672612561663


### Weak linking

In [None]:
weak_linking = {KC_A: {KC_C: 'weak'}, KC_B:{KC_C: 'weak'}, 
                  KC_C:{KC_A: 'weak', KC_B:'weak', KC_D: 'weak', KC_E:'weak'},
                  KC_D: {KC_C: 'weak'}, KC_E:{KC_C: 'weak'}}
learner_pool = LearnerPool(domain, weak_linking, 'weak')

for kc in learner_pool.get_knowledge_components():
    learner_pool.set_learn(kc, model.params().loc[f'{kc.id}', 'learns', 'default'].value)
    learner_pool.set_prior(kc, model.params().loc[f'{kc.id}', 'prior', 'default'].value)
    learner_pool.set_slip(kc, model.params().loc[f'{kc.id}', 'slips', 'default'].value)
    learner_pool.set_guess(kc, model.params().loc[f'{kc.id}', 'guesses', 'default'].value)
    learner_pool.set_forget(kc, model.params().loc[f'{kc.id}', 'forgets', 'default'].value)


all_exp, all_pred = get_predictions_from_dataset(df, learner_pool, floor_idx=0, verbose=True)

acc = sk_metrics.accuracy_score(all_exp, [1 if all_pred[i]>.5 else 0 for i in range(len(all_pred))])
auc = sk_metrics.roc_auc_score(all_exp, all_pred)
cohen_kappa = sk_metrics.cohen_kappa_score(np.array(all_exp), [1 if all_pred[i]>.6 else 0 for i in range(len(all_pred))])

print(acc, auc, cohen_kappa)

In [None]:
import sys
np.set_printoptions(threshold=sys.maxsize)
print(np.array(all_exp), np.array(all_pred))

spread au moment de regarder la next step

ACC 0.7332687159604911
AUC 0.689356682892977
MAE 0.367123167694489

avnt
ACC 0.7314253222137983
AUC 0.688447139953072
MAE 0.3572016288976596


### Strong linking

In [None]:
strong_linking = {KC_A: {KC_C: 'strong'}, KC_B:{KC_C: 'strong'}, 
                  KC_C:{KC_A: 'strong', KC_B:'strong', KC_D: 'strong', KC_E:'strong'},
                  KC_D: {KC_C: 'strong'}, KC_E:{KC_C: 'strong'}}
learner_pool = LearnerPool(domain, strong_linking, 'strong')

for kc in learner_pool.get_knowledge_components():
    learner_pool.set_learn(kc, model.params().loc[f'{kc.id}', 'learns', 'default'].value)
    learner_pool.set_prior(kc, 0)
    learner_pool.set_slip(kc, model.params().loc[f'{kc.id}', 'slips', 'default'].value)
    learner_pool.set_guess(kc, model.params().loc[f'{kc.id}', 'guesses', 'default'].value)
    learner_pool.set_forget(kc, model.params().loc[f'{kc.id}', 'forgets', 'default'].value)

    
student_df = df[df["user_id"] == np.unique(df["user_id"])[1]]
all_exp, all_pred = get_predictions_from_dataset(student_df, learner_pool, floor_idx=0, verbose=True)

acc = sk_metrics.accuracy_score(all_exp, [1 if all_pred[i]>.5 else 0 for i in range(len(all_pred))])
auc = sk_metrics.roc_auc_score(all_exp, all_pred)
cohen_kappa = sk_metrics.cohen_kappa_score(np.array(all_exp), [1 if all_pred[i]>.6 else 0 for i in range(len(all_pred))])

print(acc, auc, cohen_kappa)

In [None]:
import sys 
np.set_printoptions(threshold=sys.maxsize)
print(np.array(all_exp), np.array(all_pred))

spread au momentde la préd

ACC 0.7332687159604911
AUC 0.6889450343589241
MAE 0.3671020179386424



ACC 0.7314253222137983
AUC 0.6884344591123446
MAE 0.3571178246783089
