# Closing bracket transition prediction performance
In this notebook, we focus on the "closing bracket" domain knowledge structure. We study the transition prediction perfomance and how the prerequisite links existing between KCs influence it. The transition prediction performance is the capacity of our model to predict the success of a learner on an exercise when he has done an exercise on another KC right before. 

In [1]:
import sys
sys.path.append("/Users/olivier/PycharmProjects/bayesian-kst/")  # for mac

import pyAgrum as gum
from kgraph.expert_layer.domain import Domain
from kgraph.expert_layer.knowledge_components import KnowledgeComponent
from kgraph.expert_layer.link import Link
from kgraph.resources_layer.exercise import Exercise
from kgraph.learner_layer.answer import LearnerAnswer
from kgraph.learner_layer.learner import Learner
from kgraph.learner_layer.learner_pool import LearnerPool
from kgraph.helpers.truthtable import truthtable
from math import floor
import pyAgrum.lib.notebook as gnb
import pyAgrum.lib.dynamicBN as gdyn
import random
import itertools
import numpy as np
import tqdm
import sklearn.metrics as sk_metrics
from sklearn.model_selection import KFold
import pandas as pd

## Import the data
We use the three following KCs as KC A, KC B and KC C:
- KC A: "Déterminer l'appartenance d'un nombre réel à un intervalle fini"
- KC B: "Déterminer l'appartenance d'un nombre réel à un intervalle infini"
- KC C: "Déterminer l'appartenance d'un nombre réel à un intervalle simple"

Each of those is related to five Kartable exercises.

In [2]:
# we define the KCs

KC_A = KnowledgeComponent(55365, "A")  # Déterminer l'appartenance d'un nombre réel à un intervalle fini
KC_B = KnowledgeComponent(55363, "B")  # Déterminer l'appartenance d'un nombre réel à un intervalle infini
KC_C = KnowledgeComponent(55364, "C")  # Déterminer l'appartenance d'un nombre réel à un intervalle simple

A_2_C = Link(source=KC_A, target=KC_C)
B_2_C = Link(source=KC_B, target=KC_C)
domain = Domain([KC_A, KC_B, KC_C], [A_2_C, B_2_C])

params = {"slip": .01, "guess":.01}

# there are 5 exercises corresponding to KC A
ex_A_1 = Exercise(237957, KC_A, "qcm", ex_content="", params=params)
ex_A_2 = Exercise(237958, KC_A, "qcm", ex_content="", params=params)
ex_A_3 = Exercise(237959, KC_A, "qcm", ex_content="", params=params)
ex_A_4 = Exercise(237960, KC_A, "qcm", ex_content="", params=params)
ex_A_5 = Exercise(237961, KC_A, "qcm", ex_content="", params=params)

# there are also 5 exercises corresponding to KC B
ex_B_1 = Exercise(237947, KC_B, "qcm", ex_content="", params=params)
ex_B_2 = Exercise(237948, KC_B, "qcm", ex_content="", params=params)
ex_B_3 = Exercise(237949, KC_B, "qcm", ex_content="", params=params)
ex_B_4 = Exercise(237950, KC_B, "qcm", ex_content="", params=params)
ex_B_5 = Exercise(237951, KC_B, "qcm", ex_content="", params=params)

ex_C_1 = Exercise(237952, KC_C, "qcm", ex_content="", params=params)
ex_C_2 = Exercise(237953, KC_C, "qcm", ex_content="", params=params)
ex_C_3 = Exercise(237954, KC_C, "qcm", ex_content="", params=params)
ex_C_4 = Exercise(237955, KC_C, "qcm", ex_content="", params=params)
ex_C_5 = Exercise(237956, KC_C, "qcm", ex_content="", params=params)

def get_KC_from_exercise_id(exercise_id):
    if exercise_id in range(237957, 237962):
        return KC_A
    elif exercise_id in range(237947, 237952):
        return KC_B
    elif exercise_id in range(237952, 237957):
        return KC_C

## Clean the data
TBD

In [3]:
df = pd.read_csv("5_KCs_example_data.csv")
df = df[df["exercise_id"].isin(range(237947, 237962))]

## Score of the transition prediction performance

In [4]:
def get_transition_predictions(dataset, learner_pool, floor_idx=0, verbose=False):
    """
    Computes the prediction performance of the MSKT.
    :param metrics: list or str, the metrics that are computed to measure the prediction performance
    :param dataset: pandas DataFrame, the dataset on which the performance is computed
    :param floor_index: int, the minimal number of steps from which the performance is computed
    :param verbose: bool, verbose info
    :return the value of the metrics on the dataset
    """
    all_exp, all_pred = [], []
    for learner_id in tqdm.tqdm(dataset["user_id"].unique()):
        learner = Learner(learner_id, learner_pool)
        learner_df = dataset[dataset["user_id"] == learner_id]

        if len(learner_df["doc_id"].unique()) > 1:
            if verbose:
                print(f"learner #{learner.id}")

            learner_traces = [
                [get_KC_from_exercise_id(row["exercise_id"]), row["success"]] for i, row in learner_df.iterrows()]

            n_traces = len(learner_traces)
            key_idx = [i for i in range(1, n_traces) if learner_traces[i-1][0] is not learner_traces[i][0]]
            
            expected_values = [trace[1] for trace in np.array(learner_traces)[key_idx]]
            all_predicted_values = learner.predict_sequence(learner_traces, floor_idx, verbose)
            predicted_values = [all_predicted_values[f"eval({learner_traces[j][0].name}){j}"] for j in key_idx]
            
            all_exp = np.concatenate((all_exp, expected_values))
            all_pred = np.concatenate((all_pred, predicted_values))
            if verbose:
                print("predicted values", predicted_values)

    return all_exp, all_pred


def get_transition_prediction_performance(metrics, dataset, learner_pool, floor_idx=0, verbose=False):
    if not isinstance(metrics, list):
        if isinstance(metrics, str):
            metrics = [metrics]
    for metric in metrics:
        assert metric in ('roc_auc', 'accuracy', 'cohen_kappa'), f"Metric {metric} not handled: only AUC, ACC, " \
                                                                  f"and Cohen Kappa can be considered."

    all_exp, all_pred = get_transition_predictions(dataset, learner_pool, floor_idx, verbose)
    res = [getattr(sk_metrics, f'{metric}_score')(all_exp, all_pred) for metric in metrics if metric!='cohen_kappa']
    return res

### No linking -- should be equivalent to BKT

In [5]:
no_linking = {KC_A: {KC_C: 'not existing'}, KC_B:{KC_C: 'not existing'}, 
                  KC_C: {KC_A: 'not existing', KC_B:'not existing'}}
learner_pool = LearnerPool(domain, no_linking, 'no links')

for kc in learner_pool.get_knowledge_components():
    learner_pool.set_learn(kc, .1)
    learner_pool.set_prior(kc, .1)
    learner_pool.set_slip(kc, .1)
    learner_pool.set_guess(kc, .25)
    learner_pool.set_forget(kc, 0)


metrics = ['accuracy', 'roc_auc', 'cohen_kappa']

all_exp, all_pred = get_transition_predictions(df, learner_pool, floor_idx=0, verbose=False)

100%|██████████| 785/785 [02:31<00:00,  5.19it/s]


In [6]:
acc = sk_metrics.accuracy_score(all_exp, [1 if all_pred[i]>.5 else 0 for i in range(len(all_pred))])
auc = sk_metrics.roc_auc_score(all_exp, all_pred)
cohen_kappa = sk_metrics.cohen_kappa_score(np.array(all_exp), [1 if all_pred[i]>.6 else 0 for i in range(len(all_pred))])

print(acc, auc, cohen_kappa)

0.18444266238973536 0.5 0.0


### Weak linking

In [7]:
weak_linking = {KC_A: {KC_C: 'weak'}, KC_B:{KC_C: 'weak'}, 
                  KC_C:{KC_A: 'weak', KC_B:'weak'}}
learner_pool = LearnerPool(domain, weak_linking, 'weak')

for kc in learner_pool.get_knowledge_components():
    learner_pool.set_learn(kc, .1)
    learner_pool.set_prior(kc, .1)
    learner_pool.set_slip(kc, .1)
    learner_pool.set_guess(kc, .25)
    learner_pool.set_forget(kc, 0)


all_exp, all_pred = get_transition_predictions(df, learner_pool, floor_idx=0, verbose=False)

100%|██████████| 785/785 [04:39<00:00,  2.81it/s]


In [8]:
acc = sk_metrics.accuracy_score(all_exp, [1 if all_pred[i]>.5 else 0 for i in range(len(all_pred))])
auc = sk_metrics.roc_auc_score(all_exp, all_pred)
cohen_kappa = sk_metrics.cohen_kappa_score(np.array(all_exp), [1 if all_pred[i]>.6 else 0 for i in range(len(all_pred))])

print(acc, auc, cohen_kappa)

0.3744987971130714 0.26599974349108635 -0.2634769691819696


### Strong linking

In [11]:
strong_linking = {KC_A: {KC_C: 'strong'}, KC_B:{KC_C: 'strong'}, 
                  KC_C:{KC_A: 'strong', KC_B:'strong'}}
learner_pool = LearnerPool(domain, strong_linking, 'strong')

for kc in learner_pool.get_knowledge_components():
    learner_pool.set_learn(kc, .1)
    learner_pool.set_prior(kc, .1)
    learner_pool.set_slip(kc, .1)
    learner_pool.set_guess(kc, .25)
    learner_pool.set_forget(kc, 0)

    
all_exp, all_pred = get_transition_predictions(df, learner_pool, floor_idx=0, verbose=False)

100%|██████████| 785/785 [04:44<00:00,  2.76it/s]


In [12]:
acc = sk_metrics.accuracy_score(all_exp, [1 if all_pred[i]>.5 else 0 for i in range(len(all_pred))])
auc = sk_metrics.roc_auc_score(all_exp, all_pred)
cohen_kappa = sk_metrics.cohen_kappa_score(np.array(all_exp), [1 if all_pred[i]>.6 else 0 for i in range(len(all_pred))])

print(acc, auc, cohen_kappa)

0.37530072173215717 0.27199777692274807 -0.26314887250853203
