# HOW TO: dynamically infer the learner knowledge states 

### Package importation

In [1]:
import sys
sys.path.append("/Users/olivier/PycharmProjects/bayesian-kst/")  # for mac

import pyAgrum as gum
from kgraph.expert_layer.domain import Domain
from kgraph.expert_layer.knowledge_components import KnowledgeComponent
from kgraph.expert_layer.link import Link
from kgraph.resources_layer.exercise import Exercise
from kgraph.learner_layer.answer import LearnerAnswer
from kgraph.learner_layer.learner import Learner
from kgraph.learner_layer.learner_pool import LearnerPool
from kgraph.helpers.truthtable import truthtable
from math import floor
import pyAgrum.lib.notebook as gnb
import pyAgrum.lib.dynamicBN as gdyn
import random
import itertools
import numpy as np
from sklearn.metrics import roc_auc_score

## Modélisation du réseau bayésien dynamique à partir du modèle du domaine

Exporting the domain from dataset

### Defining the domain

We consider the following domain: 
- KC A : "Déterminer l'appartenance d'un nombre réel à un intervalle fini"
- KC B : "Déterminer l'appartenance d'un nombre réel à un intervalle infini"
- KC C : "Déterminer l'appartenance d'un nombre réel à un intervalle simple"
- KC D : "Déterminer l'appartenance d'un nombre réel à une intersection d'intervalles de R"
- KC E : "Déterminer l'appartenance d'un nombre réel à une réunion d'intervalles de R"

In [2]:
# we define the KCs

KC_A = KnowledgeComponent(55365, "Déterminer l'appartenance d'un nombre réel à un intervalle fini")
KC_B = KnowledgeComponent(55363, "Déterminer l'appartenance d'un nombre réel à un intervalle infini")
KC_C = KnowledgeComponent(55364, "Déterminer l'appartenance d'un nombre réel à un intervalle simple")
KC_D = KnowledgeComponent(50988, "Déterminer l'appartenance d'un nombre réel à une intersection d'intervalles de R")
KC_E = KnowledgeComponent(50989, "Déterminer l'appartenance d'un nombre réel à une réunion d'intervalles de R")

A_2_C = Link(source=KC_A, target=KC_C)
B_2_C = Link(source=KC_B, target=KC_C)
C_2_D = Link(source=KC_C, target=KC_D)
C_2_E = Link(source=KC_C, target=KC_E)
domain = Domain([KC_A, KC_B, KC_C, KC_D, KC_E], [A_2_C, B_2_C, C_2_D, C_2_E])

params = {"slip": .01, "guess":.01}

# there are 5 exercises corresponding to KC A
ex_A_1 = Exercise(237957, KC_A, "qcm", ex_content="", params=params)
ex_A_2 = Exercise(237958, KC_A, "qcm", ex_content="", params=params)
ex_A_3 = Exercise(237959, KC_A, "qcm", ex_content="", params=params)
ex_A_4 = Exercise(237960, KC_A, "qcm", ex_content="", params=params)
ex_A_5 = Exercise(237961, KC_A, "qcm", ex_content="", params=params)

# there are also 5 exercises corresponding to KC B
ex_B_1 = Exercise(237947, KC_B, "qcm", ex_content="", params=params)
ex_B_2 = Exercise(237948, KC_B, "qcm", ex_content="", params=params)
ex_B_3 = Exercise(237949, KC_B, "qcm", ex_content="", params=params)
ex_B_4 = Exercise(237950, KC_B, "qcm", ex_content="", params=params)
ex_B_5 = Exercise(237951, KC_B, "qcm", ex_content="", params=params)

ex_C_1 = Exercise(237952, KC_C, "qcm", ex_content="", params=params)
ex_C_2 = Exercise(237953, KC_C, "qcm", ex_content="", params=params)
ex_C_3 = Exercise(237954, KC_C, "qcm", ex_content="", params=params)
ex_C_4 = Exercise(237955, KC_C, "qcm", ex_content="", params=params)
ex_C_5 = Exercise(237956, KC_C, "qcm", ex_content="", params=params)


ex_D_1 = Exercise(225183, KC_D, "qcm", ex_content="", params=params)
ex_D_2 = Exercise(225184, KC_D, "qcm", ex_content="", params=params)
ex_D_3 = Exercise(225185, KC_D, "qcm", ex_content="", params=params)
ex_D_4 = Exercise(225186, KC_D, "qcm", ex_content="", params=params)
ex_D_5 = Exercise(225187, KC_D, "qcm", ex_content="", params=params)

ex_E_1 = Exercise(225165, KC_E, "qcm", ex_content="", params=params)
ex_E_2 = Exercise(225166, KC_E, "qcm", ex_content="", params=params)
ex_E_3 = Exercise(225167, KC_E, "qcm", ex_content="", params=params)
ex_E_4 = Exercise(225168, KC_E, "qcm", ex_content="", params=params)
ex_E_5 = Exercise(225169, KC_E, "qcm", ex_content="", params=params)

def get_KC_from_exercise_id(exercise_id):
    if exercise_id in range(237957, 237962):
        return KC_A
    elif exercise_id in range(237947, 237952):
        return KC_B
    elif exercise_id in range(237952, 237957):
        return KC_C
    elif exercise_id in range(225183, 225188):
        return KC_D
    else:
        return KC_E
    

## Modélisation de l'apprentissage de l'apprenant à partir d'ajout de preuves au modèle du domaine

In [3]:
import pandas as pd

df = pd.read_csv("5_KCs_example_data.csv")

print(df)

       exercise_id  evaluation_id  success  user_id            createdAt
0           225183      109276367        0   757204  2019-08-12 12:57:49
1           225183      109293461        1  2052585  2019-08-12 19:18:03
2           225183      109293517        1  2052585  2019-08-12 19:20:45
3           225183      109293574        1  2052585  2019-08-12 19:23:40
4           225183      109307385        1  1896564  2019-08-13 11:16:49
...            ...            ...      ...      ...                  ...
42006       237961      151532622        1  3940614  2021-09-08 12:49:53
42007       237961      151546756        1  1970804  2021-09-08 23:48:52
42008       237961      151546760        1  1970804  2021-09-08 23:49:21
42009       237961      151549675        1  3943368  2021-09-09 11:07:45
42010       237961      151549982        0  3940672  2021-09-09 11:22:25

[42011 rows x 5 columns]


### Data cleaning

In [4]:
g = df.groupby('user_id')
df = g.filter(lambda x: len(x) > 10)


In [5]:
strong_linking = {KC_A: {KC_C: 'strong'}, KC_B:{KC_C: 'strong'}, 
                  KC_C:{KC_A: 'strong', KC_B:'strong', KC_D: 'strong', KC_E:'strong'},
                  KC_D: {KC_C: 'strong'}, KC_E:{KC_C: 'strong'}}
learner_pool = LearnerPool(domain, strong_linking, 'strong')

random_learner_id = random.choice(df["user_id"].unique())
learner = Learner(random_learner_id, learner_pool)

learner_evals = df[df["user_id"]==random_learner_id]
print(random_learner_id, len(learner_evals.index))

1220158 55


## Compute learner answers prediction

In [6]:
answers = [[get_KC_from_exercise_id(row["exercise_id"]), row["success"]] for i, row in learner_evals.iterrows()]

import time


print("The answers of the learner are:", [[answers[i][0].name, answers[i][1]] for i in range(len(answers))], "\n")
start = time.time()

predicted_answers = learner.predict_answers(answers, verbose=False)
end = time.time()
print("Elapsed time for prediction: ", end - start, "\n")

print("We store all the information from the prediction:")
print(dict(list(predicted_answers[0].items())[:3]), "...\n")

print("But the real prediction information is predicted success on evaluated KCs :")
print(np.array([[answers[i][1], predicted_answers[1][i]] for i in range(len(predicted_answers[1]))]))

The answers of the learner are: [["Déterminer l'appartenance d'un nombre réel à une intersection d'intervalles de R", 0], ["Déterminer l'appartenance d'un nombre réel à une intersection d'intervalles de R", 0], ["Déterminer l'appartenance d'un nombre réel à une intersection d'intervalles de R", 0], ["Déterminer l'appartenance d'un nombre réel à une intersection d'intervalles de R", 1], ["Déterminer l'appartenance d'un nombre réel à une intersection d'intervalles de R", 1], ["Déterminer l'appartenance d'un nombre réel à une intersection d'intervalles de R", 1], ["Déterminer l'appartenance d'un nombre réel à une intersection d'intervalles de R", 0], ["Déterminer l'appartenance d'un nombre réel à une intersection d'intervalles de R", 1], ["Déterminer l'appartenance d'un nombre réel à une intersection d'intervalles de R", 1], ["Déterminer l'appartenance d'un nombre réel à une intersection d'intervalles de R", 1], ["Déterminer l'appartenance d'un nombre réel à une intersection d'intervalles

## Compute the score of the prediction 

In [7]:
def compute_score(learner, evaluations):
    n_eval = len(evaluations)
    train_idx = random.sample(range(n_eval), floor(.7*n_eval))
    train_evaluations = [evaluations[i] for i in train_idx]
    test_idx = [x for x in range(n_eval) if x not in train_idx]
    test_evaluations = [evaluations[j] for j in test_idx]
    predicted_states, predicted_answers = learner.predict_answers_with_partial_evidences(
        evaluations, train_idx, verbose=True)
    expected_test_answers = [evaluations[i][1] for i in test_idx]
    predicted_test_answers = [predicted_answers[i] for i in test_idx]
    print(expected_test_answers) 
    print(predicted_test_answers)
    return roc_auc_score(expected_test_answers, predicted_test_answers)

def find_learner_id(df):
    g = df.groupby('user_id')
    df = g.filter(lambda x: len(x) < 20 )
    random_learner_id = random.choice(df["user_id"].unique())
    while (df[df["user_id"] == random_learner_id]["success"].sum()) > 0.9*len(
        df[df["user_id"] == random_learner_id].index):
        random_learner_id = random.choice(df["user_id"].unique())
    return random_learner_id

random_learner_id = find_learner_id(df)
learner = Learner(random_learner_id, learner_pool)
learner_evals = df[df["user_id"]==random_learner_id]
print(random_learner_id, len(learner_evals.index))
answers = [[get_KC_from_exercise_id(row["exercise_id"]), row["success"]] for i, row in learner_evals.iterrows()]


predicted_answers = compute_score(learner, answers)

3164637 13
[0, 1, 0, 0]
[0.8353156042617215, 0.8786627262377198, 0.6204869125702017, 0.6200218596878533]


In [14]:

def compute_score_2(learner, evaluations):
    n_eval = len(evaluations)
    floor_idx = 0  # floor(.7*n_eval)
    print(n_eval, floor_idx)
    expected_values = []
    predicted_values = []
    for i in range(floor_idx, n_eval):
        y, pred = learner.predict_answers(evaluations[:i+1], verbose=False)
        predicted_values.append(y[f"eval({evaluations[i][0].name}){i+1}"])
        expected_values.append(evaluations[i][1])
        
    print(expected_values) 
    print(predicted_values)
    return roc_auc_score(expected_values, predicted_values)

def compute_score_4(learner, evaluations):
    n_eval = len(evaluations)
    floor_idx = floor(.7*n_eval)
    print(n_eval, floor_idx)
    expected_values = []
    predicted_values = []
    for i in range(floor_idx, n_eval):
        y, pred = learner.predict_answers_without_evidences(evaluations[:i+1], verbose=False)
        predicted_values.append(y[f"eval({evaluations[i][0].name}){i+1}"])
        expected_values.append(evaluations[i][1])
        
    print(expected_values) 
    print(predicted_values)
    return roc_auc_score(expected_values, predicted_values)

def compute_score_3(learner, evaluations, floor_idx=0):
    print(floor_idx, len(evaluations))
    n_eval = len(evaluations)
    expected_values = []
    predicted_values = []
    for i in range(floor_idx, n_eval):
        y, pred = learner.predict_answers_with_partial_evidences(evaluations[:i+1], range(i), verbose=False)
        predicted_values.append(y[f"eval({evaluations[i][0].name}){i}"])
        expected_values.append(evaluations[i][1])
        
    print("Expected values", expected_values) 
    print("\n","Predicted values", predicted_values)
    print("\n AUC",roc_auc_score(expected_values, predicted_values))
    cond_pbas = []
    for i in range(len(expected_values)):
        cond_pbas.append(predicted_values[i] if expected_values[i] == 1 else 1 - predicted_values[i])

    return - np.log(sum(cond_pbas))

In [15]:
def predict_next_step(learner, evaluations, floor_idx=0):

    n_eval = len(evaluations)
    expected_values = []
    predicted_values = []
    for i in range(floor_idx, n_eval):
        y, pred = learner.predict_answers_with_partial_evidences(evaluations[:i+1], range(i), verbose=False)
        predicted_values.append(y[f"eval({evaluations[i][0].name}){i+1}"])
        expected_values.append(evaluations[i][1])
        
    return (expected_values, predicted_values)

In [16]:
# score with all evidences 
print(compute_score_2(learner, answers))

# score with partial evidences on t-1 time steps
print(compute_score_3(learner, answers, floor_idx=floor(.5*len(answers))))

# score without evidences
print(compute_score_4(learner, answers))

44 0
[1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1]
[0.6659341964603872, 0.254723174172434, 0.672977007057434, 0.8624030701895619, 0.8910516368599408, 0.8943821730667172, 0.6213795168838097, 0.8550783277130423, 0.8904291853193443, 0.8943384715798663, 0.8947547313422837, 0.8947989954967125, 0.8948037432317323, 0.8948042820100396, 0.8948043674955837, 0.623049059019062, 0.8554809684525783, 0.5050407496218405, 0.8272826413037162, 0.8871471018019846, 0.231774379588024, 0.6880292204097812, 0.8673764299549153, 0.8918357443608754, 0.6122884034755395, 0.8533653554371166, 0.5001346004391779, 0.8260523120904328, 0.4424108280451418, 0.1690013859155085, 0.612312473509592, 0.8530778897578327, 0.890191628767715, 0.8943131665116809, 0.6212338850502988, 0.8551378437757434, 0.8904478483324965, 0.894342400568891, 0.6213467377949624, 0.8551572883568849, 0.8904494301795018, 0.607321996783112, 0.8522784034540107, 0.8901170

In [17]:
print(compute_score_4(learner, answers))

link_strength_combinations = {}

weak_linking = {KC_A: {KC_C: 'weak'}, KC_B:{KC_C: 'weak'}, 
                  KC_C:{KC_A: 'weak', KC_B:'weak', KC_D: 'weak', KC_E:'weak'},
                  KC_D: {KC_C: 'weak'}, KC_E:{KC_C: 'weak'}}
learner_pool_2 = LearnerPool(domain, weak_linking, 'weak')

random_learner_id = random.choice(df["user_id"].unique())
learner_2 = Learner(random_learner_id, learner_pool_2)
answers_2 = [[get_KC_from_exercise_id(row["exercise_id"]), row["success"]] for i, row in learner_evals.iterrows()]

learner_evals = df[df["user_id"]==random_learner_id]
print(random_learner_id, len(learner_evals.index))
print(compute_score_4(learner_2, answers_2, floor_idx=floor(.5*len(answers_2))))


44 30
[1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1]
[0.5115402227704262, 0.5153747678683306, 0.5190218105624206, 0.5224889871855275, 0.5257838513818169, 0.5289138364523386, 0.5318862244723246, 0.5347081214612179, 0.5373864378977026, 0.5399278739047274, 0.542338908475292, 0.544625792162573, 0.5467945427135845, 0.5488509431811174]
0.393939393939394
1034821 35


TypeError: compute_score_4() got an unexpected keyword argument 'floor_idx'

In [12]:
print(roc_auc_score(exp, pred))

KeyboardInterrupt: 

In [None]:

learner = Learner(2, learner_pool)
compute_score_2(learner, [[KC_A, True] for _ in range(5)] 
                + [[KC_C, True] if i!=2 else [KC_C, False] for i in range(5)]
                + [[KC_E, False], [KC_E, False], [KC_E, True], [KC_E, False], [KC_E, True]])



In [None]:
print(compute_score_2_bis(learner, answers))