# Notebook: learner answers' prediction with DBN inference

### Package importation

In [1]:
import sys
sys.path.append("/Users/olivier/PycharmProjects/bayesian-kst/")  # for mac
sys.path.append("/home/olivier/PycharmProjects/bayesian-kst/")  # for ubuntu

from kgraph.expert_layer.domain import Domain
from kgraph.expert_layer.knowledge_components import KnowledgeComponent
from kgraph.expert_layer.link import Link
from kgraph.resources_layer.exercise import Exercise
from kgraph.learner_layer.answer import LearnerAnswer
from kgraph.learner_layer.learner import Learner
from kgraph.learner_layer.learner_pool import LearnerPool
from kgraph.helpers.truthtable import truthtable
import pyAgrum as gum
import pyAgrum.lib.notebook as gnb
import pyAgrum.lib.dynamicBN as gdyn
import random
import itertools
import numpy as np
from sklearn.metrics import roc_auc_score


## Modélisation du réseau bayésien dynamique à partir du modèle du domaine

Exporting the domain from dataset

### Defining the domain

In [2]:
# we define the KCs
KC_A = KnowledgeComponent(55594, "A")
KC_B = KnowledgeComponent(55596, "B")

A_2_B = Link(source=KC_A, target=KC_B)

domain = Domain([KC_A, KC_B], [A_2_B])

params = {"slip": .01, "guess":.01}

# there are 5 exercises corresponding to KC A
ex_A_1 = Exercise(240521, KC_A, "qcm", ex_content="", params=params)
ex_A_2 = Exercise(240522, KC_A, "qcm", ex_content="", params=params)
ex_A_3 = Exercise(240523, KC_A, "qcm", ex_content="", params=params)
ex_A_4 = Exercise(240524, KC_A, "qcm", ex_content="", params=params)
ex_A_5 = Exercise(240525, KC_A, "qcm", ex_content="", params=params)

# there are also 5 exercises corresponding to KC B
ex_B_1 = Exercise(240526, KC_B, "qcm", ex_content="", params=params)
ex_B_2 = Exercise(240527, KC_B, "qcm", ex_content="", params=params)
ex_B_3 = Exercise(240528, KC_B, "qcm", ex_content="", params=params)
ex_B_4 = Exercise(240529, KC_B, "qcm", ex_content="", params=params)
ex_B_5 = Exercise(240530, KC_B, "qcm", ex_content="", params=params)

## Modélisation de l'apprentissage de l'apprenant à partir d'ajout de preuves au modèle du domaine

In [3]:
import pandas as pd
# default params

strong_linking = {KC_A:{KC_B: 'strong'}, KC_B:{KC_A: 'strong'}}
learner_pool_strong = LearnerPool(domain, strong_linking, 'strong')

weak_linking = {KC_A:{KC_B: 'weak'}, KC_B:{KC_A: 'weak'}}
learner_pool_weak = LearnerPool(domain, weak_linking, 'weak')

no_linking = {KC_A:{KC_B: None}, KC_B:{KC_A: None}}
learner_pool_non_existing = LearnerPool(domain, no_linking, 'not existing')

evaluation_df = pd.read_csv('/Users/olivier/PycharmProjects/bayesian-kst/data/2KC_data_example.csv')
print(evaluation_df)

learners = []
student_ids = pd.unique(evaluation_df['user_id'])
for stud_id in student_ids:
    learners.append(Learner(stud_id, None))
    
evaluation_df.sort_values(by=['user_id'])

evaluations, stud_evals = [], []
stud = evaluation_df.iloc[0]['user_id']
for i, row in evaluation_df.iterrows():
    if row['user_id'] != stud:
        evaluations.append(stud_evals)
        stud_evals = []
        stud = row["user_id"]
    else:
        kc = KC_A if row['doc_id'] == 55594 else KC_B
        stud_evals.append((kc, int(row["success"])))

      success  doc_id  question_id  user_id            createdAt
0           1   55596       240526  3827148  2021-06-08 11:59:29
1           1   55596       240526  3827148  2021-06-08 11:59:29
2           1   55596       240526  3827148  2021-06-08 11:59:29
3           0   55596       240526  3827148  2021-06-08 11:59:29
4           1   55596       240526  3827148  2021-06-08 11:59:29
...       ...     ...          ...      ...                  ...
3140        1   55594       240521  1034821  2020-10-12 21:27:56
3141        1   55594       240522  1034821  2020-10-12 21:27:56
3142        1   55594       240523  1034821  2020-10-12 21:27:56
3143        1   55594       240524  1034821  2020-10-12 21:27:56
3144        1   55594       240525  1034821  2020-10-12 21:27:56

[3145 rows x 5 columns]


In [4]:
def compute_likelihood(predicted, expected):
    score=1
    for i in range(len(predicted)):
        if expected[i] == 1:
            score = score * predicted[i]
        else:
            score = score * (1-predicted[i])
    return -np.log(score)

In [5]:

best_score, best_link_status = 1000000, 0

y_plt = []

# for link_possibility in link_a_to_b:
res = {}

for i in range(len(learners)):
    if evaluations[i]: 
        student = learners[i]
        # for every student, we search for the one that corresponds the most to stereotypes
        stud_score, selected_stereotype = 1000000, 0
        for learner_pool in [learner_pool_strong, learner_pool_weak, learner_pool_non_existing]:
            student.change_learner_pool(learner_pool)
            exp_vals = [int(evaluations[i][j][1]) for j in range(len(evaluations[i]))]
            pred_vals = student.predict_answers(evaluations[i]) 
            temp_score = compute_likelihood(pred_vals, exp_vals)
            if temp_score < stud_score:
                stud_score = temp_score
                selected_stereotype = learner_pool
        res[student.id] = {'learner_pool': selected_stereotype.desc, 'score': stud_score}

res_df = pd.DataFrame.from_dict(res,orient='index')


In [6]:
res_df.head(10)

Unnamed: 0,learner_pool,score
3827148,strong,34.062546
3305118,strong,5.227734
3028687,strong,0.463867
3558036,not existing,5.219241
2907259,strong,0.463867
3798348,weak,11.963312
3574744,weak,14.80363
3775207,not existing,2.577224
3739049,not existing,2.577224
2920058,not existing,2.577224


# Experiment 2
We suppose here that all learners belong in the same learner pool and we search for the best status of the link strength (between strong, weak and not existing). 

In [7]:
best_score, best_link_status = 1000000, 0

y_plt = []

# for link_possibility in link_a_to_b:
res = {}
for learner_pool in [learner_pool_strong, learner_pool_weak, learner_pool_non_existing]:
    pool_score = 0
    for i in range(len(learners)):
        if evaluations[i]: 
            student = learners[i]
            # for every student, we search for the one that corresponds the most to stereotypes
            stud_score, selected_stereotype = 1000000, 0
            student.change_learner_pool(learner_pool)
            exp_vals = [int(evaluations[i][j][1]) for j in range(len(evaluations[i]))]
            pred_vals = student.predict_answers(evaluations[i]) 
            pool_score += compute_likelihood(pred_vals, exp_vals)
    print(learner_pool.desc, pool_score)
    if pool_score < best_score:
        best_score = pool_score
        selected_stereotype = learner_pool
print(selected_stereotype)


strong 651.419498238151
weak 650.6033794375064
not existing 659.0845658548253
0
