# Closing bracket all prediction performance
In this notebook, we focus on the "closing bracket" domain knowledge structure. We study the whole prediction perfomance and how the prerequisite links existing between KCs influence it. Contrary to the transition prediction performance, which is the capacity of our model to predict the success of a learner on an exercise when he has done an exercise on another KC right before, we study here every learner interaction traces. 

In [1]:
import sys
sys.path.append("/Users/olivier/PycharmProjects/bayesian-kst/")  # for mac

import pyAgrum as gum
from kgraph.expert_layer.domain import Domain
from kgraph.expert_layer.knowledge_components import KnowledgeComponent
from kgraph.expert_layer.link import Link
from kgraph.resources_layer.exercise import Exercise
from kgraph.learner_layer.evaluation import LearnerTrace
from kgraph.learner_layer.learner import Learner
from kgraph.learner_layer.learner_pool import LearnerPool
from kgraph.helpers.truthtable import truthtable
from math import floor
import pyAgrum.lib.notebook as gnb
import pyAgrum.lib.dynamicBN as gdyn
import random
import itertools
import numpy as np
import tqdm
import sklearn.metrics as sk_metrics
from sklearn.model_selection import KFold
import pandas as pd

In [2]:
def setup_domain_and_resources_from_dataset(dataset):
    assert any((isinstance(dataset, str), isinstance(dataset, pd.DataFrame))), "dataset must be str or dataframe" 
    df = dataset if isinstance(dataset, pd.DataFrame) else pd.read_csv(dataset)
    knowledge_components, exercises = [], []
    for i, row in df.iterrows():
        kc_id = row["kd_id"]
        if kc_id not in [kc.id for kc in knowledge_components]:
            kc_name = row['kd_name'] if 'kd_name' in df.columns else kc_id
            kc = KnowledgeComponent(kc_id, kc_name)
            knowledge_components.append(kc)
        else:
            kc = [kc for kc in knowledge_components if kc.id == kc_id][0]
        exercise_id = row['kae_id']
        if exercise_id not in [exercise.id for exercise in exercises]:
            exercise = Exercise(row['kae_id'], kc)
            exercises.append(exercise)
    domain = Domain(knowledge_components)
    return domain, exercises

def deduce_learner_traces_from_dataset(dataset, domain, exercises, learner_pool):
    """
    The traces are sorted by learner id
    """
    assert any((isinstance(dataset, str), isinstance(dataset, pd.DataFrame))), "dataset must be str or dataframe" 
    df = dataset if isinstance(dataset, pd.DataFrame) else pd.read_csv(dataset)

    learners = [Learner(learner_id, learner_pool) for learner_id in np.unique(dataset['user_id'])]
    learner_traces = {}
    for learner in learners:
        learner_df = df[df['user_id'] == learner.id]
        learner_traces[learner] = []
        for i, row in learner_df.iterrows(): 
            exercise = next((x for x in exercises if x.id == row['kae_id']), None)
            if exercise is None: 
                return Exception('exercise is none')
            success = bool(row['uea_success'])
            trace = LearnerTrace(learner, exercise, success)
            learner_traces[learner].append(trace)
    return learner_traces
    
def process_dataset(dataset, domain, exercises, learner_pool, c={}):
    assert any((isinstance(dataset, str), isinstance(dataset, pd.DataFrame))), "dataset must be str or dataframe" 
    df = dataset if isinstance(dataset, pd.DataFrame) else pd.read_csv(dataset)

    learner_traces = deduce_learner_traces_from_dataset(dataset, domain, exercises, learner_pool)

    exp_vals = []
    pred_vals = []
    
    for learner in learner_traces.keys():
        exp_vals = np.concatenate((exp_vals, [trace.get_success() for trace in learner_traces[learner]]))
        pred_vals = np.concatenate((pred_vals, learner.predict_sequence(learner_traces[learner], 'NoisyOR', {'c':c})))

    return exp_vals, pred_vals


## Import the data

In [3]:
df = pd.read_csv("quadratic_equations_cleaned.csv")

In [4]:
max_ids = df.groupby(['kd_id'], sort=False).count().sort_values(by='uea_success', ascending=False)
max_ids.head(10)

Unnamed: 0_level_0,Unnamed: 0,uea_id,uea_answer,uea_success,kae_id,uea_created_at,uea_updated_at,uaa_id,uaa_status,user_id,...,chapter_contains_documents,chapter_display_order,chapter_displayed_on_front,lhc_id,course_id,level_id,teaching_id,lhc_displayed_on_front,schoolyear_id,exercise_duration
kd_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3828,2832,2832,2832,2832,2832,2832,2832,2832,2832,2832,...,2832,2832,2832,2832,2832,2832,2832,2832,2832,2831
52084,2608,2608,2608,2608,2608,2608,2608,2608,2608,2608,...,2608,2608,2608,2608,2608,2608,2608,2608,2608,2608
52089,2197,2197,2197,2197,2197,2197,2197,2197,2197,2197,...,2197,2197,2197,2197,2197,2197,2197,2197,2197,2197
52091,2133,2133,2133,2133,2133,2133,2133,2133,2133,2133,...,2133,2133,2133,2133,2133,2133,2133,2133,2133,2133
52090,2041,2041,2041,2041,2041,2041,2041,2041,2041,2041,...,2041,2041,2041,2041,2041,2041,2041,2041,2041,2041
53667,1986,1986,1986,1986,1986,1986,1986,1986,1986,1986,...,1986,1986,1986,1986,1986,1986,1986,1986,1986,1986
52086,1964,1964,1964,1964,1964,1964,1964,1964,1964,1964,...,1964,1964,1964,1964,1964,1964,1964,1964,1964,1964
52092,1850,1850,1850,1850,1850,1850,1850,1850,1850,1850,...,1850,1850,1850,1850,1850,1850,1850,1850,1850,1849
52085,1557,1557,1557,1557,1557,1557,1557,1557,1557,1557,...,1557,1557,1557,1557,1557,1557,1557,1557,1557,1557
52093,1490,1490,1490,1490,1490,1490,1490,1490,1490,1490,...,1490,1490,1490,1490,1490,1490,1490,1490,1490,1489


In [5]:
from pyBKT.models import Model
import pandas as pd

def get_strongest_folds(full, axis="user_id", nb_folds=5):
    all_elements = full[axis].unique()

    kfold = KFold(nb_folds, shuffle=True)
    folds = []
    for i, (train, test) in enumerate(kfold.split(all_elements)):
        list_of_test_ids = []
        for element_id in test:
            list_of_test_ids += list(full.query(f'{axis} == {all_elements[element_id]}').index)
        folds.append(np.array(list_of_test_ids))
    
    return folds

folds = get_strongest_folds(df, "user_id", 2)
test_ids = folds[0]

train_ids = list(set(list(df.index.values)) - set(test_ids))

df_train = df[df.index.isin(train_ids)]
df_test = df[df.index.isin(test_ids)]


# Initialize the model with an optional seed
model = Model(seed = 42, num_fits = 1)
defaults = {'order_id': 'uea_id',
            'skill_name': 'kd_id',
            'correct': 'uea_success',
            'user_id': 'user_id',
            'multilearn': 'kd_id',
            'multiprior': 'uea_success',
            'multipair': 'kd_id',
            'multigs': 'kae_id',
            'folds': 'user_id'
           }

print(np.unique(df_train['kd_id']))
print(np.unique(df_test['kd_id']))
model.fit(data = df_train, defaults = defaults,multigs = True, forgets = False, multilearn = True)
preds_df = model.predict(data=df_test)

all_exp = df_test["uea_success"]
all_pred = list(preds_df["correct_predictions"])

[ 3815  3816  3819  3820  3821  3822  3828  3829  3837 50978 50979 50980
 51736 51737 52084 52085 52086 52088 52089 52090 52091 52092 52093 52094
 52095 52096 52097 52098 52099 52101 52102 52104 52105 52106 52107 52108
 52109 52110 52111 52112 52113 52115 52116 52118 52119 52120 52121 52122
 52123 52124 53667 53668 53937 54456 54457]
[ 3815  3816  3819  3820  3821  3822  3828  3829  3837 50978 50979 50980
 51736 51737 52084 52085 52086 52088 52089 52090 52091 52092 52093 52094
 52095 52096 52097 52098 52099 52101 52102 52104 52105 52106 52107 52108
 52109 52110 52111 52112 52113 52115 52116 52118 52119 52120 52121 52122
 52123 52124 53667 53668 53937 54456 54457]


In [6]:
acc = sk_metrics.accuracy_score(all_exp, [1 if all_pred[i]>.5 else 0 for i in range(len(all_pred))])
auc = sk_metrics.roc_auc_score(all_exp, all_pred)
cohen_kappa = max([
    sk_metrics.cohen_kappa_score(
        np.array(all_exp), 
        [1 if all_pred[i]>j else 0 for i in range(len(all_pred))]) for j in np.linspace(0, 1, 100)]
)

print(acc, auc, cohen_kappa)

0.780400421496312 0.7630698759578757 0.3370502979777492


In [11]:
print(sum(all_exp)/len(df_test.index))

0.7593782929399367


## Score of the transition prediction performance

### No linking -- should be equivalent to BKT

In [7]:
def set_learning_pool_parameters_from_bkt_parameters(learner_pool, bkt_params):
    for kc in learner_pool.get_knowledge_components():
        learner_pool.set_prior(kc, model.params().loc[f'{kc.id}', 'prior', 'default'].value)
        
        learner_pool.set_learn(kc, model.params().loc[f'{kc.id}', 'learns', f'{kc.id}'].value)
        learner_pool.set_forget(kc, model.params().loc[f'{kc.id}', 'forgets', f'{kc.id}'].value)
        
        for exercise in kc.get_exercises():
            learner_pool.set_guess(exercise, model.params().loc[f'{kc.id}', 'guesses', f'{exercise.id}'].value)
            learner_pool.set_slip(exercise, model.params().loc[f'{kc.id}', 'slips', f'{exercise.id}'].value)
    return learner_pool

In [8]:
domain, exercises = setup_domain_and_resources_from_dataset(df_test)
print(domain)

Domain on 55 KCs:
- KC 3828: Déterminer si un réel est racine d'un trinôme
- KC 52084: Trouver une racine évidente pour un polynôme du second degré
- KC 52112: Résoudre une inéquation du second degré à l'aide du tableau de signes de la fonction polynôme associée
- KC 3820: Effectuer un changement de variable pour retrouver une équation du second degré
- KC 52089: Identifier un polynôme du second degré sous forme développée
- KC 3829: Résoudre une équation irrationnelle
- KC 3815: Déterminer la forme canonique d'un trinôme
- KC 3816: Donner les racines d'un trinôme du second degré
- KC 3819: Factoriser un polynôme de degré 3
- KC 3821: Donner le tableau de signes d'un trinôme du second degré
- KC 3822: Résoudre une inéquation du second degré
- KC 52086: Calculer le discriminant d'un polynôme du second degré donné sous forme développée
- KC 52090: Identifier un polynôme du second degré sous forme factorisée
- KC 52091: Identifier un polynôme du second degré sous forme canonique
- KC 5366

In [9]:
params = model.params()
learner_pool = LearnerPool(domain, {})
learner_pool = set_learning_pool_parameters_from_bkt_parameters(learner_pool, params)

metrics = ['accuracy', 'roc_auc', 'cohen_kappa']

all_exp, all_pred = process_dataset(df_test, domain, exercises, learner_pool)
print(sum(all_pred))

KeyboardInterrupt: 

In [None]:
acc = sk_metrics.accuracy_score(all_exp, [1 if all_pred[i]>.5 else 0 for i in range(len(all_pred))])
auc = sk_metrics.roc_auc_score(all_exp, all_pred)
cohen_kappa = max([
    sk_metrics.cohen_kappa_score(np.array(all_exp), [1 if all_pred[i]>j else 0 for i in range(len(all_pred))])
    for j in np.linspace(0, 1, 100)
])

print(acc, auc, cohen_kappa)

### Weak linking

In [None]:
link_strength = {
    domain.get_kc_by_name("Compléter les limites d'une somme de suites dont on connaît la limite"): {
        domain.get_kc_by_name("Déterminer la limite d'une somme de suites dont on connaît la limite"): 'weak'
    }
                 }

learner_pool = LearnerPool(domain, link_strength)
learner_pool = set_learning_pool_parameters_from_bkt_parameters(learner_pool, params)

c = {domain.get_kc_by_name("Déterminer la limite d'une somme de suites dont on connaît la limite"): {
    domain.get_kc_by_name("Compléter les limites d'une somme de suites dont on connaît la limite"): 1}
}

all_exp, all_pred = process_dataset(df_test, domain, exercises, learner_pool, c)
print(sum(all_pred))

In [None]:
acc = sk_metrics.accuracy_score(all_exp, [1 if all_pred[i]>.5 else 0 for i in range(len(all_pred))])
auc = sk_metrics.roc_auc_score(all_exp, all_pred)
cohen_kappa = max([
    sk_metrics.cohen_kappa_score(np.array(all_exp), [1 if all_pred[i]>j else 0 for i in range(len(all_pred))])
    for j in np.linspace(0, 1, 100)
])

print(acc, auc, cohen_kappa)

In [None]:
print([kc.name for kc in domain.get_knowledge_components()])

import copy

from matplotlib.colors import LogNorm
import matplotlib.pyplot as plt
import numpy as np
    

learner_traces = deduce_learner_traces_from_dataset(df_test, domain, exercises, learner_pool)

result = learner_pool.get_optimized_parameters(learner_traces, 'NoisyOR')


In [None]:
print(result.params)

In [None]:
result.show_candidates()