In [2]:
import os
import sys
import torch
import pandas as pd
import torch.nn as nn
sys.path.append('../..')
from Model.helper import *
from Config import Config
from sklearn.metrics import roc_curve
from sklearn.metrics import brier_score_loss
sys.path.append(os.path.join(os.getcwd(), '../../Data'))
from Data import *
from choosedataset import *
from torch.utils.data import Dataset
from sklearn.metrics import confusion_matrix
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
config = Config()
data = [Codeworkout, Falcon][1]()
df = data.df
padding_size_code = 30
loss_func = False
df['num_snapshots'] = df['prev_tasks'].apply(lambda x: [len(i) for i in x])

10 False similarity: False
[100.0, 0.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 0.0, 100.0, 100.0, 100.0, 100.0, 100.0, 0.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0]
None


In [43]:
all_future_q = set()
for i in df['new_task_id']:
    all_future_q.add(i)

all_prev_q = set()
for i in df['prev_tasks_id']:
    all_prev_q = all_prev_q.union(set(i))
all_problems = all_future_q.union(all_prev_q)
vocab = {name: idx for idx, name in enumerate(all_problems)}

In [44]:
class SAKTDatasetFromDataFrame(Dataset):
    def __init__(self, df, text_tokenizer, max_len_code=768, padding_size_code=100, padding_size_q=30):
        self.df = df.reset_index(drop=True)
        self.seq_len = padding_size_code
        self.vocab = text_tokenizer
        self.question_num = len(text_tokenizer)
        print(self.question_num)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        row = self.df.iloc[index]

        # Extract relevant data
        prev_q = row['prev_tasks_id']
        prev_a = row['prev_labels']
        target_q = row['new_task_id']
        label = row['Label']

        # Convert to SAKT input format (q_id if correct, q_id + QUESTION_NUM if incorrect)
        input_seq = []
        for q, a in zip(prev_q, prev_a):
            q = self.vocab[q]
            a = bool(a)
            input_seq.append(q if a else q + self.question_num)

        # Padding/truncation
        if len(input_seq) >= self.seq_len:
            input_seq = input_seq[-self.seq_len:]
        else:
            input_seq = [0] * (self.seq_len - len(input_seq)) + input_seq

        return {
            'label': torch.tensor([int(label)]).long(),
            'input': torch.tensor(input_seq).long(),
            'target_id': torch.tensor(self.vocab[target_q]).long()
        }

In [45]:
class SAKTDatasetAttemptFromDataFrame(Dataset):
    def __init__(self, df, text_tokenizer, max_len_code=768, padding_size_code=100, padding_size_q=30):
        self.df = df.reset_index(drop=True)
        self.seq_len = padding_size_code
        self.vocab = text_tokenizer
        self.question_num = len(text_tokenizer)
        print(self.question_num)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        row = self.df.iloc[index]

        # Extract relevant data
        prev_q = row['prev_tasks_id']
        prev_a = row['prev_labels']
        target_q = row['new_task_id']
        label = row['Label']
        num_snapshots = row['num_snapshots']

        # Convert to SAKT input format (q_id if correct, q_id + QUESTION_NUM if incorrect)
        input_seq = []
        for i, (q, a) in enumerate(zip(prev_q, prev_a)):
            q = self.vocab[q]
            a = bool(a)
            for j in range(num_snapshots[i] - 1):
                input_seq.append(q + self.question_num)
            input_seq.append(q if a else q + self.question_num)

        # Padding/truncation
        if len(input_seq) >= self.seq_len:
            input_seq = input_seq[-self.seq_len:]
        else:
            input_seq = [0] * (self.seq_len - len(input_seq)) + input_seq

        return {
            'label': torch.tensor([int(label)]).long(),
            'input': torch.tensor(input_seq).long(),
            'target_id': torch.tensor(self.vocab[target_q]).long()
        }

In [46]:
"""
Based on Annotated Transformer from Harvard NLP:
https://nlp.seas.harvard.edu/2018/04/03/attention.html#applications-of-attention-in-our-model
"""
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

def get_pad_mask(seq, pad_idx):
    return (seq != pad_idx).unsqueeze(-2)


def get_subsequent_mask(seq):
    ''' For masking out the subsequent info. '''
    sz_b, len_s = seq.size()
    subsequent_mask = (1 - torch.triu(torch.ones((1, len_s, len_s), device=seq.device), diagonal=1)).bool()
    return subsequent_mask


def attention(query, key, value, mask=None, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) \
             / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = F.softmax(scores, dim=-1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn


class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        "Take in model size and number of heads."
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model, bias=False), 4) # Q, K, V, last
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, query, key, value, mask=None):
        "Implements Figure 2"
        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)

        # 1) Do all the linear projections in batch from d_model => h x d_k
        query, key, value = \
            [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
             for l, x in zip(self.linears, (query, key, value))]

        # 2) Apply attention on all the projected vectors in batch.
        x, self.attn = attention(query, key, value, mask=mask,
                                 dropout=self.dropout)

        # 3) "Concat" using a view and apply a final linear.
        x = x.transpose(1, 2).contiguous() \
            .view(nbatches, -1, self.h * self.d_k)
        return self.linears[-1](x)


class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))


class SAKTLayer(nn.Module):
    """
    Single Encoder block of SAKT
    """
    def __init__(self, hidden_dim, num_head, dropout):
        super().__init__()
        self._self_attn = MultiHeadedAttention(num_head, hidden_dim, dropout)
        self._ffn = PositionwiseFeedForward(hidden_dim, hidden_dim, dropout)
        self._layernorms = clones(nn.LayerNorm(hidden_dim, eps=1e-6), 2)

    def forward(self, query, key, mask=None):
        """
        query: question embeddings
        key: interaction embeddings
        """
        # self-attention block
        output = self._self_attn(query=query, key=key, value=key, mask=mask)
        output = self._layernorms[0](key + output)
        # feed-forward block
        output = self._layernorms[1](output + self._ffn(output))
        return output


class SAKT(nn.Module):
    """
    Transformer-based
    all hidden dimensions (d_k, d_v, ...) are the same as hidden_dim
    """
    def __init__(self, hidden_dim, question_num, num_layers, num_head, dropout):
        super().__init__()
        self._hidden_dim = hidden_dim
        self._question_num = question_num

        # Blocks
        self._layers = clones(SAKTLayer(hidden_dim, num_head, dropout), num_layers)

        # prediction layer
        self._prediction = nn.Linear(hidden_dim, 1)

        # Embedding layers
        self._positional_embedding = nn.Embedding(padding_size_code+1, hidden_dim, padding_idx=0)
        self._interaction_embedding = nn.Embedding(2*question_num+1, hidden_dim, padding_idx=0)
        self._question_embedding = nn.Embedding(question_num+1, hidden_dim, padding_idx=0)

    def _transform_interaction_to_question_id(self, interaction):
        """
        get question_id from interaction index
        if interaction index is a number in [0, question_num], then leave it as-is
        if interaction index is bigger than question_num (in [question_num + 1, 2 * question_num]
        then subtract question_num
        interaction: integer tensor of shape (batch_size, sequence_size)
        """
        return interaction - self._question_num * (interaction > self._question_num).long()

    def _get_position_index(self, question_id):
        """
        [0, 0, 0, 4, 12] -> [0, 0, 0, 1, 2]
        """
        batch_size = question_id.shape[0]
        position_indices = []
        for i in range(batch_size):
            non_padding_num = (question_id[i] != 0).sum(-1).item()
            position_index = [0] * (padding_size_code - non_padding_num) + list(range(1, non_padding_num+1))
            position_indices.append(position_index)
        return torch.tensor(position_indices, dtype=int).to(question_id.device)

    def forward(self, interaction_id, target_id):
        """
        Query: Question (skill, exercise, ...) embedding
        Key, Value: Interaction embedding + positional embedding
        """
        question_id = self._transform_interaction_to_question_id(interaction_id)
        question_id = torch.cat([question_id[:, 1:], target_id.unsqueeze(1)], dim=-1)
        interaction_vector = self._interaction_embedding(interaction_id)
        question_vector = self._question_embedding(question_id)
        position_index = self._get_position_index(question_id)
        position_vector = self._positional_embedding(position_index)

        mask = get_pad_mask(question_id, 0) & get_subsequent_mask(question_id)
        x = interaction_vector + position_vector

        for layer in self._layers:
            x = layer(query=question_vector, key=x, mask=mask)

        output = self._prediction(x)
        output = output[:, -1, :]
        return output

In [47]:
class DKT(nn.Module):
    """
    LSTM based model
    """
    def __init__(self, input_dim, hidden_dim, num_layers, question_num, dropout):
        super().__init__()
        self._hidden_dim = hidden_dim
        self._num_layers = num_layers
        self._lstm = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers, batch_first=True, dropout=dropout)
        self._encoder = nn.Embedding(num_embeddings=2*question_num+1, embedding_dim=input_dim, padding_idx=0)
        self._decoder = nn.Linear(hidden_dim, question_num)

    def init_hidden(self, batch_size):
        """
        initialize hidden layer as zero tensor
        batch_size: single integer
        """
        weight = next(self.parameters())
        return (weight.new_zeros(self._num_layers, batch_size, self._hidden_dim),
                weight.new_zeros(self._num_layers, batch_size, self._hidden_dim))

    def forward(self, input, target_id):
        """
        get model output (before taking sigmoid) for target_id
        input: (batch_size, sequence_size)
        target_id: (batch_size)
        return output, a tensor of shape (batch_size, 1)
        """
        batch_size = input.shape[0]
        hidden = self.init_hidden(batch_size)
        input = self._encoder(input)
        output, _ = self._lstm(input, (hidden[0].detach(), hidden[1].detach()))
        output = self._decoder(output[:, -1, :])
        output = torch.gather(output, -1, target_id.unsqueeze(1))
        return output

# Start

In [48]:
def caculate_1loss(batch, model, device, criterion, eval=False):
    dict_batch = {k: v.to(device) for k, v in batch.items()}
    model_params = {k: v for k, v in dict_batch.items() if k != 'label'}
    logits = model(*model_params.values())
    label = dict_batch['label'].float()
    if not criterion:
        return logits, label
    return criterion(logits, label)

In [49]:
model = SAKT(100, len(vocab), 1, 5, 0.2)
# model = DKT(100, 100, 1, len(vocab), 0.2)
caculate_func = caculate_1loss
criterion = nn.BCEWithLogitsLoss() 
optimizer = torch.optim.Adam(model.parameters(), lr=config.lr, weight_decay=1e-4)

device_name = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(device_name)

# Split test, val, train

In [22]:
train_dataloader, valid_dataloader, test_dataloader = create_data_loader(df, SAKTDatasetFromDataFrame, padding_size_code=padding_size_code, 
                                                                         text_tokenizer=vocab, batch_size=config.batch_size
                                                                         , create_split=False, ids_filepath_prefix='/home/nogaschw/Codeworkout/UseData/falcon/split_ids_')

Load existing splitting
408
408
408


In [24]:
print(len(train_dataloader), len(valid_dataloader), len(test_dataloader), flush=True)
print(train_dataloader.dataset.df['Label'].value_counts())
print(valid_dataloader.dataset.df['Label'].value_counts())
print(test_dataloader.dataset.df['Label'].value_counts())
print(len(set(train_dataloader.dataset.df['student_id'])), len(set(valid_dataloader.dataset.df['student_id'])), len(set(test_dataloader.dataset.df['student_id'])))

559 81 164
Label
False    10813
True      7053
Name: count, dtype: int64
Label
False    1511
True     1054
Name: count, dtype: int64
Label
False    3244
True     2003
Name: count, dtype: int64
930 132 267


In [25]:
model = model.to(device)
model = training_loop(model=model, train_dataloader=train_dataloader, test_dataloader=valid_dataloader, 
                      optimizer=optimizer, criterion=criterion, device=device, name='a', caculate_func=caculate_func, use_wandb=False)

18/04/2025_20:10:20
559 81
Epoch: 0
Batch 0 from 559
Batch 100 from 559
Batch 200 from 559
Batch 300 from 559
Batch 400 from 559
Batch 500 from 559
Test Batch 0 from 81
Epoch [1], LR: 0.000100, Loss: 0.5842, Val Loss: 0.6021, patience: 5
success deep copy
success save in a
Epoch: 1
Batch 0 from 559
Batch 100 from 559
Batch 200 from 559
Batch 300 from 559
Batch 400 from 559
Batch 500 from 559
Test Batch 0 from 81
Epoch [2], LR: 0.000100, Loss: 0.5777, Val Loss: 0.6017, patience: 5
success deep copy
success save in a
Epoch: 2
Batch 0 from 559
Batch 100 from 559
Batch 200 from 559
Batch 300 from 559
Batch 400 from 559
Batch 500 from 559
Test Batch 0 from 81
Epoch [3], LR: 0.000100, Loss: 0.5728, Val Loss: 0.6028, patience: 5
Epoch: 3
Batch 0 from 559
Batch 100 from 559
Batch 200 from 559
Batch 300 from 559
Batch 400 from 559
Batch 500 from 559
Test Batch 0 from 81
Epoch [4], LR: 0.000100, Loss: 0.5672, Val Loss: 0.6053, patience: 4
Epoch: 4
Batch 0 from 559
Batch 100 from 559
Batch 200 fr

In [26]:
def results(threshold, y_true, y_prob):
    y_prob = np.array(y_prob)
    y_true = np.array(y_true)
    y_pred = np.where(y_prob > threshold, 1, 0)
    roc_auc = roc_auc_score(y_true, y_prob)
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    best = "best"
    if threshold == 0.5:
        best = "0.5"
    #  df = pd.concat([pd.DataFrame([[model_name, threshold, roc_auc, accuracy, precision, recall, f1]], columns=df.columns), df], ignore_index=True)
    print({"threshold": threshold, "roc_auc": roc_auc, "accuracy": accuracy, f"precision_{best}": precision, f"recall_{best}": recall, f"f1_{best}": f1})
    cm = confusion_matrix(y_true, y_pred)
    print(cm)

In [27]:
all_labels, all_probs = eval_loop(model, valid_dataloader, device, caculate_func=caculate_func)

fpr, tpr, thresholds = roc_curve(all_labels, all_probs)
J = tpr - fpr
best_index = J.argmax()

y_labels, y_probs = eval_loop(model, test_dataloader, device, caculate_func=caculate_func)
results(0.5, y_labels, y_probs)

Test Batch 0 from 81


Test Batch 0 from 164
Test Batch 100 from 164
{'threshold': 0.5, 'roc_auc': np.float64(0.7367446056562506), 'accuracy': 0.7032590051457976, 'precision_0.5': np.float64(0.6519073569482289), 'recall_0.5': np.float64(0.4777833250124813), 'f1_0.5': np.float64(0.5514261019878998)}
[[2733  511]
 [1046  957]]


In [17]:
y_pred = np.where(y_probs > 0.5, 1, 0)

In [18]:
df_results = pd.DataFrame({
    'y_true': y_labels.squeeze(1),
    'y_probs': y_probs.squeeze(1),
    'y_pred': y_pred.squeeze(1)
})

In [19]:
df_results.to_csv('results.csv')

In [1]:
sum([2733 , 511,1046 , 957])

5247

In [None]:
y_pred

(2384, 1)

# 5 - fold

In [50]:
data_loaders = create_data_loader_k_fold(df, SAKTDatasetFromDataFrame, vocab, batch_size=config.batch_size, padding_size_code=30)

408
408
408
408
408
408
408
408
408
408


In [51]:
def num_of(train_dataloader, test_dataloader):
    print(len(train_dataloader), len(test_dataloader))
    print(len(set(train_dataloader.dataset.df['student_id'])), len(set(test_dataloader.dataset.df['student_id'])))
    print(set(train_dataloader.dataset.df['student_id']).intersection(set(test_dataloader.dataset.df['student_id'])))
    print(train_dataloader.dataset.df.Label.value_counts(normalize=True))
    print(test_dataloader.dataset.df.Label.value_counts(normalize=True))

for train, test in data_loaders:
    num_of(train, test)

642 162
1064 266
set()
Label
False    0.610281
True     0.389719
Name: proportion, dtype: float64
Label
False    0.591271
True     0.408729
Name: proportion, dtype: float64
642 162
1064 266
set()
Label
False    0.607456
True     0.392544
Name: proportion, dtype: float64
Label
False    0.602549
True     0.397451
Name: proportion, dtype: float64
642 162
1064 266
set()
Label
False    0.6049
True     0.3951
Name: proportion, dtype: float64
Label
False    0.612691
True     0.387309
Name: proportion, dtype: float64
643 162
1064 266
set()
Label
False    0.606084
True     0.393916
Name: proportion, dtype: float64
Label
False    0.607995
True     0.392005
Name: proportion, dtype: float64
646 158
1064 266
set()
Label
False    0.603631
True     0.396369
Name: proportion, dtype: float64
Label
False    0.618084
True     0.381916
Name: proportion, dtype: float64


In [53]:
fold_results = {'ROC-AUC' : [], 'f1' : [], 'recall': [], "precision": [], 'calibration': [], 'f1-0.5': [], 'recall-0.5': [], 'precision-0.5': []}

for fold, (train_dataloader, test_dataloader) in enumerate(data_loaders):
    print(f"Fold {fold + 1}:")    # Prepare data for current fold
    m = SAKT(100, len(vocab), 1, 5, 0.2)
    loss_fn = None
    optimizer = torch.optim.Adam(m.parameters(), lr=config.lr, weight_decay=1e-4)

    m = m.to(device)
    print(m)
    # Training Loop
    for epoch in range(config.epoch):
        total_loss = train_loop(m, train_dataloader, device, optimizer, criterion, caculate_func)

        # Optional: Print metrics every few epochs
        if epoch % 10 == 0:
            print(f"Fold {fold + 1}, Epoch {epoch}: Loss = {total_loss / len(train_dataloader)}")

    y_labels, y_probs = eval_loop(m, test_dataloader, device, caculate_func=caculate_func)
    y_prob = np.array(y_probs)
    y_true = np.array(y_labels)
    y_pred = np.where(y_prob > 0.4, 1, 0)

    fold_results['ROC-AUC'].append(roc_auc_score(y_true, y_prob))
    fold_results['calibration'].append(brier_score_loss(y_true, y_prob))
    fold_results['precision'].append(precision_score(y_true, y_pred))
    fold_results['recall'].append(recall_score(y_true, y_pred))
    fold_results['f1'].append(f1_score(y_true, y_pred))

    y_pred = np.where(y_prob > 0.5, 1, 0)
    fold_results['precision-0.5'].append(precision_score(y_true, y_pred))
    fold_results['recall-0.5'].append(recall_score(y_true, y_pred))
    fold_results['f1-0.5'].append(f1_score(y_true, y_pred))

Fold 1:
SAKT(
  (_layers): ModuleList(
    (0): SAKTLayer(
      (_self_attn): MultiHeadedAttention(
        (linears): ModuleList(
          (0-3): 4 x Linear(in_features=100, out_features=100, bias=False)
        )
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (_ffn): PositionwiseFeedForward(
        (w_1): Linear(in_features=100, out_features=100, bias=True)
        (w_2): Linear(in_features=100, out_features=100, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (_layernorms): ModuleList(
        (0-1): 2 x LayerNorm((100,), eps=1e-06, elementwise_affine=True)
      )
    )
  )
  (_prediction): Linear(in_features=100, out_features=1, bias=True)
  (_positional_embedding): Embedding(31, 100, padding_idx=0)
  (_interaction_embedding): Embedding(817, 100, padding_idx=0)
  (_question_embedding): Embedding(409, 100, padding_idx=0)
)
Batch 0 from 642
Batch 100 from 642
Batch 200 from 642
Batch 300 from 642
Batch 400 from 642
Batch 500 from 642
Batc

### Falcon paper

In [54]:
avg_results = {metric: np.mean(vals) for metric, vals in fold_results.items()}
print("Average Fold Results:")
for metric, avg in avg_results.items():
    print(f"{metric}: {avg:.4f}")

Average Fold Results:
ROC-AUC: 0.7078
f1: 0.5816
recall: 0.6056
precision: 0.5605
calibration: 0.2082
f1-0.5: 0.5317
recall-0.5: 0.4657
precision-0.5: 0.6214


In [55]:
fold_results

{'ROC-AUC': [np.float64(0.7150437953976683),
  np.float64(0.7129646501457727),
  np.float64(0.697864655079685),
  np.float64(0.708057629893216),
  np.float64(0.7049015365108301)],
 'f1': [0.5946960807322225,
  0.5815230365242301,
  0.5612648221343873,
  0.5874970950499652,
  0.582881395897194],
 'recall': [0.6013289036544851,
  0.5918367346938775,
  0.5674325674325674,
  0.6257425742574257,
  0.6417445482866043],
 'precision': [0.5882079851439183,
  0.5715626466447677,
  0.555229716520039,
  0.5536574682435392,
  0.5339092872570195],
 'calibration': [np.float64(0.20792951829272677),
  np.float64(0.20687835195458407),
  np.float64(0.2086698211655877),
  np.float64(0.20839414344421903),
  np.float64(0.20937398016348213)],
 'f1-0.5': [0.5377410468319559,
  0.5332955511476339,
  0.5070671378091873,
  0.5348314606741573,
  0.5455561766349916],
 'recall-0.5': [0.4632178452776459,
  0.45724003887269193,
  0.43006993006993005,
  0.47128712871287126,
  0.5067497403946002],
 'precision-0.5': [0.

### CW paper

In [41]:
avg_results = {metric: np.mean(vals) for metric, vals in fold_results.items()}
print("Average Fold Results:")
for metric, avg in avg_results.items():
    print(f"{metric}: {avg:.4f}")

Average Fold Results:
ROC-AUC: 0.7620
f1: 0.5263
recall: 0.6500
precision: 0.4431
calibration: 0.1529
f1-0.5: 0.4214
recall-0.5: 0.3332
precision-0.5: 0.5778


In [40]:
fold_results

{'ROC-AUC': [np.float64(0.7740033688938798),
  np.float64(0.766852629855958),
  np.float64(0.7290721755962397),
  np.float64(0.7838333531999034),
  np.float64(0.7561909398720871)],
 'f1': [0.5734177215189873,
  0.5206489675516224,
  0.5187032418952618,
  0.5317750182615048,
  0.48689771766694845],
 'recall': [0.7167721518987342,
  0.6383363471971067,
  0.6274509803921569,
  0.7040618955512572,
  0.5636007827788649],
 'precision': [0.4778481012658228,
  0.4396014943960149,
  0.44208289054197664,
  0.4272300469483568,
  0.42857142857142855],
 'calibration': [np.float64(0.15695245200834182),
  np.float64(0.14797644107157734),
  np.float64(0.17756763596985),
  np.float64(0.13690343877733022),
  np.float64(0.14516276947394102)],
 'f1-0.5': [0.47092469018112487,
  0.43132803632236094,
  0.3800813008130081,
  0.431980906921241,
  0.392811296534018],
 'recall-0.5': [0.39082278481012656,
  0.3435804701627486,
  0.28205128205128205,
  0.35009671179883944,
  0.299412915851272],
 'precision-0.5': 

### CW

#### without attempts

In [20]:
np.mean([np.float64(0.7687714553763709),
  np.float64(0.7627186350314897),
  np.float64(0.7411323079772276),
  np.float64(0.7726746847475827),
  np.float64(0.7605864307791853)])

np.float64(0.7611767027823713)

In [22]:
np.mean([np.float64(0.7535154834078422),
  np.float64(0.7587025316455696),
  np.float64(0.7116689015084738),
  np.float64(0.7471109165634106),
  np.float64(0.7500534814585316)])

np.float64(0.7442102629167655)

##### DKT

In [23]:
np.mean([np.float64(0.7649885571326023),
  np.float64(0.7639131383675252),
  np.float64(0.7478029991398976),
  np.float64(0.7896520879431288),
  np.float64(0.7630081133743692)])

np.float64(0.7658729791915045)

In [28]:
np.mean([np.float64(0.7495451282525106),
  np.float64(0.7461553750701503),
  np.float64(0.7324244383067913),
  np.float64(0.7507600215159963),
  np.float64(0.7284490798554578)])

np.float64(0.7414668086001812)

### Falcon

#### Without attempt

In [4]:
np.mean([np.float64(0.7097928165956),
np.float64(0.698991739552964),
np.float64(0.7030397767688579),
np.float64(0.7021694955962242),
np.float64(0.7122294548603094)])

np.float64(0.7052446566747911)

In [19]:
np.mean([np.float64(0.6943674191888805),
  np.float64(0.6967422316912113),
  np.float64(0.6914856535949052),
  np.float64(0.7075981329380943),
  np.float64(0.6930449739495101)])

np.float64(0.6966476822725202)

##### DKT

In [20]:
np.mean([np.float64(0.7123435255808972),
  np.float64(0.7005605080860182),
  np.float64(0.6949550291830051),
  np.float64(0.7020821153293113),
  np.float64(0.7046139466983556)])

np.float64(0.7029110249755174)

In [25]:
np.mean([np.float64(0.7222091684137489),
  np.float64(0.7216677879943186),
  np.float64(0.7136651633809828),
  np.float64(0.7179334171846805),
  np.float64(0.7110996674852108)])

np.float64(0.7173150408917884)