# EVALUATE THE ATTENTION MODEL

## 1. Get the model to be evaluated

In [1]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

import attention_models
import better_attention_models

In [19]:
MODEL_PATH = "model_train_70_val_68" #"attention_data/model_2_train_76_val_68"
max_sequence_length = 283

model = better_attention_models.MyAttentionModelWithPoolingBatchNormSkip(vocab_size=len(voc), embedding_dim=64, max_seq_len=max_sequence_length, num_heads=4, dim_feedforward=16, num_layers=1, dropout=0.5).to(device)
#attention_models.MyAttentionModelWithMaskOnWordPositionOutputAndMaskedSkipCORRECTEDwithMaskOnPaddingBothWaysTransposed(vocab_size=len(voc), embedding_dim=8, max_seq_len=max_sequence_length, num_heads=2, dim_feedforward=8, num_layers=1, dropout=0.1)
model.load_state_dict(torch.load(MODEL_PATH))

<All keys matched successfully>

## 2. Get the data to evaluate on

In [9]:
#import pandas as pd
from sklearn.model_selection import train_test_split
import pickle
import help_functions

In [1]:
def load_data_from_file():
    should_load = input("Do you wish to load data from a file? (y/n): ")
    if should_load == "y":
        filename = input("Specify the filename to load from: ")
        data = pd.read_csv(filename)
        return data

In [5]:
def load_pickle_data(filename):
    with open(filename, "rb") as load_file:
        return pickle.load(load_file)

In [34]:
data = load_data_from_file()

In [18]:
voc = load_pickle_data("/home/lovhag/storage/data/attention_data/_voc.pickle")

In [35]:
_, test_indices = train_test_split(range(2*len(data)), test_size=0.33, random_state=42)
test_indices = [index/2 for index in test_indices if index % 2 == 0]

In [45]:
test_data = data.iloc[test_indices].copy()
test_data.head()

Unnamed: 0,text,lemma,word_pos,sense_key,lemmatized_text,sensed_lemma,sense_encoded_text
4660,Article 4 of the Constitution stipulates that ...,case.n,36,case%1:04:00::,article 4 of the Constitution stipulate that :...,case_5,article 4 of the Constitution stipulate that :...
67883,The keynotes of this style are activism and em...,major.a,61,major%3:00:01::,the keynote of this style be activism and emph...,major_1,the keynote of this style be activism and emph...
10751,The State party should ensure that counter-ter...,life.n,60,life%1:28:02::,the State party should ensure that counter-ter...,life_5,the State party should ensure that counter-ter...
71501,Already in its general comment No . 4 on the r...,security.n,92,security%1:21:01::,already in -PRON- general comment no . 4 on th...,security_5,already in -PRON- general comment no . 4 on th...
39417,The subsidiary bodies of the Commission on Nar...,lead.v,69,lead%2:41:12::,the subsidiary body of the Commission on Narco...,lead_7,the subsidiary body of the Commission on Narco...


In [46]:
test_data.iloc[2].sense_encoded_text

'the State party should ensure that counter-terrorism measure be in full conformity with the Covenant and , in particular , that the legislation adopt in this context be limit to crime that would justify be assimilate to terrorism and attract the often grave consequence associate with -PRON- . -PRON- should allow for some degree of judicial discretion in sentence to life_5 imprisonment . the State party be also request to inform the Committee on whether the Terrorism Act have ever be apply .'

In [53]:
def pad_sequence(sequence, max_sequence_len):
    return sequence+[voc.get_pad_idx()]*(max_sequence_len-len(sequence))

In [70]:
max_sequence_len = 283
def create_eval_data_per_lemma(data, with_word_position_mask=True):
    # create a dict over the data per lemma
    # for a lemma, we have a list of dicts
    # for a dict we have X_data containing several variations of the same sentence
    # in the dict, we also have y_data, which contains one integer specifying the correct variation of the sentence
    def build_X_elem(X):
        X_elem = voc.encode([X])[0]
        if max_sequence_len:
            return pad_sequence(X_elem, max_sequence_len)
        return X_elem

    sense_dict = help_functions.build_sense_dict(data.lemma.to_list(), data.sense_key.to_list())
    eval_data = {}

    for index, row in data.iterrows():
        X_data = []
        y_data = []
        if with_word_position_mask:
            mask_data = []

        X_data.append(build_X_elem(row.sense_encoded_text.split(" ")))
        y_data.append([0])

        # append faulty sense examples
        available_senses = list(sense_dict[row.lemma].keys())
        available_senses.remove(row.sense_key)
        for faulty_sense in available_senses:
            faulty_text = row.sense_encoded_text.split(" ")
            faulty_text[row.word_pos] = row.lemma[:-2]+"_"+str(sense_dict[row.lemma][faulty_sense])
            X_data.append(build_X_elem(faulty_text.copy()))

        if with_word_position_mask:
            mask_vec = [False]*max_sequence_len
            mask_vec[row.word_pos+1] = True
            for sense_num in range(len(available_senses)+1):
                mask_data.append(mask_vec)

        dict_entry = {"X_data": X_data, "mask_data": mask_data, "y_data": y_data}
        if row.lemma in eval_data:
            eval_data[row.lemma].append(dict_entry)
        else:
            eval_data[row.lemma] = [dict_entry]
    
    return eval_data

In [56]:
eval_data = create_eval_data_per_lemma(test_data)

In [62]:
len(eval_data.keys())

30

In [69]:
eval_data["keep.v"][2]["y_data"]

[[0]]

In [11]:
eval_data = load_pickle_data("/home/lovhag/storage/data/attention_data/_eval_data.pickle")

## 2. Define an evaluation method

In [12]:
import numpy as np

In [21]:
def evaluate_model_per_lemma(model, eval_data):
    print(f"Model evaluation started!")
    test_acc = {}
    model.eval()
    for lemma in eval_data.keys():
        print(f"Evaluating model for lemma {lemma}...")
        with torch.no_grad():
            accuracy_sum = 0
            nbr_test_samples = 0
            for dict_entry in eval_data[lemma]:
                nbr_test_samples += 1
                output = model.forward(torch.LongTensor(dict_entry["X_data"]).to(device), torch.BoolTensor(dict_entry["mask_data"]).to(device))
                #print(f"output: {output.numpy().flatten()}")
                correct_index_guess = np.argmax(output.cpu().numpy().flatten())
                #print(f"correct index guess: {correct_index_guess}")
                #loss = loss_fun(output, by.type(torch.FloatTensor))
                #loss_sum += loss.item()
                accuracy = [1 if correct_index_guess==dict_entry["y_data"][0] else 0]
                #print(f"accuracy: {accuracy}")
                accuracy_sum += accuracy[0]
        #test_loss = loss_sum/(nbr_test_batches*batch_size)
        #print(h)
        test_acc[lemma] = accuracy_sum/nbr_test_samples
    return test_acc

## 3. Evaluate the model

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print('Using', device)

Using cuda


In [22]:
test_acc = evaluate_model_per_lemma(model, eval_data)

Model evaluation started!
Evaluating model for lemma case.n...
Evaluating model for lemma major.a...
Evaluating model for lemma life.n...
Evaluating model for lemma security.n...
Evaluating model for lemma lead.v...
Evaluating model for lemma extend.v...
Evaluating model for lemma point.n...
Evaluating model for lemma see.v...
Evaluating model for lemma force.n...
Evaluating model for lemma national.a...
Evaluating model for lemma line.n...
Evaluating model for lemma physical.a...
Evaluating model for lemma regular.a...
Evaluating model for lemma find.v...
Evaluating model for lemma bad.a...
Evaluating model for lemma positive.a...
Evaluating model for lemma hold.v...
Evaluating model for lemma professional.a...
Evaluating model for lemma bring.v...
Evaluating model for lemma serve.v...
Evaluating model for lemma critical.a...
Evaluating model for lemma common.a...
Evaluating model for lemma keep.v...
Evaluating model for lemma time.n...
Evaluating model for lemma active.a...
Evaluatin

## MODEL 3

In [23]:
test_acc

{'case.n': 0.23616734143049933,
 'major.a': 0.5288065843621399,
 'life.n': 0.2453900709219858,
 'security.n': 0.32045779685264664,
 'lead.v': 0.18457943925233644,
 'extend.v': 0.41947565543071164,
 'point.n': 0.5204402515723271,
 'see.v': 0.6167747914735866,
 'force.n': 0.1670428893905192,
 'national.a': 0.24039829302987198,
 'line.n': 0.8397849462365592,
 'physical.a': 0.4570552147239264,
 'regular.a': 0.20574886535552195,
 'find.v': 0.2012820512820513,
 'bad.a': 0.640625,
 'positive.a': 0.5607940446650124,
 'hold.v': 0.1586998087954111,
 'professional.a': 0.3174846625766871,
 'bring.v': 0.21385902031063322,
 'serve.v': 0.1659877800407332,
 'critical.a': 0.2955390334572491,
 'common.a': 0.3488773747841105,
 'keep.v': 0.3925438596491228,
 'time.n': 0.26519337016574585,
 'active.a': 0.3530701754385965,
 'place.n': 0.296875,
 'follow.v': 0.12401055408970976,
 'order.n': 0.212481426448737,
 'build.v': 0.17904993909866018,
 'position.n': 0.31850789096126253}

In [24]:
mean_test_acc = sum(test_acc.values())/len(test_acc)
mean_test_acc

0.3342334377265451

# MODEL 2

In [129]:
test_acc

{'case.n': 0.08232118758434548,
 'major.a': 0.16049382716049382,
 'life.n': 0.21560283687943263,
 'security.n': 0.09585121602288985,
 'lead.v': 0.03387850467289719,
 'extend.v': 0.3096129837702871,
 'point.n': 0.0660377358490566,
 'see.v': 0.335032437442076,
 'force.n': 0.07674943566591422,
 'national.a': 0.11095305832147938,
 'line.n': 0.8204301075268817,
 'physical.a': 0.09815950920245399,
 'regular.a': 0.09228441754916793,
 'find.v': 0.1064102564102564,
 'bad.a': 0.2760416666666667,
 'positive.a': 0.07444168734491315,
 'hold.v': 0.06309751434034416,
 'professional.a': 0.18404907975460122,
 'bring.v': 0.06332138590203107,
 'serve.v': 0.03767820773930754,
 'critical.a': 0.15613382899628253,
 'common.a': 0.4542314335060449,
 'keep.v': 0.1737938596491228,
 'time.n': 0.09116022099447514,
 'active.a': 0.14035087719298245,
 'place.n': 0.1890625,
 'follow.v': 0.13896218117854,
 'order.n': 0.25705794947994054,
 'build.v': 0.02679658952496955,
 'position.n': 0.1649928263988522}

In [132]:
mean_test_acc = sum(test_acc.values())/len(test_acc)
mean_test_acc

0.16983297742422357

# MODEL 1

In [107]:
test_acc

{'case.n': 0.22807017543859648,
 'major.a': 0.294238683127572,
 'life.n': 0.20851063829787234,
 'security.n': 0.20457796852646637,
 'lead.v': 0.17523364485981308,
 'extend.v': 0.3620474406991261,
 'point.n': 0.5849056603773585,
 'see.v': 0.6167747914735866,
 'force.n': 0.14672686230248308,
 'national.a': 0.0,
 'line.n': 0.8397849462365592,
 'physical.a': 0.2254601226993865,
 'regular.a': 0.4281391830559758,
 'find.v': 0.0,
 'bad.a': 0.59375,
 'positive.a': 0.5682382133995038,
 'hold.v': 0.1615678776290631,
 'professional.a': 0.22392638036809817,
 'bring.v': 0.17204301075268819,
 'serve.v': 0.0,
 'critical.a': 0.2843866171003718,
 'common.a': 0.46459412780656306,
 'keep.v': 0.3925438596491228,
 'time.n': 0.26519337016574585,
 'active.a': 0.30701754385964913,
 'place.n': 0.240625,
 'follow.v': 0.25241864555848725,
 'order.n': 0.2213967310549777,
 'build.v': 0.1656516443361754,
 'position.n': 0.19655667144906744}

## Something to compare with

In [25]:
sense_dict = help_functions.build_sense_dict(test_data.lemma.to_list(), test_data.sense_key.to_list())
nbr_senses_per_lemma = {lemma: len(sense_dict[lemma]) for lemma in sense_dict.keys()}
nbr_senses_per_lemma

NameError: name 'test_data' is not defined

In [111]:
random_guessing_accuracy_per_lemma = {lemma: 1/nbr_senses_per_lemma[lemma] for lemma in sense_dict.keys()}
random_guessing_accuracy_per_lemma

{'case.n': 0.125,
 'major.a': 0.25,
 'life.n': 0.1111111111111111,
 'security.n': 0.14285714285714285,
 'lead.v': 0.125,
 'extend.v': 0.14285714285714285,
 'point.n': 0.125,
 'see.v': 0.09090909090909091,
 'force.n': 0.125,
 'national.a': 0.16666666666666666,
 'line.n': 0.09090909090909091,
 'physical.a': 0.16666666666666666,
 'regular.a': 0.125,
 'find.v': 0.1,
 'bad.a': 0.25,
 'positive.a': 0.2,
 'hold.v': 0.09090909090909091,
 'professional.a': 0.2,
 'bring.v': 0.125,
 'serve.v': 0.1111111111111111,
 'critical.a': 0.2,
 'common.a': 0.25,
 'keep.v': 0.09090909090909091,
 'time.n': 0.2,
 'active.a': 0.2,
 'place.n': 0.14285714285714285,
 'follow.v': 0.09090909090909091,
 'order.n': 0.2,
 'build.v': 0.1,
 'position.n': 0.16666666666666666}

## Results formatted

In [125]:
print(f"mean acc: {np.sum([value for key, value in test_acc.items()])/len(test_acc)}")

mean acc: 0.2941459936741436


In [123]:
result_data = pd.DataFrame(data = [test_acc, random_guessing_accuracy_per_lemma]).transpose()
result_data.rename(columns={0: "test_acc", 1: "random_guess_acc"}, inplace=True)
result_data

Unnamed: 0,test_acc,random_guess_acc
case.n,0.22807,0.125
major.a,0.294239,0.25
life.n,0.208511,0.111111
security.n,0.204578,0.142857
lead.v,0.175234,0.125
extend.v,0.362047,0.142857
point.n,0.584906,0.125
see.v,0.616775,0.090909
force.n,0.146727,0.125
national.a,0.0,0.166667
