In [2]:
import pandas as pd
from baseline_utils import get_data_for_NN, set_seed
import os

In [3]:
set_seed(420)

In [4]:
def compute_accuracy(y, yhat):
    correct = sum(1 for yt, yh in zip(y, yhat) if yt == yh)
    return correct / len(y)

def compute_recall(y, yhat):
    true_positives = sum(1 for yt, yh in zip(y, yhat) if yt == 1 and yh == 1)
    false_negatives = sum(1 for yt, yh in zip(y, yhat) if yt == 1 and yh == 0)
    # Avoid division by zero
    if true_positives + false_negatives == 0:
        return 0.0
    return true_positives / (true_positives + false_negatives)

# Qualitative analysis

## Label length weirdness
While comparing actual and predicted labels, we came across the weird problem, that the labels did not match the original text inputs anymore. To be exact, the labels were suddenly more numerous than the tokens we had them for. Here is the reason: The tokeniser splits some words that we would otherwise not think to split (of row 426 of the evaluation data, i.e. after overflow handling):


These are the input tokens after tokenisation. The word „Fußball“ has been split into 3 tokens, at positions 19, 20, 21


In [7]:
print(tokens[19:22])
print(y[19:22])
print(yhat[19:22])

['Fu', '##ss', '##ball']
['0', '0', '0']
['0', '1', '0']


Curiously, they have different labels, according to our model at least. Originally, of course, theyre identical.
Here is another example for Wals-Siezenheim. Wals was apparently false, while Siezenheim was correct. Both town-names were split.

In [8]:
print(tokens[11:17])
print(y[11:17])
print(yhat[11:17])

['W', '##als', '-', 'Sie', '##zen', '##heim']
['1', '1', '0', '0', '0', '0']
['0', '0', '0', '0', '0', '1']


## Analysing the output of a model
The expected input is a delimiter text file, the delimiter being "[DELIM]", as commas and semicolons are too common to be usable

In [88]:
datapath = "../data/output/val_predictions_mbert_rawinput.csv"
with open(datapath, "r") as f:
    raw_data = f.readlines()

# each row is a CSV, but with "DELIM" instead of a comma as delimiter
df = pd.DataFrame([row.split("[DELIM]") for row in raw_data], columns=["features", "y", "yhat"])
# preprocess for a clean dataframe
df.features = df.features.apply(lambda x: [a for a in eval(x) if a not in ["[CLS]", ["PAD]"]]])
df.y = df.y.str[1:-1].str.split()
df.yhat = df.yhat.str[1:-2].str.split()
# the features often have words that are separated by the tokenizer
# the continuations are prepended by "##", so this string is the indicator that the word needs to be joined back together
# we make it easy: we join back the list, replace " ##" and split it again
df.features = df.features.apply(lambda x: " ".join(x).replace(" ##", "").split())
# split the features column into query and response
df["question"] = df.features.apply(lambda l: l[:l.index("[SEP]")])
df["response"] = df.features.apply(lambda l: l[l.index("[SEP]")+1 : l.index("[PAD]")-1])
# drop the features column
df = df.drop("features", axis=1)
# reorder the cols
df = df[["question", "response", "y", "yhat"]]
# compute the lengths of the token lists and the outputs:
#df["len_question"] = df.question.apply(lambda l: len(l))
df["len_response"] = df.response.apply(lambda l: len(l))
df["len_y"] = df.y.apply(lambda l: len(l))
df["len_yhat"] = df.yhat.apply(lambda l: len(l))
display(df)


def show_single_row(i):
    print(f"Row {i}")
    with pd.option_context('display.max_colwidth', None):
        display(df.iloc[i])

show_single_row(494)

Unnamed: 0,question,response,y,yhat,len_response,len_y,len_yhat
0,"[¿, El, municipio, Hüffelsheim, ,, está, ubica...","[hüffelsheim, ser, uno, municipio, ubicado, en...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, ...",19,19,19
1,"[¿, En, qué, país, se, encuentra, la, ciudad, ...","[el, ciudad, de, toruń, él, encontrar, en, el,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",12,12,12
2,"[¿, En, qué, estado, de, Estados, Unidos, se, ...","[lewistown, él, encontrar, en, el, estado, de,...","[0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",12,12,12
3,"[When, did, Jef, Raskin, die, ?]","[Jef, Rakin, die, on, March, 1, ,, 2011, ., he...","[0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0]","[1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1]",15,15,15
4,"[玳, 瑁, 蜗, 牛, 属, 的, 物, 种, 分, 布, 在, 哪, 些, 地, 区, ？]","[玳, 瑁, 螺, （, 学, 名, ：, Archachatina, marginata,...","[0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ...",127,85,85
...,...,...,...,...,...,...,...
494,"[Wie, viele, Zuschauer, haben, Platz, im, Gill...","[der, Gilette, Stadium, sein, mit, ein, Kapazi...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, ...",20,18,18
495,"[Hur, många, år, har, Chris, Claremont, skrivi...","[den, finnas, ingen, känd, uppskattning, av, a...","[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, ...",18,17,17
496,"[金, 属, 键, 决, 定, 了, 金, 属, 的, 哪, 些, 物, 理, 性, 质, ？]","[金, 属, 晶, 体, 中, ，, 金, 属, 原, 子, 之, 间, 通, 过, 金, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, ...",143,90,90
497,"[Gegen, welchen, Verein, verlor, Hamilton, Aca...","[Hamilton, Academic, verlieren, in, der, Playo...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",19,19,19


Row 494


question                                                  [Wie, viele, Zuschauer, haben, Platz, im, Gillette, Stadium, in, Foxborough, ?]
response        [der, Gilette, Stadium, sein, mit, ein, Kapazität, von, rund, 50, ., 000, Zuschauer, der, groß, Stadion, in, der, NFL, .]
y                                                                                  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1]
yhat                                                                               [0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0]
len_response                                                                                                                           20
len_y                                                                                                                                  18
len_yhat                                                                                                                               18
Name: 494, dtype: object

there are a lot of rows, where the number of labels does not match the number of tokens in the response, here we investigate.  
Row 494 is a German example:  
The original text was: "model_output_text":"\nDas Gilette Stadium ist mit einer Kapazit\u00e4t von rund 50.000 Zuschauern das gr\u00f6\u00dfte Stadion in der NFL." 
The Problem appears to be the number "50.000", which was split into 3 tokens by the tokenizer, so it makes sense that there are 3 tokens too many. But the labels and predictions are of equal length, so all we need to do is add a line that joins the numbers back together in the qualitative analysis!  
Update: This fixed ca 6 lines.

In [90]:
# for all lines, where there are too many tokens, look for 3 consecutive tokens in the pattern: number + period + number 
# join the tokens
import re 
def join_numbers_in_string(text):
    # Join the list into a single string
    joined = " ".join(text)
    # Use regex to remove spaces between number, period, and number
    cleaned = re.sub(r'(\d)\s*\.\s*(\d)', r'\1.\2', joined)
    # Split back into a list
    return cleaned.split()
    
df.loc[df.len_response != df.len_y, "response"] = df.loc[df.len_response != df.len_y, "response"].apply(join_numbers_in_string)
# update the list lengths
df["len_response"] = df.response.apply(lambda l: len(l))
df["len_y"] = df.y.apply(lambda l: len(l))
df["len_yhat"] = df.yhat.apply(lambda l: len(l))

show_single_row(494)

Row 494


question                                              [Wie, viele, Zuschauer, haben, Platz, im, Gillette, Stadium, in, Foxborough, ?]
response        [der, Gilette, Stadium, sein, mit, ein, Kapazität, von, rund, 50.000, Zuschauer, der, groß, Stadion, in, der, NFL, .]
y                                                                              [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1]
yhat                                                                           [0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0]
len_response                                                                                                                       18
len_y                                                                                                                              18
len_yhat                                                                                                                           18
Name: 494, dtype: object

Next example:  
Row 477, the original text: "Der Titel f\u00fcr das Oberhaus der Sakyapa-Linie im Tibetischen Vajrayana-Buddhism ist Gongma Tripa.\n\n<|im_end|>  \n"  
One potential problem might be that the tokenizer separated "im" to be "in der", but since the problem was the splitting of the tokens before, I suspect that it has to do with the weird punctuations at the end instead.

In [99]:
show_single_row(477)

Row 477


question                                                              [Was, ist, der, Titel, des, Oberhaupts, der, Sakya, -, Schule, im, tibetischen, Buddhismus, ?]
response        [der, Titel, für, der, Oberhaus, der, Sakyapa, -, Linie, in, der, tibetisch, Vajrayana, -, Buddhism, sein, Gongma, Tripa, ., <, |, im, _, end, |, >]
y                                                                                                    [0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0]
yhat                                                                                                 [0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1]
len_response                                                                                                                                                      26
len_y                                                                                                                                                             21
len_yhat  

In [None]:
show_single_row(415)


Original text:  
Die Br\u00fcdere des Schauspieler Dave Francos heissen Tom und James Franco.\n<|im_end|>  \n"  
It has the same <|im end|> stuff at the end. Perhaps we can fix some errors by fully removing those tokens.  
the strings "<|im_end|>" are split into 7 pieces, to match the list lengths like the tokenizer saw it, they should be 2 pieces.  
Update: fixing this bug fixed another 8 lines

In [115]:
# the strings "<|im_end|>" are split into 7 pieces, to match the list lengths like the tokenizer saw it, they should be 2 pieces
df.response = df.response.apply(lambda x: " ".join(x).replace("< | im _ end | >", "<|im end|>").split())
# update the list lengths
df["len_response"] = df.response.apply(lambda l: len(l))
df["len_y"] = df.y.apply(lambda l: len(l))
df["len_yhat"] = df.yhat.apply(lambda l: len(l))

show_single_row(415)


Row 415


question                           [Wie, heißen, die, Brüder, des, amerikanischen, Schauspielers, Dave, Franco, ?]
response        [der, Brüdere, der, Schauspieler, Dave, Francos, heissen, Tom, und, James, Franco, ., <|im, end|>]
y                                                                       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
yhat                                                                    [0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1]
len_response                                                                                                    14
len_y                                                                                                           14
len_yhat                                                                                                        14
Name: 415, dtype: object

Here is another German text with an issue:  
Original response: "Der kazachischen Verein FC Astna-69 gewann seinen 2. Pokal gegen den FC Taraz.\n<|im_end|>  \n"

In [118]:
show_single_row(7)


Row 7


question         [Gegen, welchen, Verein, gewann, der, kasachische, Verein, FK, Astana, -, 1964, seinen, zweiten, Pokalsieg, ?]
response        [der, kazachisch, Verein, FC, Astna, -, 69, gewinnen, sein, 2, ., Pokal, gegen, der, FC, Taraz, ., <|im, end|>]
y                                                                                 [0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0]
yhat                                                                              [0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0]
len_response                                                                                                                 19
len_y                                                                                                                        15
len_yhat                                                                                                                     15
Name: 7, dtype: object

Another example, in English:  
Here the lengths are just off by 2.  
The 26th Biathlon World Championships were held in Rasnov, Romania. The event took place from February 23 to March 5, 2013. Biathlon is a winter sport that combines cross-country skiing and rifle shooting. Competitors race while wearing cross-country skis with rifles carried over their shoulders. They must stop at specific points along the course to shoot targets with their rifles. Failure to hit all the targets within a certain time limit results in penalties, such as additional laps or extra time added to their race."

In [121]:
show_single_row(370)

Row 370


question                                                                                                                                                               [In, which, city, were, the, 26th, biathlon, world, championships, held, ?]
response        [the, 26th, Biathlon, World, Championship, be, hold, in, Rasnov, ,, Romania, ., the, event, take, place, from, February, 23, to, March, 5, ,, 2013, ., Biathlon, be, a, winter, sport, that, combine, cross, -, country, ski, and]
y                                                                                                                                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
yhat                                                                                                                                     [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1]
len_response                

In [116]:
df[df.len_y != df.len_response]

Unnamed: 0,question,response,y,yhat,len_response,len_y,len_yhat
4,"[玳, 瑁, 蜗, 牛, 属, 的, 物, 种, 分, 布, 在, 哪, 些, 地, 区, ？]","[玳, 瑁, 螺, （, 学, 名, ：, Archachatina, marginata,...","[0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ...",127,85,85
5,"[चीन, ने, २०१६, में, कौन, सा, उपग्रह, लांच, कि...","[चीन, ने, २०१६, में, "", Tiantan, -, 1, "", नामक...","[0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",47,43,43
7,"[Gegen, welchen, Verein, gewann, der, kasachis...","[der, kazachisch, Verein, FC, Astna, -, 69, ge...","[0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0]","[0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0]",19,15,15
8,"[戴, 夫, ·, 弗, 兰, 科, 的, 代, 表, 作, 品, 有, 什, 么, ？]","[《, 美, 国, 派, 》, 系, 列, 、, 《, 欧, 洲, 性, 旅, 行, 》, ...","[0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0]","[0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0]",17,13,13
9,"[¿, Cuál, es, la, profesión, de, Ted, Lange, ?]","[ted, lange, ser, uno, actor, y, director, est...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, ...",40,38,38
...,...,...,...,...,...,...,...
485,"[Montako, aselajia, Ruotsin, merivoimiin, kuul...","[Ruotsi, meri, #, voima, (, Svenska, marinen, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, ...","[1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, ...",53,34,34
487,"[加, 拿, 大, 三, 角, 洲, 市, 和, 哪, 些, 市, 镇, 相, 邻, 或, ...","[加, 拿, 大, 三, 角, 州, 市, （, Delta, ）, 位, 于, 不, 列,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, ...",115,82,82
488,"[彼, 得, 拉, ·, 范, 斯, 塔, 弗, 伦, 参, 加, 1984, 年, 奥, ...","[彼, 得, ·, 冯, ·, 斯, 塔, 芬, 在, 1880, 年, 出, 生, ，, ...","[0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...",36,24,24
495,"[Hur, många, år, har, Chris, Claremont, skrivi...","[den, finnas, ingen, känd, uppskattning, av, a...","[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, ...",18,17,17


In [119]:
with pd.option_context('display.max_colwidth', None):
    for i, row in df[df.len_y != df.len_response].iterrows():
        print(i)
        print(" ".join(row.response))
        print(row.y)
        print()

4
玳 瑁 螺 （ 学 名 ： Archachatina marginata ） ， 又 称 为 大 丽 螺 、 宝 螺 ， 是 属 于 腹 足 纲 宝 龟 科 的 一 个 物 种 。 它 主 要 分 布 在 非 洲 的 热 带 地 区 ， 包 括 东 非 、 南 非 、 马 达 加 斯 加 以 及 部 分 西 非 国 家 。 在 这 些 地 区 中 ， 它 可 以 在 各 种 生 境 中 找 到 ， 如 森 林 、 草 原 、 沼 泽 和 农 田 等 。 由 于 它 的 壳 具 有 很 高 的 商 业 价 值 ， 因 此 在 一 些 地 方
['0', '1', '1', '0', '0', '1', '1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '0', '1', '1', '1', '0', '0', '0', '0', '0', '0', '0', '0', '1', '1', '1', '1', '1', '0', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '0', '0', '0', '0', '0', '0', '0', '0', '1', '1', '1', '1', '1', '0', '1', '1', '1', '1', '1', '1', '1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '1', '1', '1', '0', '0', '0', '0']

5
चीन ने २०१६ में " Tiantan - 1 " नामक एक उपग्रह लांच करना था । Tianta n - 1 एक संचार उपग्रह है जो उद्देश्य चीन का संचार नेटवर्क को सुदृढ़ करना है । यह उपग्रह २०१६ का ९वाँ माह में लांच करना जाना था ।
['0', '0', '0', '0', '0', '1', '1', '0', '0', '0', '0', '0', '0', '0', '1', '1', '0', '1', '1', '0

## Comuting metrics

In [68]:
df['accuracy'] = df.apply(lambda row: compute_accuracy(row['y'], row['yhat']), axis=1)
df['recall'] = df.apply(lambda row: compute_recall(row['y'], row['yhat']), axis=1)
df

Unnamed: 0,question,response,y,yhat,len_response,len_y,len_yhat,accuracy,recall
0,"[¿, El, municipio, Hüffelsheim, ,, está, ubica...","[hüffelsheim, ser, uno, municipio, ubicado, en...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, ...",19,19,19,0.315789,0.0
1,"[¿, En, qué, país, se, encuentra, la, ciudad, ...","[el, ciudad, de, toruń, él, encontrar, en, el,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",12,12,12,1.000000,0.0
2,"[¿, En, qué, estado, de, Estados, Unidos, se, ...","[lewistown, él, encontrar, en, el, estado, de,...","[0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",12,12,12,0.833333,0.0
3,"[When, did, Jef, Raskin, die, ?]","[Jef, Rakin, die, on, March, 1, ,, 2011, ., he...","[0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0]","[1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1]",15,15,15,0.533333,0.0
4,"[玳, 瑁, 蜗, 牛, 属, 的, 物, 种, 分, 布, 在, 哪, 些, 地, 区, ？]","[玳, 瑁, 螺, （, 学, 名, ：, Archachatina, marginata,...","[0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ...",127,85,85,0.505882,0.0
...,...,...,...,...,...,...,...,...,...
494,"[Wie, viele, Zuschauer, haben, Platz, im, Gill...","[der, Gilette, Stadium, sein, mit, ein, Kapazi...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, ...",20,18,18,0.777778,0.0
495,"[Hur, många, år, har, Chris, Claremont, skrivi...","[den, finnas, ingen, känd, uppskattning, av, a...","[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, ...",18,17,17,0.470588,0.0
496,"[金, 属, 键, 决, 定, 了, 金, 属, 的, 哪, 些, 物, 理, 性, 质, ？]","[金, 属, 晶, 体, 中, ，, 金, 属, 原, 子, 之, 间, 通, 过, 金, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, ...",143,90,90,0.600000,0.0
497,"[Gegen, welchen, Verein, verlor, Hamilton, Aca...","[Hamilton, Academic, verlieren, in, der, Playo...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",19,19,19,0.684211,0.0


the indices and tokens still dont quite match, investigate further

Above I print all evaluation results, where somehow the number of tokens used for training and the labels do not match.
This is the case for a variety of languages. For the languages with non-latin characters, I am at a loss. For the others though, I will continue investigating for bugs.


# Ideas to go on:

- Try other tokenisers, e.g. using a language specific one for each observation
- Thus far we havent lemmatized the query before training, maybe that would help
- Try different max_lens
- Try different batch sizes