In [1]:
import re

import datasets
import numpy as np
import pandas as pd
import scipy as sp
import sklearn.metrics as sm
import torch
import transformers
from sklearn.metrics import f1_score
from tqdm.notebook import tqdm
from transformers import (
    AutoModel,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [2]:
parameters = {
    "wikipedia_1": {
        "path": "../tmp/Wikipedia_Group_1_bias",
        "labels": ["label", "label_0", "label_1", "label_2"],
    }
}

selected_dataset = "wikipedia_1"

In [3]:
labels = parameters[selected_dataset]["labels"]
dataset_name = selected_dataset
path_testset = f"{parameters[selected_dataset]['path']}_test.pkl"

MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [4]:
batch_size = 16


def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i : i + n]


def predict(x):
    val = []
    batches = list(chunks(x, batch_size))
    for batch in tqdm(batches):
        # cleaned_text = preprocess(record)
        inputs = tokenizer(
            batch, truncation=True, padding=True, return_tensors="pt"
        ).to(device)
        outputs = model(**inputs)
        m = torch.nn.Softmax(dim=1).cuda()
        # softmax the logits
        softmaxed = m(outputs.logits).detach().cpu().numpy()
        # get the probaility for the positive class (hate)
        prediction = [1 if x[1] >= 0.5 else 0 for x in softmaxed]
        val.extend(prediction)
    return np.array(val)

In [5]:
results = []
for selected_dataset in tqdm(labels):
    print(selected_dataset)
    row = []
    model = AutoModelForSequenceClassification.from_pretrained(
        f"../tmp/models/{dataset_name}/{selected_dataset}/"
    ).to(device)
    for test_dataset in tqdm(labels):
        df_test = pd.read_pickle(path_testset)
        predictions = predict(df_test["text"].to_list())
        gold_label = df_test[test_dataset].to_list()
        row.append([gold_label, predictions])
        print("\t", test_dataset, ":", f1_score(gold_label, predictions))
    results.append(row)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

label


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1126.0), HTML(value='')))


	 label : 0.7814235234729935


HBox(children=(FloatProgress(value=0.0, max=1126.0), HTML(value='')))


	 label_0 : 0.49171350338665504


HBox(children=(FloatProgress(value=0.0, max=1126.0), HTML(value='')))


	 label_1 : 0.6300763972657821


HBox(children=(FloatProgress(value=0.0, max=1126.0), HTML(value='')))


	 label_2 : 0.7766513056835637

label_0


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1126.0), HTML(value='')))






KeyboardInterrupt: 

In [None]:
print("Performance of COVID-19 classifier")
print(sm.precision_recall_fscore_support(results[0][0][0], results[0][0][1]))
print(
    sm.precision_recall_fscore_support(
        results[0][0][0], results[0][0][1], average="macro"
    )
)

In [None]:
print("Performance of GermEval 2018 classifier")
print(sm.precision_recall_fscore_support(results[1][1][0], results[1][1][1]))
print(
    sm.precision_recall_fscore_support(
        results[1][1][0], results[1][1][1], average="macro"
    )
)

In [14]:
scores = []
for i in range(len(results)):
    row = []
    for j in range(len(results)):
        row.append(f1_score(results[i][j][0], results[i][j][1], average="macro"))
    scores.append(row)

In [15]:
norm = np.array(scores) * 100
labels_p = labels
labels_p = ["Baseline", "M I", "O", "P", "M II"]
labels_p = ["Baseline", "Pessimistic", "Medium", "Optimistic"]
df_latex = pd.DataFrame(data=norm, index=labels_p, columns=labels_p)
# df_latex = df_latex.drop(columns=['Baseline'])
# df_latex = df_latex.drop(['Baseline'])

In [16]:
print(df_latex.to_latex(float_format="{:0.1f}".format))

\begin{tabular}{lrrrr}
\toprule
{} &  Baseline &  Pessimistic &  Medium &  Optimistic \\
\midrule
Baseline    &      87.7 &         68.5 &    78.5 &        87.7 \\
Pessimistic &      78.8 &         80.2 &    80.6 &        71.0 \\
Medium      &      87.6 &         73.5 &    81.9 &        83.1 \\
Optimistic  &      84.6 &         64.3 &    74.4 &        87.5 \\
\bottomrule
\end{tabular}



In [17]:
df_latex

Unnamed: 0,Baseline,Pessimistic,Medium,Optimistic
Baseline,87.719993,68.519075,78.539329,87.722743
Pessimistic,78.755166,80.155828,80.633168,70.993525
Medium,87.644753,73.527809,81.930911,83.07768
Optimistic,84.642665,64.304939,74.442638,87.529714
