In [None]:
!pip install datasets -q
!pip install accelerate -U -q
!pip install transformers -q

In [None]:
import urllib
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from datasets import Dataset
import torch
import transformers
from transformers import (
    BertModel,
    BertTokenizer,
)
from collections import defaultdict
import os
import matplotlib.pyplot as plt
import tqdm.notebook as tq

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device: %s" % device)

## TASK 1: Corpus


In [None]:
### Argument urls
argument_urls = {
    "train": "https://zenodo.org/records/8248658/files/arguments-training.tsv?download=1",
    "validation": "https://zenodo.org/records/8248658/files/arguments-validation.tsv?download=1",
    "test": "https://zenodo.org/records/8248658/files/arguments-test.tsv?download=1",
}

### Human values urls
level2_values_urls = {
    "train": "https://zenodo.org/records/8248658/files/labels-training.tsv?download=1",
    "validation": "https://zenodo.org/records/8248658/files/labels-validation.tsv?download=1",
    "test": "https://zenodo.org/records/8248658/files/labels-test.tsv?download=1",
}
### Check that the splits in the dict keys were not misspelled
for split in level2_values_urls.keys():
    assert split in argument_urls.keys(), "url dictionary keys misspelled"

In [None]:
### Create Data folder
data_folder = Path.cwd().joinpath("Data")
if not data_folder.exists():
    data_folder.mkdir(parents=True)

### Path for each file
argument_paths = {
    split: data_folder.joinpath(f"arguments_{split}.tsv")
    for split in argument_urls.keys()
}
level2_values_paths = {
    split: data_folder.joinpath(f"labels_{split}.tsv")
    for split in level2_values_urls.keys()
}

### Download each file if it's not already there
for file, path in argument_paths.items():
    if not path.exists():
        urllib.request.urlretrieve(argument_urls[file], filename=path)
for file, path in level2_values_paths.items():
    if not path.exists():
        urllib.request.urlretrieve(level2_values_urls[file], filename=path)

In [None]:
### Read arguments file (split = train/validation/test)
argument_dfs = {
    split: pd.read_csv(path, sep="\t") for split, path in argument_paths.items()
}

### Read human values file (split = train/validation/test)
level2_values_dfs = {
    values_split: pd.read_csv(path, sep="\t")
    for values_split, path in level2_values_paths.items()
}

In [None]:
### Merge arguments and (labels) level 2 values (split = train/validation/test)
args_level2vals_dfs = {
    split: pd.merge(argument, level2_values_dfs[split], on="Argument ID")
    for split, argument in argument_dfs.items()
}

In [None]:
# TODO just for developing purposes
do_print = True

In [None]:
if do_print:
    print([d.shape for d in level2_values_dfs.values()])
if do_print:
    level2_values_dfs["train"].head(2)

In [None]:
argument_dfs["train"].head(2)

In [None]:
if do_print:
    print([d.shape for d in argument_dfs.values()])
if do_print:
    argument_dfs["train"].head(2)

In [None]:
if do_print:
    args_level2vals_dfs["train"].describe()

In [None]:
if do_print:
    print([d.shape for d in args_level2vals_dfs.values()])
if do_print:
    args_level2vals_dfs["train"].head(2)

In [None]:
### Considering category ranges (0,3),(3,7),(7,13),(13,19)
### adding +4, considering the first 4 columns which are not categories
level3_categories_ranges = {
    "Openness_to_change": (4, 7),
    "Self_enhancement": (7, 11),
    "Conversation": (11, 17),
    "Self_transcendence": (17, 23),
}
columns_to_keep = ["Argument ID", "Conclusion", "Stance", "Premise"]
### This will be useful later
level_3_cat = list(level3_categories_ranges.keys())

In [None]:
### Creating final dataframes

train, validation, test = args_level2vals_dfs.keys()
assert train == "train" and validation == "validation" and test == "test"

### nm = not merged
train_df_nm = args_level2vals_dfs["train"]
validation_df_nm = args_level2vals_dfs["validation"]
test_df_nm = args_level2vals_dfs["test"]

### Creating final dataframes
train_df = pd.DataFrame()
validation_df = pd.DataFrame()
test_df = pd.DataFrame()

### Merge lvl2 to lvl 3 (any = OR)
for cat, (start, end) in level3_categories_ranges.items():
    train_df[cat] = train_df_nm.iloc[:, start:end].any(axis=1)
    validation_df[cat] = validation_df_nm.iloc[:, start:end].any(axis=1)
    test_df[cat] = test_df_nm.iloc[:, start:end].any(axis=1)

### Adding the columns to keep of the original dfs
train_df = pd.concat([train_df_nm[columns_to_keep], train_df], axis=1)
validation_df = pd.concat([validation_df_nm[columns_to_keep], validation_df], axis=1)
test_df = pd.concat([test_df_nm[columns_to_keep], test_df], axis=1)

### Define a mapping for "Stance" column
stance_mapping = {"in favor of": 1, "against": 0}

### Apply the mapping to convert strings to boolean values
train_df["Stance"] = train_df["Stance"].map(stance_mapping)
validation_df["Stance"] = validation_df["Stance"].map(stance_mapping)
test_df["Stance"] = test_df["Stance"].map(stance_mapping)

dfs = {"train": train_df, "validation": validation_df, "test": test_df}

In [None]:
if do_print:
    train_df.head()

In [None]:
if do_print:
    test_df.describe()

In [None]:
if do_print:
    test_df["Openness_to_change"]

# TASK 2: Model definition

## Baseline Model

In [None]:
def baseline_model(
    strategy, level_3_cat, train_df, columns_to_keep
):  # Da rivere i parametri una volta definito un dizionario per tutte le variabili(?)
    clf_list = [DummyClassifier(strategy=strategy) for _ in level_3_cat]
    [
        clf.fit(X=train_df[columns_to_keep[1:]], y=train_df[cat])
        for clf, cat in zip(clf_list, level_3_cat)
    ]
    return clf_list

## Bert - base Classifier

In [None]:
### Convert dataframes into datasets
datasets = {split: Dataset.from_pandas(df) for split, df in dfs.items()}

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [None]:
def add_labels(ds_row, labels):
    labels_batch = {k: ds_row[k] for k in ds_row.keys() if k in labels}
    labels_matrix = np.zeros((len(ds_row["Conclusion"]), len(labels)))
    for i, label in enumerate(labels):
        labels_matrix[:, i] = labels_batch[label]
    return labels_matrix.tolist()

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

### Conclusion Only Model

In [None]:
class BERTConclusionClass(torch.nn.Module):
    def __init__(self):
        super(BERTConclusionClass, self).__init__()
        self.bert_model = BertModel.from_pretrained(
            "bert-base-uncased", return_dict=True
        )
        self.dropout = torch.nn.Dropout(0.3)
        self.linear1 = torch.nn.Linear(768, 384)  # TODO check that it works
        self.linear2 = torch.nn.Linear(384, len(level_3_cat))

    def forward(self, input_ids, token_type_ids, attn_mask):
        output = self.bert_model(
            input_ids, attention_mask=attn_mask, token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output_linear1 = self.linear1(output_dropout)
        output_linear2 = self.linear2(output_linear1)
        return output_linear2


model_conclusion = BERTConclusionClass()

# # Freezing BERT layers: (tested, weaker convergence)
# for param in model.bert_model.parameters():
#     param.requires_grad = False

model_conclusion.to(device)

In [None]:
### Encoding for Conclusion only model
def tokenize_conclusion(ds_row, tokenizer):
    ### Tokenize text columns
    text_tokens = tokenizer(
        ds_row["Conclusion"],
        truncation=True,
        padding="max_length",
        max_length=tokenizer.model_max_length // 2,
        return_tensors="pt",
    )

    ### Combine text tokens with non-text features
    encoded_ds_row = {
        "input_ids": text_tokens["input_ids"],
        "token_type_ids": text_tokens["token_type_ids"],
        "attention_mask": text_tokens["attention_mask"],
        "Openness_to_change": torch.tensor(
            ds_row["Openness_to_change"], dtype=torch.float
        ),
        "Self_enhancement": torch.tensor(ds_row["Self_enhancement"], dtype=torch.float),
        "Conversation": torch.tensor(ds_row["Conversation"], dtype=torch.float),
        "Self_transcendence": torch.tensor(
            ds_row["Self_transcendence"], dtype=torch.float
        ),
    }

    encoded_ds_row["labels"] = add_labels(ds_row, level_3_cat)

    return encoded_ds_row

In [None]:
### Training of the model
def train_model_conclusion(train_dl, model, optimizer):
    losses = []
    correct_predictions = 0
    num_samples = 0
    ### set model to training mode (activate droput, batch norm)
    model.train()
    ### initialize the progress bar
    batches = tq.tqdm(
        enumerate(train_dl), total=len(train_dl), leave=True, colour="steelblue"
    )
    # batches = enumerate(train_dl)
    for batch_idx, data in batches:
        ids = data["input_ids"].to(device, dtype=torch.long)
        mask = data["attention_mask"].to(device, dtype=torch.long)
        token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
        labels = data["labels"].to(device, dtype=torch.float)

        ### Forward
        outputs = model(ids, token_type_ids, mask)
        loss = loss_fn(outputs, labels)
        losses.append(loss.item())

        ### Training accuracy, apply sigmoid, round (apply thresh 0.5)
        outputs = torch.sigmoid(outputs).cpu().detach().numpy().round()
        labels = labels.cpu().detach().numpy()
        correct_predictions += np.sum(outputs == labels)
        num_samples += labels.size  ### total number of elements in the 2D array

        ### Backward
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        ### Grad descent step
        optimizer.step()

        ### Update progress bar
        batches.set_description(f"")
        batches.set_postfix(batch_loss=loss)

    accuracy = float(correct_predictions) / num_samples
    return model, accuracy, losses

### Conclusion - Premise

In [None]:
class BERTConclusionPremiseClass(torch.nn.Module):
    def __init__(self):
        super(BERTConclusionPremiseClass, self).__init__()
        self.bert_model = BertModel.from_pretrained(
            "bert-base-uncased", return_dict=True
        )
        self.dropout = torch.nn.Dropout(0.3)
        self.linear1 = torch.nn.Linear(768, 384)  # TODO check that it works
        self.linear2 = torch.nn.Linear(384, len(level_3_cat))

    def forward(self, input_ids, token_type_ids, attn_mask):
        output = self.bert_model(
            input_ids, attention_mask=attn_mask, token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output_linear1 = self.linear1(output_dropout)
        output_linear2 = self.linear2(output_linear1)
        return output_linear2


model_conclusion_premise = BERTConclusionPremiseClass()

# # Freezing BERT layers: (tested, weaker convergence)
# for param in model.bert_model.parameters():
#     param.requires_grad = False

model_conclusion_premise.to(device)

In [None]:
### Encoding for Conclusion - Premise model
def tokenize_conclusion_premise(ds_row, tokenizer):
    ### Tokenize text columns
    text_tokens = tokenizer(
        ds_row["Conclusion"],
        ds_row["Premise"],
        truncation=True,
        padding="max_length",
        max_length=tokenizer.model_max_length // 2,
        return_tensors="pt",
    )

    ### Combine text tokens with non-text features
    encoded_ds_row = {
        "input_ids": text_tokens["input_ids"],
        "token_type_ids": text_tokens["token_type_ids"],
        "attention_mask": text_tokens["attention_mask"],
        "Openness_to_change": torch.tensor(
            ds_row["Openness_to_change"], dtype=torch.float
        ),
        "Self_enhancement": torch.tensor(ds_row["Self_enhancement"], dtype=torch.float),
        "Conversation": torch.tensor(ds_row["Conversation"], dtype=torch.float),
        "Self_transcendence": torch.tensor(
            ds_row["Self_transcendence"], dtype=torch.float
        ),
    }
    encoded_ds_row["labels"] = add_labels(ds_row, level_3_cat)

    return encoded_ds_row

In [None]:
### Training of the model
def train_model_conclusion_premise(train_dl, model, optimizer):
    losses = []
    correct_predictions = 0
    num_samples = 0
    ### set model to training mode (activate droput, batch norm)
    model.train()
    ### initialize the progress bar
    batches = tq.tqdm(
        enumerate(train_dl), total=len(train_dl), leave=True, colour="steelblue"
    )
    # batches = enumerate(train_dl)
    for batch_idx, data in batches:
        ids = data["input_ids"].to(device, dtype=torch.long)
        mask = data["attention_mask"].to(device, dtype=torch.long)
        token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
        labels = data["labels"].to(device, dtype=torch.float)

        ### Forward
        outputs = model(ids, token_type_ids, mask)
        loss = loss_fn(outputs, labels)
        losses.append(loss.item())

        ### Training accuracy, apply sigmoid, round (apply thresh 0.5)
        outputs = torch.sigmoid(outputs).cpu().detach().numpy().round()
        labels = labels.cpu().detach().numpy()
        correct_predictions += np.sum(outputs == labels)
        num_samples += labels.size  ### total number of elements in the 2D array

        ### Backward
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        ### Grad descent step
        optimizer.step()

        ### Update progress bar
        batches.set_description(f"")
        batches.set_postfix(batch_loss=loss)

    accuracy = float(correct_predictions) / num_samples
    return model, accuracy, losses

### Conclusion - Premise - Stance

In [None]:
class BERTConclusionPremiseStanceClass(torch.nn.Module):
    def __init__(self):
        super(BERTConclusionPremiseStanceClass, self).__init__()
        self.bert_model = BertModel.from_pretrained(
            "bert-base-uncased", return_dict=True
        )
        self.dropout = torch.nn.Dropout(0.3)
        self.linear1 = torch.nn.Linear(769, 384)  ### 769!
        self.linear2 = torch.nn.Linear(384, len(level_3_cat))

    def forward(self, input_ids, token_type_ids, attn_mask, stance):
        output = self.bert_model(
            input_ids, attention_mask=attn_mask, token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        ### concatenate stance
        stance = stance.view(stance.shape[0], -1)
        output_stack = torch.cat((output_dropout, stance), dim=1)
        output_linear1 = self.linear1(output_stack)
        output_linear2 = self.linear2(output_linear1)
        return output_linear2


model_conclusion_premise_stance = BERTConclusionPremiseStanceClass()

# # Freezing BERT layers: (tested, weaker convergence)
# for param in model.bert_model.parameters():
#     param.requires_grad = False

model_conclusion_premise_stance.to(device)

In [None]:
### Encoding for Conclusion - Premise - Stance model
def tokenize_conclusion_premise_stance(ds_row, tokenizer):
    ### Tokenize text columns
    text_tokens = tokenizer(
        ds_row["Conclusion"],
        ds_row["Premise"],
        truncation=True,
        padding="max_length",
        max_length=tokenizer.model_max_length // 2,
        return_tensors="pt",
    )

    ### Combine text tokens with non-text features
    encoded_ds_row = {
        "input_ids": text_tokens["input_ids"],
        "token_type_ids": text_tokens["token_type_ids"],
        "attention_mask": text_tokens["attention_mask"],
        "Stance": torch.tensor(
            ds_row["Stance"], dtype=torch.float
        ),  ### Assuming 'Stance' is represented as 0 or 1
        "Openness_to_change": torch.tensor(
            ds_row["Openness_to_change"], dtype=torch.float
        ),
        "Self_enhancement": torch.tensor(ds_row["Self_enhancement"], dtype=torch.float),
        "Conversation": torch.tensor(ds_row["Conversation"], dtype=torch.float),
        "Self_transcendence": torch.tensor(
            ds_row["Self_transcendence"], dtype=torch.float
        ),
    }
    encoded_ds_row["labels"] = add_labels(ds_row, level_3_cat)

    return encoded_ds_row

In [None]:
### Training of the model
def train_model_conclusion_premise_stance(train_dl, model, optimizer):
    losses = []
    correct_predictions = 0
    num_samples = 0
    ### set model to training mode (activate droput, batch norm)
    model.train()
    ### initialize the progress bar
    batches = tq.tqdm(
        enumerate(train_dl), total=len(train_dl), leave=True, colour="steelblue"
    )
    # batches = enumerate(train_dl)
    for batch_idx, data in batches:
        ids = data["input_ids"].to(device, dtype=torch.long)
        mask = data["attention_mask"].to(device, dtype=torch.long)
        token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
        labels = data["labels"].to(device, dtype=torch.float)
        stance = data["Stance"].to(device, dtype=torch.float)

        ### Forward
        outputs = model(ids, token_type_ids, mask, stance)
        loss = loss_fn(outputs, labels)
        losses.append(loss.item())

        ### Training accuracy, apply sigmoid, round (apply thresh 0.5)
        outputs = torch.sigmoid(outputs).cpu().detach().numpy().round()
        labels = labels.cpu().detach().numpy()
        correct_predictions += np.sum(outputs == labels)
        num_samples += labels.size  ### total number of elements in the 2D array

        ### Backward
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        ### Grad descent step
        optimizer.step()

        ### Update progress bar
        batches.set_description(f"")
        batches.set_postfix(batch_loss=loss)

    accuracy = float(correct_predictions) / num_samples
    return model, accuracy, losses

# Task 3: Metrics

### Baseline Metric

In [None]:
def f1_baseline(prediction, labels, data):
    ### Evaluate F1 overall
    f1_overall = f1_score(y_true=data[labels], y_pred=prediction, average="macro")

    ### Evaluate F1 per category
    f1_per_cat = [
        f1_score(y_true=data[cat], y_pred=prediction[:, i])
        for i, cat in enumerate(labels)
    ]

    return f1_overall, f1_per_cat

### Bert Conclusion-Only Model Metric

In [None]:
def eval_model_conclusion(validation_dl, model):
    losses = []
    correct_predictions = 0
    num_samples = 0
    num_categories = next(iter(validation_dl))["labels"].shape[1]

    ### accumulate data over each batch to compute the f1
    true_positives = np.array([0 for _ in range(num_categories)])
    false_positives = np.array([0 for _ in range(num_categories)])
    false_negatives = np.array([0 for _ in range(num_categories)])

    ### set model to eval mode (turn off dropout, fix batch norm)
    model.eval()

    ### show progress bar
    batches = tq.tqdm(
        enumerate(validation_dl),
        total=len(validation_dl),
        leave=True,
        colour="steelblue",
    )
    # batches = enumerate(validation_dl)
    with torch.no_grad():
        for batch_idx, data in batches:
            ids = data["input_ids"].to(device, dtype=torch.long)
            mask = data["attention_mask"].to(device, dtype=torch.long)
            token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
            labels = data["labels"].to(device, dtype=torch.float)
            outputs = model(ids, token_type_ids, mask)

            loss = loss_fn(outputs, labels)
            losses.append(loss.item())

            ### validation accuracy
            ### add sigmoid, for the training sigmoid is in BCEWithLogitsLoss
            outputs = torch.sigmoid(outputs).cpu().detach().numpy().round()
            labels = labels.cpu().detach().numpy()
            correct_predictions += np.sum(outputs == labels)
            num_samples += labels.size  ### total number of elements in the 2D array

            ### TP: predicttion == 1, true label == 1.
            true_positives += np.array(
                [
                    np.sum(np.logical_and(outputs[:, i] == 1, labels[:, i] == 1))
                    for i in range(num_categories)
                ]
            )

            ### FP: prediction == 1, true label == 0
            false_positives += np.array(
                [
                    np.sum(np.logical_and(outputs[:, i] == 1, labels[:, i] == 0))
                    for i in range(num_categories)
                ]
            )

            ### FN: prediction == 0, true label == 1
            false_negatives += np.array(
                [
                    np.sum(np.logical_and(outputs[:, i] == 0, labels[:, i] == 1))
                    for i in range(num_categories)
                ]
            )

    accuracy = float(correct_predictions) / num_samples
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1_per_cat = 2 * (precision * recall) / (precision + recall)
    f1_overall = np.mean(f1_per_cat)

    return accuracy, losses, f1_overall, f1_per_cat

### Bert Conclusion - Premise Model Metric

In [None]:
def eval_model_conclusion_premise(validation_dl, model):
    losses = []
    correct_predictions = 0
    num_samples = 0
    num_categories = next(iter(validation_dl))["labels"].shape[1]

    ### accumulate data over each batch to compute the f1
    true_positives = np.array([0 for _ in range(num_categories)])
    false_positives = np.array([0 for _ in range(num_categories)])
    false_negatives = np.array([0 for _ in range(num_categories)])

    ### set model to eval mode (turn off dropout, fix batch norm)
    model.eval()

    ### show progress bar
    batches = tq.tqdm(
        enumerate(validation_dl),
        total=len(validation_dl),
        leave=True,
        colour="steelblue",
    )
    # batches = enumerate(validation_dl)
    with torch.no_grad():
        for batch_idx, data in batches:
            ids = data["input_ids"].to(device, dtype=torch.long)
            mask = data["attention_mask"].to(device, dtype=torch.long)
            token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
            labels = data["labels"].to(device, dtype=torch.float)
            outputs = model(ids, token_type_ids, mask)

            loss = loss_fn(outputs, labels)
            losses.append(loss.item())

            ### validation accuracy
            ### add sigmoid, for the training sigmoid is in BCEWithLogitsLoss
            outputs = torch.sigmoid(outputs).cpu().detach().numpy().round()
            labels = labels.cpu().detach().numpy()
            correct_predictions += np.sum(outputs == labels)
            num_samples += labels.size  ### total number of elements in the 2D array

            ### TP: predicttion == 1, true label == 1.
            true_positives += np.array(
                [
                    np.sum(np.logical_and(outputs[:, i] == 1, labels[:, i] == 1))
                    for i in range(num_categories)
                ]
            )

            ### FP: prediction == 1, true label == 0
            false_positives += np.array(
                [
                    np.sum(np.logical_and(outputs[:, i] == 1, labels[:, i] == 0))
                    for i in range(num_categories)
                ]
            )

            ### FN: prediction == 0, true label == 1
            false_negatives += np.array(
                [
                    np.sum(np.logical_and(outputs[:, i] == 0, labels[:, i] == 1))
                    for i in range(num_categories)
                ]
            )

    accuracy = float(correct_predictions) / num_samples
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1_per_cat = 2 * (precision * recall) / (precision + recall)
    f1_overall = np.mean(f1_per_cat)

    return accuracy, losses, f1_overall, f1_per_cat

### Bert Conclusion - Premise - Stance Model Metric

In [None]:
def eval_model_conclusion_premise_stance(validation_dl, model):
    losses = []
    correct_predictions = 0
    num_samples = 0
    num_categories = next(iter(validation_dl))["labels"].shape[1]

    ### accumulate data over each batch to compute the f1
    true_positives = np.array([0 for _ in range(num_categories)])
    false_positives = np.array([0 for _ in range(num_categories)])
    false_negatives = np.array([0 for _ in range(num_categories)])

    ### set model to eval mode (turn off dropout, fix batch norm)
    model.eval()

    ### show progress bar
    batches = tq.tqdm(
        enumerate(validation_dl),
        total=len(validation_dl),
        leave=True,
        colour="steelblue",
    )
    # batches = enumerate(validation_dl)
    with torch.no_grad():
        for batch_idx, data in batches:
            ids = data["input_ids"].to(device, dtype=torch.long)
            mask = data["attention_mask"].to(device, dtype=torch.long)
            token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
            labels = data["labels"].to(device, dtype=torch.float)
            stance = data["Stance"].to(device, dtype=torch.float)
            outputs = model(ids, token_type_ids, mask, stance)

            loss = loss_fn(outputs, labels)
            losses.append(loss.item())

            ### validation accuracy
            ### add sigmoid, for the training sigmoid is in BCEWithLogitsLoss
            outputs = torch.sigmoid(outputs).cpu().detach().numpy().round()
            labels = labels.cpu().detach().numpy()
            correct_predictions += np.sum(outputs == labels)
            num_samples += labels.size  ### total number of elements in the 2D array

            ### TP: predicttion == 1, true label == 1.
            true_positives += np.array(
                [
                    np.sum(np.logical_and(outputs[:, i] == 1, labels[:, i] == 1))
                    for i in range(num_categories)
                ]
            )

            ### FP: prediction == 1, true label == 0
            false_positives += np.array(
                [
                    np.sum(np.logical_and(outputs[:, i] == 1, labels[:, i] == 0))
                    for i in range(num_categories)
                ]
            )

            ### FN: prediction == 0, true label == 1
            false_negatives += np.array(
                [
                    np.sum(np.logical_and(outputs[:, i] == 0, labels[:, i] == 1))
                    for i in range(num_categories)
                ]
            )

    accuracy = float(correct_predictions) / num_samples
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1_per_cat = 2 * (precision * recall) / (precision + recall)
    f1_overall = np.mean(f1_per_cat)

    return accuracy, losses, f1_overall, f1_per_cat

# TASK 4 - Training and Evaluation

## Baseline

In [None]:
model_uniform = baseline_model("uniform", level_3_cat, train_df, columns_to_keep)
prediction_uniform = np.array(
    [clf.predict(X=test_df[columns_to_keep[1:]]) for clf in model_uniform]
).T
f1_overall, f1_percat = f1_baseline(
    prediction_uniform, labels=level_3_cat, data=test_df
)

In [None]:
print(f"f1_overall = {f1_overall}  \t f1 per category: {f1_percat}")

In [None]:
prediction_majority = baseline_model(
    "most_frequent", level_3_cat, train_df, columns_to_keep
)
prediction_uniform = np.array(
    [clf.predict(X=test_df[columns_to_keep[1:]]) for clf in model_uniform]
).T
f1_overall, f1_percat = f1_baseline(
    prediction_uniform, labels=level_3_cat, data=test_df
)

In [None]:
print(f"f1_overall = {f1_overall}  \t f1 per category: {f1_percat}")

## Bert-base models

In [None]:
TRAIN_BATCH_SIZE = 32
VALIDATION_BATCH_SIZE = 32
TEST_BATCH_SIZE = 32
N_EPOCHS = 2
LEARNING_RATE = (1e-5) / 2
WEIGHT_DECAY = 0.01
seeds = [333, 666, 999]

### Bert Conclusion-Only

In [None]:
LEARNING_RATE_Co = LEARNING_RATE  # 1e-5

In [None]:
tokenized_datasets_conclusion = {
    split: ds.map(
        function=tokenize_conclusion, fn_kwargs={"tokenizer": tokenizer}, batched=True
    )
    for split, ds in datasets.items()
}

for ds in tokenized_datasets_conclusion.values():
    ds.set_format(type="torch")

### Data loaders
train_dl = torch.utils.data.DataLoader(
    tokenized_datasets_conclusion["train"],
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0,
)

validation_dl = torch.utils.data.DataLoader(
    tokenized_datasets_conclusion["validation"],
    batch_size=VALIDATION_BATCH_SIZE,
    shuffle=False,
    num_workers=0,
)

test_dl = torch.utils.data.DataLoader(
    tokenized_datasets_conclusion["test"],
    batch_size=TEST_BATCH_SIZE,
    shuffle=False,
    num_workers=0,
)

### define the optimizer
optimizer_conclusion = torch.optim.AdamW(
    model_conclusion.parameters(), lr=LEARNING_RATE_Co, weight_decay=WEIGHT_DECAY
)

In [None]:
history_conclusion = {}
best_accuracy = 0
best_f1_sum = 0

model_folder = Path.cwd().joinpath("models")
if not model_folder.exists():
    model_folder.mkdir(parents=True)

for epoch in range(1, N_EPOCHS + 1):
    print(f"Epoch {epoch}/{N_EPOCHS}")
    model, train_acc, train_losses = train_model_conclusion(
        train_dl, model_conclusion, optimizer_conclusion
    )
    val_acc, val_losses, f1_overall, f1_per_cat = eval_model_conclusion(
        validation_dl, model
    )

    print(
        f"train_loss={np.mean(train_losses):.4f}, val_loss={np.mean(val_losses):.4f}, ",
        f"train_acc={train_acc:.4f}, val_acc={val_acc:.4f}, ",
        f"val_f1_overall={f1_overall:.4f}, " f"val_f1_per_cat={f1_per_cat}",
    )

    history_conclusion.update({"train_acc": train_acc})
    history_conclusion.update({"train_losses": train_losses})
    history_conclusion.update({"val_acc": val_acc})
    history_conclusion.update({"val_losses": val_losses})
    history_conclusion.update({"f1_overall": f1_overall})
    history_conclusion.update({"f1_per_cat": f1_per_cat})

    ### save the best model
    if (f1_overall) > best_f1_sum:  # val_acc > best_accuracy:
        ### (f1_overall - (np.max(f1_per_cat) - np.min(f1_per_cat)))
        torch.save(
            model.state_dict(), Path.joinpath(model_folder, "model_conclusion_only.bin")
        )
        best_f1_sum = f1_overall

In [None]:
### Plot loss
plt.plot(history_conclusion["train_losses"])

### Bert with Conclusion and Premise

In [None]:
LEARNING_RATE_CP = LEARNING_RATE  # 1e-5

In [None]:
tokenized_datasets_conclusion_premise = {
    split: ds.map(
        function=tokenize_conclusion_premise,
        fn_kwargs={"tokenizer": tokenizer},
        batched=True,
    )
    for split, ds in datasets.items()
}

for ds in tokenized_datasets_conclusion_premise.values():
    ds.set_format(type="torch")

### Data loaders
train_dl = torch.utils.data.DataLoader(
    tokenized_datasets_conclusion_premise["train"],
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0,
)

validation_dl = torch.utils.data.DataLoader(
    tokenized_datasets_conclusion_premise["validation"],
    batch_size=VALIDATION_BATCH_SIZE,
    shuffle=False,
    num_workers=0,
)

test_dl = torch.utils.data.DataLoader(
    tokenized_datasets_conclusion_premise["test"],
    batch_size=TEST_BATCH_SIZE,
    shuffle=False,
    num_workers=0,
)

### define the optimizer
optimizer_conclusion_premise = torch.optim.AdamW(
    model_conclusion_premise.parameters(),
    lr=LEARNING_RATE_CP,
    weight_decay=WEIGHT_DECAY,
)

In [None]:
history_conclusion_premise = {}
best_accuracy = 0
best_f1_sum = 0

model_folder = Path.cwd().joinpath("models")
if not model_folder.exists():
    model_folder.mkdir(parents=True)

for epoch in range(1, N_EPOCHS + 1):
    print(f"Epoch {epoch}/{N_EPOCHS}")
    model, train_acc, train_losses = train_model_conclusion_premise(
        train_dl, model_conclusion_premise, optimizer_conclusion_premise
    )
    val_acc, val_losses, f1_overall, f1_per_cat = eval_model_conclusion_premise(
        validation_dl, model
    )
    print(
        f"train_loss={np.mean(train_losses):.4f}, val_loss={np.mean(val_loss):.4f}, ",
        f"train_acc={train_acc:.4f}, val_acc={val_acc:.4f}, ",
        f"val_f1_overall={f1_overall:.4f}, " f"val_f1_per_cat={f1_per_cat}",
    )

    history_conclusion_premise.update({"train_acc": train_acc})
    history_conclusion_premise.update({"train_losses": train_losses})
    history_conclusion_premise.update({"val_acc": val_acc})
    history_conclusion_premise.update({"val_losses": val_losses})
    history_conclusion_premise.update({"f1_overall": f1_overall})
    history_conclusion_premise.update({"f1_per_cat": f1_per_cat})

    ### save the best model
    if (f1_overall) > best_f1_sum:  # val_acc > best_accuracy:
        ### (f1_overall - (np.max(f1_per_cat) - np.min(f1_per_cat)))
        torch.save(
            model.state_dict(),
            Path.joinpath(model_folder, "modell_conclusion_premise.bin"),
        )
        best_f1_sum = f1_overall

In [None]:
### Plot loss
plt.plot(history_conclusion_premise["train_losses"])

### Bert with Conclusion Premise and Stance

In [None]:
LEARNING_RATE_CPS = LEARNING_RATE  # 1e-5

In [None]:
tokenized_datasets_conclusion_premise_stance = {
    split: ds.map(
        function=tokenize_conclusion_premise_stance,
        fn_kwargs={"tokenizer": tokenizer},
        batched=True,
    )
    for split, ds in datasets.items()
}

for ds in tokenized_datasets_conclusion_premise_stance.values():
    ds.set_format(type="torch")

### Data loaders
train_dl = torch.utils.data.DataLoader(
    tokenized_datasets_conclusion_premise_stance["train"],
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0,
)

validation_dl = torch.utils.data.DataLoader(
    tokenized_datasets_conclusion_premise_stance["validation"],
    batch_size=VALIDATION_BATCH_SIZE,
    shuffle=False,
    num_workers=0,
)

test_dl = torch.utils.data.DataLoader(
    tokenized_datasets_conclusion_premise_stance["test"],
    batch_size=TEST_BATCH_SIZE,
    shuffle=False,
    num_workers=0,
)

### define the optimizer
optimizer_conclusion_premise_stance = torch.optim.AdamW(
    model_conclusion_premise_stance.parameters(),
    lr=LEARNING_RATE_CPS,
    weight_decay=WEIGHT_DECAY,
)

In [None]:
history_conclusion_premise_stance = {}
best_accuracy = 0
best_f1_sum = 0

model_folder = Path.cwd().joinpath("models")
if not model_folder.exists():
    model_folder.mkdir(parents=True)

for epoch in range(1, N_EPOCHS + 1):
    print(f"Epoch {epoch}/{N_EPOCHS}")
    model, train_acc, train_losses = train_model_conclusion_premise_stance(
        train_dl, model_conclusion_premise_stance, optimizer_conclusion_premise_stance
    )
    val_acc, val_losses, f1_overall, f1_per_cat = eval_model_conclusion_premise_stance(
        validation_dl, model
    )
    print(
        f"train_loss={np.mean(train_losses):.4f}, val_loss={np.mean(val_losses):.4f}, ",
        f"train_acc={train_acc:.4f}, val_acc={val_acc:.4f}, ",
        f"val_f1_overall={f1_overall:.4f}, " f"val_f1_per_cat={f1_per_cat}",
    )

    history_conclusion_premise_stance.update({"train_acc": train_acc})
    history_conclusion_premise_stance.update({"train_losses": train_losses})
    history_conclusion_premise_stance.update({"val_acc": val_acc})
    history_conclusion_premise_stance.update({"val_losses": val_losses})
    history_conclusion_premise_stance.update({"f1_overall": f1_overall})
    history_conclusion_premise_stance.update({"f1_per_cat": f1_per_cat})

    ### save the best model
    if (f1_overall) > best_f1_sum:  # val_acc > best_accuracy:
        ### (f1_overall - (np.max(f1_per_cat) - np.min(f1_per_cat)))
        torch.save(
            model.state_dict(),
            Path.joinpath(model_folder, "model_conclusion_premise_stance.bin"),
        )
        best_f1_sum = f1_overall

In [None]:
### Plot loss
plt.plot(history_conclusion_premise_stance["train_losses"])