In [1]:
%pip install datasets -q
%pip install accelerate -U -q
%pip install transformers -q
%pip install scikit-learn==1.3.2 -q

In [1]:
import urllib
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score
from datasets import Dataset
import torch
from transformers import (
    BertModel,
    BertTokenizer,
)
import os
import matplotlib.pyplot as plt
import tqdm.notebook as tq

  return torch._C._cuda_getDeviceCount() if nvml_count < 0 else nvml_count


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device: %s" % device)

Device: cpu


## TASK 1: Corpus


In [3]:
### Argument urls
argument_urls = {
    "train": "https://zenodo.org/records/8248658/files/arguments-training.tsv?download=1",
    "validation": "https://zenodo.org/records/8248658/files/arguments-validation.tsv?download=1",
    "test": "https://zenodo.org/records/8248658/files/arguments-test.tsv?download=1",
}

### Human values urls
level2_values_urls = {
    "train": "https://zenodo.org/records/8248658/files/labels-training.tsv?download=1",
    "validation": "https://zenodo.org/records/8248658/files/labels-validation.tsv?download=1",
    "test": "https://zenodo.org/records/8248658/files/labels-test.tsv?download=1",
}
### Check that the splits in the dict keys were not misspelled
for split in level2_values_urls.keys():
    assert split in argument_urls.keys(), "url dictionary keys misspelled"

In [4]:
### Create Data folder
data_folder = Path.cwd().joinpath("Data")
if not data_folder.exists():
    data_folder.mkdir(parents=True)

### Path for each file
argument_paths = {
    split: data_folder.joinpath(f"arguments_{split}.tsv")
    for split in argument_urls.keys()
}
level2_values_paths = {
    split: data_folder.joinpath(f"labels_{split}.tsv")
    for split in level2_values_urls.keys()
}

### Download each file if it's not already there
for file, path in argument_paths.items():
    if not path.exists():
        urllib.request.urlretrieve(argument_urls[file], filename=path)
for file, path in level2_values_paths.items():
    if not path.exists():
        urllib.request.urlretrieve(level2_values_urls[file], filename=path)

In [5]:
### Read arguments file (split = train/validation/test)
argument_dfs = {
    split: pd.read_csv(path, sep="\t") for split, path in argument_paths.items()
}

### Read human values file (split = train/validation/test)
level2_values_dfs = {
    values_split: pd.read_csv(path, sep="\t")
    for values_split, path in level2_values_paths.items()
}

In [6]:
### Merge arguments and (labels) level 2 values (split = train/validation/test)
args_level2vals_dfs = {
    split: pd.merge(argument, level2_values_dfs[split], on="Argument ID")
    for split, argument in argument_dfs.items()
}

In [None]:
level2_values_dfs["train"].head(2)

In [None]:
argument_dfs["train"].head(2)

In [None]:
args_level2vals_dfs["train"].describe()

In [None]:
args_level2vals_dfs["train"].head(2)

In [7]:
### Considering category ranges (0,3),(3,7),(7,13),(13,19)
### adding +4, considering the first 4 columns which are not categories
level3_categories_ranges = {
    "Openness_to_change": (4, 7),
    "Self_enhancement": (7, 11),
    "Conversation": (11, 17),
    "Self_transcendence": (17, 23),
}
columns_to_keep = ["Argument ID", "Conclusion", "Stance", "Premise"]
level_3_cat = list(level3_categories_ranges.keys())

In [8]:
### Creating final dataframes
train, validation, test = args_level2vals_dfs.keys()
assert train == "train" and validation == "validation" and test == "test"

### nm = not merged
train_df_nm = args_level2vals_dfs["train"]
validation_df_nm = args_level2vals_dfs["validation"]
test_df_nm = args_level2vals_dfs["test"]

### Creating final dataframes
train_df = pd.DataFrame()
validation_df = pd.DataFrame()
test_df = pd.DataFrame()

### Merge lvl2 to lvl 3 (any = OR)
for cat, (start, end) in level3_categories_ranges.items():
    train_df[cat] = train_df_nm.iloc[:, start:end].any(axis=1)
    validation_df[cat] = validation_df_nm.iloc[:, start:end].any(axis=1)
    test_df[cat] = test_df_nm.iloc[:, start:end].any(axis=1)

### Adding the columns to keep of the original dfs
train_df = pd.concat([train_df_nm[columns_to_keep], train_df], axis=1)
validation_df = pd.concat([validation_df_nm[columns_to_keep], validation_df], axis=1)
test_df = pd.concat([test_df_nm[columns_to_keep], test_df], axis=1)

### Define a mapping for "Stance" column
stance_mapping = {"in favor of": 1, "against": 0}

### Apply the mapping to convert strings to boolean values
train_df["Stance"] = train_df["Stance"].map(stance_mapping)
validation_df["Stance"] = validation_df["Stance"].map(stance_mapping)
test_df["Stance"] = test_df["Stance"].map(stance_mapping)

dfs = {"train": train_df, "validation": validation_df, "test": test_df}

In [None]:
train_df["Conversation"].sum()

In [None]:
train_df.head(2)

In [None]:
train_df.describe()

# TASK 2: Model definition

## Baseline Model

In [None]:
def baseline_model(strategy, level_3_cat, train_df, columns_to_keep):
    clf_list = [DummyClassifier(strategy=strategy) for _ in level_3_cat]
    [
        clf.fit(X=train_df[columns_to_keep[1:]], y=train_df[cat])
        for clf, cat in zip(clf_list, level_3_cat)
    ]
    return clf_list

## Bert - base Classifier

In [9]:
### Convert dataframes into datasets
datasets = {split: Dataset.from_pandas(df) for split, df in dfs.items()}

In [10]:
def compute_class_weights(df, cat_labels):
    labels_array = df[cat_labels].to_numpy()
    n_ones = np.sum(labels_array, axis=0, dtype=np.single)
    weights = np.empty_like(n_ones)
    n_zeroes = np.array([labels_array.shape[0] - o for o in n_ones])

    for class_num, (ones, zeroes) in enumerate(zip(n_ones, n_zeroes)):
        weights[class_num] = zeroes / (ones + 1e-4)

    print(f"weigts = {weights}")
    return torch.as_tensor(weights, dtype=torch.float).to(device)


def compute_class_weights_root(df, cat_labels):
    labels_array = df[cat_labels].to_numpy()
    n_ones = np.sum(labels_array, axis=0, dtype=np.single)
    weights = np.empty_like(n_ones)
    n_zeroes = np.array([labels_array.shape[0] - o for o in n_ones])

    for class_num, ones in enumerate(n_ones):
        weights[class_num] = np.sqrt(labels_array.shape[0] / (ones + 1e-4))

    print(f"weigts = {weights}")
    return torch.as_tensor(weights, dtype=torch.float).to(device)


def loss_fn(outputs, targets, pos_weight=None):
    return torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)(outputs, targets)

In [11]:
def add_labels(ds_row, labels):
    labels_batch = {k: ds_row[k] for k in ds_row.keys() if k in labels}
    labels_matrix = np.zeros((len(ds_row["Conclusion"]), len(labels)))
    for i, label in enumerate(labels):
        labels_matrix[:, i] = labels_batch[label]
    return labels_matrix.tolist()

### NEW
### General Model

In [12]:
class BERTClass(torch.nn.Module):
    def __init__(self, cps=False):
        super(BERTClass, self).__init__()
        self.bert_model = BertModel.from_pretrained(
            "bert-base-uncased", return_dict=True
        )
        self.dropout = torch.nn.Dropout(0.3)
        if not cps:
            self.linear = torch.nn.Linear(768, len(level_3_cat))
        else:
            ### 769! there is "stance" as another input
            self.linear = torch.nn.Linear(768, len(level_3_cat))

    def forward(self, input_ids, token_type_ids, attn_mask, stance=None):
        output = self.bert_model(
            input_ids, attention_mask=attn_mask, token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)

        if stance is None:
            output_linear = self.linear(output_dropout)
        else:
            ### concatenate stance
            stance = stance.view(stance.shape[0], -1)
            output_stack = torch.cat((output_dropout, stance), dim=1)
            output_linear = self.linear(output_stack)
        return output_linear

In [13]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [14]:
### Encoding
def tokenize(ds_row, tokenizer=tokenizer, premise=False, stance=False):
    ### Tokenize text columns
    if not premise:
        text_tokens = tokenizer(
            ds_row["Conclusion"],
            truncation=True,
            padding="max_length",
            max_length=tokenizer.model_max_length // 2,
            return_tensors="pt",
        )
    else:
        text_tokens = tokenizer(
            ds_row["Conclusion"],
            ds_row["Premise"],
            truncation=True,
            padding="max_length",
            max_length=tokenizer.model_max_length // 2,
            return_tensors="pt",
        )

    ### Combine text tokens with non-text features
    encoded_ds_row = {
        "input_ids": text_tokens["input_ids"],
        "token_type_ids": text_tokens["token_type_ids"],
        "attention_mask": text_tokens["attention_mask"],
    }
    if stance:
        encoded_ds_row.update(
            {
                "Stance": torch.tensor(
                    ds_row["Stance"], dtype=torch.float
                ),  ### Assuming 'Stance' is represented as 0 or 1
            }
        )

    encoded_ds_row["labels"] = add_labels(ds_row, level_3_cat)

    return encoded_ds_row

tokenized_datasets = {
    split: ds.map(
        function=tokenize,
        fn_kwargs={"tokenizer": tokenizer, "premise": False, "stance": False},
        batched=True,
        remove_columns=[
            "Argument ID",
            "Conclusion",
            "Stance",
            "Premise",
            "Openness_to_change",
            "Self_enhancement",
            "Conversation",
            "Self_transcendence",
        ],
    )
    for split, ds in datasets.items()
}

In [15]:
### Training of the model
def train_model(train_dl, model, optimizer, class_weights, use_stance=False):
    model = BERTClass()
    model.to(device)
    losses = []
    correct_predictions = 0
    num_samples = 0

    ### activate dropout, batch norm
    model.train()

    ### initialize progress bar
    batches = tq.tqdm(
        enumerate(train_dl), total=len(train_dl), leave=True, colour="steelblue"
    )

    for batch_idx, data in batches:
        ids = data["input_ids"].to(device, dtype=torch.long)
        mask = data["attention_mask"].to(device, dtype=torch.long)
        token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
        labels = data["labels"].to(device, dtype=torch.float)
        if use_stance:
            stance = data["Stance"].to(device, dtype=torch.float)
            outputs = model(ids, token_type_ids, mask, stance)  ### Forward
        else:
            outputs = model(ids, token_type_ids, mask)  ### Forward

        loss = loss_fn(outputs, labels, class_weights)
        losses.append(loss.cpu().detach().numpy())

        ### apply thresh 0.5
        outputs = torch.sigmoid(outputs).cpu().detach().numpy().round()
        labels = labels.cpu().detach().numpy()
        correct_predictions += np.sum(outputs == labels)
        num_samples += labels.size

        ### Backward
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        ### Grad descent step
        optimizer.step()

        ### Update progress bar
        batches.set_description(f"")
        batches.set_postfix(batch_loss=loss)

    accuracy = float(correct_predictions) / num_samples
    return model, accuracy, losses

### Conclusion Only Model

class BERTConclusionClass(torch.nn.Module):
    def __init__(self):
        super(BERTConclusionClass, self).__init__()
        self.bert_model = BertModel.from_pretrained(
            "bert-base-uncased", return_dict=True
        )
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, len(level_3_cat))

    def forward(self, input_ids, token_type_ids, attn_mask):
        output = self.bert_model(
            input_ids, attention_mask=attn_mask, token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output_linear = self.linear(output_dropout)
        return output_linear

### Encoding for Conclusion only model
def tokenize_c(ds_row, tokenizer):
    ### Tokenize text columns
    text_tokens = tokenizer(
        ds_row["Conclusion"],
        truncation=True,
        padding="max_length",
        max_length=tokenizer.model_max_length // 2,
        return_tensors="pt",
    )

    ### Combine text tokens with non-text features
    encoded_ds_row = {
        "input_ids": text_tokens["input_ids"],
        "token_type_ids": text_tokens["token_type_ids"],
        "attention_mask": text_tokens["attention_mask"],
    }

    encoded_ds_row["labels"] = add_labels(ds_row, level_3_cat)

    return encoded_ds_row

tokenized_datasets = {
    split: ds.map(
        function=tokenize_c,
        fn_kwargs={"tokenizer": tokenizer},
        batched=True,
        remove_columns=[
            "Argument ID",
            "Conclusion",
            "Stance",
            "Premise",
            "Openness_to_change",
            "Self_enhancement",
            "Conversation",
            "Self_transcendence",
        ],
    )
    for split, ds in datasets.items()
}

### Training of the model
def train_model_c(train_dl, model, optimizer, class_weights):
    model = BERTConclusionClass()
    model.to(device)
    losses = []
    correct_predictions = 0
    num_samples = 0

    ### activate dropout, batch norm
    model.train()

    ### initialize progress bar
    batches = tq.tqdm(
        enumerate(train_dl), total=len(train_dl), leave=True, colour="steelblue"
    )

    for batch_idx, data in batches:
        ids = data["input_ids"].to(device, dtype=torch.long)
        mask = data["attention_mask"].to(device, dtype=torch.long)
        token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
        labels = data["labels"].to(device, dtype=torch.float)

        ### Forward
        outputs = model(ids, token_type_ids, mask)

        loss = loss_fn(outputs, labels, class_weights)
        losses.append(loss.cpu().detach().numpy())

        ### apply thresh 0.5
        outputs = torch.sigmoid(outputs).cpu().detach().numpy().round()
        labels = labels.cpu().detach().numpy()
        correct_predictions += np.sum(outputs == labels)
        num_samples += labels.size

        ### Backward
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        ### Grad descent step
        optimizer.step()

        ### Update progress bar
        batches.set_description(f"")
        batches.set_postfix(batch_loss=loss)

    accuracy = float(correct_predictions) / num_samples
    return model, accuracy, losses

### Conclusion - Premise

class BERTConclusionPremiseClass(torch.nn.Module):
    def __init__(self):
        super(BERTConclusionPremiseClass, self).__init__()
        self.bert_model = BertModel.from_pretrained(
            "bert-base-uncased", return_dict=True
        )
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, len(level_3_cat))

    def forward(self, input_ids, token_type_ids, attn_mask):
        output = self.bert_model(
            input_ids, attention_mask=attn_mask, token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output_linear = self.linear(output_dropout)
        return output_linear

### Encoding for Conclusion - Premise model
def tokenize_cp(ds_row, tokenizer):
    ### Tokenize text columns
    text_tokens = tokenizer(
        ds_row["Conclusion"],
        ds_row["Premise"],
        truncation=True,
        padding="max_length",
        max_length=tokenizer.model_max_length // 2,
        return_tensors="pt",
    )

    ### Combine text tokens with non-text features
    encoded_ds_row = {
        "input_ids": text_tokens["input_ids"],
        "token_type_ids": text_tokens["token_type_ids"],
        "attention_mask": text_tokens["attention_mask"],
    }
    encoded_ds_row["labels"] = add_labels(ds_row, level_3_cat)

    return encoded_ds_row

### Training of the model
def train_model_cp(train_dl, model, optimizer, class_weights):
    losses = []
    correct_predictions = 0
    num_samples = 0

    ### activate droput, batch norm
    model.train()

    ### initialize progress bar
    batches = tq.tqdm(
        enumerate(train_dl), total=len(train_dl), leave=True, colour="steelblue"
    )
    for batch_idx, data in batches:
        ids = data["input_ids"].to(device, dtype=torch.long)
        mask = data["attention_mask"].to(device, dtype=torch.long)
        token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
        labels = data["labels"].to(device, dtype=torch.float)

        ### Forward
        outputs = model(ids, token_type_ids, mask)
        loss = loss_fn(outputs, labels, class_weights)
        losses.append(loss.cpu().detach().numpy())

        ### thresh 0.5
        outputs = torch.sigmoid(outputs).cpu().detach().numpy().round()
        labels = labels.cpu().detach().numpy()
        correct_predictions += np.sum(outputs == labels)
        num_samples += labels.size

        ### Backward
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        ### Grad descent step
        optimizer.step()

        ### Update progress bar
        batches.set_description(f"")
        batches.set_postfix(batch_loss=loss)

    accuracy = float(correct_predictions) / num_samples
    return model, accuracy, losses

### Conclusion - Premise - Stance

class BERTConclusionPremiseStanceClass(torch.nn.Module):
    def __init__(self):
        super(BERTConclusionPremiseStanceClass, self).__init__()
        self.bert_model = BertModel.from_pretrained(
            "bert-base-uncased", return_dict=True
        )
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(
            769, len(level_3_cat)
        )  ### 769! there is "stance" as another input

    def forward(self, input_ids, token_type_ids, attn_mask, stance):
        output = self.bert_model(
            input_ids, attention_mask=attn_mask, token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        ### concatenate stance
        stance = stance.view(stance.shape[0], -1)
        output_stack = torch.cat((output_dropout, stance), dim=1)
        output_linear = self.linear(output_stack)
        return output_linear

### Encoding for Conclusion - Premise - Stance model
def tokenize_cps(ds_row, tokenizer):
    ### Tokenize text columns
    text_tokens = tokenizer(
        ds_row["Conclusion"],
        ds_row["Premise"],
        truncation=True,
        padding="max_length",
        max_length=tokenizer.model_max_length // 2,
        return_tensors="pt",
    )

    ### Combine text tokens with non-text features
    encoded_ds_row = {
        "input_ids": text_tokens["input_ids"],
        "token_type_ids": text_tokens["token_type_ids"],
        "attention_mask": text_tokens["attention_mask"],
        "Stance": torch.tensor(
            ds_row["Stance"], dtype=torch.float
        ),  ### Assuming 'Stance' is represented as 0 or 1
    }
    encoded_ds_row["labels"] = add_labels(ds_row, level_3_cat)

    return encoded_ds_row

### Training of the model
def train_model_cps(train_dl, model, optimizer, class_weights):
    losses = []
    correct_predictions = 0
    num_samples = 0

    ### activate droput, batch norm
    model.train()

    ### initialize progress bar
    batches = tq.tqdm(
        enumerate(train_dl), total=len(train_dl), leave=True, colour="steelblue"
    )

    for batch_idx, data in batches:
        ids = data["input_ids"].to(device, dtype=torch.long)
        mask = data["attention_mask"].to(device, dtype=torch.long)
        token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
        labels = data["labels"].to(device, dtype=torch.float)
        stance = data["Stance"].to(device, dtype=torch.float)

        ### Forward
        outputs = model(ids, token_type_ids, mask, stance)
        loss = loss_fn(outputs, labels, class_weights)
        losses.append(loss.cpu().detach().numpy())

        ### Training accuracy, apply sigmoid, round (apply thresh 0.5)
        outputs = torch.sigmoid(outputs).cpu().detach().numpy().round()
        labels = labels.cpu().detach().numpy()
        correct_predictions += np.sum(outputs == labels)
        num_samples += labels.size

        ### Backward
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        ### Grad descent step
        optimizer.step()

        ### Update progress bar
        batches.set_description(f"")
        batches.set_postfix(batch_loss=loss)

    accuracy = float(correct_predictions) / num_samples
    return model, accuracy, losses

# Task 3: Metrics

### Baseline Metric

In [None]:
def f1_baseline(prediction, labels, data):
    ### Evaluate F1 overall
    f1_overall = f1_score(
        y_true=data[labels], y_pred=prediction, average="macro", zero_division=np.nan
    )

    ### Evaluate F1 per category
    f1_per_cat = [
        f1_score(y_true=data[cat], y_pred=prediction[:, i])
        for i, cat in enumerate(labels)
    ]

    return f1_overall, f1_per_cat

### NEW
### BERT BASE METRIC

In [16]:
def eval_model(validation_dl, model, class_weights, use_stance=False):
    losses = []
    correct_predictions = 0
    num_samples = 0
    num_categories = next(iter(validation_dl))["labels"].shape[1]

    ### accumulate data over each batch to compute the f1
    true_positives = np.array([0 for _ in range(num_categories)])
    false_positives = np.array([0 for _ in range(num_categories)])
    false_negatives = np.array([0 for _ in range(num_categories)])

    ### turn off dropout, fix batch norm
    model.eval()

    ### show progress bar
    batches = tq.tqdm(
        enumerate(validation_dl),
        total=len(validation_dl),
        leave=True,
        colour="steelblue",
    )
    # batches = enumerate(validation_dl)
    with torch.no_grad():
        for batch_idx, data in batches:
            ids = data["input_ids"].to(device, dtype=torch.long)
            mask = data["attention_mask"].to(device, dtype=torch.long)
            token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
            labels = data["labels"].to(device, dtype=torch.float)
            if use_stance:
                stance = data["Stance"].to(device, dtype=torch.float)
                outputs = model(ids, token_type_ids, mask, stance)  ### Forward
            else:
                outputs = model(ids, token_type_ids, mask)

            loss = loss_fn(outputs, labels, class_weights)
            losses.append(loss.cpu().detach().numpy())

            ### validation accuracy
            ### training sigmoid is in BCEWithLogitsLoss
            outputs = torch.sigmoid(outputs).cpu().detach().numpy().round()
            labels = labels.cpu().detach().numpy()
            correct_predictions += np.sum(outputs == labels)
            num_samples += labels.size

            ### TP: predicttion == 1, true label == 1
            true_positives += np.array(
                [
                    np.sum(np.logical_and(outputs[:, i] == 1, labels[:, i] == 1))
                    for i in range(num_categories)
                ]
            )

            ### FP: prediction == 1, true label == 0
            false_positives += np.array(
                [
                    np.sum(np.logical_and(outputs[:, i] == 1, labels[:, i] == 0))
                    for i in range(num_categories)
                ]
            )

            ### FN: prediction == 0, true label == 1
            false_negatives += np.array(
                [
                    np.sum(np.logical_and(outputs[:, i] == 0, labels[:, i] == 1))
                    for i in range(num_categories)
                ]
            )

        accuracy = float(correct_predictions) / num_samples
        precision = true_positives / (true_positives + false_positives)
        recall = true_positives / (true_positives + false_negatives)
        f1_per_cat = 2 * (precision * recall) / (precision + recall)
        f1_overall = np.mean(f1_per_cat)
    return accuracy, losses, f1_overall, f1_per_cat

### Bert Conclusion-Only Model Metric

def eval_model_c(validation_dl, model, class_weights):
    losses = []
    correct_predictions = 0
    num_samples = 0
    num_categories = next(iter(validation_dl))["labels"].shape[1]

    ### accumulate data over each batch to compute the f1
    true_positives = np.array([0 for _ in range(num_categories)])
    false_positives = np.array([0 for _ in range(num_categories)])
    false_negatives = np.array([0 for _ in range(num_categories)])

    ### set model to eval mode (turn off dropout, fix batch norm)
    model.eval()

    ### show progress bar
    batches = tq.tqdm(
        enumerate(validation_dl),
        total=len(validation_dl),
        leave=True,
        colour="steelblue",
    )
    # batches = enumerate(validation_dl)
    with torch.no_grad():
        for batch_idx, data in batches:
            ids = data["input_ids"].to(device, dtype=torch.long)
            mask = data["attention_mask"].to(device, dtype=torch.long)
            token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
            labels = data["labels"].to(device, dtype=torch.float)
            outputs = model(ids, token_type_ids, mask)

            loss = loss_fn(outputs, labels, class_weights)
            losses.append(loss.cpu().detach().numpy())

            ### validation accuracy

            ### training sigmoid is in BCEWithLogitsLoss
            outputs = torch.sigmoid(outputs).cpu().detach().numpy().round()
            labels = labels.cpu().detach().numpy()
            correct_predictions += np.sum(outputs == labels)
            num_samples += labels.size

            ### TP: predicttion == 1, true label == 1.
            true_positives += np.array(
                [
                    np.sum(np.logical_and(outputs[:, i] == 1, labels[:, i] == 1))
                    for i in range(num_categories)
                ]
            )

            ### FP: prediction == 1, true label == 0
            false_positives += np.array(
                [
                    np.sum(np.logical_and(outputs[:, i] == 1, labels[:, i] == 0))
                    for i in range(num_categories)
                ]
            )

            ### FN: prediction == 0, true label == 1
            false_negatives += np.array(
                [
                    np.sum(np.logical_and(outputs[:, i] == 0, labels[:, i] == 1))
                    for i in range(num_categories)
                ]
            )

        accuracy = float(correct_predictions) / num_samples
        precision = true_positives / (true_positives + false_positives)
        recall = true_positives / (true_positives + false_negatives)
        f1_per_cat = 2 * (precision * recall) / (precision + recall)
        f1_overall = np.mean(f1_per_cat)

    return accuracy, losses, f1_overall, f1_per_cat

### Bert Conclusion - Premise Model Metric

def eval_model_cp(validation_dl, model, class_weights):
    losses = []
    correct_predictions = 0
    num_samples = 0
    num_categories = next(iter(validation_dl))["labels"].shape[1]

    ### accumulate data over each batch to compute the f1
    true_positives = np.array([0 for _ in range(num_categories)])
    false_positives = np.array([0 for _ in range(num_categories)])
    false_negatives = np.array([0 for _ in range(num_categories)])

    ### turn off dropout, fix batch norm
    model.eval()

    ### show progress bar
    batches = tq.tqdm(
        enumerate(validation_dl),
        total=len(validation_dl),
        leave=True,
        colour="steelblue",
    )
    with torch.no_grad():
        for batch_idx, data in batches:
            ids = data["input_ids"].to(device, dtype=torch.long)
            mask = data["attention_mask"].to(device, dtype=torch.long)
            token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
            labels = data["labels"].to(device, dtype=torch.float)
            outputs = model(ids, token_type_ids, mask)

            loss = loss_fn(outputs, labels, class_weights)  # maybe remove class weights
            losses.append(loss.cpu().detach().numpy())

            ### validation accuracy

            ### training sigmoid is in BCEWithLogitsLoss
            outputs = torch.sigmoid(outputs).cpu().detach().numpy().round()
            labels = labels.cpu().detach().numpy()
            correct_predictions += np.sum(outputs == labels)
            num_samples += labels.size

            ### TP: predicttion == 1, true label == 1.
            true_positives += np.array(
                [
                    np.sum(np.logical_and(outputs[:, i] == 1, labels[:, i] == 1))
                    for i in range(num_categories)
                ]
            )

            ### FP: prediction == 1, true label == 0
            false_positives += np.array(
                [
                    np.sum(np.logical_and(outputs[:, i] == 1, labels[:, i] == 0))
                    for i in range(num_categories)
                ]
            )

            ### FN: prediction == 0, true label == 1
            false_negatives += np.array(
                [
                    np.sum(np.logical_and(outputs[:, i] == 0, labels[:, i] == 1))
                    for i in range(num_categories)
                ]
            )

        accuracy = float(correct_predictions) / num_samples
        precision = true_positives / (true_positives + false_positives)
        recall = true_positives / (true_positives + false_negatives)
        f1_per_cat = 2 * (precision * recall) / (precision + recall)
        f1_overall = np.mean(f1_per_cat)

    return accuracy, losses, f1_overall, f1_per_cat

### Bert Conclusion - Premise - Stance Model Metric

def eval_model_cps(validation_dl, model, class_weights):
    losses = []
    correct_predictions = 0
    num_samples = 0
    num_categories = next(iter(validation_dl))["labels"].shape[1]

    ### accumulate data over each batch to compute the f1
    true_positives = np.array([0 for _ in range(num_categories)])
    false_positives = np.array([0 for _ in range(num_categories)])
    false_negatives = np.array([0 for _ in range(num_categories)])

    ### turn off dropout, fix batch norm
    model.eval()

    ### show progress bar
    batches = tq.tqdm(
        enumerate(validation_dl),
        total=len(validation_dl),
        leave=True,
        colour="steelblue",
    )
    # batches = enumerate(validation_dl)
    with torch.no_grad():
        for batch_idx, data in batches:
            ids = data["input_ids"].to(device, dtype=torch.long)
            mask = data["attention_mask"].to(device, dtype=torch.long)
            token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
            labels = data["labels"].to(device, dtype=torch.float)
            stance = data["Stance"].to(device, dtype=torch.float)
            outputs = model(ids, token_type_ids, mask, stance)

            loss = loss_fn(outputs, labels, class_weights)
            losses.append(loss.cpu().detach().numpy())

            ### validation accuracy
            ### training sigmoid is in BCEWithLogitsLoss
            outputs = torch.sigmoid(outputs).cpu().detach().numpy().round()
            labels = labels.cpu().detach().numpy()
            correct_predictions += np.sum(outputs == labels)
            num_samples += labels.size

            ### TP: predicttion == 1, true label == 1
            true_positives += np.array(
                [
                    np.sum(np.logical_and(outputs[:, i] == 1, labels[:, i] == 1))
                    for i in range(num_categories)
                ]
            )

            ### FP: prediction == 1, true label == 0
            false_positives += np.array(
                [
                    np.sum(np.logical_and(outputs[:, i] == 1, labels[:, i] == 0))
                    for i in range(num_categories)
                ]
            )

            ### FN: prediction == 0, true label == 1
            false_negatives += np.array(
                [
                    np.sum(np.logical_and(outputs[:, i] == 0, labels[:, i] == 1))
                    for i in range(num_categories)
                ]
            )

        accuracy = float(correct_predictions) / num_samples
        precision = true_positives / (true_positives + false_positives)
        recall = true_positives / (true_positives + false_negatives)
        f1_per_cat = 2 * (precision * recall) / (precision + recall)
        f1_overall = np.mean(f1_per_cat)
    return accuracy, losses, f1_overall, f1_per_cat

# TASK 4 - Training and Evaluation

## Baseline

In [None]:
model_uniform = baseline_model("uniform", level_3_cat, train_df, columns_to_keep)
prediction_uniform = np.array(
    [clf.predict(X=test_df[columns_to_keep[1:]]) for clf in model_uniform]
).T
f1_overall, f1_percat = f1_baseline(
    prediction_uniform, labels=level_3_cat, data=test_df
)

In [None]:
print(f"f1_overall = {f1_overall}  \t f1 per category: {f1_percat}")

In [None]:
prediction_majority = baseline_model(
    "most_frequent", level_3_cat, train_df, columns_to_keep
)
prediction_uniform = np.array(
    [clf.predict(X=test_df[columns_to_keep[1:]]) for clf in model_uniform]
).T
f1_overall, f1_percat = f1_baseline(
    prediction_uniform, labels=level_3_cat, data=test_df
)

In [None]:
print(f"f1_overall = {f1_overall}  \t f1 per category: {f1_percat}")

## Bert-base models

In [17]:
def create_data_loaders(tokenized_datasets, batch_size):
    train_dl = torch.utils.data.DataLoader(
        tokenized_datasets["train"],
        batch_size=batch_size,
        shuffle=True,
        num_workers=0,
    )

    validation_dl = torch.utils.data.DataLoader(
        tokenized_datasets["validation"],
        batch_size=batch_size,
        shuffle=False,
        num_workers=0,
    )

    test_dl = torch.utils.data.DataLoader(
        tokenized_datasets["test"],
        batch_size=batch_size,
        shuffle=False,
        num_workers=0,
    )
    return train_dl, validation_dl, test_dl

In [18]:
def setup(
    datasets,
    learning_rate,
    batch_size=32,
    weight_decay=0.01,
    premise=False,
    stance=False,
    ### from here on there is no need to specify the arguments
    tokenization_function=tokenize,
    tokenizer=tokenizer,
    model_class=BERTClass,
):
    ### tokenize each ds in the datasets dictionary
    ### mapping the tokenization function on each dataset
    tokenized_datasets = {
        split: ds.map(
            function=tokenization_function,
            fn_kwargs={"tokenizer": tokenizer, "premise": premise, "stance": stance},
            batched=True,
            remove_columns=[
                "Argument ID",
                "Conclusion",
                "Stance",
                "Premise",
                "Openness_to_change",
                "Self_enhancement",
                "Conversation",
                "Self_transcendence",
            ],
        )
        for split, ds in datasets.items()
    }

    for ds in tokenized_datasets.values():
        ds.set_format(type="torch")

    train_dl, validation_dl, test_dl = create_data_loaders(
        tokenized_datasets, batch_size
    )

    ### define the model
    model = model_class()

    model.to(device)

    ### define the optimizer
    optimizer = torch.optim.AdamW(
        model.parameters(), lr=learning_rate, weight_decay=weight_decay
    )

    return (train_dl, validation_dl, test_dl), model, optimizer

In [19]:
def train_eval(
    dls,
    model,
    optimizer,
    class_weights,
    n_epochs=1,
    save_name="0",
    use_stance=False,
    ### from here on there is no need to specify the arguments
    train_model_f=train_model,
    eval_model_f=eval_model,
):
    model_folder = Path.cwd().joinpath("models")
    if not model_folder.exists():
        model_folder.mkdir(parents=True)

    history = {}
    best_f1 = 0
    train_dl, validation_dl, test_dl = dls

    for epoch in range(1, n_epochs + 1):
        print(f"Epoch {epoch}/{n_epochs}")
        model, train_acc, train_losses = train_model_f(
            train_dl, model, optimizer, class_weights, use_stance
        )
        val_acc, val_losses, f1_overall, f1_per_cat = eval_model_f(
            validation_dl, model, class_weights, use_stance
        )

        print(
            f"train_loss={np.mean(train_losses):.4f}, val_loss={np.mean(val_losses):.4f}, ",
            f"train_acc={train_acc:.4f}, val_acc={val_acc:.4f}, ",
            f"val_f1_overall={f1_overall:.4f}, " f"val_f1_per_cat={f1_per_cat}",
        )

        ### TODO return a more meaningful history
        history.update({"train_acc": train_acc})
        history.update({"train_losses": train_losses})
        history.update({"val_acc": val_acc})
        history.update({"val_losses": val_losses})
        history.update({"f1_overall": f1_overall})
        history.update({"f1_per_cat": f1_per_cat})

        ### save the best model
        if f1_overall > best_f1:
            torch.save(
                model.state_dict(),
                Path.joinpath(model_folder, f"model_{save_name}.bin"),
            )
            best_f1 = f1_overall
    return history

In [20]:
### Generic Parameters
BATCH_SIZE = 32
N_EPOCHS = 1
LEARNING_RATE = 3e-5
WEIGHT_DECAY = 0.01

# seeds = [333, 666 , 999]
seeds = [333, 666]
# class_weights = compute_class_weights(train_df, level_3_cat)
class_weights = compute_class_weights_root(train_df, level_3_cat)

weigts = [1.6664435 1.5299385 1.1585379 1.1881835]


### Bert Conclusion-Only

In [21]:
LEARNING_RATE_Co = LEARNING_RATE

In [22]:
### loop over seeds:
history_list_c = []
for seed_idx, seed in enumerate(seeds):
    torch.manual_seed(seed)
    np.random.seed(seed)
    dls, model_c, optimizer_c = setup(
        datasets=datasets,
        learning_rate=LEARNING_RATE_Co,
        weight_decay=WEIGHT_DECAY,
        premise=False,
        stance=False,
    )

    history = train_eval(
        dls=dls,
        model=model_c,
        optimizer=optimizer_c,
        class_weights=class_weights,
        n_epochs=N_EPOCHS,
        save_name=f"conclusion_{seed_idx}",
        use_stance=False,
    )
    history_list_c.append(history)

Map:   0%|          | 0/5393 [00:00<?, ? examples/s]

Map:   0%|          | 0/1896 [00:00<?, ? examples/s]

Map:   0%|          | 0/1576 [00:00<?, ? examples/s]

Epoch 1/1


KeyboardInterrupt: 

In [None]:
### Plot loss
plt.plot(history_list_c[1]["train_losses"])

### Bert with Conclusion and Premise

In [23]:
LEARNING_RATE_CP = LEARNING_RATE

In [24]:
### loop over seeds:
history_list_cp = []
for seed_idx, seed in enumerate(seeds):
    torch.manual_seed(seed)
    np.random.seed(seed)
    dls, model_cp, optimizer_cp = setup(
        datasets=datasets,
        learning_rate=LEARNING_RATE_CP,
        weight_decay=WEIGHT_DECAY,
        premise=True,
        stance=False,
    )

    history = train_eval(
        dls=dls,
        model=model_cp,
        optimizer=optimizer_cp,
        class_weights=class_weights,
        n_epochs=N_EPOCHS,
        save_name=f"conclusion_premise_{seed_idx}",
        use_stance=False,
    )
    history_list_cp.append(history)

Map:   0%|          | 0/5393 [00:00<?, ? examples/s]

Map:   0%|          | 0/1896 [00:00<?, ? examples/s]

Map:   0%|          | 0/1576 [00:00<?, ? examples/s]

Epoch 1/1


  0%|          | 0/169 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
### Plot loss
plt.plot(history_list_cp[0]["train_losses"])

### Bert with Conclusion Premise and Stance

In [25]:
LEARNING_RATE_CPS = LEARNING_RATE

In [26]:
### loop over seeds:
history_list_cps = []
for seed_idx, seed in enumerate(seeds):
    torch.manual_seed(seed)
    np.random.seed(seed)
    dls, model_cps, optimizer_cps = setup(
        datasets=datasets,
        learning_rate=LEARNING_RATE_CPS,
        weight_decay=WEIGHT_DECAY,
        premise=True,
        stance=True,
    )

    history = train_eval(
        dls=dls,
        model=model_cps,
        optimizer=optimizer_cps,
        class_weights=class_weights,
        n_epochs=N_EPOCHS,
        save_name=f"conclusion_premise_stance{seed_idx}",
        use_stance=True,
    )
    history_list_c.append(history)

Map:   0%|          | 0/5393 [00:00<?, ? examples/s]

Map:   0%|          | 0/1896 [00:00<?, ? examples/s]

Map:   0%|          | 0/1576 [00:00<?, ? examples/s]

KeyboardInterrupt: 

In [None]:
### Plot loss
plt.plot(history_list_cps[0]["train_losses"])

In [None]:
for hl, name in zip(
    (
        history_list_c,
        history_list_cp,
        history_list_cps,
    ),
    ("C", "CP", "CPS"),
):
    print(name)
    for h, s in zip(hl, seeds):
        print(f"SEED = {s}")
        print(f"F1 overall = {h['f1_overall']:.4f}")
        print(f"F1 per cat = {[ f'{i:.4f}' for i in h['f1_per_cat'] ]}\n")