In [1]:
#!pip install datasets
#!pip install accelerate -U
#!pip install transformers

In [2]:
import urllib
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from datasets import Dataset
import torch
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)

  return torch._C._cuda_getDeviceCount() if nvml_count < 0 else nvml_count


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device: %s" % device)

Device: cpu


## TASK 1: Corpus


In [4]:
### Argument urls
argument_urls = {
    "train": "https://zenodo.org/records/8248658/files/arguments-training.tsv?download=1",
    "validation": "https://zenodo.org/records/8248658/files/arguments-validation.tsv?download=1",
    "test": "https://zenodo.org/records/8248658/files/arguments-test.tsv?download=1",
}

### Human values urls
level2_values_urls = {
    "train": "https://zenodo.org/records/8248658/files/labels-training.tsv?download=1",
    "validation": "https://zenodo.org/records/8248658/files/labels-validation.tsv?download=1",
    "test": "https://zenodo.org/records/8248658/files/labels-test.tsv?download=1",
}
### Check that the splits in the dict keys were not misspelled
for split in level2_values_urls.keys():
    assert split in argument_urls.keys(), "url dictionary keys misspelled"

In [5]:
### Create Data folder
data_folder = Path.cwd().joinpath("Data")
if not data_folder.exists():
    data_folder.mkdir(parents=True)

### Path for each file
argument_paths = {
    split: data_folder.joinpath(f"arguments_{split}.tsv")
    for split in argument_urls.keys()
}
level2_values_paths = {
    split: data_folder.joinpath(f"labels_{split}.tsv")
    for split in level2_values_urls.keys()
}

### Download each file if it's not already there
for file, path in argument_paths.items():
    if not path.exists():
        urllib.request.urlretrieve(argument_urls[file], filename=path)
for file, path in level2_values_paths.items():
    if not path.exists():
        urllib.request.urlretrieve(level2_values_urls[file], filename=path)

In [6]:
### Read arguments file (split = train/validation/test)
argument_dfs = {
    split: pd.read_csv(path, sep="\t") for split, path in argument_paths.items()
}

### Read human values file (split = train/validation/test)
level2_values_dfs = {
    values_split: pd.read_csv(path, sep="\t")
    for values_split, path in level2_values_paths.items()
}

In [7]:
### Merge arguments and (labels) level 2 values (split = train/validation/test)
# TODO find a better name for args_level2vals_dfs
args_level2vals_dfs = {
    split: pd.merge(argument, level2_values_dfs[split], on="Argument ID")
    for split, argument in argument_dfs.items()
}

In [8]:
# TODO just for developing purposes
print = False

In [9]:
if print:
    print([d.shape for d in level2_values_dfs.values()])
if print:
    level2_values_dfs["train"].head(2)

In [10]:
if print:
    print([d.shape for d in argument_dfs.values()])
if print:
    argument_dfs["train"].head(2)

In [11]:
if print:
    args_level2vals_dfs["train"].describe()

In [12]:
if print:
    print([d.shape for d in args_level2vals_dfs.values()])
if print:
    args_level2vals_dfs["train"].head(2)

In [13]:
# TODO REMOVE
# ### if we want to merge the 3 df into 1 we do this
# for split, df in args_level2vals_dfs.items():
#     df["split"] = split
# #
# big_df = pd.concat(args_level2vals_dfs.values(), axis="rows")
# print(big_df.shape)

In [14]:
### Considering category ranges (0,3),(3,7),(7,13),(13,19)
### adding +4, considering the first 4 columns which are not categories
level3_categories_ranges = {
    "Openness_to_change": (4, 7),
    "Self_enhancement": (7, 11),
    "Conversation": (11, 17),
    "Self_transcendence": (17, 23),
}
columns_to_keep = ["Argument ID", "Conclusion", "Stance", "Premise"]
### This will be useful later
level_3_cat = list(level3_categories_ranges.keys())

In [15]:
### Creating final dataframes

train, validation, test = args_level2vals_dfs.keys()
assert train == "train" and validation == "validation" and test == "test"

### nm = not merged
train_df_nm = args_level2vals_dfs["train"]
validation_df_nm = args_level2vals_dfs["validation"]
test_df_nm = args_level2vals_dfs["test"]

### Creating final dataframes
train_df = pd.DataFrame()
validation_df = pd.DataFrame()
test_df = pd.DataFrame()

### Merge lvl2 to lvl 3 (any = OR)
for cat, (start, end) in level3_categories_ranges.items():
    train_df[cat] = train_df_nm.iloc[:, start:end].any(axis=1)
    validation_df[cat] = validation_df_nm.iloc[:, start:end].any(axis=1)
    test_df[cat] = test_df_nm.iloc[:, start:end].any(axis=1)

### Adding the columns to keep of the original dfs
train_df = pd.concat([train_df_nm[columns_to_keep], train_df], axis=1)
validation_df = pd.concat([validation_df_nm[columns_to_keep], validation_df], axis=1)
test_df = pd.concat([test_df_nm[columns_to_keep], test_df], axis=1)

### Define a mapping for "Stance" column
stance_mapping = {"in favor of": 1, "against": 0}

### Apply the mapping to convert strings to boolean values
train_df["Stance"] = train_df["Stance"].map(stance_mapping)
validation_df["Stance"] = validation_df["Stance"].map(stance_mapping)
test_df["Stance"] = test_df["Stance"].map(stance_mapping)


### TODO see version 1
dfs = {"train": train_df, "validation": validation_df, "test": test_df}

In [16]:
if print:
    train_df.head(2)

In [17]:
if print:
    test_df.describe()

In [18]:
if print:
    test_df["Openness_to_change"]

In [19]:
### TODO remove
np.random.seed(12345678)

# TASK 2: Model definition

### Baseline Model 

In [20]:
def baseline_model(
    strategy, level_3_cat, train_df, columns_to_keep
):  # Da rivere i parametri una volta definito un dizionario per tutte le variabili(?)
    clf_list = [DummyClassifier(strategy=strategy) for _ in level_3_cat]
    [
        clf.fit(X=train_df[columns_to_keep[1:]], y=train_df[cat])
        for clf, cat in zip(clf_list, level_3_cat)
    ]
    prediction = np.array(
        [clf.predict(X=test_df[columns_to_keep[1:]]) for clf in clf_list]
    ).T
    return prediction

## Bert-base Classifier

In [21]:
### Convert dataframes into datasets
datasets = {split: Dataset.from_pandas(df) for split, df in dfs.items()}

In [22]:
### Huggingface provides the automodel for multi-label classification
def generate_bert_auto_model(model_card, labels):
    id2label = {idx: label for idx, label in enumerate(labels)}
    label2id = {label: idx for idx, label in enumerate(labels)}

    model = AutoModelForSequenceClassification.from_pretrained(
        model_card,
        problem_type="multi_label_classification",
        num_labels=len(labels),
        id2label=id2label,
        label2id=label2id,
    )

    tokenizer = AutoTokenizer.from_pretrained(model_card)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    return (model, tokenizer, data_collator)

In [23]:
### We can use a Huggingface model as an encoder and define our classification heads
def generate_bert_model(model_card, labels):
    pass

### Conclusion Only Model

In [24]:
def add_labels(ds_row, labels):
    labels_batch = {k: ds_row[k] for k in ds_row.keys() if k in labels}
    labels_matrix = np.zeros((len(ds_row["Conclusion"]), len(labels)))
    for i, label in enumerate(labels):
        labels_matrix[:, i] = labels_batch[label]
    return labels_matrix.tolist()

In [25]:
### Encoding for Conclusion only model
def tokenize_conclusion(ds_row, tokenizer):
    ### Tokenize text columns
    text_tokens = tokenizer(
        ds_row["Conclusion"],
        truncation=True,
        padding="max_length",
        max_length=tokenizer.model_max_length,
        return_tensors="pt",
    )

    ### Combine text tokens with non-text features
    encoded_ds_row = {
        "input_ids": text_tokens["input_ids"],
        "token_type_ids": text_tokens["token_type_ids"],
        "attention_mask": text_tokens["attention_mask"],
        "Openness_to_change": torch.tensor(
            ds_row["Openness_to_change"], dtype=torch.float
        ),
        "Self_enhancement": torch.tensor(ds_row["Self_enhancement"], dtype=torch.float),
        "Conversation": torch.tensor(ds_row["Conversation"], dtype=torch.float),
        "Self_transcendence": torch.tensor(
            ds_row["Self_transcendence"], dtype=torch.float
        ),
    }

    encoded_ds_row["labels"] = add_labels(
        ds_row, level_3_cat
    )  # TODO bring back code of function add_labels

    return encoded_ds_row

### Conclusion - Premise

### Conclusion - Premise - Stance

In [26]:
# TODO remove
columns = [
    "input_ids",
    "token_type_ids",
    "attention_mask",
    "Openness_to_change",
    "Self_enhancement",
    "Conversation",
    "Self_transcendence",
]

In [27]:
### Sanity check
# print(train_tokenized_ds["Conclusion"][50])
# decoded_text = tokenizer.decode(train_tokenized_ds["input_ids"][50])
# print(decoded_text)

In [28]:
### TODO here we can put the ROBERTA tokenizations

# Task 3: Metrics

### Baseline Metric

In [29]:
def f1_baseline(prediction, labels):
    ### Evaluate F1 overall
    f1_overall = f1_score(y_true=test_df[labels], y_pred=prediction, average="macro")

    ### Evaluate F1 per category
    f1_per_cat = [
        f1_score(y_true=test_df[cat], y_pred=prediction[:, i])
        for i, cat in enumerate(labels)
    ]

    return f1_overall, f1_per_cat

### Bert Conclusion-Only Model Metric

In [30]:
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    metrics = {"f1": f1_score(y_true=y_true, y_pred=y_pred, average="macro")}
    return metrics


def compute_metrics(prediction):
    preds = (
        prediction.predictions[0]
        if isinstance(prediction.predictions, tuple)
        else prediction.predictions
    )
    result = multi_label_metrics(predictions=preds, labels=prediction.label_ids)
    return result

### Bert Conclusion - Premise Model Metric

In [31]:
### Encoding for Conclusion - Premise model
def tokenize_conclusion_premise(example):
    ### Tokenize text columns
    text_tokens = tokenizer(
        example["Conclusion"],
        example["Premise"],
        truncation=True,
        padding="max_length",
        max_length=tokenizer.model_max_length,
        return_tensors="pt",
    )

    ### Combine text tokens with non-text features
    encoded_example = {
        "input_ids": text_tokens["input_ids"],
        "token_type_ids": text_tokens["token_type_ids"],
        "attention_mask": text_tokens["attention_mask"],
        "Openness_to_change": torch.tensor(
            example["Openness_to_change"], dtype=torch.float
        ),
        "Self_enhancement": torch.tensor(
            example["Self_enhancement"], dtype=torch.float
        ),
        "Conversation": torch.tensor(example["Conversation"], dtype=torch.float),
        "Self_transcendence": torch.tensor(
            example["Self_transcendence"], dtype=torch.float
        ),
    }

    return encoded_example

### Bert Conclusion - Premise - Stance Model Metric

In [32]:
### Encoding for Conclusion - Premise - Stance model
def tokenize_conclusion_premise_stance(example):
    ### Tokenize text columns
    text_tokens = tokenizer(
        example["Conclusion"],
        example["Premise"],
        truncation=True,
        padding="max_length",
        max_length=tokenizer.model_max_length,
        return_tensors="pt",
    )

    ### Combine text tokens with non-text features
    encoded_example = {
        "input_ids": text_tokens["input_ids"],
        "token_type_ids": text_tokens["token_type_ids"],
        "attention_mask": text_tokens["attention_mask"],
        "Stance": torch.tensor(
            example["Stance"], dtype=torch.float
        ),  ### Assuming 'Stance' is represented as 0 or 1
        "Openness_to_change": torch.tensor(
            example["Openness_to_change"], dtype=torch.float
        ),
        "Self_enhancement": torch.tensor(
            example["Self_enhancement"], dtype=torch.float
        ),
        "Conversation": torch.tensor(example["Conversation"], dtype=torch.float),
        "Self_transcendence": torch.tensor(
            example["Self_transcendence"], dtype=torch.float
        ),
    }

    return encoded_example

# TASK 4 - Training and Evaluation

## Baseline

In [33]:
prediction_uniform = baseline_model("uniform", level_3_cat, train_df, columns_to_keep)
f1_overall, f1_percat = f1_baseline(prediction_uniform, labels=level_3_cat)

In [34]:
prediction_majority = baseline_model(
    "most_frequent", level_3_cat, train_df, columns_to_keep
)
f1_overall, f1_percat = f1_baseline(prediction_uniform, labels=level_3_cat)

## Bert-base models

In [35]:
model_card = "bert-base-uncased"

### Bert Conclusion-Only 

In [36]:
model, tokenizer, data_collator = generate_bert_auto_model(
    model_card, labels=level_3_cat
)

tokenized_datasets = {
    split: ds.map(
        function=tokenize_conclusion, fn_kwargs={"tokenizer": tokenizer}, batched=True
    )
    for split, ds in datasets.items()
}

for ds in tokenized_datasets.values():
    ds.set_format(type="torch")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/5393 [00:00<?, ? examples/s]

Map:   0%|          | 0/1896 [00:00<?, ? examples/s]

Map:   0%|          | 0/1576 [00:00<?, ? examples/s]

In [37]:
### TODO get data for 3 different random seeds

training_args = TrainingArguments(
    output_dir="Models/BertBaseUncased",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    load_best_model_at_end=True,
    report_to="none",
    save_strategy="epoch",  #'no'
    evaluation_strategy="epoch",
    num_train_epochs=1,  ### fine tuning
    weight_decay=0.01,
    metric_for_best_model="f1",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.evaluate()

  0%|          | 0/675 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


KeyboardInterrupt: 

### Bert with Conclusion and Premise

In [None]:
### TODO setup datasets?

In [None]:
### TODO train/eval

### Bert with Conclusion Premise and Stance

In [None]:
### TODO setup datasets?

In [None]:
### TODO train/eval

## Roberta-base models

In [None]:
model_card = "roberta-base-uncased"

In [None]:
### TODO here we can put the ROBERTA training and evaluation