# Contradictory sentences - baseline model
Create a baseline model for contradiction classification

Because this dataset is multi-lingual, we need to choose the best in class language model that is readily trainable (on kaggle TPUs?). One possibility is the [`XLM-RoBERTa`](https://huggingface.co/tomaarsen/span-marker-xlm-roberta-base-multinerd) model, but this model has fallen out of favor due to major tokenization limitations. The preferred model for multilanguage NER is this SpanMarker model using xlm-roberta-base as the underlying encoder, trained on the multinerd dataset: [`span-marker-xlm-roberta-base-multinerd`](https://huggingface.co/tomaarsen/span-marker-xlm-roberta-base-multinerd). The problem is that I wont want to be doing NER, I want to be doing sentence comparison.

A reasonable starting point is just the base [`XLM-RoBERTa`](https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/xlm-roberta) model.

This kaggle challenge was started as a reason to learn to use TPUs. You can use TPUs in PyTorch with the [`torch_xla`](https://pytorch.org/xla/release/2.0/index.html) package. See how to use it in this example kaggle code [here](https://www.kaggle.com/code/tanlikesmath/the-ultimate-pytorch-tpu-tutorial-jigsaw-xlm-r).  
For now, we will stick with CPU/GPU. Double check Apple silicon MPS devices [[ref]](https://developer.apple.com/metal/pytorch/).



In [1]:
# imports
import os
from pathlib import Path
import warnings
import time

import pandas as pd
import numpy as np
import kaggle
import wandb
from tqdm import tqdm

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.optim import lr_scheduler
from transformers import (
    TrainingArguments, Trainer, DataCollatorWithPadding,
    XLMRobertaTokenizer, XLMRobertaModel, XLMRobertaConfig)
from datasets import DatasetDict
import evaluate

from utils import *

warnings.filterwarnings('ignore')

# Constants
DATA_PATH = "data"
WANDB_PROJECT = "contradictory"
RAW_DATA_AT = "contra_raw"
PROCESSED_DATA_AT = "contra_split"



In [2]:
def seed_everything(seed):
    os.environ['PYTHONHASHSEED']=str(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)
    # random.seed(seed)
    
seed_everything(SEED)

In [3]:
device = "cpu"
if torch.cuda.is_available():
    print("Found GPU: ", torch.cuda.device_count())
    device = "cuda"
elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
    mps_device = torch.device("mps")
    print("Found MPS, may not work on some torch ops!" )
    device = "mps"

torch.device(device)

Found GPU:  1


device(type='cuda')

In [4]:
# define global parameters
MODEL_NAME = "xlm-roberta-base"  # "xlm-roberta-large"

NUM_EPOCHS = 5
BATCH_SIZE = 16 # hyperparameter, can iterate on this later

id2label = {0: "entailment", 1: "neutral", 2: "contradiction"}
label2id = {v:k for k,v in id2label.items()}

In [5]:
output_dir = os.path.join(DATA_PATH, f"contradiction-training-{str(int(time.time()))}")

train_config = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    logging_steps=1,
    report_to="wandb",  # enable logging to W&B
    run_name=f"{MODEL_NAME}-baseline",  # name of the W&B run (optional)
)

In [6]:
# init wandb
run = wandb.init(project=WANDB_PROJECT, entity=None, job_type="training", config=train_config)

[34m[1mwandb[0m: Currently logged in as: [33mmpesavento[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [7]:
processed_data_at = run.use_artifact(f'{PROCESSED_DATA_AT}:latest')
processed_dataset_dir = Path(processed_data_at.download())
df = pd.read_csv(processed_dataset_dir / 'data_split.csv')

# drop test for now, split in valid & train
df = df[df.Stage != 'test'].reset_index(drop=True)
df['is_valid'] = df.Stage == 'valid'


[34m[1mwandb[0m:   4 of 4 files downloaded.  


In [8]:
tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME)

In [9]:
from datasets import Dataset, DatasetDict

train_dataset = Dataset.from_pandas(df[df["is_valid"]!=True])
valid_dataset = Dataset.from_pandas(df[df["is_valid"]])
datasets = DatasetDict({"train": train_dataset, "validation": valid_dataset})
                                    

In [10]:
def tokenize_function_batch(examples):
    tokenized_examples = tokenizer(examples["premise"], examples["hypothesis"], 
                                   truncation=True, padding=True, return_tensors="pt",)
    return tokenized_examples


In [11]:
tokenized_datasets = datasets.map(tokenize_function_batch, batched=True)

Map:   0%|          | 0/9696 [00:00<?, ? examples/s]

Map:   0%|          | 0/1212 [00:00<?, ? examples/s]

In [12]:
tokenizer.decode(tokenized_datasets["train"][0]["input_ids"])

'<s> They look just as good as new." They cut them carefully and ripped away the oilskin.</s></s> The oilskin would be good for several months of use.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'

In [13]:
accuracy_fn = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    ent_ix = np.where(labels==label2id["entailment"])[0]
    neut_ix = np.where(labels==label2id["neutral"])[0]
    contra_ix = np.where(labels==label2id["contradiction"])[0]
    metrics = {
        "accuracy": accuracy_fn.compute(
            predictions=predictions, references=labels)["accuracy"],
        "acc_entailment": accuracy_fn.compute(
            predictions=predictions[ent_ix], references=labels[ent_ix])["accuracy"],
        "acc_neutral": accuracy_fn.compute(
            predictions=predictions[neut_ix], references=labels[neut_ix])["accuracy"],
        "acc_contradiction": accuracy_fn.compute(
            predictions=predictions[contra_ix], references=labels[contra_ix])["accuracy"],
    }
    return metrics

In [14]:
from transformers import XLMRobertaForSequenceClassification

num_labels = len(np.unique(tokenized_datasets['train']["label"]))
model = XLMRobertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# set up the trainer
trainer = Trainer(
    model=model,
    args=train_config,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

In [16]:
# train it!
model_trained = trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Acc Entailment,Acc Neutral,Acc Contradiction
1,1.0363,0.905834,0.580858,0.58134,0.809278,0.362069
2,0.9618,0.758498,0.681518,0.779904,0.610825,0.647783
3,0.6403,0.770384,0.69967,0.705742,0.701031,0.692118
4,1.183,0.869413,0.70297,0.746411,0.675258,0.684729
5,0.1124,0.950777,0.707096,0.746411,0.662371,0.70936


In [17]:
def create_predictions_table(dataset, id2label):
    """Creates a wandb table with predictions and targets side by side"""
    predictions = trainer.predict(dataset, metric_key_prefix="validate")
    X_pred = np.argmax(predictions.predictions, axis=1)
    y_labels = predictions.label_ids
    if not np.array_equal(y_labels, [dataset[i]["label"] for i in range(len(dataset))]):
        raise ValueError("prediction labels do not match dataset labels")
    
    col_names = ["id", "premise", "hypothesis", "lang_abv", "label", "predict"]

    data_out = []
    for i, sample in tqdm(enumerate(dataset)):
        data_out.append({
            col:sample[col] for col in col_names[:-1]})
        data_out[-1][col_names[-1]] = X_pred[i]

    data_df = pd.DataFrame.from_records(data_out)
    table = wandb.Table(data=data_df)
    return table

In [19]:
table = create_predictions_table(tokenized_datasets['validation'], id2label)
wandb.log({"pred_table":table})

1212it [00:00, 4139.07it/s]


In [20]:
scores = trainer.evaluate()
for k,v in scores.items():
    wandb.summary[k] = v

In [21]:
wandb.finish()

VBox(children=(Label(value='0.817 MB of 0.817 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/acc_contradiction,▁▇████
eval/acc_entailment,▁█▅▇▇▇
eval/acc_neutral,█▁▄▃▃▃
eval/accuracy,▁▇████
eval/loss,▆▁▁▅██
eval/runtime,▄▁█▄▆▁
eval/samples_per_second,▅█▁▅▃█
eval/steps_per_second,▅█▁▅▃█
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███

0,1
epoch,5.0
eval/acc_contradiction,0.70936
eval/acc_entailment,0.74641
eval/acc_neutral,0.66237
eval/accuracy,0.7071
eval/loss,0.95078
eval/runtime,3.6988
eval/samples_per_second,327.678
eval/steps_per_second,41.095
eval_acc_contradiction,0.70936
