# Contradictory sentences - baseline model
Create a baseline model for contradiction classification

Because this dataset is multi-lingual, we need to choose the best in class language model that is readily trainable (on kaggle TPUs?). One possibility is the [`XLM-RoBERTa`](https://huggingface.co/tomaarsen/span-marker-xlm-roberta-base-multinerd) model, but this model has fallen out of favor due to major tokenization limitations. The preferred model for multilanguage NER is this SpanMarker model using xlm-roberta-base as the underlying encoder, trained on the multinerd dataset: [`span-marker-xlm-roberta-base-multinerd`](https://huggingface.co/tomaarsen/span-marker-xlm-roberta-base-multinerd). The problem is that I wont want to be doing NER, I want to be doing sentence comparison.

A reasonable starting point is just the base [`XLM-RoBERTa`](https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/xlm-roberta) model.

This kaggle challenge was started as a reason to learn to use TPUs. You can use TPUs in PyTorch with the [`torch_xla`](https://pytorch.org/xla/release/2.0/index.html) package. See how to use it in this example kaggle code [here](https://www.kaggle.com/code/tanlikesmath/the-ultimate-pytorch-tpu-tutorial-jigsaw-xlm-r).  
For now, we will stick with CPU/GPU. Double check Apple silicon MPS devices [[ref]](https://developer.apple.com/metal/pytorch/).



In [1]:
# imports
import os
from pathlib import Path
import warnings
import time

import pandas as pd
import numpy as np
import kaggle
import wandb
from tqdm import tqdm

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.optim import lr_scheduler
from transformers import (
    TrainingArguments, Trainer, 
    XLMRobertaTokenizer, XLMRobertaModel, XLMRobertaConfig)
from datasets import DatasetDict

from utils import *

warnings.filterwarnings('ignore')

# Constants
DATA_PATH = "data"
WANDB_PROJECT = "contradictory"
RAW_DATA_AT = "contra_raw"
PROCESSED_DATA_AT = "contra_split"

In [2]:
def seed_everything(seed):
    os.environ['PYTHONHASHSEED']=str(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)
    # random.seed(seed)
    
seed_everything(SEED)

In [3]:
device = "cpu"
if torch.cuda.is_available():
    print("Found GPU: ", torch.cuda.device_count())
    device = "cuda"
elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
    mps_device = torch.device("mps")
    print("Found MPS, may not work on some torch ops!" )
    device = "mps"

torch.device(device)

Found MPS, may not work on some torch ops!


device(type='mps')

In [4]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [5]:
MODEL_NAME = "xlm-roberta-base"  # "xlm-roberta-large"

BATCH_SIZE = 16 # hyperparameter, can iterate on this later

In [43]:
output_dir = os.path.join(DATA_PATH, f"contradiction-training-{str(int(time.time()))}")

train_config = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=1,  # can change to 100
    report_to="wandb",  # enable logging to W&B
    run_name=f"{MODEL_NAME}-baseline",  # name of the W&B run (optional)
    label_smoothing_factor=0.1
)

In [7]:
# init wandb
run = wandb.init(project=WANDB_PROJECT, entity=None, job_type="training", config=train_config)

[34m[1mwandb[0m: Currently logged in as: [33mmpesavento[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [8]:
processed_data_at = run.use_artifact(f'{PROCESSED_DATA_AT}:latest')
processed_dataset_dir = Path(processed_data_at.download())
df = pd.read_csv(processed_dataset_dir / 'data_split.csv')

# drop test for now, split in valid & train
df = df[df.Stage != 'test'].reset_index(drop=True)
df['is_valid'] = df.Stage == 'valid'


[34m[1mwandb[0m:   4 of 4 files downloaded.  


In [9]:
tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME)

In [10]:
from datasets import Dataset, DatasetDict

train_dataset = Dataset.from_pandas(df[df["is_valid"]!=True])
valid_dataset = Dataset.from_pandas(df[df["is_valid"]])
datasets = DatasetDict({"train": train_dataset, "validation": valid_dataset})
                                    

In [11]:
def tokenize_function_batch(examples):
    tokenized_examples = tokenizer(examples["premise"], examples["hypothesis"], 
                                   truncation=True, padding=True, return_tensors="pt",)
    return tokenized_examples


In [12]:
tokenized_datasets = datasets.map(tokenize_function_batch, batched=True)

Map:   0%|          | 0/9696 [00:00<?, ? examples/s]

Map:   0%|          | 0/1212 [00:00<?, ? examples/s]

In [13]:
tokenizer.decode(tokenized_datasets["train"][0]["input_ids"])

'<s> They look just as good as new." They cut them carefully and ripped away the oilskin.</s></s> The oilskin would be good for several months of use.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'

In [36]:
# data collator
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
from transformers import DataCollatorWithPadding


@dataclass
class DataCollator:
    """
    Data collator that will dynamically pad the inputs
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)

        batch = self.tokenizer.pad(
            features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [53]:
# original was from CustomRoberta here: https://www.kaggle.com/code/tanlikesmath/the-ultimate-pytorch-tpu-tutorial-jigsaw-xlm-r?scriptVersionId=37280514&cellId=27
# this is an attempt to manually set up a sequence classifier. It looks like i'm not calculating the loss correctly, likely from 
# not interpreting the source code correctly
# Using the XLMRobertaForSequenceClassification is the correct way to do this!
# check source code for important differences: https://github.com/huggingface/transformers/blob/v4.17.0/src/transformers/models/roberta/modeling_roberta.py#L1162

class XLMRobertaContradict(nn.Module):
    def __init__(self, dropout=0.0):
        super(XLMRobertaContradict, self).__init__()
        self.num_labels = 3
        self.dropout_pct = dropout
        self.roberta = XLMRobertaModel.from_pretrained(MODEL_NAME, 
                                                       output_hidden_states=False, num_labels=self.num_labels)
        self.dropout = nn.Dropout(p=self.dropout_pct)
        self.feats = self.roberta.pooler.dense.out_features
        self.relu =  torch.nn.ReLU(inplace=True)
        self.linear = nn.Linear(self.feats, self.num_labels)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self,
                input_ids=None,
                attention_mask=None,
                position_ids=None,
                head_mask=None,
                inputs_embeds=None,
                # labels=None,
               ):
        x = self.roberta(input_ids,
                       attention_mask=attention_mask,
                       position_ids=position_ids,
                       head_mask=head_mask,
                       inputs_embeds=inputs_embeds)
        # these have an unknown effect, idk what the hid_mix is doing. 
        # maybe blending the last hidden states to extend the output embedding?
        # hidden_states = x[2]
        # hmix = []
        # for i in range(1, self.hid_mix + 1):
        #     hmix.append(hidden_states[-i][:, 0].reshape((-1, 1, self.feats)))
        # hmix_tensor = torch.cat(hmix, 1)
        # mean_tensor = torch.mean(hmix_tensor, 1)
        # pool_tensor = self.dropout(mean_tensor)
        # return self.classifier(pool_tensor)
        
        # 
        # x = x.pooler_output  # get just the [CLS] token
        x = x.last_hidden_state[:,0,:]
        # x = self.relu(x)
        # x = self.dropout(x)
        x = self.linear(x)
        x = self.softmax(x)
        # import pdb; pdb.set_trace()
        return x

# model = XLMRobertaContradict()

In [54]:
from transformers import XLMRobertaForSequenceClassification
model = XLMRobertaForSequenceClassification.from_pretrained(MODEL_NAME)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [55]:


# set up the trainer
trainer = Trainer(
    model=model,
    args=train_config,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    # data_collator=DataCollator(tokenizer=tokenizer),
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
)

In [56]:

trainer.train()

Step,Training Loss
1,0.5066


TrainOutput(global_step=1, training_loss=0.5066074132919312, metrics={'train_runtime': 20.9575, 'train_samples_per_second': 0.382, 'train_steps_per_second': 0.048, 'total_flos': 1011333119040.0, 'train_loss': 0.5066074132919312, 'epoch': 0.0})

In [None]:
# set up callbacks & metrics

callbacks = [
    SaveModelCallback(monitor='miou'),
    WandbCallback(log_preds=False, log_model=True)
]