# Using RoBERTa with Fastai - CB Tutorial 

Source:

Dev Sharma, Using RoBERTa with fast.ai for SuperGLUE Task CB
Finetuning state of the art RoBERTa with fast.ai on Commitment Bank NLP task.

This notebook follows the tutorial @ https://medium.com/@devkosal/superglue-roberta-with-fastai-for-rte-task-c362961be957

Related article:

Using RoBERTa with fast.ai for NLP
Implementing the current state of the art NLP model in fast.ai

https://medium.com/analytics-vidhya/using-roberta-with-fastai-for-nlp-7ed3fed21f6c

In [84]:
# !pip install fastai

In [105]:
import numpy as np
import pandas as pd

from pathlib import Path
from typing import *

import torch
import torch.optim as optim

from fastai import *
from fastai.text import *
from fastai.metrics import *
from fastai.text.data import *
from fastai.text.all import *

from transformers import RobertaTokenizer

In [106]:
# Creating a config object to store task specific information
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)
    
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)
        
config = Config(
    task = "CB",
    testing=False,
    seed = 2019,
    roberta_model_name='roberta-base', # can also be exchanged with roberta-large 
    max_lr=1e-5,
    epochs=10,
    use_fp16=False,
    bs=4, 
    max_seq_len=256, 
    num_labels = 3,
    hidden_dropout_prob=.05,
    hidden_size=768, # 1024 for roberta-large
    start_tok = "<s>",
    end_tok = "</s>",
    mark_fields=True,
)

In [107]:
train = pd.read_json("./superglue_data/CB/train.jsonl",lines=True)
val = pd.read_json("./superglue_data/CB/val.jsonl",lines=True)
test = pd.read_json("./superglue_data/CB/test.jsonl",lines=True)

In [108]:
# drop the unnecessary idx column
for df in (train,val):
    if "idx" in df.columns: df.drop("idx",axis=1,inplace=True)
        
if config.testing:
    train = train[:100]
    val = val[:100]
    
print(df.shape)

(56, 3)


In [109]:
train.head()

Unnamed: 0,premise,hypothesis,label
0,It was a complex language. Not written down but handed down. One might say it was peeled down.,the language was peeled down,entailment
1,"It is part of their religion, a religion I do not scoff at as it holds many elements which match our own even though it lacks the truth of ours. At one of their great festivals they have the ritual of driving out the devils from their bodies. First the drummers come on - I may say that no women are allowed to take part in this ritual and the ladies here will perhaps agree with me that they are fortunate in that omission.",no women are allowed to take part in this ritual,entailment
2,"The Paris to Rouen railway was being extended to Le Havre, and the line cut straight through Dr Flaubert's land. Part of it was to be compulsorily purchased. You could say that Gustave was shepherded into creative retreat at Croisset by epilepsy.",Gustave was shepherded into creative retreat at Croisset by epilepsy,entailment
3,Part of it was to be compulsorily purchased. You could say that Gustave was shepherded into creative retreat at Croisset by epilepsy. You could also say he was driven there by the railway.,Gustave was driven to creative retreat in Croisset by the railway,entailment
4,"Some of them, like for instance the farm in Connecticut, are quite small. If I like a place I buy it. I guess you could say it's a hobby.",buying places is a hobby,entailment


In [110]:
train.label.value_counts()

contradiction    119
entailment       115
neutral           16
Name: label, dtype: int64

In [111]:
feat_cols = ["premise","hypothesis"]
label_cols = "label"

## Setting Up the Tokenizer

In [112]:
class FastAiRobertaTokenizer(BaseTokenizer):
    """Wrapper around RobertaTokenizer to be compatible with fastai"""
    def __init__(self, tokenizer: RobertaTokenizer, max_seq_len: int=128, **kwargs): 
        self._pretrained_tokenizer = tokenizer
        self.max_seq_len = max_seq_len 
    def __call__(self, *args, **kwargs): 
        return self 
    def tokenizer(self, t:str) -> List[str]: 
        """Adds Roberta bos and eos tokens and limits the maximum sequence length""" 
        if config.mark_fields:
            sub = 2 # subtraction in totoal seq_length to be made due to adding spcl tokens
            assert "xxfld" in t
            t = t.replace("xxfld 1","") # remove the xxfld 1 special token from fastai
            # converting fastai field sep token to Roberta
            t = re.split(r'xxfld \d+', t) 
            res = []
            for i in range(len(t)-1): # loop over the number of additional fields and the Roberta sep
                res += self._pretrained_tokenizer.tokenize(t[i]) + [config.end_tok, config.end_tok]
                sub += 2 # increase our subtractions since we added more spcl tokens
            res += self._pretrained_tokenizer.tokenize(t[-1]) # add the last sequence
            return [config.start_tok] + res[:self.max_seq_len - sub] + [config.end_tok] 
        
        res = self._pretrained_tokenizer.tokenize(t)
        return [config.start_tok] + res[:self.max_seq_len - sub] + [config.end_tok]

In [113]:
# create fastai tokenizer for roberta
# source: https://docs.fast.ai/text.core.html#tokenizer-

roberta_tok = RobertaTokenizer.from_pretrained("roberta-base")

fastai_tokenizer = Tokenizer(tok=FastAiRobertaTokenizer(
    roberta_tok, max_seq_len=config.max_seq_len), rules=[]
                            )

In [114]:
# create fastai vocabulary for roberta    
path="./output"
roberta_tok.save_vocabulary(path)

with open('./output/vocab.json', 'r') as f:
    roberta_vocab_dict = json.load(f)
    
# fastai_roberta_vocab = Vocab(list(roberta_vocab_dict.keys()))
fastai_roberta_vocab = list(roberta_vocab_dict.keys())

In [125]:
# Setting up pre-processors
class RobertaTokenizeProcessor(TokenizeProcessor):
    def __init__(self, tokenizer):
         super().__init__(tokenizer=tokenizer, include_bos=False, include_eos=False, mark_fields=config.mark_fields)

class RobertaNumericalizeProcessor(NumericalizeProcessor):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, vocab=fastai_roberta_vocab, **kwargs)


def get_roberta_processor(tokenizer:Tokenizer=None, vocab:Vocab=None):
    """
    Constructing preprocessors for Roberta
    We remove sos and eos tokens since we add that ourselves in the tokenizer.
    We also use a custom vocabulary to match the numericalization with the original Roberta model.
    """
    return [RobertaTokenizeProcessor(tokenizer=tokenizer), NumericalizeProcessor(vocab=vocab)]

NameError: name 'TokenizeProcessor' is not defined

## Setting up the DataBunch

In [97]:
# Creating a Roberta specific DataBunch class
class RobertaDataBunch(TextDataBunch):
    "Create a `TextDataBunch` suitable for training Roberta"
    @classmethod
    def create(cls, train_ds, valid_ds, test_ds=None, path:PathOrStr='.', bs:int=64, val_bs:int=None, pad_idx=1,
               pad_first=True, device:torch.device=None, no_check:bool=False, backwards:bool=False, 
               dl_tfms:Optional[Collection[Callable]]=None, **dl_kwargs) -> DataBunch:
        "Function that transform the `datasets` in a `DataBunch` for classification. Passes `**dl_kwargs` on to `DataLoader()`"
        datasets = cls._init_ds(train_ds, valid_ds, test_ds)
        val_bs = ifnone(val_bs, bs)
        collate_fn = partial(pad_collate, pad_idx=pad_idx, pad_first=pad_first, backwards=backwards)
        train_sampler = SortishSampler(datasets[0].x, key=lambda t: len(datasets[0][t][0].data), bs=bs)
        train_dl = DataLoader(datasets[0], batch_size=bs, sampler=train_sampler, drop_last=True, **dl_kwargs)
        dataloaders = [train_dl]
        for ds in datasets[1:]:
            lengths = [len(t) for t in ds.x.items]
            sampler = SortSampler(ds.x, key=lengths.__getitem__)
            dataloaders.append(DataLoader(ds, batch_size=val_bs, sampler=sampler, **dl_kwargs))
        return cls(*dataloaders, path=path, device=device, dl_tfms=dl_tfms, collate_fn=collate_fn, no_check=no_check)

NameError: name 'TextDataBunch' is not defined

In [98]:
class RobertaTextList(TextList):
    _bunch = RobertaDataBunch
    _label_cls = TextList

NameError: name 'TextList' is not defined

In [99]:
# loading the tokenizer and vocab processors
processor = get_roberta_processor(tokenizer=fastai_tokenizer, vocab=fastai_roberta_vocab)

# creating our databunch 
data = ItemLists(".", RobertaTextList.from_df(train, ".", cols=feat_cols, processor=processor),
                      RobertaTextList.from_df(val, ".", cols=feat_cols, processor=processor)
                ) \
       .label_from_df(cols=label_cols, label_cls=CategoryList) \
       .add_test(RobertaTextList.from_df(test, ".", cols=feat_cols, processor=processor)) \
       .databunch(bs=config.bs,pad_first=False)

NameError: name 'get_roberta_processor' is not defined

# Building the Model

In [64]:
import torch
import torch.nn as nn
from transformers import RobertaForSequenceClassification

# defining our model architecture 
class RobertaForSequenceClassificationModel(nn.Module):
    def __init__(self,num_labels=config.num_labels):
        super(RobertaForSequenceClassificationModel,self).__init__()
        self.num_labels = num_labels
        self.roberta = RobertaForSequenceClassification.from_pretrained(config.roberta_model_name,num_labels= self.num_labels)

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        outputs = self.roberta(input_ids, token_type_ids, attention_mask)
        logits = outputs[0] 
        return logits

In [65]:
roberta_model = RobertaForSequenceClassificationModel() 

learn = Learner(data, roberta_model, metrics=[accuracy])

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


AttributeError: module 'fastai.text.data' has no attribute 'train_ds'

In [None]:
learn.model.roberta.train() # setting roberta to train as it is in eval mode by default
learn.fit_one_cycle(config.epochs, max_lr=config.max_lr)

# Getting Predictions

In [69]:
def get_preds_as_nparray(ds_type) -> np.ndarray:
    learn.model.roberta.eval()
    preds = learn.get_preds(ds_type)[0].detach().cpu().numpy()
    sampler = [i for i in data.dl(ds_type).sampler]
    reverse_sampler = np.argsort(sampler)
    ordered_preds = preds[reverse_sampler, :]
    pred_values = np.argmax(ordered_preds, axis=1)
    return ordered_preds, pred_values

In [70]:
# val preds
preds, pred_values = get_preds_as_nparray(DatasetType.Valid)

NameError: name 'DatasetType' is not defined

In [71]:
# accuracy for valid valid
(pred_values == data.valid_ds.y.items).mean()

NameError: name 'pred_values' is not defined

In [72]:
# test preds
_, test_pred_values = get_preds_as_nparray(DatasetType.Test)

NameError: name 'DatasetType' is not defined