<a href="https://colab.research.google.com/github/parrot-qa/models/blob/main/Bart_ELI5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pre-requisites

Upload the train, dev and test files as generated by the DPR script:
- `parrot-qa-ctx-train.json`
- `parrot-qa-ctx-dev.json`
- `parrot-qa-ctx-test.json`



In [None]:
!pip install transformers datasets sentencepiece rouge_score

# Step 2: BART Fine-tuning

In [None]:
# Perform fine-tuning or run zero-shot?
DO_FINE_TUNING = True

# For training with mini-batches, TOKENIZER_BATCH_SIZE must be 2048
# For training one sample at a time, TOKENIZER_BATCH_SIZE should be small, e.g. 16
TOKENIZER_BATCH_SIZE = 16
TRAIN_BATCH_SIZE = 1
EVAL_BATCH_SIZE = 1

# When tokenizing text
MAX_QUES_CTX_LENGTH = 1024
MAX_ANS_LENGTH = 512

MODEL_NAME = 'yjernite/bart_eli5'
DEVICE = 'cuda'

### Reformat dataset

In [None]:
import json
from datasets import Dataset


def create_dataset(file_path):
    with open(file_path) as fp:
        dataset = json.load(fp)
    
    q, a, c = [], [], []
    for item in dataset:
        q.append(item['question'])
        a.append(item['answer'])
        c.append(' '.join(item['contexts']))
    
    data = {'question': q, 'answer': a, 'context': c}
    return Dataset.from_dict(data)


train = create_dataset('parrot-qa-ctx-train.json')
dev = create_dataset('parrot-qa-ctx-dev.json')
test = create_dataset('parrot-qa-ctx-test.json')

len(train), len(dev), len(test)

### Load and perform tokenization

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


In [None]:
def tokenize_all(samples):
    q, c, a = samples['question'], samples['context'], samples['answer']
    qc = [f'question: {qval} context: {cval}' for (qval, cval) in zip(q, c)]
    inp = tokenizer(qc, padding=True, truncation=True, max_length=MAX_QUES_CTX_LENGTH)
    outp = tokenizer(a, padding=True, truncation=True, max_length=MAX_ANS_LENGTH)
    return {
        'input_ids': inp.input_ids,
        'attention_mask': inp.attention_mask,
        'labels': outp.input_ids,
        'decoder_attention_mask': outp.attention_mask
    }


train = train.map(tokenize_all, batched=True, batch_size=TOKENIZER_BATCH_SIZE)
dev = dev.map(tokenize_all, batched=True, batch_size=TOKENIZER_BATCH_SIZE)
test = test.map(tokenize_all, batched=True, batch_size=TOKENIZER_BATCH_SIZE)

### Train model

In [None]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(DEVICE)


In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments


args = Seq2SeqTrainingArguments(
    'output',
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    load_best_model_at_end=True,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    predict_with_generate=True,
)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train,
    eval_dataset=dev,
    tokenizer=tokenizer,
)

if DO_FINE_TUNING:
    trainer.train()

### Perform inference

In [None]:
from datasets import load_metric

rouge = load_metric("rouge")


def calc_rouge(dataset):
    pred = trainer.predict(dataset)
    pred_answers = tokenizer.batch_decode(pred.predictions, skip_special_tokens=True)
    results = rouge.compute(predictions=pred_answers, references=dataset['answer'])
    return results


train_rouge = calc_rouge(train)
dev_rouge = calc_rouge(dev)
test_rouge = calc_rouge(test)

In [None]:
def display_rouge(split, score):
    R1 = score['rouge1'].mid.fmeasure * 100
    R2 = score['rouge2'].mid.fmeasure * 100
    RL = score['rougeL'].mid.fmeasure * 100
    print(f'{split}: R1 = {R1:.2f}, R2 = {R2:.2f}, RL = {RL:.2f}')


display_rouge('Train', train_rouge)
display_rouge('Dev', dev_rouge)
display_rouge('Test', test_rouge)