# Pre-requisites

**Need to Have:** The contextualized dataset file `parrot-qa-ctx.json` generated using the DPR pipeline.

Upload it to `data` folder.


In [None]:
!pip install datasets transformers sentencepiece rouge_score

# Step 2: UnifiedQA

In [None]:
QA_MODEL_NAME = 'allenai/unifiedqa-t5-large'
QA_BATCH_SIZE = 16
QA_MAX_INPUT_TOKENS = 1024

In [None]:
# Preprocess raw data into formatted input

import json


def format_qa_for_inference(qa_db):
    questions = []
    answers = []

    for pair in qa_db:
        qa_ctx = pair['question'] + ' \\n ' + ' '.join(pair['contexts'])
        questions.append(qa_ctx)
        answers.append(pair['answer'])

    return questions, answers


with open('data/parrot-qa-ctx.json') as fp:
    qa_db = json.load(fp)
questions, answers = format_qa_for_inference(qa_db)

len(questions), len(answers)

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration


tokenizer = T5Tokenizer.from_pretrained(QA_MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(QA_MODEL_NAME).to('cuda')


def run_batch(model_inputs):
    answers = []
    inputs = tokenizer(model_inputs, return_tensors="pt", truncation=True, padding=True, max_length=QA_MAX_INPUT_TOKENS)
    res = model.generate(
        input_ids=inputs["input_ids"].to('cuda'),
        attention_mask=inputs["attention_mask"].to('cuda')
    )
    outputs = tokenizer.batch_decode(res, skip_special_tokens=True)
    return outputs


generated_answers = []
for start_idx in range(0, len(questions), QA_BATCH_SIZE):
    print(f'Inference for {start_idx}...')
    batch = questions[start_idx:start_idx+QA_BATCH_SIZE]
    generated_answers.extend(run_batch(batch))

len(generated_answers)

In [None]:
from datasets import load_metric
rouge = load_metric("rouge")

rouge.compute(predictions=generated_answers, references=answers)