In [None]:
!pip install trapper

# Training

This notebook serves as a walkthrough for training with trapper package.

In [None]:
# Setting up the correct path

from copy import deepcopy
import os
import json
from typing import Dict, List, Union

from jury import Jury
import requests
from tqdm import tqdm

from trapper.training.train import run_experiment
from trapper.common.notebook_utils import download_fixture_data, load_json, save_json

In [None]:
download_fixture_data()

Set logging configuration.

In [None]:
import logging
import sys

# You can customize your logger below.
logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s',
                     level=logging.INFO, stream=sys.stdout)

In [None]:
# Define constants
EXPERIMENT_NAME = "roberta-base-training-example"

WORKING_DIR = os.getcwd()
PROJECT_ROOT = os.path.dirname(os.path.dirname(WORKING_DIR))
EXPERIMENT_DIR = os.path.join(WORKING_DIR, EXPERIMENT_NAME)
CONFIG_PATH = os.path.join(WORKING_DIR, "experiment.jsonnet")  # default experiment params

MODEL_DIR = os.path.join(EXPERIMENT_DIR, "model")
CHECKPOINT_DIR = os.path.join(EXPERIMENT_DIR, "checkpoints")
OUTPUT_DIR = os.path.join(EXPERIMENT_DIR, "outputs")

In [None]:
ext_vars = {
    # Used to feed the jsonnet config file with file paths
    "OUTPUT_PATH": OUTPUT_DIR,
    "CHECKPOINT_PATH": CHECKPOINT_DIR
}

result = run_experiment(
    config_path=CONFIG_PATH,
    ext_vars=ext_vars,
)

In [None]:
result

# Inference

In this section, usage of pipeline for inference is illustrated.

In [None]:
from trapper.pipelines.question_answering_pipeline import SquadQuestionAnsweringPipeline
from trapper.pipelines.pipeline import create_pipeline_from_checkpoint

## Helper Functions

Some helper functions for inference steps.

In [None]:
def prepare_samples(data: Union[str, Dict]):
    if isinstance(data, str):
        data = load_json(data)
    data = data["data"]
    qa_samples = []

    for article in data:
        for paragraph in article["paragraphs"]:
            for qa in paragraph["qas"]:
                sample = {}
                sample["context"] = paragraph["context"]
                sample["question"] = qa["question"]
                sample["gold_answers"] = [ans["text"] for ans in qa["answers"]]
                qa_samples.append(sample)

    return qa_samples


def prepare_samples_for_pipeline(samples: List[Dict]):
    pipeline_samples = deepcopy(samples)
    for i, sample in enumerate(pipeline_samples):
        sample.pop("gold_answers")
        if "id" not in sample:
            sample["id"] = str(i)
    return pipeline_samples


def predict(pipeline, samples, **kwargs):
    pipeline_samples = prepare_samples_for_pipeline(samples)
    predictions = pipeline(pipeline_samples, **kwargs)
    for i, prediction in enumerate(predictions):
        samples[i]["predicted_answer"] = prediction[0]["answer"].text
    return samples

In [None]:
SQUAD_DEV = os.path.join(PROJECT_ROOT, "test_fixtures/data/question_answering/squad_qa/dev.json")
EXPORT_PATH = os.path.join(WORKING_DIR, "qa-outputs.json")

PRETRAINED_MODEL_PATH = OUTPUT_DIR
EXPERIMENT_CONFIG = os.path.join(PRETRAINED_MODEL_PATH, "experiment_config.json")

In [None]:
qa_pipeline = create_pipeline_from_checkpoint(
    checkpoint_path=PRETRAINED_MODEL_PATH,
    experiment_config_path=EXPERIMENT_CONFIG,
    task="squad-question-answering",
    device=0
)

In [None]:
samples = prepare_samples(SQUAD_DEV)

In [None]:
predictions = predict(qa_pipeline, samples)

In [None]:
save_json(predictions, EXPORT_PATH)

In [None]:
references = [sample["gold_answers"] for sample in predictions]
hypotheses = [sample["predicted_answer"] for sample in predictions]

In [None]:
jury = Jury(metrics="squad")

In [None]:
jury.evaluate(references=references, predictions=hypotheses)