In [1]:
pip install transformers datasets evaluate

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install transformers[torch]

Note: you may need to restart the kernel to use updated packages.


# Training 


In [1]:
import os
import re
import sys
import string
import argparse
import datetime
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

torch.manual_seed(0)

<torch._C.Generator at 0x210400f7a50>

Finetuning distillbert for context classification

In [1]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Creating training data

In [2]:
from datasets import load_dataset

squad = load_dataset("squad")

In [9]:
# generates the bad answers by randomly picking a span in the context
import random
def generate_bad_labels(example, answer_count=2):
    min_length = 1 
    max_length = 5
    words = example['context'].split()
    length = len(words)
    correct_ans = example['answers']['text'][0]
    answers = []
    while (len(answers) < answer_count):
        ans_len = random.randint(min_length, max_length)
        start = random.randint(0, length - ans_len - 1)
        ans = ' '.join(words[start: start + ans_len])
        if (ans != correct_ans):
            answers.append(ans)
    return answers

In [10]:
# assumes mi <= ma
def clamp(x, mi, ma):
    if x < mi:
        return mi
    elif x > ma:
        return ma
    return x

In [14]:
# generates the bad answers by extending from current answer
def generate_bad_labels_from_answers(example, answer_count=2):
    len_diff_max = 2
    strat_thresh = 2
    words = example['context'].split()
    word_count = length = len(words)
    correct_answer = example['answers']['text'][0].split()
    correct_answer_word_count = len(correct_answer)
    answer_start = example['answers']['answer_start'][0]
    answers = []
    while (len(answers) < answer_count):
        # two strategies, if correct_answer is longer than a certain length and if correct answer is shorter than a certain length
        if (correct_answer_word_count >= strat_thresh):
            #trim from answer
            trim_length = clamp(random.randint(1, len_diff_max), 1, correct_answer_word_count - 1)
            trim_from_front = random.randint(0, 1)
            if (trim_from_front == 0):
                answers.append(' '.join(correct_answer[trim_length:]))
            else:
                answers.append(' '.join(correct_answer[:-trim_length]))
        else:
            # deal with one word answers here, just randomly choose a singular word
            start = random.randint(0, word_count - 1)
            if (words[start] != correct_answer[0]):
                answers.append(words[start])
    return answers

In [15]:
def add_answers(dataset, generator=generate_bad_labels):
    bad_answer1 = []
    bad_answer2 = []
    correct_answer = []
    labels = []
    for example in dataset:
        bad_answers = generator(example)
        bad_answer1.append(bad_answers[0])
        bad_answer2.append(bad_answers[1])
        correct_answer.append(example['answers']['text'][0])
        labels.append(random.randint(0, 2))
    dataset = dataset.add_column('bad_answer1', bad_answer1)
    dataset = dataset.add_column('bad_answer2', bad_answer2)
    dataset = dataset.add_column('correct_answer', correct_answer)
    dataset = dataset.add_column('label', labels)
    return dataset


In [16]:
squad['train'] = add_answers(squad['train'], generator=generate_bad_labels_from_answers)
squad['validation'] = add_answers(squad['validation'], generator=generate_bad_labels_from_answers)

In [17]:
squad['train'][0]

{'id': '5733be284776f41900661182',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]},
 'bad_answer1': 'Bernadette Soubirous',
 'bad_answ

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [20]:
ans_names = ['correct_answer', 'bad_answer1', 'bad_answer2']
def preprocess_function(examples):
    context = [[c] * 3 for c in examples["context"]]
    question = examples["question"]
    labels = examples["label"]
    qna = [
        [f"{q} {examples[ans][i]}" for ans in ans_names] for i, q in enumerate(question)
    ]
    for i, q in enumerate(qna):
        label = labels[i]
        q[0], q[label] = q[label], q[0]
    context = sum(context, [])
    qna = sum(qna, [])

    tokenized_examples = tokenizer(context, qna, truncation=True)
    return {k: [v[i : i + 3] for i in range(0, len(v), 3)] for k, v in tokenized_examples.items()}

In [21]:
tokenized_squad = squad.map(preprocess_function, batched=True)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [22]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch


@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [26]:
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer

model = AutoModelForMultipleChoice.from_pretrained("distilbert-base-uncased")

Some weights of DistilBertForMultipleChoice were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Not computing metrics here due to issue with memory

In [27]:
training_args = TrainingArguments(
    output_dir="squad_qna_model",
    evaluation_strategy="steps",
    eval_steps = 5000,
    save_strategy="steps",
    save_steps = 5000,
    load_best_model_at_end=True,
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    weight_decay=0.01,
    push_to_hub=True,
    prediction_loss_only=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
)

trainer.train()

Step,Training Loss,Validation Loss
5000,0.2903,0.172063
10000,0.1883,0.177139
15000,0.143,0.127819
20000,0.1952,0.115576


TrainOutput(global_step=21900, training_loss=0.2183965109907873, metrics={'train_runtime': 6544.778, 'train_samples_per_second': 13.385, 'train_steps_per_second': 3.346, 'total_flos': 1.6633413141701616e+16, 'train_loss': 0.2183965109907873, 'epoch': 1.0})

In [18]:
trainer.push_to_hub()

'https://huggingface.co/Clyvey/squad_qna_model/tree/main/'

In [28]:
trainer.save_model('qna_model2.pt')

# Evaluation

In [2]:
from transformers import AutoModelForMultipleChoice, AutoTokenizer

model = AutoModelForMultipleChoice.from_pretrained("./qna_model.pt")

tokenizer = AutoTokenizer.from_pretrained("./qna_model.pt")


## Data generation

don't need to run this if training data was previously generated

In [33]:
from datasets import load_dataset

squad = load_dataset("squad")

In [34]:
squad['validation'] = add_answers(squad['validation'])
squad['train'] = add_answers(squad['train'])

In [36]:
squad

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers', 'bad_answer1', 'bad_answer2', 'correct_answer', 'label'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers', 'bad_answer1', 'bad_answer2', 'correct_answer', 'label'],
        num_rows: 10570
    })
})

In [38]:
ans_names = ['correct_answer', 'bad_answer1', 'bad_answer2']
def preprocess_function(examples):
    context = [[c] * 3 for c in examples["context"]]
    question = examples["question"]
    labels = examples["label"]
    qna = [
        [f"{q} {examples[ans][i]}" for ans in ans_names] for i, q in enumerate(question)
    ]
    for i, q in enumerate(qna):
        label = labels[i]
        q[0], q[label] = q[label], q[0]
    context = sum(context, [])
    qna = sum(qna, [])

    tokenized_examples = tokenizer(context, qna, truncation=True)
    print(tokenized_example)
    return {k: [v[i : i + 3] for i in range(0, len(v), 3)] for k, v in tokenized_examples.items()}

tokenized_squad = squad.map(preprocess_function, batched=True)
print(tokenized_squad)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

NameError: name 'tokenized_example' is not defined

In [29]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [30]:
training_args = TrainingArguments(
    output_dir="squad_qna_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

eval_trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    compute_metrics=compute_metrics
)

In [31]:
eval_trainer.evaluate()

{'eval_loss': 0.11557555943727493,
 'eval_accuracy': 0.9785241248817408,
 'eval_runtime': 168.976,
 'eval_samples_per_second': 62.553,
 'eval_steps_per_second': 15.641}

'Make predictions based on 3 model predictions'

In [2]:
from datasets import load_dataset

squad = load_dataset("squad")

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [4]:
import json

pred_deberta = None
pred_albert = None
pred_electra = None
with open('./deberta_pred.json') as f:
    pred_deberta = json.load(f)
with open('./albert_pred.json') as f:
    pred_albert = json.load(f)
with open('./electra_pred.json') as f:
    pred_electra = json.load(f)

In [5]:
def add_preds(dataset):
    answer1 = []
    answer2 = []
    answer3 = []
    for example in dataset:
        answer1.append(pred_deberta[example['id']])
        answer2.append(pred_albert[example['id']])
        answer3.append(pred_electra[example['id']])
    dataset = dataset.add_column('pred_deberta', answer1)
    dataset = dataset.add_column('pred_albert', answer2)
    dataset = dataset.add_column('pred_electra', answer3)
    return dataset

In [6]:
squad['validation'] = add_preds(squad['validation'])

In [7]:
pred_names = ['pred_deberta', 'pred_albert', 'pred_electra']
def preprocess_function_for_pred(examples):
    context = [[c] * 3 for c in examples["context"]]
    question = examples["question"]
    qna = [
        [f"{q} {examples[ans][i]}" for ans in pred_names] for i, q in enumerate(question)
    ]
    context = sum(context, [])
    qna = sum(qna, [])

    tokenized_examples = tokenizer(context, qna, truncation=True)
    return {k: [v[i : i + 3] for i in range(0, len(v), 3)] for k, v in tokenized_examples.items()}

In [8]:
squad['train'] = None
tokenized_squad = squad['validation'].map(preprocess_function_for_pred, batched=True)

In [10]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch


@dataclass
class DataCollatorForMultipleChoiceForPred:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        return batch

model saved at https://huggingface.co/Clyvey/squad_qna_model_v2

In [11]:
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer

model = AutoModelForMultipleChoice.from_pretrained("./qna_model2.pt")

In [12]:
training_args = TrainingArguments(
    output_dir="squad_qna_model_v2",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

eval_trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=tokenized_squad,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoiceForPred(tokenizer=tokenizer),
)

In [13]:
eval_trainer.push_to_hub()

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.09k [00:00<?, ?B/s]

'https://huggingface.co/Clyvey/squad_qna_model_v2/tree/main/'

In [43]:
import numpy as np

eval_pred = eval_trainer.predict(tokenized_squad)

In [45]:
predictions = np.argmax(eval_pred[0], axis=1)

In [61]:
preds = {}
for i, example in enumerate(tokenized_squad):
    preds[example['id']] = example[pred_names[int(predictions[i])]]

In [14]:
import matplotlib.pyplot as plt

plt.hist(predictions)

NameError: name 'predictions' is not defined

In [63]:
with open('ensemble_pred.json', 'w') as f:
    f.write(json.dumps(preds))