## **1. Install and import bibraries**

In [1]:
!pip install -qq transformers[sentencepiece]==4.35.2 datasets==2.16.1 evaluate==0.4.1

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
!sudo apt-get install libomp-dev
!pip install -qq faiss-gpu

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libomp-14-dev libomp5-14
Suggested packages:
  libomp-14-doc
The following NEW packages will be installed:
  libomp-14-dev libomp-dev libomp5-14
0 upgraded, 3 newly installed, 0 to remove and 33 not upgraded.
Need to get 738 kB of archives.
After this operation, 8,991 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 libomp5-14 amd64 1:14.0.0-1ubuntu1.1 [389 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 libomp-14-dev amd64 1:14.0.0-1ubuntu1.1 [347 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libomp-dev amd64 1:14.0-55~exp2 [3,074 B]
Fetched 738 kB in 1s (656 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debcon

In [3]:
import numpy as np
import collections
import torch
import faiss
import evaluate

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
from transformers import AutoModelForQuestionAnswering
from transformers import TrainingArguments
from transformers import Trainer
from tqdm.auto import tqdm

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

## **2. Download dataset**

In [13]:
DATASET_NAME = 'squad_v2'
raw_datasets = load_dataset(DATASET_NAME, split='train+validation').shard(num_shards=40, index=0)
raw_datasets

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 3555
})

## **3. Filter out non-answerable samples**

In [14]:
raw_datasets = raw_datasets.filter(
    lambda x: len(x['answers']['text']) > 0
)
raw_datasets

Filter:   0%|          | 0/3555 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 2333
})

In [15]:
columns = raw_datasets.column_names
columns_to_keep = ['id', 'context', 'question', 'answers']
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
raw_datasets = raw_datasets.remove_columns(columns_to_remove)
raw_datasets

Dataset({
    features: ['id', 'context', 'question', 'answers'],
    num_rows: 2333
})

## **4. Intialize pre-trained model**

In [16]:
MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)

## **5. Create get vector embedding functions**

In [17]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

In [18]:
def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list,
        padding=True,
        truncation=True,
        return_tensors='pt'
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)

    return cls_pooling(model_output)

In [19]:
# Test functionality
embedding = get_embeddings(raw_datasets['question'][0])
embedding.shape

torch.Size([1, 768])

In [20]:
# Convert to numpy array (required for HF Datasets)
EMBEDDING_COLUMN = 'question_embedding'
embeddings_dataset = raw_datasets.map(
    lambda x: {EMBEDDING_COLUMN: get_embeddings(x['question']).detach().cpu().numpy()[0]}
)

Map:   0%|          | 0/2333 [00:00<?, ? examples/s]

In [21]:
embeddings_dataset.add_faiss_index(column=EMBEDDING_COLUMN)

  0%|          | 0/3 [00:00<?, ?it/s]

Dataset({
    features: ['id', 'context', 'question', 'answers', 'question_embedding'],
    num_rows: 2333
})

In [22]:
embeddings_dataset[0]

{'id': '56be85543aeaaa14008c9063',
 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".',
 'question': 'When did Beyonce start becoming popular?',
 'answers': {'text': ['in the late 1990s'], 'answer_start': [269]},
 'question_embedding': [0.11258814483880997,
  -0.36605221033096313,
  -0.016732459887862206,
  -0.07673165202140808,
  -0.1046

## **6. Search similar samples with a question**

In [23]:
input_question = 'When did Beyonce start becoming popular?'

input_quest_embedding = get_embeddings([input_question]).cpu().detach().numpy()
input_quest_embedding.shape

(1, 768)

In [24]:
TOP_K = 5
scores, samples = embeddings_dataset.get_nearest_examples(
    EMBEDDING_COLUMN, input_quest_embedding, k=TOP_K
)

In [25]:
for idx, score in enumerate(scores):
    print(f'Top {idx + 1}\tScore: {score}')
    print(f'Question: {samples["question"][idx]}')
    print(f'Context: {samples["context"][idx]}')
    print()

Top 1	Score: 0.0
Question: When did Beyonce start becoming popular?
Context: Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".

Top 2	Score: 7.510955333709717
Question: What movie did Beyonce act in 2006?
Context: Her first acting role of 2006 was in the comedy film The Pink Panther starring opposite Steve Martin, grossing $158.8 million at the box offi

## **7. QA**

In [26]:
from transformers import pipeline

PIPELINE_NAME = 'question-answering'
MODEL_NAME = 'mf212/distilbert-finetuned-squadv2'
pipe = pipeline(PIPELINE_NAME, model=MODEL_NAME)

config.json:   0%|          | 0.00/561 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [28]:
print(f'Input question: {input_question}')
for idx, score in enumerate(scores):
    context = samples["context"][idx]
    answer = pipe(
        question=input_question,
        context=context
    )
    print(f'Top {idx + 1}\tScore: {score}')
    print(f'Context: {context}')
    print(f'Answer: {answer}')
    print()

Input question: When did Beyonce start becoming popular?
Top 1	Score: 0.0
Context: Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".
Answer: {'score': 0.6091989278793335, 'start': 276, 'end': 286, 'answer': 'late 1990s'}

Top 2	Score: 7.510955333709717
Context: Her first acting role of 2006 was in the comedy film The Pink Panther starring opposite Steve

In [29]:
test_datasets = load_dataset(DATASET_NAME, split='validation')
test_datasets

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 11873
})

In [30]:
TOP_K = 3
for idx, input_question in enumerate(embeddings_dataset['question'][200:210]):
    input_quest_embedding = get_embeddings([input_question]).cpu().detach().numpy()
    scores, samples = embeddings_dataset.get_nearest_examples(
        EMBEDDING_COLUMN, input_quest_embedding, k=TOP_K
    )
    print(f'Question {idx + 1}: {input_question}')
    for jdx, score in enumerate(scores):
        print(f'Top {jdx + 1}\tScore: {score}')
        context = samples['context'][jdx]
        answer = pipe(
            question=input_question,
            context=context
        )
        print(f'Context: {context}')
        print(f'Answer: {answer}')
        print()
    print()

Question 1: What was the main subject of controversy that arose within the Reformed Church during the beginning of the Republic?
Top 1	Score: 0.0
Context: In the first years of the Republic, controversy arose within the Reformed Church, mainly around the subject of predestination. This has become known as the struggle between Arminianism and Gomarism, or between Remonstrants and Contra-Remonstrants. In 1618 the Synod of Dort tackled this issue, which led to the banning of the Remonstrant faith.
Answer: {'score': 0.9573731422424316, 'start': 111, 'end': 125, 'answer': 'predestination'}

Top 2	Score: 10.170092582702637
Context: Roman canon law had been criticized by the Presbyterians as early as 1572 in the Admonition to Parliament. The protest centered on the standard defense that canon law could be retained so long as it did not contradict the civil law. According to Polly Ha, the Reformed Church Government refuted this claiming that the bishops had been enforcing canon law for 1500 ye