## **1. Install and import bibraries**

In [None]:
!pip install -qq transformers[sentencepiece]==4.35.2 datasets==2.16.1 evaluate==0.4.1

In [None]:
!sudo apt-get install libomp-dev
!pip install -qq faiss-gpu

In [None]:
import numpy as np
import collections
import torch
import faiss
import evaluate

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
from transformers import AutoModelForQuestionAnswering
from transformers import TrainingArguments
from transformers import Trainer
from tqdm.auto import tqdm

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

## **2. Download dataset**

In [None]:
DATASET_NAME = None
raw_datasets = None
raw_datasets

## **3. Filter out non-answerable samples**

In [None]:
raw_datasets = None
raw_datasets

In [None]:
columns = raw_datasets.column_names
columns_to_keep = ['id', 'context', 'question', 'answers']
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
raw_datasets = raw_datasets.remove_columns(columns_to_remove)
raw_datasets

## **4. Intialize pre-trained model**

In [None]:
MODEL_NAME = None
tokenizer = None
model = None

## **5. Create get vector embedding functions**

In [None]:
def cls_pooling(model_output):
    return None

In [None]:
def get_embeddings(text_list):

    return None

In [None]:
# Test functionality
embedding = get_embeddings(raw_datasets['question'][0])
embedding.shape

In [None]:
# Convert to numpy array (required for HF Datasets)
EMBEDDING_COLUMN = 'question_embedding'
embeddings_dataset = raw_datasets.map(
    lambda x: {EMBEDDING_COLUMN: get_embeddings(x['question']).detach().cpu().numpy()[0]}
)

In [None]:
embeddings_dataset.add_faiss_index(column=EMBEDDING_COLUMN)

In [None]:
embeddings_dataset[0]

## **6. Search similar samples with a question**

In [None]:
input_question = 'When did Beyonce start becoming popular?'

input_quest_embedding = get_embeddings([input_question]).cpu().detach().numpy()
input_quest_embedding.shape

In [None]:
TOP_K = 5
scores, samples = embeddings_dataset.get_nearest_examples(
    EMBEDDING_COLUMN, input_quest_embedding, k=TOP_K
)

In [None]:
for idx, score in enumerate(scores):
    print(f'Top {idx + 1}\tScore: {score}')
    print(f'Question: {samples["question"][idx]}')
    print(f'Context: {samples["context"][idx]}')
    print()

## **7. QA**

In [None]:
from transformers import pipeline

PIPELINE_NAME = None
MODEL_NAME = None
pipe = pipeline(PIPELINE_NAME, model=MODEL_NAME)

In [None]:
print(f'Input question: {input_question}')
for idx, score in enumerate(scores):
    context = samples["context"][idx]
    answer = pipe(
        question=input_question,
        context=context
    )
    print(f'Top {idx + 1}\tScore: {score}')
    print(f'Context: {context}')
    print(f'Answer: {answer}')
    print()

In [None]:
test_datasets = load_dataset(DATASET_NAME, split='validation')
test_datasets

In [None]:
TOP_K = 3
for idx, input_question in enumerate(embeddings_dataset['question'][200:210]):
    input_quest_embedding = get_embeddings([input_question]).cpu().detach().numpy()
    scores, samples = embeddings_dataset.get_nearest_examples(
        EMBEDDING_COLUMN, input_quest_embedding, k=TOP_K
    )
    print(f'Question {idx + 1}: {input_question}')
    for jdx, score in enumerate(scores):
        print(f'Top {jdx + 1}\tScore: {score}')
        context = samples['context'][jdx]
        answer = pipe(
            question=input_question,
            context=context
        )
        print(f'Context: {context}')
        print(f'Answer: {answer}')
        print()
    print()