In [17]:
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Preparing corpus

Loading the trainig set from the json file

In [18]:
import os
import json

path = 'proj-nlp-2024/data'
os.chdir(f'/content/drive/MyDrive/{path}')
train_path = os.path.join(os.getcwd(), 'beerqa_train_v1.0.json')
with open(train_path, 'r') as file:
    train_dataset = json.load(file)

Preparing the corpus out of the training set contexts

In [19]:
from pandas.core.common import flatten

data_dict = train_dataset.get('data', {}) # Extracting the data dictionary
context_list = [list(flatten(entry.get('context', ''))) for entry in data_dict] # List of list of strings
context_string = list(flatten(context_list)) # List of strings
corpus = context_string[1::2] # Removing the strings corresponding to titles

In [20]:
corpus[:3]

['In the 19th century, Burke was praised by both liberals and conservatives. Burke\'s friend Philip Francis wrote that Burke "was a man who truly & prophetically foresaw all the consequences which would rise from the adoption of the French principles", but because Burke wrote with so much passion, people were doubtful of his arguments. William Windham spoke from the same bench in the House of Commons as Burke had when he had separated from Fox and an observer said Windham spoke "like the ghost of Burke" when he made a speech against peace with France in 1801. William Hazlitt, a political opponent of Burke, regarded him as amongst his three favourite writers (the others being Junius and Rousseau) and made it "a test of the sense and candour of any one belonging to the opposite party, whether he allowed Burke to be a great man". William Wordsworth was originally a supporter of the French Revolution and attacked Burke in "A Letter to the Bishop of Llandaff" (1793), but by the early 19th c

# Embedding and indexing the corpus

In [21]:
!pip install -U sentence-transformers
!pip install hnswlib



Choosing the best avaiable pre-trained model

In [24]:
from sentence_transformers import SentenceTransformer
import torch

semb_model = SentenceTransformer('all-mpnet-base-v2')

NameError: name '_C' is not defined

In [26]:
!pip uninstall torch torchvision torchaudio
!pip install torch torchvision torchaudio
!pip install sentence-transformers

Found existing installation: torch 2.2.1+cu121
Uninstalling torch-2.2.1+cu121:
  Would remove:
    /usr/local/bin/convert-caffe2-to-onnx
    /usr/local/bin/convert-onnx-to-caffe2
    /usr/local/bin/torchrun
    /usr/local/lib/python3.10/dist-packages/functorch/*
    /usr/local/lib/python3.10/dist-packages/torch-2.2.1+cu121.dist-info/*
    /usr/local/lib/python3.10/dist-packages/torch/*
    /usr/local/lib/python3.10/dist-packages/torchgen/*
Proceed (Y/n)? 
Y
  Successfully uninstalled torch-2.2.1+cu121
Found existing installation: torchvision 0.17.1+cu121
Uninstalling torchvision-0.17.1+cu121:
  Would remove:
    /usr/local/lib/python3.10/dist-packages/torchvision-0.17.1+cu121.dist-info/*
    /usr/local/lib/python3.10/dist-packages/torchvision.libs/libcudart.7ec1eba6.so.12
    /usr/local/lib/python3.10/dist-packages/torchvision.libs/libjpeg.ceea7512.so.62
    /usr/local/lib/python3.10/dist-packages/torchvision.libs/libnvjpeg.f00ca762.so.12
    /usr/local/lib/python3.10/dist-packages/tor



Embedding the corpus

In [None]:
corpus_embeddings = semb_model.encode(corpus, convert_to_tensor=True, show_progress_bar=True)

Saving the corpus embeddings

In [None]:
import pickle

with open('/content/drive/MyDrive/proj-nlp-2024/data/corpus_embeddings.pkl', 'wb') as f:
    pickle.dump(corpus_embeddings, f)

Loading the corpus embeddings

In [None]:
# import pickle

# with open('/content/drive/MyDrive/proj-nlp-2024/data/corpus_embeddings.pkl', "rb") as f:
#     corpus_embeddings = pickle.load(f)

In [None]:
index_dim = corpus_embeddings.size(1)
corpus_embeddings.size()

Indexing the corpus embeddings

In [None]:
import hnswlib

index = hnswlib.Index(space='cosine', dim=index_dim)

Saving the corpus embeddings index

In [None]:
index_path = '/content/drive/MyDrive/proj-nlp-2024/data/corpus_hnswlib.index'
index.init_index(max_elements=corpus_embeddings.size(0), ef_construction=400, M=64)
index.add_items(corpus_embeddings.cpu(), list(range(len(corpus_embeddings))))
index.save_index(index_path)

# Retriever

Loading the corpus embeddings index

In [None]:
# import hnswlib

# index_path = '/content/drive/MyDrive/proj-nlp-2024/data/corpus_hnswlib.index'
# index = hnswlib.Index(space='cosine', dim=index_dim)
# index.load_index(index_path)

Choosing the cross encoder model to rerank the top results retireved with cosine similarity between the query and documents embeddings

In [None]:
from sentence_transformers import CrossEncoder

xenc_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

Defining the query and running semantic search

In [None]:
import numpy as np

query = "Who was Napoleon?"
query_embedding = semb_model.encode(query, convert_to_tensor=True)

corpus_ids, distances = index.knn_query(query_embedding.cpu(), k=128)
scores = 1 - distances

Cross encoding the top results and reranking

In [None]:
model_inputs = [(query, corpus[idx]) for idx in corpus_ids[0]]
cross_scores = xenc_model.predict(model_inputs)

print("Cross-encoder model re-ranking results")
print(f"Query: \"{query}\"")
print("---------------------------------------")
for idx in np.argsort(-cross_scores)[:3]:
    print(f"Score: {cross_scores[idx]:.4f}\nDocument: \"{corpus[corpus_ids[0][idx]]}\"\n\n")

Right now we have a model that is able to retrieve the most relevant document in a corpus relative to a query

# Reader

# Reranker