In [1]:
# Colab
from google.colab import drive
import os
drive.mount('/content/drive', force_remount=True)
os.chdir(f'/content/drive/MyDrive/proj-nlp-2024/data')

# # Local
# os.chdir('c:\\Users\\green\\Documents\\GitHub\\proj-nlp-2024\\data')

Mounted at /content/drive


In [None]:
!pip install -U sentence-transformers
!pip install hnswlib
!pip -q install sentencepiece
!pip install accelerate
!pip install --upgrade transformers accelerate torch

Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transform

In [None]:
import os
import json
from pandas.core.common import flatten
from sentence_transformers import SentenceTransformer, CrossEncoder
import torch
import hnswlib
import pickle
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration
import accelerate

# Corpus

Creating the corpus

In [None]:
# Loading the json file
train_path = os.path.join(os.getcwd(), 'beerqa_train_v1.0.json')
with open(train_path, 'r') as file:
    train_dataset = json.load(file)

# Creating the corpus out of the json file contexts
data_dict = train_dataset.get('data', {}) # Extracting the data dictionary
context_list = [list(flatten(entry.get('context', ''))) for entry in data_dict] # List of list of strings
context_string = list(flatten(context_list)) # List of strings
corpus = context_string[1::2] # Removing the strings corresponding to titles

In [None]:
corpus[:2]

Preparing the cropus embeddings index

In [None]:
# # Embedding the corpus
# semb_model = SentenceTransformer('all-mpnet-base-v2')
# corpus_embeddings = semb_model.encode(corpus, convert_to_tensor=True, show_progress_bar=True)
# index_dim = corpus_embeddings.size(1)

# # Indexing the embeddings
# index = hnswlib.Index(space='cosine', dim=index_dim)
# index.init_index(max_elements=corpus_embeddings.size(0), ef_construction=400, M=64)
# index.add_items(corpus_embeddings.cpu(), list(range(len(corpus_embeddings))))

Saving/loading the corpus embeddings and their index

In [None]:
# # Saving the corpus embeddings
# with open('/content/drive/MyDrive/proj-nlp-2024/data/corpus_embeddings.pkl', 'wb') as f:
#     pickle.dump(corpus_embeddings, f)

# # Saving the corpus embeddings index
# index_path = '/content/drive/MyDrive/proj-nlp-2024/data/corpus_hnswlib.index'
# index.save_index(index_path)

# Loading the corpus embeddings index
index_path = '/content/drive/MyDrive/proj-nlp-2024/data/corpus_hnswlib.index'
index = hnswlib.Index(space='cosine', dim=768) # index_dim=768
index.load_index(index_path)

# TODO: Data exploration on embeddings

# Question Answering

Defining function parameters

In [None]:
semb_model = SentenceTransformer('all-mpnet-base-v2')
xenc_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large", device_map="cuda", torch_dtype=torch.bfloat16)

In [None]:
def qa_pipeline(
    question,
    similarity_model=semb_model,
    embeddings_index=index,
    re_ranking_model=xenc_model,
    generative_model=model,
    device=device
):
    if not question.endswith('?'):
        question = question + '?'
    # Embed question
    question_embedding = semb_model.encode(question, convert_to_tensor=True)
    # Search documents similar to question in index
    corpus_ids, distances = index.knn_query(question_embedding.cpu(), k=128) # Finding out the best hyperparameter
    # Re-rank results
    xenc_model_inputs = [(question, corpus[idx]) for idx in corpus_ids[0]]
    cross_scores = xenc_model.predict(model_inputs)
    # Get best matching passage
    passage_idx = np.argsort(-cross_scores)[0]
    passage = corpus[corpus_ids[0][passage_idx]]
    # Encode input
    input_text = f"Given the following passage, answer the related question.\n\nPassage:\n\n{passage}\n\nQ: {question}"
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
    # Generate output
    output_ids = model.generate(input_ids, max_new_tokens=32)
    # Decode output
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Return result
    return f"Passage:\n\n{passage}\n\nQ: {question}\n\nA: {output_text}"

In [None]:
question = input("Ask a question >>> ")
print()

print(qa_pipeline(question))