In [56]:
from datasets import load_dataset
import json
import os
# 1. Load entire dataset
corpus_name = 'kilt_wikipedia'
corpus_file_name = f'/data/tir/projects/tir6/general/afreens/dbqa/data/corpus_files/{corpus_name}/{corpus_name}_jsonl/{corpus_name}.jsonl'
# corpus_dataset = load_dataset('json', data_files=corpus_file_name)

# 2. For dev load subseet (50) of dataset
passages = []
with open(corpus_file_name, 'r') as file:
    for i, line in enumerate(file):
        data = json.loads(line)
        passages.append({'id': data['id'], 'contents': data['contents']})
        if i == 50:
            break
from datasets import Dataset
corpus_dataset = Dataset.from_list(passages)

In [24]:
corpus_dataset

Dataset({
    features: ['id', 'contents'],
    num_rows: 51
})

In [54]:
def load_jsonl(filename, sort_by_id = True):
    print('loading from', filename)
    data = []
    with open(filename, 'r') as file:
        for line in file:
            json_obj = json.loads(line.strip())
            data.append(json_obj)
    if sort_by_id:
        for d in data:
            d["id"] = str(d["id"])
        return sorted(data, key=lambda x: x['id'])
    return data

def save_jsonl(data, filename):
    if os.path.dirname(filename):
        os.makedirs(os.path.dirname(filename), exist_ok= True)
    print('writing to', filename)
    with open(filename, "w") as outfile:
        for idx, element in enumerate(data):
            json.dump(element, outfile)
            outfile.write("\n")
            
def get_retriever_outputs(corpus_dataset, query_dataset, top_k):
    retrieved_output = []
    for i, q in enumerate(query_dataset):
        query = q['input']
        indices, distances = retrieve(query, top_k)
        provenance = []
        for (i,d) in zip(indices, distances):
            id = corpus_dataset[(int)(i)]['id']
            page_id = id.split('_')[0]
            start_par_id = id.split('_')[1]
            end_par_id = start_par_id
            content = corpus_dataset[(int)(i)]['contents']
            score = (float)(1/d)

            provenance.append({"page_id": page_id,
            "start_par_id": start_par_id,
            "end_par_id": end_par_id,
            "text": content,
            "score":score })
        
        retrieved_output.append({"id": q['id'], 
        "input": q['input'], 
        "output": [{"provenance": provenance}]})
        return retrieved_output

# DPR

In [2]:
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer, DPRContextEncoder, DPRContextEncoderTokenizer
import faiss
import torch
import numpy as np

# Load the question encoder and tokenizer
question_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")

# Load the context encoder and tokenizer
context_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
context_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")


Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the

In [3]:
def encode_passages(examples):
    inputs = context_tokenizer(examples['contents'], return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        embeddings = context_encoder(**inputs).pooler_output.cpu().numpy()
    return {'embeddings': embeddings}

# Encode passages in bulk
encoded_corpus_dataset = corpus_dataset.map(encode_passages, batched=True, batch_size=32)


Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [6]:
retriever_name = 'dpr'
encoded_corpus_dataset.save_to_disk(os.path.join(os.getenv("DBQA"), 'indexes',  retriever_name, corpus_name))

Saving the dataset (0/1 shards):   0%|          | 0/51 [00:00<?, ? examples/s]

In [7]:
context_embeddings = np.vstack(encoded_corpus_dataset['embeddings']).astype(np.float32)
index = faiss.IndexFlatIP(context_embeddings.shape[1]) 
index.add(context_embeddings)
passage_ids = encoded_corpus_dataset['id']


In [18]:
def retrieve(query, top_k=5):
    # Encode the query using the question encoder
    inputs = question_tokenizer(query, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = question_encoder(**inputs)
        # query_embedding = mean_pooling(outputs[0], inputs['attention_mask']).cpu().numpy()
        query_embedding = question_encoder(**inputs).pooler_output.cpu().numpy()

    # Perform the retrieval
    D, I = index.search(query_embedding, top_k)
    return I[0], D[0]

# Example query
query = "What is the capital of France?"
top_k = 5
indices, distances = retrieve(query, top_k)
print(indices, distances)
# Print the top-k retrieved passages
# for idx, distance in zip(indices, distances):
#     print(f"ID: {passage_ids[idx]}\nPassage: {dataset[idx]['contents']}\nScore: {distance}\n")


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[ 7 25  3 45  8] [54.289043 54.14091  52.687218 51.875103 51.52078 ]


In [14]:
# load query dataset
query_dataset_name = 'nq-dev-kilt'
query_file_name = f'/data/tir/projects/tir6/general/afreens/dbqa/data/{query_dataset_name}.jsonl'
query_dataset = load_jsonl(query_file_name)

loading from /data/tir/projects/tir6/general/afreens/dbqa/data/nq-dev-kilt.jsonl


In [51]:
top_k = 5

In [52]:
retrieved_output = get_retriever_outputs(corpus_dataset, query_dataset, top_k)
save_jsonl(retrieved_output, os.path.join(os.getenv("DBQA"), 'retriever_results', 'predictions', retriever_name, query_dataset_name + '.jsonl'))

# GTR

In [None]:
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import faiss
import numpy as np
import torch

# Load the GTR model and tokenizer
model_name = "sentence-transformers/gtr-t5-base"
model = SentenceTransformer(model_name)

modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.86k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
passages = dataset['contents']
context_embeddings = model.encode(passages, convert_to_numpy=True, show_progress_bar=True)


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
context_embeddings = context_embeddings.astype(np.float32)
index = faiss.IndexFlatIP(context_embeddings.shape[1]) 
index.add(context_embeddings)
passage_ids = dataset['id']


In [None]:
def retrieve(query, top_k=5):
    # Encode the query using the GTR model
    query_embedding = model.encode([query], convert_to_numpy=True).astype(np.float32)

    # Perform the retrieval
    D, I = index.search(query_embedding, top_k)
    return I[0], D[0]

# Example query
query = "What is the capital of France?"
top_k = 5
indices, distances = retrieve(query, top_k)
print(indices, distances)
# # Print the top-k retrieved passages
# for idx, distance in zip(indices, distances):
#     print(f"ID: {passage_ids[idx]}\nPassage: {dataset[idx]['contents'][idx]}\nScore: {distance}\n")


[32  6  4 10  0] [0.56198996 0.5037815  0.48101538 0.480132   0.4723112 ]


# Contriever

In [12]:
from transformers import AutoModel, AutoTokenizer
from datasets import load_dataset
import faiss
import numpy as np
import torch

# Load the Contriever model and tokenizer
model_name = "facebook/contriever"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [19]:
def mean_pooling(token_embeddings, mask):
    token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
    sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
    return sentence_embeddings
    
def encode_passages(examples):
    inputs = tokenizer(examples['contents'], return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        # embeddings = model(**inputs).last_hidden_state.mean(dim=1).cpu().numpy()
        # embeddings = model(**inputs).pooler_output.cpu().numpy()
        outputs = model(**inputs)
        embeddings = mean_pooling(outputs[0], inputs['attention_mask'])
    return {'embeddings': embeddings}

# Encode passages in bulk
encoded_dataset = dataset.map(encode_passages, batched=True, batch_size=32)


Map:   0%|          | 0/51 [00:00<?, ? examples/s]

In [20]:
context_embeddings = np.vstack(encoded_dataset['embeddings']).astype(np.float32)
index = faiss.IndexFlatIP(context_embeddings.shape[1]) 
index.add(context_embeddings)
passage_ids = encoded_dataset['id']


In [21]:
def retrieve(query, top_k=5):
    # Encode the query using the Contriever model
    inputs = tokenizer(query, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        query_embedding = mean_pooling(outputs[0], inputs['attention_mask']).cpu().numpy()
        # query_embedding = model(**inputs).pooler_output.cpu().numpy()
        # query_embedding = model(**inputs).last_hidden_state.mean(dim=1).cpu().numpy()

    # Perform the retrieval
    D, I = index.search(query_embedding, top_k)
    return I[0], D[0]

# Example query
query = "What is the capital of France?"
top_k = 5
indices, distances = retrieve(query, top_k)
print(indices, distances)
# Print the top-k retrieved passages
# for idx, distance in zip(indices, distances):
#     print(f"Passage: {passages[idx]}\nScore: {distance}\n")


[32 21 42 20 48] [0.703585   0.6459863  0.63183594 0.59237474 0.58883893]
