In [1]:
import numpy as np
import json
from rank_bm25 import BM25Okapi
import random
import csv
import torch
from torch.utils.data import Dataset, DataLoader
import gzip
from tqdm import tqdm
from transformers import DPRContextEncoderTokenizer
from transformers import DPRQuestionEncoderTokenizer
from transformers import DPRQuestionEncoder
from transformers import DPRContextEncoder

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
print(device)
torch.cuda.empty_cache()

cuda


In [3]:
# find the indices of the k max elements,
def kmax(k, nums, blocked):
    ix = []
    while len(ix) < k:
        w_ix = np.argmax(nums, axis=0)
        nums[w_ix] = float('-inf')
        if w_ix != blocked:
            ix.append(w_ix)
    return ix

def sample(j, nums, blocked):
    rand = random.sample(list(range(len(nums))), 40)
    for ind in rand:
        if ind in blocked:
            rand.remove(ind)
    return rand[0:j]

def NQ_file_open(file_object, device):
    while True:
        try:
            chunk = json.loads(file_object.readline())
            yield chunk
        except:
            yield "NONE"

def NQ_index_processing(f, file_out, device):
    examples = {'docs': [], 'questions': []}
    count = 0
    cap = 13000 #number of docs per question set in a dataset
    with gzip.open(f, 'r') as file:
        for chunk in NQ_file_open(file, device):
            if count > cap: 
                break
            if chunk == "NONE":
                break
            examples['questions'].append(chunk['question_text'].split(" "))
            doc_string = []
            for elem in chunk['document_tokens']:
                if not elem['html_token']:
                    doc_string.append(elem['token'])
            examples['docs'].append(doc_string)
            count += 1
    file.close()
    # CREATING OUR DATASET,
    # DOC INDEX = CORRECT QUESTION INDEX, [HARD NEGATIVES INDICES], [EASY NEGATIVES INDICES],

    file = open(file_out, 'w+', newline ='\n')
    # file_text = open(text_file_out, 'w')
    k_hard = 10
    j_rand = 20
    bm25 = BM25Okapi(examples['docs'])
    print(examples['questions'][0])
    for i in range(len(examples['questions'])):
        tokenized_query = examples['questions'][i]
        doc_scores = bm25.get_scores(tokenized_query)
        hard_ix = kmax(k_hard, doc_scores, i)
        rand_ix = sample(j_rand, doc_scores, hard_ix + [i])
        answers = [[i] + hard_ix + rand_ix]
        write = csv.writer(file)
        write.writerows(answers)
        # file_text.write(' '.join(examples['questions'][i]) + '\n')
        # file_text.write(' '.join(examples['docs'][i]) + '\n') 
    file.close()
    # file_text.close()
       

In [4]:
def embed_vals(elements, tokenizer, encoder):
    print('embedding values.')
    embeds = []
    for step, elem in tqdm(enumerate(elements)):
        with torch.no_grad():
            tokenized = tokenizer(elem, padding='max_length', max_length = 512,truncation=True )
            val = encoder( torch.tensor([tokenized['input_ids']]).to(device) )[0].cpu()
            embeds.append(val) 
        # if step == 10:
        #     break
    return embeds

def get_matrix(embeds):
    f = []
    for vec in embeds:
        s = []
        for elem in vec[0]:
            s.append(elem)
        f.append(s[:])
    return torch.tensor(f)


def store_vals(file_index, q_embeds, c_embeds, dev):
    file_q = get_matrix(q_embeds)
    file_c = get_matrix(c_embeds)
    if dev:
        torch.save(torch.tensor(file_c), f"/home/ubuntu/nlm/williamyang/DPR_Preprocess_Data/dev/context-{file_index}-embeds")
        torch.save(torch.tensor(file_q), f"/home/ubuntu/nlm/williamyang/DPR_Preprocess_Data/dev/question-{file_index}-embeds")
    else:
        torch.save(torch.tensor(file_c), f"/home/ubuntu/nlm/williamyang/DPR_Preprocess_Data/train/context-{file_index}-embeds")
        torch.save(torch.tensor(file_q), f"/home/ubuntu/nlm/williamyang/DPR_Preprocess_Data/train/question-{file_index}-embeds")
    
def NQ_file_processing(f, file_index, device, dev):
    examples = {'docs': [], 'questions': []}
    count = 0
    cap = 13000 #number of docs per question set in a dataset
    #opens file and reads context/question as token vectors
    with gzip.open(f, 'r') as file:
        for chunk in NQ_file_open(file, device):
            if count > cap: 
                break
            if chunk == "NONE":
                break
            examples['questions'].append(chunk['question_text'])
            doc_string = ""
            for elem in chunk['document_tokens']:
                if not elem['html_token']:
                    doc_string = doc_string + " " + elem['token']
            examples['docs'].append(doc_string)
            count += 1
    print("FINAL COUNT: ", count)
    file.close()
    with torch.no_grad():
        context_tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')#.to(device)
        context_encoder = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base').to(device)
        question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')#.to(device)
        question_encoder = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base').to(device)
    question_vals = embed_vals(examples['questions'], question_tokenizer, question_encoder)
    context_vals = embed_vals(examples['docs'], context_tokenizer, context_encoder)
    store_vals(file_index, question_vals, context_vals, dev)



    

In [4]:
#for dev sets - generating passage rankings
for i in range(1, 2):
    print("FILE NUMBER: ", i)
    if i < 10:
        file_name = f'/home/ubuntu/nlm/williamyang/Data/NQ/dev/nq-dev-0{i}.jsonl.gz'
    else:
        file_name = f'/home/ubuntu/nlm/williamyang/Data/NQ/dev/nq-dev-{i}.jsonl.gz'
    file_out = f'/home/ubuntu/nlm/williamyang/DPR_Preprocess_Data/dev/nq-dev-ix-{i}.csv'
    NQ_index_processing(file_name, file_out, device)

FILE NUMBER:  1
['what', 'do', 'the', '3', 'dots', 'mean', 'in', 'math']


In [12]:
#for train sets - generating passage rankings
for i in range(0, 1):
    print("FILE NUMBER: ", i)
    if i < 10:
        file_name = f'/home/ubuntu/nlm/williamyang/Data/NQ/train/nq-train-0{i}.jsonl.gz'
    else:
        file_name = f'/home/ubuntu/nlm/williamyang/Data/NQ/train/nq-train-{i}.jsonl.gz'
    file_out = f'/home/ubuntu/nlm/williamyang/DPR_Preprocess_Data/train/nq-train-ix-{i}.csv'
    NQ_index_processing(file_name, file_out, device)

FILE NUMBER:  0
['when', 'is', 'the', 'last', 'episode', 'of', 'season', '8', 'of', 'the', 'walking', 'dead']


In [9]:
#for dev sets - generating context/question embeddings
for i in range(1, 2):
    print("FILE NUMBER: ", i)
    if i < 10:
        file_name = f'/home/ubuntu/nlm/williamyang/Data/NQ/dev/nq-dev-0{i}.jsonl.gz'
    else:
        file_name = f'/home/ubuntu/nlm/williamyang/Data/NQ/dev/nq-dev-{i}.jsonl.gz'
    NQ_file_processing(file_name, i, device, dev=True)

FILE NUMBER:  1
FINAL COUNT:  1494


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.
Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mod

embedding values.


1494it [00:20, 74.69it/s]


embedding values.


1494it [03:59,  6.23it/s]


In [None]:
#for train sets - generating context/question embeddings
for i in range(0, 2):
    print("FILE NUMBER: ", i)
    if i < 10:
        file_name = f'/home/ubuntu/nlm/williamyang/Data/NQ/train/nq-train-0{i}.jsonl.gz'
    else:
        file_name = f'/home/ubuntu/nlm/williamyang/Data/NQ/train/nq-train-{i}.jsonl.gz'
    NQ_file_processing(file_name, i, device, dev=False)

In [5]:
#pre-processing for generating phrase embeddings for testing
def NQ_file_open(file_object, device):
    while True:
        try:
            chunk = json.loads(file_object.readline())
            yield chunk
        except:
            yield "NONE"

def pad_embeds(embeds, target):
    x, y = embeds.shape
    torch.reshape(embeds, (1, x, y))
    proper = torch.clone(target)
    proper[:, :x, :] = embeds
    return proper

def phrase_creator(contexts, phrase_len, batch_size, token_max_len, device):
    with torch.no_grad():
        context_tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
        context_encoder = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base').to(device)
    phrase_embeds = torch.empty((1, 769), dtype=torch.float).to(device)
    print("TOTAL ITERS:", int(len(contexts)/batch_size))
    for _, i in tqdm(enumerate(range(0, len(contexts), batch_size))):
        end = min(i+batch_size, len(contexts))
        passage_nums = torch.reshape(torch.tensor([contexts[k][0] for k in range(i, end)]), (end-i, 1)).to(device)
        batch = [contexts[j][1] for j in range(i, end)]
        with torch.no_grad():
            tokenized = context_tokenizer(batch, padding='max_length', max_length = token_max_len,truncation=True)
            batch_embeds = context_encoder( torch.tensor(tokenized['input_ids']).to(device) )[0]
            final_vals = torch.cat((passage_nums, batch_embeds), 1).to(device)
        phrase_embeds = torch.cat((phrase_embeds, final_vals), 0).to(device)
    return phrase_embeds[1:, :].cpu()
            
def store_phrase_vals(file_index, c_embeds, phrase_len, dev):
    if dev:
        torch.save(torch.tensor(c_embeds), f"/home/ubuntu/nlm/williamyang/DPR_Preprocess_Data/dev/context-{file_index}-{phrase_len}-embeds")
    else:
        torch.save(torch.tensor(c_embeds), f"/home/ubuntu/nlm/williamyang/DPR_Preprocess_Data/train/context-{file_index}-{phrase_len}-embeds")            
            
def NQ_phrase_preprocessing(f, file_index, device, phrase_len, dev):
    examples = []
    count = 0
    cap = 13000 #number of docs per question set in a dataset
    #opens file and reads context/question as token vectors
    with gzip.open(f, 'r') as file:
        doc_string = ""
        for chunk in NQ_file_open(file, device):
            ind = 1
            if count > cap: 
                break
            if chunk == "NONE":
                break
            for i in range(len(chunk['document_tokens'])):
                if not chunk['document_tokens'][i]['html_token']:
                    doc_string += " " + chunk['document_tokens'][i]['token']
                    if ind%phrase_len == 0:
                        examples.append((count, doc_string))
                        doc_string = ""
                    ind += 1
                if i==len(chunk['document_tokens'])-1:
                    examples.append((count, doc_string))
                    break
            count += 1

    file.close()
    print("COUNT:", count)
    return examples
    
    
        
    

In [3]:
phrase_len = 500
dev = False
file_index = 0
file_path = "/home/ubuntu/nlm/williamyang/Data/NQ/train/nq-train-00.jsonl.gz"
vals = NQ_phrase_preprocessing(file_path, file_index, device, phrase_len, dev)

COUNT: 5961


In [6]:
token_max_len = min(phrase_len, 512)
embeds = phrase_creator(vals, phrase_len, 20, token_max_len, device)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.
Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mod

TOTAL ITERS: 4533


4534it [23:31,  3.21it/s]


In [7]:
#for storing values
store_phrase_vals(file_index, embeds, phrase_len, dev)




In [None]:
#for clearing values
embeds = None
vals = None