In [115]:
import pyserini.search as pys
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
import pandas as pd
import time
import math as m

In [113]:

searcher = pys.SimpleSearcher('indexes/sample_collection_jsonl')
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model     = model.to(device)

f = open("data/queries.dev.tsv")
queries = []

for i in range(20):
    l = f.readline().split("\t")
    queries.append({"id": int(l[0]), "content": l[1].strip()})
    
print(queries[0])
f.close()

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier

{'id': 1048578, 'content': 'cost of endless pools/swim spa'}


In [4]:

hits = searcher.search(queries[0]['content'], k=20)


for i in range(len(hits)):
    print(f'{i+1:2} {hits[i].docid:4} {hits[i].score:.5f}')

 1 7471198 21.35000
 2 7187236 20.33760
 3 5365326 19.68380
 4 7187234 19.53490
 5 7187242 18.88170
 6 2078221 18.84990
 7 7187241 18.66050
 8 6802210 18.12250
 9 6794083 17.85240
10 5365328 17.60240
11 6750054 17.18490
12 4332300 16.23630
13 6347088 16.22780
14 6347089 16.22780
15 6270168 16.07760
16 3982208 15.82690
17 7471199 15.75350
18 7313043 15.45380
19 8105762 15.30410
20 1139145 15.19000


In [5]:
def findDoc(id, mode="chunk"):
    if(mode == "linear"):
        res = ""
        f = open("data/collection.tsv",  encoding="utf8")
        for i in range(id+1):
            l = f.readline()
        print(l)
    elif mode == "chunk":
        res = id%10000
        nearest_n = id - res
        f = open(f"data/collection_chunks/{nearest_n}.txt",  encoding="utf8")
        for i in range(res):
            l = f.readline()
        return l


In [31]:
w = [2/3, 1/6, 1/6]
a = 0.5
# The tokenizer will automatically add any model specific separators (i.e. <CLS> and <SEP>) and tokens to
# the sequence, as well as compute the attention masks.

def getSim(query, doc):
    paraphrase = tokenizer(query, doc, return_tensors="pt")
    paraphrase_classification_logits = model(**paraphrase).logits
    paraphrase_results = torch.softmax(paraphrase_classification_logits, dim=1).tolist()[0]
    
    return paraphrase_results[1]
     
def calcBertScore(query, doc): 
    sentences = list(filter(lambda x: x != "", [d.strip() for d in doc.split(".")]))
    
    sims = [{"score": getSim(query, d), "sentence": d} for d in sentences]
    sims = sorted(sims, reverse = True, key = lambda x: x["score"])[:3]
    n = min(len(sims), 3)
    score = 0
    for i in range(n):
        score = score + w[i]*sims[i]["score"]
    
    return score



In [134]:
def combine3(x):
    sims = sorted(x.to_numpy(), reverse=True)
    n = min(len(sims), 3)
    score = 0
    for i in range(n):
        score = score + w[i]*sims[i]
        
    return score

def calcBertScores(query, docs): 
    n = len(docs)
    batch_size = 2
    n_batches = m.ceil(n/batch_size)
    scores = []
    for i in range(n_batches):
        batch_sentences = [query]*(min((i + 1)*batch_size, n) - i*batch_size)
        batch_of_second_sentences = docs["sentence"].to_numpy().tolist()[i*batch_size:min((i + 1)*batch_size, n)]

        t =time.time()
        encoded_inputs = tokenizer(batch_sentences, batch_of_second_sentences, padding=True, return_tensors="pt").to(device)
        classification_logits = model(**encoded_inputs).logits
        results = torch.softmax(classification_logits, dim=1).tolist()
        scores = scores + [x[1] for x in results]
    
    return scores


def expandSentences(query, hits):
    res_dict = {"docid": [], "sentence" : [], "bm-25": []}
    
    for hit in hits:
        docid = int(hit.docid)
        doc = findDoc(docid).split("\t")[1]
        sentences = list(filter(lambda x: x != "", [d.strip() for d in doc.split(".")]))
        for s in sentences:
            res_dict["docid"].append(docid)
            res_dict["sentence"].append(s)
            res_dict["bm-25"].append(hit.score)
    
    res = pd.DataFrame(res_dict)
    return res

    


def bertRankQuery(query, k=20):
    hits = searcher.search(query, k=k)
    
    maxBM = 0
    maxBert = 0
    
    res = expandSentences(query, hits)
    res["bert"]  = calcBertScores(query, res)
    
    agr = res.groupby("docid", as_index=False).first()
    agr["bert"] = res.groupby("docid")["bert"].aggregate(combine3).to_numpy()
    res = agr
    
    maxBM = res["bm-25"].max()
    maxBert = res["bert"].max()
    res["final"] = a*res["bm-25"]/maxBM + (1-a)*res["bert"]/maxBert
    res = res.astype({'docid': 'int32'})
    res = res.sort_values("final", ascending = False)

        
    return res

query = queries[0]["content"]
res = []

for query in queries[:5]:
    res.append({"query": query, "res": bertRankQuery(query["content"], 100)})



In [135]:
def exportRes(results, filename = "res.txt"):
    f = open(filename, "w")
    
    for r in results:
        res = r["res"]
        query = r["query"]["id"]
        for i in range(len(res)):
            docid = res["docid"].iat[i]
            score = res["final"].iat[i]
            f.write(f"{query} Q0 {docid} {i + 1} {score} Bertserini \n")
        
exportRes(res)
        
print(res)

[{'query': {'id': 1048578, 'content': 'cost of endless pools/swim spa'}, 'res':       docid                                           sentence      bm-25  \
89  7471198  Cal Spas is the leading maker of home resort p...  21.350000   
50  5365326                      How much does a swim spa cost  19.683800   
79  7187234  Endless pools and swim spas are available in a...  19.534901   
16  2078221  1 A number of value brands will sell swim spas...  18.849899   
85  7187242                                   Endless Pools vs  18.881701   
..      ...                                                ...        ...   
28  3279353  Our customers typically spend about twice the ...  13.975899   
58  5800643  He holds licenses for: Construction/ Renovatio...  13.494600   
83  7187240  An endless pool, sometimes referred to as a tr...  13.402500   
95  8105762  Pinch A Penny Pool Patio & Spa is your headqua...  15.304100   
12  1880841  What is a swimming pool autofill? Kona Labs de...  14.427700

In [83]:
res["bert"] = scores
    
agr = res.groupby("docid").first()
agr["bert"] = res.groupby("docid")["bert"].aggregate(combine3)
agr

ValueError: Length of values (82) does not match length of index (20)