# Buscador TF-IDF no TREC-DL 2020

Aqui, é implementado um buscador com vetorização TF-IDF, que leva em conta tanto a frequência de termos em cada documento, como também a "raridade" de cada termo no corpus.

## Download do dataset

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
main_path = '/content/drive/MyDrive/Unicamp-aula-2/'

import os

if not os.path.exists(main_path):
  os.makedirs(main_path)
else:
  print('Diretório já existente')

Diretório já existente


## Download de ferramentas auxiliares

In [3]:
!pip install pyserini

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyserini
  Downloading pyserini-0.20.0-py3-none-any.whl (137.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.1/137.1 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyjnius>=1.4.0
  Downloading pyjnius-1.4.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m84.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting onnxruntime>=1.8.1
  Downloading onnxruntime-1.14.1-cp38-cp38-manylinux_2_27_x86_64.whl (5.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m102.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pandas>=1.4.0
  Downloading pandas-1.5.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.2/12.2 MB[0m [31m97.8 MB/s[0

In [None]:
!git clone https://github.com/castorini/pyserini.git --recurse-submodules {main_path}/pyserini

In [None]:
!cd {main_path}/pyserini/tools/eval && tar xvfz trec_eval.9.0.4.tar.gz && cd trec_eval.9.0.4 && make && cd ../../..
!cd {main_path}/pyserini/tools/eval/ndeval && make && cd ../../..

trec_eval.9.0.4/
trec_eval.9.0.4/m_prefs_pair.c
trec_eval.9.0.4/m_ndcg_p.c
trec_eval.9.0.4/m_infap.c
trec_eval.9.0.4/m_num_q.c
trec_eval.9.0.4/m_iprec_at_recall.c
trec_eval.9.0.4/form_prefs_counts.c
trec_eval.9.0.4/m_prefs_num_prefs_ful_ret.c
trec_eval.9.0.4/utility_pool.c
trec_eval.9.0.4/m_binG.c
trec_eval.9.0.4/meas_avg.c
trec_eval.9.0.4/m_gm_bpref.c
trec_eval.9.0.4/m_runid.c
trec_eval.9.0.4/m_bpref.c
trec_eval.9.0.4/m_gm_map.c
trec_eval.9.0.4/trec_eval.h
trec_eval.9.0.4/m_yaap.c
trec_eval.9.0.4/m_relstring.c
trec_eval.9.0.4/m_Rprec.c
trec_eval.9.0.4/m_prefs_avgjg.c
trec_eval.9.0.4/m_success.c
trec_eval.9.0.4/m_ndcg.c
trec_eval.9.0.4/functions.h
trec_eval.9.0.4/m_P_avgjg.c
trec_eval.9.0.4/test/
trec_eval.9.0.4/test/qrels.rel_level
trec_eval.9.0.4/test/results.test
trec_eval.9.0.4/test/qrels.test
trec_eval.9.0.4/test/out.test.qrels_jg
trec_eval.9.0.4/test/out.test.meas_params
trec_eval.9.0.4/test/out.test.a
trec_eval.9.0.4/test/out.test.prefs
trec_eval.9.0.4/test/out.test.aqcM
trec_ev

## Construção do índice invertido

In [4]:
from pyserini.analysis import Analyzer, get_lucene_analyzer

In [5]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 27.3 gigabytes of available RAM

You are using a high-RAM runtime!


In [6]:
analyzer:Analyzer = Analyzer(get_lucene_analyzer(stemmer='porter'))

def preprocess_and_tokenize(text):
  return analyzer.analyze(text)

In [7]:

collection_path = main_path + '/collections/msmarco-passage/collection.tsv'



In [8]:
import nltk
import string

from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

nltk.download('stopwords')  # Download stopwords if not already downloaded

from nltk.corpus import stopwords

stop_words = stopwords.words('english')
stop_words = set(stop_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [9]:
import array
import pandas as pd
from collections import defaultdict
from collections import Counter
import pickle
import os

index_path = f"{main_path}/index-tf-idf.pickle"

def load_or_build_inverted_index():
  if os.path.exists(index_path):
    with open(index_path, "rb") as f:
      print("Loading index...")
      index = pickle.load(f)
  else:
    print("Building inverted index and vocabulary...")
    # set the chunk size
    chunk_size = 1000
    chunks = []
    inverted_index = dict()
    full_text = ''

    def process(row):
      tokenized_text = preprocess_and_tokenize(row[1])
      counter = Counter(tokenized_text)
      doc_length = len(tokenized_text)
      doc_id = row[0]
      for token, count in counter.items():
        if token not in stop_words:
          #Para cada token, temos 2 arrays paralelos que armazenam os documentos e as frquências dos termos
          inverted_index.setdefault(token, {"docs":array.array("L", []), "tf":array.array("f", [])})["docs"].append(int(doc_id))
          inverted_index.setdefault(token, {"docs":array.array("L", []), "tf":array.array("f", [])})["tf"].append(count/doc_length)

    chunk_id = 0
    n_documents = 0

    # iterate through the file in chunks
    for chunk in pd.read_csv(collection_path, sep='\t', header=None, chunksize=chunk_size):
      # process the chunk here
      if (chunk_id % 1000) == 0:
        print(f'Processing chunk {chunk_id}')
      for index, row in chunk.iterrows():
        process(row)
        n_documents += 1
      del(chunk)
      chunk_id += 1

    index = {"inverted_index": inverted_index, "n_documents": n_documents}

    with open(index_path, "wb") as f:
      pickle.dump(index, f)

  return index

In [10]:
index = load_or_build_inverted_index()
inverted_index = index["inverted_index"]
n_documents = index["n_documents"]
del(index['vocab'])

Loading index...


In [11]:
import torch

device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [12]:
len(inverted_index)

2660662

In [13]:
!head {collection_path}

0	The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.
1	The Manhattan Project and its atomic bomb helped bring an end to World War II. Its legacy of peaceful uses of atomic energy continues to have an impact on history and science.
2	Essay on The Manhattan Project - The Manhattan Project The Manhattan Project was to see if making an atomic bomb possible. The success of this project would forever change the world forever making it known that something this powerful can be manmade.
3	The Manhattan Project was the name for a project conducted during World War II, to develop the first atomic bomb. It refers specifically to the period of the project from 194 â¦ 2-1946 under the control of the U.S. Army Corps of Engineers

## Avaliação

In [29]:
topics_file = main_path + '/pyserini/tools/topics-and-qrels/topics.dl20.txt'
qrels_eval = main_path + '/pyserini/tools/topics-and-qrels/qrels.dl20-passage.txt'

In [30]:
!head {topics_file}

1030303	who is aziz hashim
1037496	who is rep scalise?
1043135	who killed nicholas ii of russia
1045109	who owns barnhart crane
1049519	who said no one can make you feel inferior
1051399	who sings monk theme song
1056416	who was the highest career passer  rating in the nfl
1064670	why do hunters pattern their shotguns?
1065636	why do some places on my scalp feel sore
1071750	why is pete rose banned from hall of fame


In [31]:
!head {qrels_eval}

23849 0 1020327 2
23849 0 1034183 3
23849 0 1120730 0
23849 0 1139571 1
23849 0 1143724 0
23849 0 1147202 0
23849 0 1150311 0
23849 0 1158886 2
23849 0 1175024 1
23849 0 1201385 0


In [74]:
import math
import torch.nn.functional as F

def search(query):
  doc_scores = defaultdict(int) # int (doc_id) -> int (score)
  query_tokens = preprocess_and_tokenize(query)
  n_query_tokens = len(query_tokens)
  query_counter = Counter(query_tokens)
  doc_norms = defaultdict(float)
  query_norm = 0

  for token in query_counter.keys():
    if token in inverted_index:
      #Calcula TF-IDF para par (termo, query)
      query_tf = query_counter[token]/n_query_tokens
      doc_ids = inverted_index[token]["docs"]
      n_docs_contain_term = len(set(doc_ids))
      idf = math.log(n_documents / n_docs_contain_term)
      query_tf_idf = query_tf * idf
      query_norm += query_tf_idf**2

      for i, doc_id in enumerate(doc_ids):
        #Calcula TF-IDF para par (termo, documento)
        doc_tf = inverted_index[token]["tf"][i]
        doc_tf_idf = doc_tf * idf
        doc_norms[doc_id] += doc_tf_idf**2
        
        #Incrementa para implementar o produto escalar entre o "vetor" da 
        #query e o "vetor" do documento
        doc_scores[doc_id] += query_tf_idf * doc_tf_idf
          
  #Calcula a norma da query
  query_norm = math.sqrt(query_norm)

  for doc_id, score in doc_scores.items():
    #calcula a norma do documento
    doc_norms[doc_id] = math.sqrt(doc_norms[doc_id])
    #calcula a similaridade de cossenos entre o "vetor" da query e o "vetor" do 
    #documento
    doc_scores[doc_id] = doc_scores[doc_id]/(query_norm * doc_norms[doc_id])
          
  return doc_scores

In [51]:
results = search('who is aziz hashim')

In [52]:
len(results)

245

In [53]:
results
#6989780, 1305521, 4358004, 1815707, 7508059

defaultdict(int,
            {22484: 0.6599182397808202,
             68532: 0.6599182397808202,
             119291: 0.6599182397808202,
             176182: 0.6599182397808201,
             176183: 0.6599182397808202,
             226464: 0.6599182397808201,
             279227: 0.6599182397808201,
             315873: 0.6599182397808201,
             480287: 0.6599182397808202,
             596813: 0.6599182397808202,
             705402: 0.6599182397808201,
             770274: 0.6599182397808201,
             794624: 0.6599182397808202,
             821997: 0.6599182397808202,
             848943: 0.6599182397808201,
             1038342: 0.6599182397808201,
             1154757: 0.6599182397808201,
             1161432: 0.6599182397808202,
             1161439: 0.6599182397808202,
             1358683: 0.6599182397808202,
             1376556: 0.6599182397808202,
             1376558: 0.6599182397808202,
             1451842: 0.6599182397808201,
             1451844: 0.6599182397

OBS.: A lista de stopwords do Lucene Analyzer parece ser muito retrista.  Assim, para reduzir o tamanho do índice, uma alternativa seria combinar com a lista de stopwords do NLTK - ou seja, só salvar no índie invertido se não estiver na lsita de stopwords do NLTK.

In [44]:
def get_document_by_id(id):
  result = None
  with open(collection_path, 'r') as f:
    for line in f:
      fields = line.strip().split('\t')
      doc_id = fields[0]
      if doc_id == id:
        result = fields[1]
        break

  return result

In [54]:
sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)[:10]

In [60]:
sorted_results

[(7156982, 1.0000000000000002),
 (8726430, 1.0000000000000002),
 (8726433, 1.0000000000000002),
 (8726434, 1.0000000000000002),
 (8726436, 1.0000000000000002),
 (8726437, 1.0000000000000002),
 (8726435, 0.9532731492403059),
 (8726429, 0.9532731492403058),
 (309441, 0.751337418743792),
 (1292819, 0.751337418743792)]

In [66]:
get_document_by_id('1292819')

"His name means highly praised.. Muhammad's full name was Abu al-Qasim Muhammad Ibn Abd Allah Ibn Abd al-Muttalib Ibn Hashim. He was the last prophet of the religion of Islam. Muhammad's father, Abdallah, died several weeks before his birth and his mother, Aminah, died when he was six years old."

In [None]:
query_to_results = dict()
i = 0

with open(topics_file, 'r') as f:
  for line in f:
      i += 1
      fields = line.strip().split('\t')
      query_id = fields[0]
      query_text = fields[1]
      results = search(query_text)
      query_to_results[int(query_id)] = sorted(results.items(), key=lambda x: x[1], reverse=True)[:10]

with open('run.dl20.boolean.trec', 'w') as f:
  for query_id, results in query_to_results.items():
    for i, (doc_id, score) in enumerate(results):
      print(i)
      f.write(f'{query_id}\tQ0\t{doc_id}\t{i+1}\t{score}\tboolean\n')

In [69]:
!head run.dl20.boolean.trec

1030303	Q0	7156982	1	1.0000000000000002	boolean
1030303	Q0	8726430	2	1.0000000000000002	boolean
1030303	Q0	8726433	3	1.0000000000000002	boolean
1030303	Q0	8726434	4	1.0000000000000002	boolean
1030303	Q0	8726436	5	1.0000000000000002	boolean
1030303	Q0	8726437	6	1.0000000000000002	boolean
1030303	Q0	8726435	7	0.9532731492403059	boolean
1030303	Q0	8726429	8	0.9532731492403058	boolean
1030303	Q0	309441	9	0.751337418743792	boolean
1030303	Q0	1292819	10	0.751337418743792	boolean


In [70]:
!python {main_path}/pyserini/tools/scripts/msmarco/convert_msmarco_to_trec_qrels.py \
   --input {qrels_eval} \
   --output qrels.dl20.trec

Done!


In [71]:
!head qrels.dl20.trec

23849 0 1020327 2
23849 0 1034183 3
23849 0 1120730 0
23849 0 1139571 1
23849 0 1143724 0
23849 0 1147202 0
23849 0 1150311 0
23849 0 1158886 2
23849 0 1175024 1
23849 0 1201385 0


In [72]:
!chmod 755 {main_path}/pyserini/tools/eval/trec_eval.9.0.4/trec_eval

In [73]:
!{main_path}/pyserini/tools/eval/trec_eval.9.0.4/trec_eval -c -m map -m ndcg_cut.10 -l 2 \
   qrels.dl20.trec run.dl20.boolean.trec

map                   	all	0.1309
ndcg_cut_10           	all	0.3469
