# Biblioteca Rank-BM25

In [1]:
!pip install rank_bm25

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


Leitura e processamento do arquivo CISI.ALL, que contém os documentos a serem pesquisados/recuperados.

In [3]:
with open('CISI.ALL', 'r') as file:
    content = file.read()

In [4]:
documents = content.split('.I ')

Para cada documento, o título, autor e texto são concatenados, para ajudar na recuperação das informações.

In [5]:
docs = []
for i, document in enumerate(documents):
  title = document[document.find('\n.T') + 3:document.find('\n.A')].strip()
  author = document[document.find('\n.A') + 3: document.find('\n.W')].strip()
  text = document[document.find('\n.W') + 3: document.find('\n.X')].strip()
  doc = title + ' ' + author + ' ' + text
  docs.append(doc)

Leitura e processamento do arquivo CISI.QRY, que contém as consultas (queries).

In [6]:
with open('CISI.QRY', 'r') as file:
    content = file.read()

In [7]:
queries = content.split('.I ')

In [8]:
query_docs = []
for query in queries:
  text = query[query.find('.W\n') + 3:].strip()
  query_docs.append(text)

Tokenização dos documentos

In [9]:
from rank_bm25 import BM25Okapi, BM25L, BM25Plus, BM25

corpus = docs

tokenized_corpus = [doc.split(" ") for doc in corpus]

Leitura e processamento do arquivo CISI.REL, que contém os valores-alvo de relevância que relacionam as consultas (queries) aos documentos.

In [10]:
from collections import defaultdict

map_query_to_docs = defaultdict(list)
total = 0

with open('CISI.REL', 'r') as file:
    for line in file:
        cols = line.split()
        query_id = cols[0]
        doc_id = cols[1]
        map_query_to_docs[int(query_id)].append(int(doc_id))
        total += 1

A seguir, o código responsável por executar o algoritmo a partir da base de documentos.

In [11]:
import numpy as np

def calculate_bm25(bm25, k1, b):
  """
  Função repsponsável por testar o algoritmo BM25 sobre a base de documentos.

  Parameters
  -----------
  bm25: instância do modelo (implementação/variante específica do algoritmo 
        BM25), previamente instanciada sobre a base de documentos.
  Returns
  -------
  Tupla contendo MAP (mean average precision) e recall obtidos pelo modelo.
  """
  threshold = 0
  query_precision = defaultdict(list)
  query_recall = defaultdict(list)

  for i in range(1, len(query_docs)):
      query = query_docs[i]
      tokenized_query = query.split(" ")
      doc_scores = bm25.get_scores(tokenized_query)

      #quanto maior este valor, melhor o MAP
      threshold = np.percentile(doc_scores, 95)
      #threshold = np.percentile(doc_scores, 75)
      
      relevant_docs = map_query_to_docs[i]
      if len(relevant_docs) > 0:
        retrieved_docs = [i for i, doc_score in enumerate(doc_scores) 
                            if doc_score >= threshold]
        relevant_retrieved = 0
        for doc in relevant_docs:
          if doc in retrieved_docs:
            relevant_retrieved += 1
        recall = relevant_retrieved/len(relevant_docs)
        query_recall[i].append(recall)

        retrieved_relevant = 0
        for doc in retrieved_docs:
          if doc in relevant_docs:
            retrieved_relevant += 1
        precision = retrieved_relevant/len(retrieved_docs)
        query_precision[i].append(precision)

  map = 0
  recall = 0
  n = 0
  for q in query_precision:
    mean_query_precision = sum(query_precision[q])/len(query_precision[q])
    map += mean_query_precision
    mean_query_recall = sum(query_recall[q])/len(query_recall[q])
    recall += mean_query_recall
    n += 1

  print('Model = ', bm25)
  print('k1 = ', k1)
  print('b = ', b)
  map = map/n
  recall = recall/n
  print('MAP: ', map)
  print('Mean recall: ', recall)
  return map, recall
        

A seguir, grid search que varia:

1.   a implementação (variante) específica do algoritmo BM25, 
2.   o hiperparâmetro k1
3.   o hiperparâmetro b

In [12]:
best_map = 0
best_k1 = None
best_b = None
best_model = None

for k1 in [0.5, 1.0, 1.2, 1.5, 2.0, 2.5, 3.0, 4.0, 5.0]:
  for b in [0.25, 0.5, 0.75, 1.0]:
    for model in [BM25Okapi(tokenized_corpus, k1=k1, b=b), 
                  BM25L(tokenized_corpus, k1=k1, b=b), 
                  BM25Plus(tokenized_corpus, k1=k1, b=b)]:
      map, _ = calculate_bm25(model, k1, b)
      if map > best_map:
        best_map = map
        best_k1 = k1
        best_b = b
        best_model = model

print('Best MAP = ', best_map)
print('Best k1 = ', best_k1)
print('Best b = ', best_b)
print('Best model = ', best_model)

Model =  <rank_bm25.BM25Okapi object at 0x7fea2e452880>
k1 =  0.5
b =  0.25
MAP:  0.09139402560455191
Mean recall:  0.22151304592269935
Model =  <rank_bm25.BM25L object at 0x7fea4ca65ee0>
k1 =  0.5
b =  0.25
MAP:  0.08054765291607398
Mean recall:  0.17484754474238623
Model =  <rank_bm25.BM25Plus object at 0x7fea4ca65400>
k1 =  0.5
b =  0.25
MAP:  0.09530583214793738
Mean recall:  0.2353273171823646
Model =  <rank_bm25.BM25Okapi object at 0x7fea4ca65c40>
k1 =  0.5
b =  0.5
MAP:  0.09174964438122331
Mean recall:  0.22317101776125883
Model =  <rank_bm25.BM25L object at 0x7fea4ca65ee0>
k1 =  0.5
b =  0.5
MAP:  0.07983641536273116
Mean recall:  0.17204563830439418
Model =  <rank_bm25.BM25Plus object at 0x7fea6e84c400>
k1 =  0.5
b =  0.5
MAP:  0.09512802275960168
Mean recall:  0.2367998131948259
Model =  <rank_bm25.BM25Okapi object at 0x7fea4ca65c40>
k1 =  0.5
b =  0.75
MAP:  0.09246088193456614
Mean recall:  0.22401217413229554
Model =  <rank_bm25.BM25L object at 0x7fea2ef832b0>
k1 =  0.5
b

In [13]:
map, recall = calculate_bm25(BM25Plus(tokenized_corpus, k1=best_k1, b=best_b), best_k1, best_b)
print('MAP = ', map)
print('Recall = ', recall)
print('F-1 = ', 2*map*recall/(map + recall))

Model =  <rank_bm25.BM25Plus object at 0x7fea2e4521f0>
k1 =  2.0
b =  1.0
MAP:  0.10170697012802275
Mean recall:  0.2435124663194599
MAP =  0.10170697012802275
Recall =  0.2435124663194599
F-1 =  0.14348505630286076


OBS.: Com o limiar (threshhold) utilizando o 75 percentil, foram obtidos os resultados abaixo:

* Model =  <rank_bm25.BM25Plus object at 0x7f9f8e7af430>
* k1 =  1.2
* b =  0.75
* MAP:  0.05265104946437059
* Mean recall:  0.5351813646824921
* MAP =  0.05265104946437059
* Recall =  0.5351813646824921
* F-1 =  0.09587038695442662