<a href="https://colab.research.google.com/github/pedrogengo/CISI_BM25/blob/main/notebooks/CISI_Results.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Download and parse CISI dataset

In [1]:
!wget http://ir.dcs.gla.ac.uk/resources/test_collections/cisi/cisi.tar.gz
!tar -xvzf cisi.tar.gz

--2023-02-21 01:24:45--  http://ir.dcs.gla.ac.uk/resources/test_collections/cisi/cisi.tar.gz
Resolving ir.dcs.gla.ac.uk (ir.dcs.gla.ac.uk)... 130.209.240.253
Connecting to ir.dcs.gla.ac.uk (ir.dcs.gla.ac.uk)|130.209.240.253|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 775144 (757K) [application/gzip]
Saving to: ‘cisi.tar.gz’


2023-02-21 01:24:46 (1.38 MB/s) - ‘cisi.tar.gz’ saved [775144/775144]

CISI.ALL
CISI.BLN
CISI.QRY
CISI.REL


In [2]:
import re
import math
from collections import defaultdict


def load_collection(path):
    """Load the CISI collection from a file."""
    with open(path, 'r') as f:
        collection = f.read()
    return collection

def parse_documents(collection):
    """Parse the documents in the CISI collection."""
    document_pattern = re.compile(r'\.W\s+(.*?)\s+\.[A-Z]', re.DOTALL)
    documents = document_pattern.findall(collection)
    documents = [doc.replace("\n", " ").strip() for doc in documents]
    return documents

def parse_queries(path):
    """Parse the queries in the CISI queries file."""
    with open(path, 'r') as f:
        queries = f.read()
    query_pattern = re.compile(r'\.W\s*(.*?)\n+\.[A-Z]', re.DOTALL)
    queries = query_pattern.findall(queries)
    queries = [query.replace("\n", " ").strip() for query in queries]
    return queries

def parse_judgments(path):
    """Parse the relevance judgments in the CISI relevance judgments file."""
    with open(path, 'r') as f:
        judgments = f.read()
    judgment_pattern = re.compile(r'\s+(\d+)\s+(\d+)\s+', re.DOTALL)
    judgments = judgment_pattern.findall(judgments)
    judgments_dict = defaultdict(lambda: [])
    for query, document in judgments:
      judgments_dict[int(query)].append(int(document))
    return judgments_dict

collection = load_collection("CISI.ALL")
documents = parse_documents(collection)
queries = parse_queries("CISI.QRY")
judgments = parse_judgments("CISI.REL")

## 2. BM25

In [3]:
import math
import re

def tokenize(text):
    """Tokenize a document or query."""
    words = re.findall(r'\w+', text.lower())
    return words

def build_index(documents):
    """Build an inverted index from the documents."""
    index = {}
    doc_term_freqs = []

    for i, document in enumerate(documents):
        # Tokenize the document
        terms = tokenize(document)

        # Count the term frequencies
        term_freqs = {}
        for term in terms:
            term_freqs[term] = term_freqs.get(term, 0) + 1

        doc_term_freqs.append(term_freqs)

        # Add the document to the index for each term it contains
        for term in term_freqs:
            if term not in index:
                index[term] = []
            index[term].append((i, term_freqs[term]))

    # Calculate the inverse document frequencies
    N = len(documents)
    idfs = {term: math.log(1 + ((N - len(postings) + 0.5) / (len(postings) + 0.5))) for term, postings in index.items()}

    # Return the inverted index and document term frequencies
    return {"doc_term_freqs": doc_term_freqs, "idfs": idfs}

class BM25():

  def __init__(self, index, k, b, tokenizer):
    self.doc_term_freqs = index["doc_term_freqs"]
    self.idfs = index["idfs"]
    self.k = k
    self.b = b
    self.tokenizer = tokenizer
    self.documents_lengths = [self._count_tokens(doc) for doc in self.doc_term_freqs]
    self.avg_doc_len = sum(self.documents_lengths) / len(self.doc_term_freqs)

  def _count_tokens(self, document):
    """Counts the amount of token in a document"""
    total = 0
    for token_count in document.values():
      total += token_count
    return total

  def search(self, query: str, k: int = 10):
    """Returns the top k documents related to the query"""
    scores = []
    tokenized_query = self.tokenizer(query)
    for i in range(len(self.doc_term_freqs)):
      scores.append((self.score(tokenized_query, i), i))
    scores.sort(reverse=True)
    return scores[:k]

  def score(self, tokenized_query, doc_id: int):
    """Calculates bm25 score for a query and a document"""
    score = 0.
    norm = self.documents_lengths[doc_id] / self.avg_doc_len
    for token in tokenized_query:
      if token not in self.doc_term_freqs[doc_id].keys():
        score += 0.
      else:
        term_freq = self.doc_term_freqs[doc_id][token]
        idf = self.idfs[token]

        numerator = idf * term_freq * (self.k + 1)
        denominator = (term_freq * self.k) / (term_freq + (self.k * (1 - self.b + self.b * norm)))

        score += (numerator / denominator)
    
    return score

## 3. Building index

In [62]:
index = build_index(documents)

## 4. Evaluation

In [55]:
import numpy as np

results = []
for k in np.arange(0.5, 3.5, 0.5):
  for b in np.arange(0., 0.4, 0.2):
    bm25 = BM25(index, k, b, tokenize)
    acc = 0.
    for query_id, doc_similars in judgments.items():
      query_id -= 1  # Os julgamentos estao indexados comecando em 1
      top_k = 10 #len(doc_similars)
      doc_similars = set([doc - 1 for doc in doc_similars]) # Os documentos estao indexados comecando em 1
      scores = bm25.search(queries[query_id], top_k)
      returned_docs = set([doc for score, doc in scores])
      tp = len(doc_similars.intersection(returned_docs))
      fp = len(returned_docs - doc_similars)
      fn = len(doc_similars - returned_docs)
      acc += (tp / top_k)

    results.append([k, b, acc / len(judgments)])

In [61]:
import pandas as pd

pd.DataFrame(results, columns=["k", "b", "P@10"]).sort_values("P@10", ascending=False).reset_index()

Unnamed: 0,index,k,b,P@10
0,4,1.5,0.0,0.227632
1,6,2.0,0.0,0.226316
2,10,3.0,0.0,0.225
3,8,2.5,0.0,0.222368
4,2,1.0,0.0,0.217105
5,5,1.5,0.2,0.214474
6,1,0.5,0.2,0.214474
7,0,0.5,0.0,0.213158
8,3,1.0,0.2,0.207895
9,7,2.0,0.2,0.207895
