# Imports

In [1]:
import functools
import itertools
import os
import re
from collections import defaultdict, namedtuple
from pathlib import Path
import toolz
from dataclasses import dataclass

import more_itertools

# Data preprocessing

## Download dos dados

Para fazer download dos dados, vá até a pasta raiz e digite:
```bash
make get_raw_data

```
Para gerar esse Makefile, eu dei a seguinte query ao chat GPT:
```
write a makefile to do the following:
- download file from http://ir.dcs.gla.ac.uk/resources/test_collections/cisi/cisi.tar.gz
- unzip it to the folder data/raw
- it should be on step "get_raw_data"
- operation should not be repeated if files already exist
- write help for the steps
```


RAW_DATA_BASEDIR = Path('../data/raw')
!ls -lht {RAW_DATA_BASEDIR}

raw_files_path = {
    'query': RAW_DATA_BASEDIR / 'CISI.QRY',
    'all': RAW_DATA_BASEDIR / 'CISI.ALL',
    'rel': RAW_DATA_BASEDIR / 'CISI.REL',
}


# Conversão dos dados para um formato mais fácil de trabalhar

In [2]:
RAW_DATA_BASEDIR = Path('../data/raw')
!ls -lht {RAW_DATA_BASEDIR}

raw_files_path = {
    'query': RAW_DATA_BASEDIR / 'CISI.QRY',
    'all': RAW_DATA_BASEDIR / 'CISI.ALL',
    'rel': RAW_DATA_BASEDIR / 'CISI.REL',
}


total 6200
-rw-r--r--  1 marcospiau  staff   757K Feb 22 19:27 cisi.tar.gz
-rw-r--r--  1 marcospiau  staff    79K Feb 28  1994 CISI.REL
-rw-r--r--  1 marcospiau  staff    67K Feb 28  1994 CISI.QRY
-rw-r--r--  1 marcospiau  staff   4.5K Feb 28  1994 CISI.BLN
-rw-r--r--  1 marcospiau  staff   2.1M Feb 28  1994 CISI.ALL


Arquivos QRY e ALL são mais complicados e precisam de processamento. Tentei deixar processamento semelhante pra conseguir fazer os dois casos com uma mesma funcao.

Usei bastante o chatgpt, principalmente para escrever docstrings e typehints. Uma coisa curiosa é que constantemente ele removia a dataclass e utilizava classes padrão, de forma que precisei constamente instruir ele a a manter as dataclasses.

Utilizei bastante a bibliioteca toolz para deixar o código com uma abordagem mais funcional.

In [3]:
# (.I) ID
# (.T) Title
# (.W) Abstract
# (.B) Publication date of the article
# (.A) Author list
# (.N) Information when entry was added
# (.X) List of cross-references to other documents

renames_docs_all = {
    '.I': 'id',
    '.T': 'title',
    '.W': 'abstract',
    '.B': 'publication_date',
    '.A': 'author_list',
    '.N': 'added_date',
    '.X': 'cross_references'
}

process_doc_all = toolz.compose_left(
    # for each tag, get renamed tag, and join texts for tag
    functools.partial(more_itertools.map_reduce,
                      keyfunc=lambda x: renames_docs_all[x.tag],
                      valuefunc=lambda x: x.text,
                      reducefunc=lambda x: ' '.join(x)),
    # keeps only desired keys
    toolz.curried.keyfilter(lambda x: x in {'id', 'title', 'abstract'}),
    # convert id to int
    toolz.curried.update_in(keys=['id'], func=int)
)

# (.I) ID
# (.W) Query
# (.A) Author list
# (.N) Authors name and some keywords on what the query searches for

renames_docs_queries = {
    '.I': 'id',
    '.W': 'query',
    '.T': 'title',
    '.A': 'author_list',
    # '.N': 'other_query_infos',
    '.B': 'publication_date',
    # '.X': 'cross_references'
}

process_doc_qry = toolz.compose_left(
    functools.partial(more_itertools.map_reduce,
                      keyfunc=lambda x: renames_docs_queries[x.tag],
                      valuefunc=lambda x: x.text,
                      reducefunc=lambda x: ' '.join(x)),
    toolz.curried.keyfilter(lambda x: x in {'id', 'query'}),
    toolz.curried.update_in(keys=['id'], func=int))

In [4]:
import itertools
import re
from typing import Any, Callable, Dict, Iterable, List, Tuple, Union


@dataclass
class IdTagText:
    id: str
    tag: str
    text: str


def parse_cisi_all_or_qry(
    path: str,
    process_doc_fn: Callable[[Iterable[IdTagText]], Dict[str, Any]],
    return_dict: bool = False,
) -> Union[List[Dict[str, Any]], Dict[str, Any]]:
    """Parses a CISI all or query file.

    Args:
        path: A string representing the path to the file to parse.
        process_doc_fn: A function that receives a list of IdTagText or an iterable of
          IdTagText as input, and returns a dictionary.
        return_dict: A boolean indicating whether to return a dictionary or a list of
          dictionaries.

    Returns:
        If return_dict is True, a dictionary with the document IDs as keys and
        processed documents as values. Otherwise, a list of processed documents.

    Raises:
        AssertionError: If the document IDs are not unique.
    """
    markers = [r'^\.I\s(\d+)$', '\.T$', '\.A$', '\.B$', '\.W$', '\.X$', '\.N$']
    marker_pattern = re.compile('|'.join(markers))

    def gen_items() -> Iterable[IdTagText]:
        with open(path, 'r') as f:
            for line in map(str.strip, f):
                match = marker_pattern.match(line)
                # match occurs for lines with tags
                if match:
                    # if match.group(1) is not None, it means that the tag is .I
                    # and the group contains the ID
                    if match.group(1):
                        id_ = match.group(1)
                        yield IdTagText(id_, '.I', id_)
                    else:
                        tag = match.group(0).strip()
                else:
                    # if match is None, it means that the line is a text
                    # just propagate tag and id
                    yield IdTagText(id_, tag, line)

    out = [
        process_doc_fn(group)
        for _, group in itertools.groupby(gen_items(), key=lambda x: x.id)
    ]
    assert len(out) == len({x['id'] for x in out}), 'IDs are not unique'
    return {x['id']: x for x in out} if return_dict else out


process_cisi_all = functools.partial(parse_cisi_all_or_qry,
                                     process_doc_fn=process_doc_all)
process_cisi_qry = functools.partial(parse_cisi_all_or_qry,
                                     process_doc_fn=process_doc_qry)

In [5]:
corpus = process_cisi_all(raw_files_path['all'])
corpus[0]

{'id': 1,
 'title': '18 Editions of the Dewey Decimal Classifications',
 'abstract': "The present study is a history of the DEWEY Decimal Classification.  The first edition of the DDC was published in 1876, the eighteenth edition in 1971, and future editions will continue to appear as needed.  In spite of the DDC's long and healthy life, however, its full story has never been told.  There have been biographies of Dewey that briefly describe his system, but this is the first attempt to provide a detailed history of the work that more than any other has spurred the growth of librarianship in this country and abroad."}

In [6]:
queries = process_cisi_qry(raw_files_path['query'])
queries[0]

{'id': 1,
 'query': 'What problems and concerns are there in making up descriptive titles? What difficulties are involved in automatically retrieving articles from approximate titles? What is the usual relevance of the content of articles to their titles?'}

O arquivo qrels é mais simples e pode ser lido de forma mais simples. O resultado é um dicionário com a query como chave e uma lista de documentos relevantes para aquela query.

In [10]:
def load_qrels(path):
    out = defaultdict(set)
    with open(path, 'r') as f:
        for line in map(str.strip, f):
            qid, docid, _, _ = line.split()
            out[int(qid)].add(int(docid))
    out.default_factory = None

    return out

qrels = load_qrels(raw_files_path['rel'])
list(itertools.islice(qrels.items(), 5))

[(1,
  {28,
   35,
   38,
   42,
   43,
   52,
   65,
   76,
   86,
   150,
   189,
   192,
   193,
   195,
   215,
   269,
   291,
   320,
   429,
   465,
   466,
   482,
   483,
   510,
   524,
   541,
   576,
   582,
   589,
   603,
   650,
   680,
   711,
   722,
   726,
   783,
   813,
   820,
   868,
   869,
   894,
   1162,
   1164,
   1195,
   1196,
   1281}),
 (2,
  {29,
   68,
   197,
   213,
   214,
   309,
   319,
   324,
   429,
   499,
   636,
   669,
   670,
   674,
   690,
   692,
   695,
   700,
   704,
   709,
   720,
   731,
   733,
   738,
   740,
   1136}),
 (3,
  {60,
   85,
   114,
   123,
   126,
   131,
   133,
   136,
   138,
   140,
   346,
   359,
   363,
   372,
   412,
   445,
   454,
   461,
   463,
   469,
   532,
   537,
   540,
   553,
   554,
   555,
   585,
   590,
   599,
   640,
   660,
   664,
   803,
   901,
   909,
   911,
   1027,
   1053,
   1169,
   1179,
   1181,
   1190,
   1191,
   1326}),
 (4, {310, 315, 321, 329, 332, 420, 601, 980}),
 (

Abaixo, vemos que todos os qrels possuem relevância igual a zero. Por isso, vamos interpretar que caso um documento esteja relacionado a uma query, ele é relevante para essa query.

In [16]:
import pandas as pd
df = pd.read_csv(raw_files_path['rel'], delim_whitespace=True, header=None, names=['qid', 'docid', 'nao', 'sei']).astype(str)
display(df.describe())
del df

Unnamed: 0,qid,docid,nao,sei
count,3114,3114,3114,3114.0
unique,76,1162,1,1.0
top,44,375,0,0.0
freq,155,15,3114,3114.0


# Classe BM25

Utilizei bastante o ChatGPT para desevolver essa classe, comecei com uma classe mais simples (loops for) e fui modificando para conseguir deixar um pouco mais vetorizado. Estamos usando o CountVectorizer para facilitar a tokenização dos documentos e queries, o que também ajuda pois estão incluídos os stop words.

Novamente, docstrings e typehints foram gerados pelo chatgpt.

Utilizamos numpy e matrizes esparsas scipy para facilitar vetorização e operações matriciais.

Valores default de k1 e b foram escolhidos pelo ChatGPT.

In [49]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from tqdm.auto import tqdm

class BM25:

    def __init__(self, corpus: List[dict], k1: float = 1.2, b: float = 0.75):
        self.k1 = k1
        self.b = b

        self.corpus = {doc['id']: doc['text'] for doc in corpus}
        self.docids_to_idx = {
            docid: idx
            for idx, docid in enumerate(self.corpus.keys())
        }

        # self.vectorizer = CountVectorizer(lowercase=True,
        #                                   max_features=None,
        #                                   stop_words=None,
        #                                   tokenizer=self.tokenize,
        #                                   preprocessor=lambda x: x)
        self.vectorizer = CountVectorizer(lowercase=True,
                                          max_features=None,
                                          stop_words='english')
        self.doc_term_matrix = self.vectorizer.fit_transform(
            self.corpus.values())
        self.corpus_lengths = self.doc_term_matrix.sum(axis=1).A1
        self.avgdl = self.corpus_lengths.mean()
        self.idf = self._calculate_idf()
        # self.idf = self._calculate_idf()
        # self.avgdl = sum(map(len, (doc['text'] for doc in self.corpus))) / len(self.corpus)

    def tokenize(self, text):
        raise NotImplementedError
        # return re.sub(r'[^\w\s]', '', text).lower().split()


    def _calculate_idf(self) -> np.ndarray:
        """Calculate the inverse document frequency for each term in the corpus using the BM25 formula."""
        # documents with term
        df = self.doc_term_matrix.getnnz(axis=0)
        N = self.doc_term_matrix.shape[0]
        idf = np.log((N - df + 0.5) / (df + 0.5))
        return idf

    def score(self, query: str, doc_id: int) -> float:
        """Calculate the relevance score of a document for a given query.

        Args:
            query: Query text.
            doc_id: ID of the document in the corpus.

        Returns:
            Relevance score of the document for the query.
        """
        doc_tf = self.doc_term_matrix.getrow(self.docids_to_idx[doc_id])
        doc_length = doc_tf.sum()
        query_tf = self.vectorizer.transform([query])
        score = 0
        for weight, term in zip(query_tf.data, query_tf.indices):
            idf = self.idf[term]
            tf = doc_tf[0, term]
            score += idf * weight * (self.k1 + 1) / (
                tf + self.k1 * (1 - self.b + self.b * doc_length / self.avgdl))
        return score

    def retrieve(self, query: str, k: int = 10) -> List[dict]:
        """Retrieve the top-k documents for a given query.

        Args:
            query: Query text.
            k: Number of documents to retrieve.

        Returns:
            List of document dictionaries with 'id' and 'score' keys.
        """
        scores = [
            self.score(query, doc_id)
            for doc_id in self.corpus.keys()
        ]
        docids = np.argsort(scores)[::-1][:k]
        return [
            {'id': docid, 'score': scores[docid]}
            for docid in docids
        ]

    def get_results_for_all_queries(self, queries: List[dict], k: int = 10) -> Dict[int, List[dict]]:
        """Retrieve the top-k documents for all queries.

        Args:
            queries: List of query dictionaries with 'id' and 'text' keys.
            k: Number of documents to retrieve.

        Returns:
            Dictionary mapping query IDs to a list of document dictionaries with 'id' and 'score' keys.
        """
        return {
            query['id']: self.retrieve(query['query'], k)
            for query in tqdm(queries, desc='Retrieving')
        }

Exemplo de uso:

In [51]:
corpus_abstracts = [{'id': x['id'], 'text': x['abstract']} for x in corpus]
bm25 = BM25(corpus=corpus_abstracts)


In [52]:
bm25.score(query=corpus[0]['abstract'], doc_id=1)

175.0623179775004

In [53]:
bm25.retrieve(query=queries[0]['query'], k=10)

[{'id': 1287, 'score': 366.937721998151},
 {'id': 1295, 'score': 366.937721998151},
 {'id': 1283, 'score': 353.04613444162817},
 {'id': 1085, 'score': 340.1679993689292},
 {'id': 1301, 'score': 340.1679993689292},
 {'id': 1288, 'score': 340.1679993689292},
 {'id': 930, 'score': 340.1679993689292},
 {'id': 1311, 'score': 340.1679993689292},
 {'id': 1300, 'score': 328.1963161321295},
 {'id': 1278, 'score': 328.1963161321295}]

In [56]:
results_top10 = bm25.get_results_for_all_queries(queries, k=10)

Retrieving: 100%|██████████| 112/112 [00:59<00:00,  1.89it/s]


# Avaliacao das métricas

Para facilitar avaliacão das métricas e ter um script confiável, vamos utilizar o script trec_eval. Ele está instalado na pasta `bin`, dentro da raiz do projeto.

## Convertendo dados para format trec_eval

## QRELS

In [67]:
from typing import Set

def convert_qrels_to_trec_format(qrels: Dict[int, Set[int]], output_path: str):
    """Convert qrels to TREC format.

    Args:
        qrels: Dictionary of qrels.
        output_path: Path to the output file.
    """
    with open(output_path, 'w') as f:
        for qid, docids in qrels.items():
            for docid in docids:
                f.write(f'{qid} 0 {docid} 1\n')

!mkdir -pv '../data/processed/cisi'
convert_qrels_to_trec_format(qrels, '../data/processed/cisi/qrels.txt')
!head '../data/processed/cisi/qrels.txt'

1 0 1281 1
1 0 650 1
1 0 1162 1
1 0 524 1
1 0 269 1
1 0 1164 1
1 0 783 1
1 0 894 1
1 0 150 1
1 0 28 1


## Results

In [62]:
results_top10[1] = [{'id': 1287, 'score': 366.937721998151},
 {'id': 1295, 'score': 366.937721998151},
 {'id': 1283, 'score': 353.04613444162817},
 {'id': 1085, 'score': 340.1679993689292},
 {'id': 1301, 'score': 340.1679993689292},
 {'id': 1288, 'score': 340.1679993689292},
 {'id': 930, 'score': 340.1679993689292},
 {'id': 1311, 'score': 340.1679993689292},
 {'id': 1300, 'score': 328.1963161321295},
 {'id': 1278, 'score': 328.1963161321295}]

[{'id': 1287, 'score': 366.937721998151},
 {'id': 1295, 'score': 366.937721998151},
 {'id': 1283, 'score': 353.04613444162817},
 {'id': 1085, 'score': 340.1679993689292},
 {'id': 1301, 'score': 340.1679993689292},
 {'id': 1288, 'score': 340.1679993689292},
 {'id': 930, 'score': 340.1679993689292},
 {'id': 1311, 'score': 340.1679993689292},
 {'id': 1300, 'score': 328.1963161321295},
 {'id': 1278, 'score': 328.1963161321295}]

In [68]:
def convert_results_to_trec_eval_format(results: Dict[str, List[str]], output_path: str):
    """Converts results to trec_eval format.

    Args:
        results: Dictionary of results.
        output_path: Path to the output file.
    """
    with open(output_path, 'w') as f:
        for query_id, docs in results.items():
            for rank, doc in enumerate(docs, start=1):
                doc_id = doc['id']
                score = doc['score']
                line = f"{query_id} Q0 {doc_id} {rank} {score} RUN\n"
                f.write(line)

convert_results_to_trec_eval_format(results_top10, '../data/processed/cisi/results_abstract_top10.txt')
!head '../data/processed/cisi/results_abstract_top10.txt'

1 Q0 1287 1 366.937721998151 RUN
1 Q0 1295 2 366.937721998151 RUN
1 Q0 1283 3 353.04613444162817 RUN
1 Q0 1085 4 340.1679993689292 RUN
1 Q0 1301 5 340.1679993689292 RUN
1 Q0 1288 6 340.1679993689292 RUN
1 Q0 930 7 340.1679993689292 RUN
1 Q0 1311 8 340.1679993689292 RUN
1 Q0 1300 9 328.1963161321295 RUN
1 Q0 1278 10 328.1963161321295 RUN
