In [1]:
#!git clone https://github.com/usnistgov/trec_eval.git && cd trec_eval && make

In [2]:
import os
import json
import numpy as np
from typing import Dict
from tqdm import tqdm
from re import compile
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
#import unicodedata
import contractions
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [3]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [4]:
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer(language='english')

pattern_newline = compile(r'[\n\t\u200e]')
pattern_non_alphanumeric = compile(r'[^a-z0-9]')
pattern_multiple_spaces = compile(r' +')

def clean_text(text):
    # Expande contracciones
    cln_text = contractions.fix(text)
    
    # Convierte el texto a minúsculas
    cln_text = cln_text.lower()
    
    # Normalización Unicode
    #cln_text = unicodedata.normalize('NFKD', cln_text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    
    #  Elimina saltos de línea y caracteres no deseados
    cln_text = pattern_newline.sub(' ', cln_text)
    
    # Elimina caracteres no alfanuméricos
    cln_text = pattern_non_alphanumeric.sub(' ', cln_text)
    
    #tokens 
    tokens = [stemmer.stem(word) for word in word_tokenize(cln_text) if word not in stop_words]
    # Filtra las stopwords usando un conjunto
    cln_text = ' '.join(tokens)
    
    # Elimina múltiples espacios consecutivos
    cln_text = pattern_multiple_spaces.sub(' ', cln_text).strip()
    
    return cln_text

# create funcion that tokenize in unigrams and bigrams
def tokenizer(text):
    tokens = text.split()
    unigrams = tokens
    bigrams = [f"{tokens[i]} {tokens[i + 1]}" for i in range(len(tokens) - 1)]
    #tigrams = [f"{tokens[i]} {tokens[i + 1]} {tokens[i + 2]}" for i in range(len(tokens) - 2)]
    return unigrams + bigrams


def load_qrels(docs_dir: str, fqrels: str) -> Dict[str, Dict[str, int]]:
    ndocs = 40
    docs = []
    for i in range(1, ndocs + 1):
        with open(os.path.join(docs_dir, f"{i}.json")) as f:
            doc = json.load(f)
            docs.append(doc)

    did2pid2id: Dict[str, Dict[str, str]] = {}
    for doc in docs:
        for psg in doc:
            did2pid2id.setdefault(psg["DocumentID"], {})
            assert psg["ID"] not in did2pid2id[psg["DocumentID"]]
            did2pid2id[psg["DocumentID"]].setdefault(psg["PassageID"], psg["ID"])

    with open(fqrels) as f:
        data = json.load(f)
    qrels = {}
    for e in data:
        qid = e["QuestionID"]
        for psg in e["Passages"]:
            qrels.setdefault(qid, {})
            pid = did2pid2id[psg["DocumentID"]][psg["PassageID"]]
            qrels[qid][pid] = 1
    return qrels

In [5]:
qrels = load_qrels("ObliQADataset/StructuredRegulatoryDocuments", "ObliQADataset/ObliQA_test.json")
with open("qrels", "w") as f:
    for qid, rels in qrels.items():
        for pid, rel in rels.items():
            line = f"{qid} Q0 {pid} {rel}"
            f.write(line + "\n")

In [6]:
ndocs = 40
collection = []
for i in range(1, ndocs + 1):
    with open(os.path.join("ObliQADataset/StructuredRegulatoryDocuments", f"{i}.json")) as f:
        doc = json.load(f)
        for psg in doc:
            collection.append(
                dict(text=psg["PassageID"] + " " + psg["Passage"], ID=psg["ID"],
                     DcoumentId=psg['DocumentID'],
                     PassageId=psg['PassageID'],
                     )
            )

In [7]:
# Sparse sintactic representation

tokenized_corpus = [tokenizer(clean_text(doc['text'])) for doc in collection]

bm25 = BM25Okapi(tokenized_corpus, k1=1.5, b=0.75)

collection_array = np.array(collection)

len(tokenized_corpus)

13732

In [8]:
# Dense semantic representation

embeddings = model.encode([i['text'] for i in collection_array],
                          device='cuda',
                          normalize_embeddings=True,
                          show_progress_bar=True,
                          max_length=512,
                          )

Batches:   0%|          | 0/430 [00:00<?, ?it/s]

In [9]:
def sintactic_query(query:str):
        tokenized_query = tokenizer(clean_text(query))
        
        scores = bm25.get_scores(tokenized_query)
        
        return scores
    
def semantic_query(query: str):
    
    cln_query = pattern_newline.sub(' ', query)
    cln_query = pattern_multiple_spaces.sub(' ', cln_query).strip()
    
    query_emb = model.encode([cln_query], 
                             device='cuda',
                             normalize_embeddings=True)
    
    scores = (query_emb@embeddings.T)[0]
    
    return scores


def hybrid_query_rrf(query: str, k: int = 60):
    sintactic_scores = sintactic_query(query)
    semantic_scores = semantic_query(query)
    
    # Obtener los rankings inversos (mayor puntaje = rango 1)
    # Añadimos 1 para que los rangos empiecen en 1
    sintactic_ranks = sintactic_scores.argsort()[::-1].argsort() + 1 
    semantic_ranks = semantic_scores.argsort()[::-1].argsort() + 1
    
    # Calcular los puntajes RRF para cada documento
    rrf_scores = (1 / (k + sintactic_ranks)) + (1 / (k + semantic_ranks))
    
    return rrf_scores

    
def hybrid_query_avg(query: str, alpha: float = 0.5):
    
    sintactic_scores = sintactic_query(query)
    sintactic_scores = (sintactic_scores - sintactic_scores.min()) / (sintactic_scores.max() - sintactic_scores.min())
    
    semantic_scores = semantic_query(query)
    semantic_scores = (semantic_scores - semantic_scores.min()) / (semantic_scores.max() - semantic_scores.min())
    
    scores = alpha*semantic_scores + (1-alpha)*sintactic_scores

    return scores

In [10]:
retrieved = {}
top_n = 10
with open("ObliQADataset/ObliQA_test.json") as f:
    data = json.load(f)
    for e in tqdm(data):
        query = e['Question']
        
        scores = hybrid_query_avg(query)
        
        top_k = np.argpartition(-scores, top_n)[:top_n]
        # Ordenamos correctamente los índices según sus puntuaciones
        top_k = top_k[np.argsort(-scores[top_k])]

        # Recuperamos los documentos correspondientes
        top_docs = collection_array[top_k]

        # Si deseas incluir las puntuaciones en los resultados
        top_scores = scores[top_k]
        top_results = [{**doc, 'score': score} for doc, score in zip(top_docs, top_scores)]
        
        retrieved[e["QuestionID"]] = top_results

100%|██████████| 2786/2786 [04:57<00:00,  9.35it/s]


In [11]:
with open("rankings.trec", "w") as f:
  for qid, hits in retrieved.items():
    for i, hit in enumerate(hits):
      line = f"{qid} 0 {hit['ID']} {i+1} {hit['score']} bm25"
      f.write(line + "\n")

In [12]:
!trec_eval/trec_eval -m recall.10 -m map_cut.10 ./qrels ./rankings.trec

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


recall_10             	all	0.8024
map_cut_10            	all	0.6637
