In [1]:
from json import load
from glob import glob
import pandas as pd
from re import sub
from tqdm import tqdm

# import nltk for stopword removal
from nltk.corpus import stopwords

In [2]:
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Elimina caracteres especiales y convierte el texto a minúsculas
    cln_text = text.lower()
    
    # Elimina saltos de línea, tabulaciones y caracteres no deseados como \u200e
    cln_text = sub(r'[\n\t\u200e]', ' ', cln_text)
    
    # Filtra las stopwords usando un conjunto
    cln_text = ' '.join([word for word in cln_text.split() if word not in stop_words])
    
    # Elimina múltiples espacios consecutivos
    cln_text = sub(r' +', ' ', cln_text).strip()
    
    return cln_text

docs_corpus = glob('./ObliQADataset/StructuredRegulatoryDocuments/*')

corpus = {}
for f in docs_corpus:
    with open(f,'r', encoding='utf-8') as f:
        data = load(f)
        for doc in data:
            corpus[f"{doc['DocumentID']}-{doc['PassageID']}"] = dict(**doc)
            corpus[f"{doc['DocumentID']}-{doc['PassageID']}"]['Passage'] = clean_text(doc['Passage'])
            
            
def create_dataset(data):
    queries = []

    for q in data:
        q_id = q['QuestionID']
        question = q['Question']
        passages = q['Passages']
        for p in passages:
            queries.append({
                'query_id': q_id,
                'query': clean_text(question),
                'passage': clean_text(p['Passage']),
                'doc_id': p['DocumentID'],
                'passage_id': p['PassageID'],
                'doc_global_id': corpus[f"{p['DocumentID']}-{p['PassageID']}"]["ID"],
                'doc_local_id':f"{p['DocumentID']}-{p['PassageID']}"
            })
            
    for n,q in enumerate(queries):
        q['serial'] = n
        
    return queries

In [3]:
with open('./ObliQADataset/ObliQA_dev.json','r') as f:
    data_val = load(f)
    
with open('./ObliQADataset/ObliQA_test.json','r') as f:
    data_test = load(f)
    
with open('./ObliQADataset/ObliQA_train.json','r') as f:
    data_train = load(f)

#### Train Set

In [4]:
data_train = create_dataset(data_train)

In [5]:
pd.to_pickle(data_train, 'data/qa_training.pkl')

#### Test & Validation Set

In [6]:
test_queries = create_dataset(data_test)

validation_queries = create_dataset(data_test)

In [7]:
test_queries

[{'query_id': '777e7a14-fea3-4c37-a0e6-9ffb50024d5c',
  'query': 'adgm provide clarity level detail documentation accompany report suspicious activity ensure meets regulatory standards?',
  'passage': 'relevant persons comply guidance issued eocn regard identifying reporting suspicious activity transactions relating money laundering, terrorist financing proliferation financing.',
  'doc_id': 1,
  'passage_id': '14.2.3.Guidance.10.',
  'doc_global_id': 'cbe01c7d-746e-4822-a789-e69e97077ff9',
  'doc_local_id': '1-14.2.3.Guidance.10.',
  'serial': 0},
 {'query_id': '0eb99ea8-3810-492c-9986-7739006b5708',
  'query': 'exceptions specific circumstances real-time reporting requirements virtual asset transactions may waived modified?',
  'passage': 'regulatory requirements authorised persons engaged regulated activities relation virtual assets market abuse, transaction reporting misleading impressions (fsmr) similar reporting requirements imposed recognised investment exchanges mtfs relation f

In [8]:
qa_vali = dict(
    queries={q['query_id']: q['query'] for q in test_queries},
    relevant_docs={q['query_id']: q['doc_global_id'] for q in test_queries},
    corpus = {c['ID']: c['Passage'] for c in corpus.values()}
    
)

In [9]:
pd.to_pickle(qa_vali,'./data/qa_evaluation.pkl')

## BM25 testing

In [10]:
from langchain_community.retrievers import BM25Retriever
from langchain_core.documents import Document

In [11]:
retriever = BM25Retriever.from_documents(
    [Document(page_content=corpus[doc]['Passage'], metadata=dict(ID=corpus[doc]['ID'],
                                                                 DcoumentId=corpus[doc]['DocumentID'],
                                                                 PassageId=corpus[doc]['PassageID'],
                                                                 global_id=f"{corpus[doc]['DocumentID']}-{corpus[doc]['PassageID']}",
                                                                 )) for doc in corpus],
    k=10,
    #preprocess_func=clean_text
)

In [12]:
res_metrics = {}
for qt in tqdm(data_test):
    result = retriever.invoke(clean_text(qt['Question']))
    
    rel_docs = [f"{p['DocumentID']}-{p['PassageID']}" for p in qt['Passages']]
    cnt = 0
    for doc in result:
        if doc.metadata['global_id'] in rel_docs:
            cnt += 1
    
    res_metrics[qt['QuestionID']] = cnt/len(rel_docs)

100%|██████████| 2786/2786 [01:07<00:00, 41.29it/s]


In [15]:
sum(res_metrics.values())/len(res_metrics)

0.6793491265853076

## tf-idf testing

In [16]:
from langchain_community.retrievers import TFIDFRetriever

In [17]:
retriever = TFIDFRetriever.from_documents(
    [Document(page_content=clean_text(corpus[doc]['Passage']), metadata=dict(ID=corpus[doc]['ID'],
                                                                             DcoumentId=corpus[doc]['DocumentID'],
                                                                             PassageId=corpus[doc]['PassageID'],
                                                                             global_id=f"{
                                                                                 corpus[doc]['DocumentID']}-{corpus[doc]['PassageID']}",
                                                                             )) for doc in corpus],
    k=10,
    # preprocess_func=clean_text,
    tfidf_params={
        'ngram_range': (1, 2),  # Unigramas y bigramas
        'max_features': 15000,  # Limitar a los 15,000 términos más frecuentes
        'stop_words': 'english',  # Eliminar stopwords en inglés
        'max_df': 0.8,  # Eliminar palabras que aparezcan en más del 80% de los documentos
        'min_df': 3,  # Eliminar palabras que aparezcan en menos de 2 documentos
    }
)

In [18]:
res_metrics = {}
for qt in tqdm(data_test):
    result = retriever.invoke(clean_text(qt['Question']))
    
    rel_docs = [f"{p['DocumentID']}-{p['PassageID']}" for p in qt['Passages']]
    cnt = 0
    for doc in result:
        if doc.metadata['global_id'] in rel_docs:
            cnt += 1
    
    res_metrics[qt['QuestionID']] = cnt/len(rel_docs)

100%|██████████| 2786/2786 [00:08<00:00, 317.69it/s]


In [19]:
sum(res_metrics.values())/len(res_metrics)

0.6601040918880114

## Embeddings

In [32]:
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import CharacterTextSplitter


embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
db = FAISS.from_documents(
    [Document(page_content=clean_text(corpus[doc]['Passage']), metadata=dict(ID=corpus[doc]['ID'],
                                                                             DcoumentId=corpus[doc]['DocumentID'],
                                                                             PassageId=corpus[doc]['PassageID'],
                                                                             global_id=f"{
                                                                                 corpus[doc]['DocumentID']}-{corpus[doc]['PassageID']}",
                                                                             )) for doc in corpus],
    embeddings)

retriever = db.as_retriever(search_type='similarity', search_kwargs={"k": 10, 
                                                              #"fetch_k":20,
                                                              #"lambda": 0.2
                                                              })

In [33]:
res_metrics = {}
for qt in tqdm(data_test):
    result = retriever.invoke(clean_text(qt['Question']))
    
    rel_docs = [f"{p['DocumentID']}-{p['PassageID']}" for p in qt['Passages']]
    cnt = 0
    for doc in result:
        if doc.metadata['global_id'] in rel_docs:
            cnt += 1
    
    res_metrics[qt['QuestionID']] = cnt/len(rel_docs)

100%|██████████| 2786/2786 [00:11<00:00, 238.02it/s]


In [34]:
sum(res_metrics.values())/len(res_metrics)

0.6379576453697057