In [2]:
from datasets import load_from_disk

SEED = 123
FOLDER = "../data/processed/legal-qa-v1"

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
dataset = load_from_disk(FOLDER)

In [None]:
#check the dataset
print(dataset)
print(dataset['train'][0])
print(dataset['validation'][0])
print(dataset['test'][0])

In [None]:
dataset["answer"]

In [None]:
def transform(example):
    # ejemplo: poner preguntas en mayúsculas y respuestas en minúsculas
    new_question = example["question"].replace("Q:", "").strip()
    new_answer = example["answer"].replace("A:", "").strip()

    return {
        "question": new_question,
        "answer": new_answer
    }

In [None]:
transformed_dataset = dataset.map(transform)

In [None]:
print(transformed_dataset["question"][0])
print(transformed_dataset["answer"][0])

In [None]:
# split the dataset into train and validation
train_test = transformed_dataset.train_test_split(test_size=0.2, seed=SEED)
print(train_test)
train_val = train_test["test"].train_test_split(test_size=0.5, seed=SEED)
transformed_dataset = {
    "train": train_test["train"],
    "val": train_val["test"],
    "test": train_val["train"]
}

In [None]:
transformed_dataset

## Embeddings

In [1]:
from typing import List
from langchain_core.embeddings import Embeddings
from sentence_transformers import SentenceTransformer


class SentenceTransformerEmbeddings(Embeddings):

    def __init__(self, model: str, device: str = 'cuda'):
        self.model = SentenceTransformer(model, device=device)

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """Embed search docs."""
        return self.model.encode(texts).tolist()

    def embed_query(self, text: str) -> List[float]:
        """Embed query text."""
        return self.model.encode_query(text).tolist()

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

model = HuggingFaceEmbeddings(
    model_name="Qwen/Qwen3-Embedding-0.6B",
    model_kwargs={"device": "cuda"}
)

loaded_vectorstore=FAISS.load_local(
    "../data/db/parliament_db/parliament_all_docs_embeddings_Qwen_Qwen3-Embedding-0.6B_chunked_max_length-512",
    model,
    allow_dangerous_deserialization=True
)

print(f"Loaded vector store contains {loaded_vectorstore.index.ntotal} vectors")

Loaded vector store contains 128518 vectors


In [5]:
# get all embeddings in numpy array
import numpy as np
all_embeddings = np.array([emb for emb in loaded_vectorstore.index.reconstruct_n(0, loaded_vectorstore.index.ntotal)])
print(f"All embeddings shape: {all_embeddings.shape}")

All embeddings shape: (128518, 1024)


In [None]:
# plot histogram of the embeddings cosine similarity distribution

import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarities = cosine_similarity(all_embeddings)
# get the upper triangle of the cosine similarities matrix, without the diagonal
upper_triangle_indices = np.triu_indices_from(cosine_similarities, k=1)
upper_triangle_values = cosine_similarities[upper_triangle_indices]

plt.hist(upper_triangle_values, bins=50)
plt.grid()
plt.title("Cosine Similarity Distribution of Embeddings")
plt.xlabel("Cosine Similarity")
plt.ylabel("Frequency")
plt.show()

MemoryError: Unable to allocate 61.5 GiB for an array with shape (128518, 128518) and data type float32

: 

In [8]:
# print the mean and standard deviation of the cosine similarities
mean_cosine_similarity = np.mean(upper_triangle_values)
std_cosine_similarity = np.std(upper_triangle_values)
# print the results rounded to 4 decimal places
print(f"Mean Cosine Similarity: {mean_cosine_similarity:.4f}")
print(f"Standard Deviation of Cosine Similarity: {std_cosine_similarity:.4f}")

Mean Cosine Similarity: 0.7246
Standard Deviation of Cosine Similarity: 0.1009


In [None]:
from datasets import load_dataset

# load train/validation/test splits of individual subset
ragbench_hotpotqa = load_dataset("rungalileo/ragbench", "hotpotqa")
print(ragbench_hotpotqa)

In [None]:
ragbench = {}
for dataset in ['covidqa', 'cuad', 'delucionqa', 'emanual', 'expertqa', 'finqa', 'hagrid', 'hotpotqa', 'msmarco', 'pubmedqa', 'tatqa', 'techqa']:
  ragbench[dataset] = load_dataset("rungalileo/ragbench", dataset)

In [None]:
ragbench

In [None]:
ragbench["covidqa"]["train"][0]

In [None]:
## quiero solo estas columnas
columns_to_keep = ["id", "question", "documents", "response"]
for subset in ragbench:
    ragbench[subset] = ragbench[subset].remove_columns([col for col in ragbench[subset]['train'].column_names if col not in columns_to_keep])

In [None]:
document = []
for split in ragbench["covidqa"]:
    for docs in ragbench["covidqa"][split]["documents"]:
        for doc in docs:
            document.append(doc)

In [None]:
len(document)

In [None]:
# check unique documents in alphabetical order
len(set(document))
unique_documents = list(set(document))
unique_documents.sort()

In [None]:
# create new column with the idx of the document in the list of unique documents, the idx should be a list of integers and the name of the column should be "document_ids"
unique_documents = list(set(document))
unique_documents.sort()
document_idx_map = {doc: idx for idx, doc in enumerate(unique_documents)}
for split in ragbench["covidqa"]:
    ragbench["covidqa"][split] = ragbench["covidqa"][split].add_column(
        "document_ids",
        [[document_idx_map[doc] for doc in docs] for docs in ragbench["covidqa"][split]["documents"]]
    )
ragbench["covidqa"]["train"][0]

In [None]:
# save to disk the list of unique documents in jsonl format
import os
import json

with open("../data/processed/ragbench/covidqa_unique_documents.jsonl", "w") as f:
    for doc in unique_documents:
        f.write(json.dumps({"document": doc}) + "\n")

# Parliamentary data

In [None]:
from datasets import load_from_disk
FOLDER_AUTORE = "../data/raw/ORDERS_PARLIAMENT" # Entrenamiento autoregresivo

In [None]:
dataset_aut = load_from_disk(FOLDER_AUTORE) 

In [None]:
dataset_aut["train"][0]

In [None]:
print(dataset_aut['train'][0])

In [3]:
import json
FOLDER_QA = "../data/raw/QA_PARLIAMENT_TRAIN"

dataset_qa = load_from_disk(FOLDER_QA)
print(dataset_qa)
print(dataset_qa["train"])
print(json.dumps(dataset_qa['train'][3], indent=2))

DatasetDict({
    train: Dataset({
        features: ['PK', 'question', 'answer', 'cost', 'context', 'type', 'retrieved_pks', 'oracle_context', 'formatted_context'],
        num_rows: 614
    })
    validation: Dataset({
        features: ['PK', 'question', 'answer', 'cost', 'context', 'type', 'retrieved_pks', 'oracle_context', 'formatted_context'],
        num_rows: 161
    })
})
Dataset({
    features: ['PK', 'question', 'answer', 'cost', 'context', 'type', 'retrieved_pks', 'oracle_context', 'formatted_context'],
    num_rows: 614
})
{
  "PK": "6521_1",
  "question": "\u00bfQu\u00e9 argumentos expuso el grupo parlamentario que se opuso a la propuesta de modificaci\u00f3n del orden del d\u00eda en la sesi\u00f3n del 26 de septiembre de 2023, que implicaba la convalidaci\u00f3n del decreto relativo al impuesto de sucesiones y donaciones?",
  "answer": "El fragmento del Diario de Sesiones no proporciona argumentos espec\u00edficos de ning\u00fan grupo parlamentario que se oponga a la pr

In [None]:
doc = dataset_qa["train"][10]["context"]
print(doc)

In [None]:
doc.replace("[Documento]:", "").strip()

In [None]:
import json
FOLDER_QA = "../data/raw/QA_PARLIAMENT_TEST"

dataset_qa = load_from_disk(FOLDER_QA)
print(dataset_qa["test"])
print(json.dumps(dataset_qa['test'][3], indent=2))

In [None]:
doc = dataset_qa["test"][3]["formatted_context"]
print(doc)

In [None]:
pks = dataset_qa["test"][:]['PK']
print(pks)
# hay algunos pk vacíos?
pks.count("")

pk_1 = pks[1]

# buscar en dataset_aut 



In [None]:
dataset = load_from_disk("../data/processed/parliament_all_docs")
print(dataset)

In [None]:
for i in dataset:
    print(i)