# Flujo completo RAG


In [1]:
import os
from typing import List
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
import re
from typing import List, Dict
from langchain.document_loaders import PyPDFLoader

from dotenv import load_dotenv
from langchain_qdrant import FastEmbedSparse, RetrievalMode
from langchain_qdrant import QdrantVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain_qdrant import RetrievalMode
from langchain_huggingface import HuggingFaceEmbeddings


import os
from langchain.chat_models import ChatOpenAI
from langchain import hub
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser


from langchain_community.document_loaders import PyPDFLoader, TextLoader, UnstructuredFileLoader
from PyPDF2 import PdfReader
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sklearn.metrics.pairwise import cosine_similarity

## Funciones

In [5]:
def load_pdf_all_documents(directory_path: str) -> List[str]:
    """
    Carga documentos PDF desde una carpeta y devuelve una lista de páginas como texto.

    Args:
        directory_path (str): Ruta de la carpeta que contiene los archivos PDF.

    Returns:
        List[str]: Lista de textos extraídos de cada página de todos los PDFs en la carpeta.
    """
    all_texts = []
    for filename in os.listdir(directory_path):
        if filename.lower().endswith('.pdf'):  # Filtrar solo archivos PDF.
            file_path = os.path.join(directory_path, filename)
            print(f"Cargando archivo: {file_path}")
            reader = PdfReader(file_path)
            for page in reader.pages:
                all_texts.append(page.extract_text())  # Agregar texto de cada página a la lista
    return all_texts



def clean_text_and_exclude_sections(text: str) -> str:
    """
    Limpia el texto eliminando espacios redundantes y caracteres especiales al principio o al final.
    Args:
        text (str): Texto extraído del PDF.
    Returns:
        str: Texto limpio.
    """
    import re
    # Reemplazar múltiples espacios consecutivos con un único espacio
    text = re.sub(r'\s+', ' ', text)
    # Eliminar espacios al inicio y final del texto
    text = text.strip()
    
    return text


# Función para dividir texto en oraciones
def split_text_into_sentences(text: str) -> List[Dict[str, str]]:
    """
    Divide un texto en oraciones basado en '.', '?', y '!' y devuelve una lista de diccionarios.
    Args:
        text (str): El texto a dividir.
    Returns:
        List[Dict[str, str]]: Lista de diccionarios con 'sentence' y 'index'.
    """
    single_sentences_list = re.split(r'(?<=[.?!])\s+', text.strip())
    sentences = [{'sentence': sentence, 'index': i} for i, sentence in enumerate(single_sentences_list)]
    return sentences

# Función para combinar oraciones
def combine_sentences(sentences: List[Dict[str, str]], buffer_size: int = 1) -> List[Dict[str, str]]:
    """
    Combina oraciones de acuerdo al tamaño del buffer definido.
    Args:
        sentences (List[Dict[str, str]]): Lista de oraciones con índices.
        buffer_size (int): Número de oraciones antes y después a combinar.
    Returns:
        List[Dict[str, str]]: Lista con oraciones combinadas.
    """
    for i in range(len(sentences)):
        combined_sentence = ''

        # Añadir oraciones previas
        for j in range(i - buffer_size, i):
            if j >= 0:
                combined_sentence += sentences[j]['sentence'] + ' '

        # Añadir oración actual
        combined_sentence += sentences[i]['sentence']

        # Añadir oraciones posteriores
        for j in range(i + 1, i + 1 + buffer_size):
            if j < len(sentences):
                combined_sentence += ' ' + sentences[j]['sentence']

        # Guardar la oración combinada en el dict actual
        sentences[i]['combined_sentence'] = combined_sentence.strip()

    return sentences

# Función para calcular distancias coseno
def calculate_cosine_distances(sentences: List[Dict[str, str]], model_name: str) -> List[float]:
    """
    Calcula las distancias coseno entre embeddings de oraciones combinadas.

    Args:
        sentences (List[Dict[str, Any]]): Lista de oraciones con embeddings combinados.
        model_name (str): Nombre del modelo de embeddings.

    Returns:
        List[float]: Distancias coseno entre embeddings consecutivos.
    """
    # Crear embeddings
    embedding_model = HuggingFaceEmbeddings(model_name=model_name)
    embeddings = embedding_model.embed_documents([sentence['combined_sentence'] for sentence in sentences])

    # Añadir embeddings a las oraciones
    for i, sentence in enumerate(sentences):
        sentence['embedding'] = embeddings[i]

    distances = []
    for i in range(len(sentences) - 1):
        sim = cosine_similarity([sentences[i]['embedding']], [sentences[i + 1]['embedding']])[0][0]
        distances.append(1 - sim)

    return distances

# Función para dividir en fragmentos
def split_into_chunks(sentences: List[Dict[str, str]], distances: List[float], threshold: float) -> List[str]:
    """
    Divide el texto en fragmentos basado en la distancia coseno entre oraciones.

    Args:
        sentences (List[Dict[str, str]]): Lista de oraciones.
        distances (List[float]): Distancias entre oraciones consecutivas.
        threshold (float): Umbral para decidir la separación de fragmentos.

    Returns:
        List[str]: Lista de fragmentos de texto.
    """
    chunks = []
    start_index = 0

    for i, distance in enumerate(distances):
        if distance > threshold:
            chunk = ' '.join(sentence['sentence'] for sentence in sentences[start_index:i + 1])
            chunks.append(chunk)
            start_index = i + 1

    if start_index < len(sentences):
        chunk = ' '.join(sentence['sentence'] for sentence in sentences[start_index:])
        chunks.append(chunk)

    return chunks

# Embeddings

def create_qdrant_store(model_name: str, chunks: List[str]) -> QdrantVectorStore:
    """
    Crea y devuelve un QdrantVectorStore a partir de un modelo de embeddings y una lista de chunks de texto.

    Args:
        model_name (str): Nombre del modelo de embeddings.
        chunks (List[str]): Lista de fragmentos de texto.

    Returns:
        QdrantVectorStore: Objeto de almacenamiento Qdrant.
    """
    # Crear embeddings con el modelo especificado
    open_source_embeddings = HuggingFaceEmbeddings(model_name=model_name)

    # Preparar documentos para Qdrant
    documents_for_qdrant = [
        Document(page_content=chunk, metadata={"chunk_index": i})
        for i, chunk in enumerate(chunks)
    ]

    # Crear la tienda de vectores en memoria
    qdrant = QdrantVectorStore.from_documents(
        documents_for_qdrant,
        embedding=open_source_embeddings,
        location=":memory:",  # Puedes cambiar la ubicación para persistencia
        collection_name="my_documents",
        retrieval_mode=RetrievalMode.DENSE,
    )
    
    return qdrant 





def create_rag_chain(model, openai_api_key, qdrant):
    """
    Crea una cadena RAG (Retrieval-Augmented Generation) usando LangChain.
    
    :param model: Nombre del modelo OpenAI (str).
    :param openai_api_key: Clave de la API de OpenAI (str).
    :param qdrant: Objeto QdrantVectorStore configurado como un retriever.
    :return: Objeto rag_chain.
    """
    # Configurar el modelo OpenAI
    llm = ChatOpenAI(
        model=model,
        temperature=0.7,  # Ajusta la creatividad según sea necesario
        openai_api_key=openai_api_key
    )

    # Descargar y configurar el prompt desde LangChain Hub
    prompt = hub.pull("rlm/rag-prompt")

    # Función para formatear los documentos
    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)

    # Configurar el retriever desde Qdrant
    retriever = qdrant.as_retriever()

    # Crear la cadena RAG
    rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )
    
    return rag_chain


In [None]:
# Parámetros
model = "gpt-3.5-turbo"
openai_api_key = os.getenv("OPENAI_API_KEY")
model_name = "sentence-transformers/paraphrase-MiniLM-L6-v2"
#directory_path = "../practicos-rag/data/USA/"
directory_path = "/Users/v0a02bg/practicos-rag/otros/usa2/CFR-2024-vol8.pdf"
buffer_size = 2  # Número de oraciones antes y después a combinar
threshold = 0.5  # Umbral para dividir chunks

# 1. Cargar texto de los documentos PDF
pdf_texts = load_pdf_all_documents(directory_path)

# 2. Combinar texto de todas las páginas en un solo string
full_text = " ".join(pdf_texts)

# 3. Limpiar texto y excluir secciones no deseadas
cleaned_text = clean_text_and_exclude_sections(full_text)

# 4. Procesar el texto para dividirlo en chunks
# 4.1 Dividir texto en oraciones
sentences = split_text_into_sentences(cleaned_text)

# 4.2 Combinar oraciones con un buffer
combined_sentences = combine_sentences(sentences, buffer_size)

# 4.3 Calcular distancias coseno entre oraciones combinadas
distances = calculate_cosine_distances(combined_sentences, model_name)

# 4.4 Dividir texto en chunks basados en el umbral
chunks = split_into_chunks(combined_sentences, distances, threshold)

FileNotFoundError: [Errno 2] No such file or directory: '../practicos-rag/otros/usa2/CFR-2024-vol8.pdf'

In [7]:
/Users/v0a02bg/practicos-rag/otros/usa2/CFR-2024-vol8.pdf

 flujo_completo3.ipynb
01_intro.ipynb
02_chunking.ipynb
03_embedding.ipynb
04_vector_databases.ipynb
CRM-Q2-FY25-Earnings-Press-Release-w-financials.pdf
Prueba2.ipynb
[34mUSA[m[m/
[34mcacao[m[m/
chunk.py
data.txt
document_splitter.py
embedding.py
example.md
flujo_completo.ipynb
flujo_completo2.ipynb
load.py
main.ipynb
main_app.py
mit.txt
note.txt
[34musa2[m[m/
visual_instruction_tunning.pdf
vstore.py


In [9]:
# Parámetros
model = "gpt-3.5-turbo"
openai_api_key = os.getenv("OPENAI_API_KEY")
model_name = "sentence-transformers/paraphrase-MiniLM-L6-v2"
#directory_path = "../practicos-rag/data/USA/"
directory_path = "../practicos-rag/data/cacao"
buffer_size = 2  # Número de oraciones antes y después a combinar
threshold = 0.5  # Umbral para dividir chunks

# 1. Cargar texto de los documentos PDF
pdf_texts = load_pdf_all_documents(directory_path)

# 2. Combinar texto de todas las páginas en un solo string
full_text = " ".join(pdf_texts)

# 3. Limpiar texto y excluir secciones no deseadas
cleaned_text = clean_text_and_exclude_sections(full_text)

# 4. Procesar el texto para dividirlo en chunks
# 4.1 Dividir texto en oraciones
sentences = split_text_into_sentences(cleaned_text)

# 4.2 Combinar oraciones con un buffer
combined_sentences = combine_sentences(sentences, buffer_size)

# 4.3 Calcular distancias coseno entre oraciones combinadas
distances = calculate_cosine_distances(combined_sentences, model_name)

# 4.4 Dividir texto en chunks basados en el umbral
chunks = split_into_chunks(combined_sentences, distances, threshold)

# 5. Crear el Qdrant store con los chunks procesados
qdrant_store = create_qdrant_store(model_name, chunks)

# 6. Configurar y crear la cadena RAG
rag_chain = create_rag_chain(model, openai_api_key, qdrant_store)

# 7. Realizar inferencia con una pregunta específica
question = "What are the mandatory data elements that must be submitted in the Automated Commercial Environment (ACE) for articles regulated by the FDA?"
response = rag_chain.invoke(question)

# 8. Imprimir la respuesta
print("Respuesta:", response)


Cargando archivo: ../practicos-rag/data/cacao/Regulaciones cacao y chocolate 2003.pdf
Respuesta: I don't know.


In [10]:
qdrant_store

<langchain_qdrant.qdrant.QdrantVectorStore at 0x7ff64c67d710>

In [11]:
question = "Can you tell me what are the proportions of cocoa butter in the chocolate?"
response = rag_chain.invoke(question)
print("Respuesta:", response)

Respuesta: The proportions of cocoa butter in chocolate must be calculated according to the weight of the dry matter, with not less than 20 percent cocoa butter. The regulations specify that the chocolate product must contain not less than 18 percent cocoa butter. The total dry cocoa solids content must be not less than 35 percent, including at least 18 percent cocoa butter.


In [12]:
question = "Which vegetable fats, apart from cocoa butter, are authorized to be used in chocolate products according to the regulations?"
response = rag_chain.invoke(question)
print("Respuesta:", response)

Respuesta: Apart from cocoa butter, vegetable fats such as Illipe, Palm-oil, Sal, Shea, Kokum gurgi, and Mango kernel can be authorized for use in chocolate products according to the regulations. These vegetable fats must comply with specific criteria, including being non-lauric vegetable fats rich in certain types of triglycerides and obtained through specific processes like refining or fractionation. Coconut oil can also be used in chocolate for the manufacture of ice cream and similar frozen products.


In [21]:
import pickle
from src.retrievers.rag_retriever import create_rag_chain

with open('qdrant_store.pkl', 'rb') as f:
    qdrant_loaded = pickle.load(f)

rag_chain = create_rag_chain("gpt-3.5-turbo", openai_api_key, qdrant_loaded)

question = "What information must be submitted to the ACE system for electronic entry of FDA-regulated products according to Section 1.72?"
response = rag_chain.invoke(question)
print("Respuesta:", response)

Respuesta: To submit electronic entry of FDA-regulated products to the ACE system according to Section 1.72, the following information must be provided: FDA Country of Production, Complete FDA Product Code consistent with the invoice description, and Full Intended Use Code. Additionally, Importer of record contact information such as telephone and email address must be included in the submission.
