# Flujo completo RAG


In [1]:
import os
import re
import logging
from tqdm import tqdm
import random

from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import pdist, squareform

from PyPDF2 import PdfReader
from dotenv import load_dotenv
from datasets import Dataset
from typing import List, Dict, Tuple, Optional

from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.documents import Document

from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader

from langchain_community.document_loaders import PyPDFLoader
from langchain.prompts import ChatPromptTemplate
from langchain_huggingface import HuggingFaceEmbeddings

from langchain_qdrant import FastEmbedSparse, RetrievalMode
from langchain_qdrant import QdrantVectorStore

#from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI

from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)

from qdrant_client.http.models import Distance, VectorParams
from qdrant_client import QdrantClient
from uuid import uuid4

## Funciones

In [2]:

# Configurar logging
logging.basicConfig(level=logging.INFO)

# --- PDF Processing ---
def load_pdf_all_documents(directory_path: str) -> List[str]:
    """
    Carga documentos PDF desde una carpeta y devuelve una lista de páginas como texto.

    Args:
        directory_path (str): Ruta de la carpeta que contiene los archivos PDF.

    Returns:
        List[str]: Lista de cadenas de texto, donde cada cadena corresponde al texto extraído de una página PDF.
    """
    from PyPDF2 import PdfReader

    all_texts = list()
    for filename in os.listdir(directory_path):
        if filename.lower().endswith('.pdf'):
            file_path = os.path.join(directory_path, filename)
            try:
                logging.info(f"Cargando archivo: {file_path}")
                reader = PdfReader(file_path)
                for page in reader.pages:
                    all_texts.append(page.extract_text())
            except Exception as e:
                logging.error(f"Error leyendo el archivo PDF: {file_path}. Detalle: {e}")
    return all_texts

# --- Text Processing ---
def clean_text_and_exclude_sections(text: str) -> str:
    """
    Limpia el texto eliminando espacios redundantes y caracteres especiales.

    Args:
        text (str): Texto a limpiar.

    Returns:
        str: Texto limpio con espacios redundantes eliminados.
    """
    text = re.sub(r'\s+', ' ', text)  # Reemplazar múltiples espacios
    return text.strip()

# Función para dividir texto en oraciones
def split_text_into_sentences(text: str) -> List[Dict[str, str]]:
    """
    Divide un texto en oraciones basado en '.', '?', y '!' y devuelve una lista de diccionarios.
    Args:
        text (str): El texto a dividir.
    Returns:
        List[Dict[str, str]]: Lista de diccionarios con 'sentence' y 'index'.
    """
    single_sentences_list = re.split(r'(?<=[.?!])\s+', text.strip())
    sentences = [{'sentence': sentence, 'index': i} for i, sentence in enumerate(single_sentences_list)]
    return sentences

# Función para combinar oraciones
def combine_sentences(sentences: List[Dict[str, str]], buffer_size: int = 1) -> List[Dict[str, str]]:
    """
    Combina oraciones de acuerdo al tamaño del buffer definido.
    Args:
        sentences (List[Dict[str, str]]): Lista de oraciones con índices.
        buffer_size (int): Número de oraciones antes y después a combinar.
    Returns:
        List[Dict[str, str]]: Lista con oraciones combinadas.
    """
    for i in range(len(sentences)):
        combined_sentence = ''

        # Añadir oraciones previas
        for j in range(i - buffer_size, i):
            if j >= 0:
                combined_sentence += sentences[j]['sentence'] + ' '

        # Añadir oración actual
        combined_sentence += sentences[i]['sentence']

        # Añadir oraciones posteriores
        for j in range(i + 1, i + 1 + buffer_size):
            if j < len(sentences):
                combined_sentence += ' ' + sentences[j]['sentence']

        # Guardar la oración combinada en el dict actual
        sentences[i]['combined_sentence'] = combined_sentence.strip()

    return sentences

# Función para calcular distancias coseno
def calculate_cosine_distances(sentences: List[Dict[str, str]], model_name: str) -> List[float]:
    """
    Calcula las distancias coseno entre embeddings de oraciones combinadas.

    Args:
        sentences (List[Dict[str, Any]]): Lista de oraciones con embeddings combinados.
        model_name (str): Nombre del modelo de embeddings.

    Returns:
        List[float]: Distancias coseno entre embeddings consecutivos.
    """
    # Crear embeddings
    embedding_model = HuggingFaceEmbeddings(model_name=model_name)
    embeddings = embedding_model.embed_documents([sentence['combined_sentence'] for sentence in sentences])

    # Añadir embeddings a las oraciones
    for i, sentence in enumerate(sentences):
        sentence['embedding'] = embeddings[i]

    distances = list()
    for i in range(len(sentences) - 1):
        sim = cosine_similarity([sentences[i]['embedding']], [sentences[i + 1]['embedding']])[0][0]
        distances.append(1 - sim)

    return distances

# Función para dividir en fragmentos
def split_into_chunks(sentences: List[Dict[str, str]], distances: List[float], threshold: float) -> List[str]:
    """
    Divide el texto en fragmentos basado en la distancia coseno entre oraciones.

    Args:
        sentences (List[Dict[str, str]]): Lista de oraciones.
        distances (List[float]): Distancias entre oraciones consecutivas.
        threshold (float): Umbral para decidir la separación de fragmentos.

    Returns:
        List[str]: Lista de fragmentos de texto.
    """
    chunks = list()
    start_index = 0

    for i, distance in enumerate(distances):
        if distance > threshold:
            chunk = ' '.join(sentence['sentence'] for sentence in sentences[start_index:i + 1])
            chunks.append(chunk)
            start_index = i + 1

    if start_index < len(sentences):
        chunk = ' '.join(sentence['sentence'] for sentence in sentences[start_index:])
        chunks.append(chunk)

    return chunks


def extract_metadata(text_chunk: str) -> Tuple[Optional[str], Optional[str], Optional[str]]:
    """
    Extrae títulos, subtítulos y sub-subtítulos de un fragmento de texto.

    Args:
        text_chunk (str): Fragmento de texto del cual extraer la metadata.

    Returns:
        Tuple[Optional[str], Optional[str], Optional[str]]: Título, subtítulo y sub-subtítulo encontrados (o None si no se encuentran).
    """
    title_pattern = re.compile(r"PART \d+[-—]\s*[A-Za-z0-9 ,.\-]+")
    subtitle_pattern = re.compile(r"Subpart [A-Z]—[A-Za-z0-9 ,\\-]+")
    sub_subtitle_pattern = re.compile(r"§\s*\d+\.\d+\s+[A-Za-z0-9 ,.\-]+")

    title = title_pattern.search(text_chunk)
    subtitle = subtitle_pattern.search(text_chunk)
    sub_subtitle = sub_subtitle_pattern.search(text_chunk)

    return (
        title.group(0).strip() if title else None,
        subtitle.group(0).strip() if subtitle else None,
        sub_subtitle.group(0).strip() if sub_subtitle else None,
    )

def assign_metadata_to_chunks_with_context(chunks: List[str], max_previous_chunks: int = 100) -> List[Dict[str, str]]:
    """
    Asigna títulos, subtítulos y sub-subtítulos como metadata a cada chunk.

    Args:
        chunks (List[str]): Lista de fragmentos de texto.
        max_previous_chunks (int): Número máximo de fragmentos previos a considerar para acumular metadata.

    Returns:
        List[Dict[str, str]]: Lista de fragmentos con metadata asignada.
    """
    annotated_chunks = list()
    for i in range(len(chunks)):
        metadata_accumulated = {"title": None, "subtitle": None, "sub_subtitle": None}
        for j in range(max(0, i - max_previous_chunks), i):
            metadata_title, metadata_subtitle, metadata_subsubtitle = extract_metadata(chunks[j])
            if metadata_title:
                metadata_accumulated["title"] = metadata_title
            if metadata_subtitle:
                metadata_accumulated["subtitle"] = metadata_subtitle
            if metadata_subsubtitle:
                metadata_accumulated["sub_subtitle"] = metadata_subsubtitle
        annotated_chunks.append({"chunk_text": chunks[i], "metadata": metadata_accumulated.copy()})
    return annotated_chunks

# --- Qdrant Vector Store ---
def create_qdrant_store(model_name: str, chunks: List[str]) -> QdrantVectorStore:
    """
    Crea y devuelve un QdrantVectorStore a partir de un modelo de embeddings y una lista de chunks de texto.

    Args:
        model_name (str): Nombre del modelo de embeddings.
        chunks (List[str]): Lista de fragmentos de texto.

    Returns:
        QdrantVectorStore: Objeto de almacenamiento Qdrant.
    """
    # Crear embeddings con el modelo especificado
    open_source_embeddings = HuggingFaceEmbeddings(model_name=model_name)
    sparse_embeddings = FastEmbedSparse(model_name="Qdrant/bm25")
    # Preparar documentos para Qdrant
    documents_for_qdrant = [
        Document(
            page_content=item['chunk_text'], 
            metadata={
                "title": item['metadata'].get('title', ''),
                "subtitle": item['metadata'].get('subtitle', ''),
                "sub_subtitle": item['metadata'].get('sub_subtitle', '')
            }
        )
        for i, item in enumerate(chunks)  # Aquí se itera sobre los datos originales con metadatos
    ]

    # Crear la tienda de vectores en memoria
    qdrant = QdrantVectorStore.from_documents(
        documents_for_qdrant,
        embedding=open_source_embeddings,
        sparse_embedding=sparse_embeddings,
        location=":memory:",  # Puedes cambiar la ubicación para persistencia
        collection_name="my_documents",
        retrieval_mode=RetrievalMode.HYBRID,
    )
    
    return qdrant 

def create_llm(model_name: str, temperature: float, openai_api_key: str) -> ChatOpenAI:
    """
    Crea un modelo LLM utilizando los parámetros proporcionados.

    Args:
        model_name (str): Nombre del modelo a utilizar.
        temperature (float): Grado de creatividad en las respuestas.
        openai_api_key (str): Clave de API de OpenAI para la autenticación.

    Returns:
        ChatOpenAI: Una instancia del modelo configurado.
    """
    llm = ChatOpenAI(
        model=model_name,
        temperature=temperature,  # Ajusta la creatividad según sea necesario
        openai_api_key=openai_api_key
    )
    return llm

def create_rag_chain(qdrant: QdrantVectorStore, llm: ChatOpenAI) -> QdrantVectorStore:
    """
    Crea y devuelve una cadena RAG (Retrieval-Augmented Generation) utilizando LangChain.

    Args:
        model (str): Nombre del modelo OpenAI para la generación de texto.
        openai_api_key (str): Clave de acceso a la API de OpenAI.
        qdrant (QdrantVectorStore): Almacén de vectores configurado para recuperar documentos relevantes.
        temperature (float): Nivel de creatividad del modelo en la generación de texto. 

    Returns:
        rag_chain: La cadena RAG configurada para generación y recuperación.
        retriever: El objeto retriever configurado para recuperar documentos relevantes.
    """

    # Descargar y configurar el prompt desde LangChain Hub
    prompt = hub.pull("rlm/rag-prompt")

    # Función para formatear los documentos
    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)

    # Configurar el retriever desde Qdrant
    retriever = qdrant.as_retriever()

    # Crear la cadena RAG
    rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )
    
    return rag_chain,retriever

In [None]:
def load_pdf(file_path):
    """
    Carga un archivo PDF y devuelve las primeras páginas como documentos.

    Args:
        file_path (str): Ruta al archivo PDF.

    Returns:
        list: Lista de documentos extraídos del PDF.
    """
    # Cargar variables de entorno si son necesarias
    load_dotenv()

    # Cargar el archivo PDF
    loader = PyPDFLoader(file_path)
    docs = loader.load()

    return docs  # Devuelve todas las páginas como lista

def split_pdf_documents(docs, chunk_size=1000, chunk_overlap=200):
    """
    Divide un documento PDF en fragmentos de texto.

    Args:
        docs (list): Lista de documentos cargados desde un PDF.
        chunk_size (int): Tamaño de cada fragmento de texto en caracteres. Default es 1000.
        chunk_overlap (int): Cantidad de solapamiento entre fragmentos. Default es 200.

    Returns:
        list: Lista de fragmentos de texto extraídos del PDF.
    """
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    splits = text_splitter.split_documents(docs)

    return splits  # Devuelve los fragmentos divididos

# --- Qdrant Vector Store ---
def create_qdrant_store_naive(model_name: str, chunks: List[str]) -> QdrantVectorStore:
    """
    Crea y devuelve un QdrantVectorStore a partir de un modelo de embeddings y una lista de chunks de texto.

    Args:
        model_name (str): Nombre del modelo de embeddings.
        chunks (List[str]): Lista de fragmentos de texto.

    Returns:
        QdrantVectorStore: Objeto de almacenamiento Qdrant.
    """
    # Crear embeddings con el modelo especificado
    open_source_embeddings = HuggingFaceEmbeddings(model_name=model_name)
    client = QdrantClient(path="/tmp/langchain_qdrant5")

    try:
        client.get_collection("naive_documents")
    except ValueError:
        client.create_collection(
            collection_name="naive_documents",
            vectors_config=VectorParams(size=384,
                                        distance=Distance.COSINE),
        )

    qdrant = QdrantVectorStore(
        client=client,
        collection_name="naive_documents",
        embedding=open_source_embeddings
    )
    uuids = [str(uuid4()) for _ in range(len(chunks))]
    qdrant.add_documents(documents=chunks, ids=uuids)
    client.close()
    return qdrant

In [3]:
def generate_factoid_qa_prompt():
    """
    Genera un prompt mejorado para la creación de preguntas y respuestas factuales basado en un contexto.

    Returns:
        ChatPromptTemplate: Un objeto de plantilla de prompt para generación de QA.
    """
    QA_generation_prompt = ChatPromptTemplate.from_template("""
    Your task is to generate a *factoid question* and its corresponding *answer* based on the given context.

    Here are the rules:
    1. The *factoid question* must be directly answerable with a specific and concise piece of factual information from the context.
    2. Avoid using phrases like "according to the passage" or "based on the context" in your question.
    3. The question should resemble the style of queries typically entered in a search engine, focusing on clarity and relevance.
    4. The context provided will have a maximum token limit of 200 to 300 tokens.                                                        

    Please provide your response in the following format:

    Output:::
    Factoid question: (Your factoid question here)
    Answer: (The answer to the factoid question here)

    Here is the context:

    Context: {context}

    Output:::
    """)
    return QA_generation_prompt


# Corregir la implementación del Passthrough
class SimplePassthrough:
    def __call__(self, inputs):
        return inputs  # Devuelve los inputs directamente


# Definimos una función para manejar un solo contexto
def question_chain(context, prompt, config):
    passthrough = SimplePassthrough()  # Instanciar el passthrough
    passthrough_output = passthrough({"context": context})  # Paso directo

    # Formatear el prompt
    prompt_output = prompt.format(**passthrough_output)
    
    llm = create_llm(config["model"], config["temperature"], config["openai_api_key"])

    # Generar salida usando el modelo LLM
    llm_output = llm.invoke(prompt_output)
    
    # Parsear la salida final
    parsed_output = StrOutputParser().parse(llm_output)
    
    return parsed_output

# Proceso para manejar varios documentos
def process_multiple_docs(docs, prompt, config, num_samples=15):
    # Seleccionar una muestra aleatoria de documentos
    sampled_docs = random.sample(docs, num_samples)
    sampled_docs_processed = [doc.page_content for doc in sampled_docs]
    
    # Procesar preguntas en batch
    questions = [
        question_chain(sampled_context, prompt, config)
        for sampled_context in tqdm(sampled_docs_processed, desc="Processing questions")
    ]
    
    return questions

def extract_questions_and_answers(data):
    questions = list()
    answers = list()
    for message in data:
        if hasattr(message, "content"):
            content = message.content
            if "Factoid question:" in content and "Answer:" in content:
                # Extraer pregunta y respuesta
                factoid_question = content.split("Factoid question:")[1].split("\nAnswer:")[0].strip()
                answer = content.split("Answer:")[1].strip()
                # Agregar a las listas correspondientes
                questions.append(factoid_question)
                answers.append(answer)
    return questions, answers



def evaluate_rag_pipeline(rag_chain, retriever, questions, ground_truths):
    """
    Realiza la inferencia con un pipeline RAG, evalúa los resultados y devuelve un DataFrame con las métricas.

    Args:
        rag_chain: El modelo RAG para generar respuestas.
        retriever: El componente de recuperación para obtener contextos relevantes.
        questions (list): Lista de preguntas para realizar la inferencia.
        ground_truths (list): Lista de respuestas esperadas (ground truths) para evaluación.

    Returns:
        pandas.DataFrame: DataFrame con los resultados de la evaluación.
    """
    # Inicializar listas para almacenar respuestas y contextos
    answers = list()
    contexts = list()

    # Inferencia para cada pregunta
    for query in questions:
        # Obtener respuesta del modelo
        answers.append(rag_chain.invoke(query))
        # Obtener contextos relevantes del retriever
        relevant_docs = retriever.invoke(query)
        contexts.append([doc.page_content for doc in relevant_docs])

    # Crear conjunto de datos para evaluación
    dataset = Dataset.from_dict({
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truths": ground_truths,
        "reference": [gt[0] for gt in ground_truths]  # Agregar columna de referencia
    })

    # Evaluar el pipeline utilizando métricas de RAGAs
    result = evaluate(
        dataset=dataset, 
        metrics=[
            context_precision,
            context_recall,
            faithfulness,
            answer_relevancy,
        ],
    )

    # Convertir resultados a DataFrame y devolver
    df = result.to_pandas()
    return df

In [13]:
# --- Flujo Principal ---
def main():
    os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
    load_dotenv()
    config = {
        "model": "gpt-3.5-turbo",
        "openai_api_key": os.getenv("OPENAI_API_KEY"),
        "model_name": "sentence-transformers/paraphrase-MiniLM-L6-v2",
        "directory_path": os.getcwd()+ "\\data\\",
        "buffer_size": 2,
        "threshold": 0.3,
        "max_previous_chunks": 400,
        "temperature": 0.7,
        "file_path":  os.getcwd()+ "\\data\\" + "CFR-2024-vol8.pdf",
        "num_samples":15,
    }
    #SuperRAG
    pdf_texts = load_pdf_all_documents(config["directory_path"])
    cleaned_text = clean_text_and_exclude_sections(" ".join(pdf_texts))
    sentences = split_text_into_sentences(cleaned_text)
    combined_sentences = combine_sentences(sentences, config["buffer_size"])
    distances = calculate_cosine_distances(combined_sentences, config["model_name"])
    chunks = split_into_chunks(combined_sentences, distances, config["threshold"])
    annotated_chunks = assign_metadata_to_chunks_with_context(chunks, config["max_previous_chunks"])
    qdrant_store = create_qdrant_store(config["model_name"], annotated_chunks)
    llm = create_llm(config["model"], config["temperature"], config["openai_api_key"])
    rag_chain, retriever = create_rag_chain(qdrant_store,llm)

    #Raga
    loader = PyPDFLoader(config["file_path"])
    docs = loader.load()
    prompt = generate_factoid_qa_prompt()
    new_questions = process_multiple_docs(docs, prompt, config, config["num_samples"])
    questions, ground_truths = extract_questions_and_answers(new_questions)
    df_raga = evaluate_rag_pipeline(rag_chain, retriever, questions, ground_truths)
    df_raga.to_csv("results.csv", encoding = "utf-8", sep = "|")


    question = "What are the procedures for milk pasteurization according to the regulations mentioned in the document?"
    response = rag_chain.invoke(question)
    logging.info(f"Respuesta: {response}")

if __name__ == "__main__":
    main()

INFO:root:Cargando archivo: c:\Users\jomunozf\Documents\GitHub\practicos-rag\data\CFR-2024-vol8.pdf
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/paraphrase-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/paraphrase-MiniLM-L6-v2


Fetching 29 files:   0%|          | 0/29 [00:00<?, ?it/s]

  llm_output = llm(prompt_output)
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Processing questions:   7%|▋         | 1/15 [00:01<00:20,  1.47s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Processing questions:  13%|█▎        | 2/15 [00:02<00:16,  1.24s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Processing questions:  20%|██        | 3/15 [00:03<00:13,  1.09s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Processing questions:  27%|██▋       | 4/15 [00:04<00:13,  1.23s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Processing questions:  33%|███▎      | 5/15 [00:06<00:13,  1.30s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Processing questions:  40%|████      | 6/15 [00:07<00:10,  1.14s/it]INFO:httpx:H

Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"

In [None]:


OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
embeddings = OpenAIEmbeddings(model="text-embedding-3-small", api_key=OPENAI_API_KEY)

## Naive RAG

In [2]:
config = {
    "model": "gpt-3.5-turbo",
    "openai_api_key": os.getenv("OPENAI_API_KEY"),
    "model_name": "sentence-transformers/paraphrase-MiniLM-L6-v2",
    "directory_path": os.getcwd()+ "\\data\\",
    "buffer_size": 2,
    "threshold": 0.3,
    "max_previous_chunks": 400,
    "temperature": 0.7,
    "file_path":  os.getcwd()+ "\\data\\" + "CFR-2024-vol8.pdf",
    "num_samples":15,
}

In [33]:
# main
docs = load_pdf(config["file_path"])
naive_chunks = split_pdf_documents(docs)
naive_qdrant = create_qdrant_store_naive(config["model_name"], naive_chunks)
llm = create_llm(config["model"], config["temperature"], config["openai_api_key"])
naive_rag_chain, naive_retriever = create_rag_chain(naive_qdrant, llm )

In [36]:
naive_rag_chain.invoke("What are the procedures for milk pasteurization according to the regulations mentioned in the document?")

'According to the regulations mentioned in the document, the procedures for milk pasteurization are as follows:\n\n1. Milk must be heated to a minimum temperature of 161°F (71.7°C) for at least 15 seconds for batch pasteurization or to a minimum temperature of 145°F (62.8°C) for at least 30 minutes for vat pasteurization.\n\n2. The milk must be cooled rapidly to a temperature of 45°F (7.2°C) or lower after pasteurization to prevent the growth of harmful bacteria.\n\n3. Proper sanitation procedures must be followed to prevent contamination of the milk during pasteurization.\n\n4. Records of pasteurization times, temperatures, and cooling procedures must be maintained for inspection by regulatory authorities.\n\n5. Pasteurized milk must be stored and transported in clean, sanitized containers to prevent contamination.\n\n6. Pasteurized milk must be labeled as such to indicate that it has been properly pasteurized according to regulations.\n\n7. Any deviations from the pasteurization proc

The procedures for milk pasteurization according to the regulations mentioned in the document involve heating every particle of milk or cream to a specific temperature and holding it there continuously for a specified time. The temperatures given in the document are 145 °F (63 °C) for 30 minutes or at least 161 °F for at least 15 seconds. These procedures are mandatory for all milk and milk products intended for direct human consumption.

'According to the regulations mentioned in the document, the procedures for milk pasteurization are as follows:\n\n1. Milk must be heated to a minimum temperature of 161°F (71.7°C) for at least 15 seconds for batch pasteurization or to a minimum temperature of 145°F (62.8°C) for at least 30 minutes for vat pasteurization.\n\n2. The milk must be cooled rapidly to a temperature of 45°F (7.2°C) or lower after pasteurization to prevent the growth of harmful bacteria.\n\n3. Proper sanitation procedures must be followed to prevent contamination of the milk during pasteurization.\n\n4. Records of pasteurization times, temperatures, and cooling procedures must be maintained for inspection by regulatory authorities.\n\n5. Pasteurized milk must be stored and transported in clean, sanitized containers to prevent contamination.\n\n6. Pasteurized milk must be labeled as such to indicate that it has been properly pasteurized according to regulations.\n\n7. Any deviations from the pasteurization procedures must be documented and corrective actions taken to prevent future occurrences.\n\nOverall, the goal of milk pasteurization is to ensure the safety and quality of the milk by destroying harmful bacteria while preserving its nutritional value.'

In [None]:
# import pandas as pd
# import os

# pd.read_csv(f'{os.getcwd()}\\results.csv', sep = '|').to_excel('results.xlsx')