<small>


#### Step 0: Set environment

</small>


In [1]:
import os
import requests
from dotenv import load_dotenv
import pandas as pd

In [2]:
load_dotenv()
OPENAI_KEY = os.getenv("OPENAI_API_KEY")
LANGCHAIN_KEY = os.getenv('LANGCHAIN_API_KEY')
FOLDER_PATH = os.getenv('FOLDER_PATH')

<small>


#### Step 1: Split text, create/embed chunks and load chunks

</small>


In [3]:
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
#variable to split text
from numpy.core.defchararray import endswith


split_text = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 200,
    length_function = len
)

#function to load_documents
def load_documents(FOLDER_PATH):
    documents = []
    for file_name in os.listdir(FOLDER_PATH):
        file_path = os.path.join(FOLDER_PATH,file_name)
        if file_name.endswith(".pdf"):
            loader = PyPDFLoader(file_path)
        elif file_name.endswith(".docx"):
            loader = Docx2txtLoader(file_path)
        else:
            print(f"The document {file_name} is not supported")
        
        documents.extend(loader.load())
    return documents

#load documents
documents = load_documents(FOLDER_PATH)
print(f"{len(documents)} documents loaded")

#split text of the documents
chunks = split_text.split_documents(documents)
print(f"{len(chunks)} chunks in total")

    

179 documents loaded
1261 chunks in total


In [5]:
#call embedding model from openai
embeddings = OpenAIEmbeddings()

In [6]:
#load embeddings
from langchain_chroma import Chroma

vector_db = Chroma().from_documents(collection_name='collahuasi_pdfs',documents=chunks,embedding=embeddings, persist_directory='./cllh_db')

In [7]:
#create retriever
retriever = vector_db.as_retriever(search_kwargs={"k":3})

<small>

#### Step 2: Start to create the chain

</small>

In [11]:
#Call to model
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-4o")

In [8]:
#Simple parse answer of the model
from langchain_core.output_parsers import StrOutputParser
parser =  StrOutputParser()

In [9]:
from langchain_core.prompts import ChatPromptTemplate
template = """
You are an expert in environmental consulting projects in the north of Chile. 
Always answer the question based only on the following context:
{context}

Question: {question}

Answer: ""
"""
prompt = ChatPromptTemplate.from_template(template)

<small>

#### ⚙️ Step-by-step flow of the `rag_chain`

1. **Input**  
   `"tell me the height of the Collahuasi campament"`

2. **Branch mapping**  
   - **context** → input goes to the `retriever`, which returns `docs`.  
     The lambda joins all document texts into one string using `"\n\n"`.  
   - **question** → `RunnablePassthrough()` passes the original input unchanged.

3. **Prompt**  
   The `prompt` fills its template with `{context}` and `{question}`.

4. **LLM**  
   The `llm` generates an answer based on the formatted prompt.

5. **Parser**  
   The `parser` formats or extracts the model’s output (e.g., plain text or JSON).

**Result:**  
A final, parsed answer based on the retrieved context and user question.

</small>

In [19]:
from langchain.schema.runnable import RunnablePassthrough

# retrievers output  its docs = []
rag_chain = (
    {
            "context":retriever | (lambda docs: "\n\n".join([d.page_content for d in docs])), 
            "question": RunnablePassthrough() } 
    | prompt
    | llm
    | parser 
)

#rag_chain.invoke('tell me the height of the collahuasi campament')

<small>

#### Step 3 ⚙️ Add history-aware to the chat: improve the retriever by using chat history

1. **Input**  
   User asks a question.

2. **History retriever**  
   Reformulates the question using `chat_history`, queries the retriever, and returns relevant docs.

3. **Context branch**  
   Joins all `page_content` from docs with `"\n\n"` → becomes `{context}`.

4. **Prompt**  
   Combines `{chat_history}`, `{context}`, and `{question}` into the `answer_prompt`.


**Result:**  
A context-aware answer built from retrieved documents and chat history.

</small>


In [20]:
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.prompts import MessagesPlaceholder

# Base Message
chat_history: list = []

question = 'tell me the height of the collahuasi campament'
answer = rag_chain.invoke(question)

# Update history
chat_history.extend([
    HumanMessage(content=question),
    AIMessage(content=answer)
])

In [25]:
from langchain.chains import create_history_aware_retriever

contextualize_history_prompt = ChatPromptTemplate.from_messages([
    ("system", "Given the chat history and the latest user question, rewrite it self-contained."),
    MessagesPlaceholder("chat_history"),
    ("human", "{input}")
])

history_retriever = create_history_aware_retriever(
    llm=llm,
    retriever=retriever,
    prompt=contextualize_history_prompt
)

answer_prompt = ChatPromptTemplate.from_messages([
    ("system", "Use the provided context to answer the question. "
    "If the answer is not present, say you don't know."),
    MessagesPlaceholder("chat_history"),
    ("system", "Context:\n{context}"),
    ("human", "{question}")
])

context_branch = (
    {
        "input": RunnablePassthrough(),          
        "chat_history": lambda _: chat_history    
    }
    | history_retriever                           #history_retriever use get.
    | (lambda docs: "\n\n".join(d.page_content for d in docs))
)

In [None]:
rag_chain = (
    {
        "context":  context_branch,
        "question": RunnablePassthrough(),
        "chat_history": lambda _: chat_history, 
    }
    | answer_prompt
    | llm
    | parser
)

In [29]:
rag_chain.invoke('of which project are we talking about')

'The context refers to the Collahuasi mining project, which is located in the altiplano of the Atacama Desert.'

In [None]:
''''
#another way to simplify the final part of the chain by usin prebuild chain

from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

qa_prompt = ChatPromptTemplate.from_messages([
    ('system', 'You are an environmental professional expert with specialization in Chile environment. '
    'Use the following context to answer the users question'), 
    ('system','Context: {context}' ),
    MessagesPlaceholder(variable_name='chat_history'),
    ('human','{input}')
])

qa_chain = create_stuff_documents_chain(llm,qa_prompt)
rag_chain_two = create_retrieval_chain(history_retriever,qa_chain)
rag_chain_two.invoke({'input':'of which project are we talking about','chat_history':chat_history})
'''

{'input': 'of which project are we talking about',
 'chat_history': [HumanMessage(content='tell me the height of the collahuasi campament', additional_kwargs={}, response_metadata={}),
  AIMessage(content='The height of the Collahuasi camp is 4,400 meters above sea level (msnm).', additional_kwargs={}, response_metadata={})],
 'context': [Document(id='6057b28f-93cd-416b-a9d8-c3d1999685c0', metadata={'comments': '', 'moddate': '2025-03-26T15:04:12-03:00', 'subject': '', 'author': 'Fernanda Muñoz', 'page': 5, 'title': '', 'source': 'c:\\Users\\olcay\\LuisOlcay20\\python\\IA\\RAG\\PDFs\\7343-Apendice Bases Técnicas Mitigación_lic_2025VF.pdf', 'page_label': '6', 'creationdate': '2025-03-26T15:03:51-03:00', 'keywords': '', 'creator': 'Acrobat PDFMaker 20 para Word', 'producer': 'Adobe PDF Library 20.5.43', 'sourcemodified': '', 'company': 'Microsoft', 'total_pages': 33}, page_content='integrante de un equipo en una Compañía de vanguardia. \n2.2 UBICACIÓN \nLos yacimientos de Collahuasi se e

<small>

#### ⚙️ Step 4 — Add persistent chat memory with SQLite

1. **Database initialization**  
   The function `app_logs()` creates a local SQLite file `little_rag_app.db` and ensures the table `app_logs` exists.  
   Each row logs: `session_id`, `user_query`, `gpt_response`, `model`, and a timestamp.

2. **Insert chat records**  
   Every message–response pair is saved using `insert_app_logs()`, allowing each user session to have its own record.

3. **Retrieve conversation history**  
   `get_chat_history(session_id)` loads all previous messages from the same session in chronological order as  
   `[{role: "human", content: ...}, {role: "ai", content: ...}]`.

4. **Session handling**  
   A new `session_id` is generated with `uuid.uuid4()` for each user, enabling multi-user persistence and context continuity.

5. **Integration with RAG**  
   The retrieved history is passed to `history_retriever` and `rag_chain` so the assistant maintains memory across interactions.

**Result:**  
A lightweight local database providing persistent, multi-session conversational memory for your RAG application.

</small>


In [44]:
import sqlite3
from datetime import datetime

DB_NAME = 'little_rag_app.db'

def db_connection():
    connect = sqlite3.connect(DB_NAME)
    connect.row_factory = sqlite3.Row
    return connect

def app_logs():
    connect = db_connection()
    connect.execute(
        '''
        CREATE TABLE IF NOT EXISTS app_logs
            (id INTEGER PRIMARY KEY AUTOINCREMENT,
             session_id TEXT,
             user_query TEXT,
              gpt_response TEXT,
              model TEXT,
               created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP)
        '''
        )
    connect.close()

def insert_app_logs(session_id, user_query, gpt_response, model):
    connect = db_connection()
    connect.execute(''' 
        INSERT INTO app_logs (session_id, user_query, gpt_response, model) 
                    VALUES (?,?,?,?)
        ''',
        (session_id, user_query, gpt_response, model)
        )
    connect.commit()
    connect.close()

def get_chat_history(session_id):
    connect = db_connection()
    cursor = connect.cursor()
    cursor.execute(
        '''
        SELECT 
            user_query,
            gpt_response
        FROM
            app_logs
        WHERE
            session_id = ?
        ORDER BY  created_at
    ''',
    (session_id,)
    )
    messages = []
    for row in cursor.fetchall():
        messages.extend([
            {'role':'human', 'content': row['user_query']},
            {'role':'ai', 'content': row['gpt_response']}

        ]) 
    connect.close()
    return messages

# Start db

app_logs()

In [45]:
import uuid
session_id = str(uuid.uuid4())
chat_history = get_chat_history(session_id)
print(chat_history)

insert_app_logs(session_id, question,answer,'gpt-4o')
print(f'Human: {question}')
print(f'AI: {answer}')

[]
Human: tell me the height of the collahuasi campament
AI: The height of the Collahuasi camp is 4,400 meters above sea level (msnm).
