<small>


#### Step 0: Set environment

</small>


In [None]:
import os
import requests
from dotenv import load_dotenv
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
load_dotenv()
OPENAI_KEY = os.getenv("OPENAI_API_KEY")
LANGCHAIN_KEY = os.getenv('LANGCHAIN_API_KEY')
FOLDER_PATH = os.getenv('FOLDER_PATH')

<small>


#### Step 1: Split text, create/embed chunks and load chunks

</small>


In [7]:
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document

In [11]:
#variable to split text
from numpy.core.defchararray import endswith


split_text = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 200,
    length_function = len
)

#function to load_documents
def load_documents(FOLDER_PATH):
    documents = []
    for file_name in os.listdir(FOLDER_PATH):
        file_path = os.path.join(FOLDER_PATH,file_name)
        if file_name.endswith(".pdf"):
            loader = PyPDFLoader(file_path)
        elif file_name.endswith(".docx"):
            loader = Docx2txtLoader(file_path)
        else:
            print(f"The document {file_name} is not supported")
        
        documents.extend(loader.load())
    return documents

#load documents
documents = load_documents(FOLDER_PATH)
print(f"{len(documents)} documents loaded")

#split text of the documents
chunks = split_text.split_documents(documents)
print(f"{len(chunks)} chunks in total")

    

179 documents loaded
1261 chunks in total


In [12]:
#call embedding model from openai
embeddings = OpenAIEmbeddings()

In [13]:
#load embeddings
from langchain_chroma import Chroma

vector_db = Chroma().from_documents(collection_name='collahuasi_pdfs',documents=chunks,embedding=embeddings, persist_directory='./cllh_db')

In [26]:
#create retriever
retriever = vector_db.as_retriever(search_kwargs={"k":3})

<small>

#### Step 2: Start to create the chain

</small>

In [None]:
#Call to model
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-4o")

In [5]:
#Simple parse answer of the model
from langchain_core.output_parsers import StrOutputParser
parser =  StrOutputParser()

In [15]:
from langchain_core.prompts import ChatPromptTemplate
template = """
You are an expert in environmental consulting projects in the north of Chile. 
Always answer the question based only on the following context:
{context}

Question: {question}

Answer: ""
"""
prompt = ChatPromptTemplate.from_template(template)

<small>

### ⚙️ Step-by-step flow of the `rag_chain`

1. **Input**  
   `"tell me the height of the Collahuasi campament"`

2. **Branch mapping**  
   - **context** → input goes to the `retriever`, which returns `docs`.  
     The lambda joins all document texts into one string using `"\n\n"`.  
   - **question** → `RunnablePassthrough()` passes the original input unchanged.

3. **Prompt**  
   The `prompt` fills its template with `{context}` and `{question}`.

4. **LLM**  
   The `llm` generates an answer based on the formatted prompt.

5. **Parser**  
   The `parser` formats or extracts the model’s output (e.g., plain text or JSON).

**Result:**  
A final, parsed answer based on the retrieved context and user question.

</small>

In [34]:
from langchain.schema.runnable import RunnablePassthrough

# retrievers output  its docs = []
rag_chain = (
    {
            "context":retriever | (lambda docs: "\n\n".join([d.page_content for d in docs])), 
            "question": RunnablePassthrough() } 
    | prompt
    | llm
    | parser 
)

rag_chain.invoke('tell me the height of the collahuasi campament')

'The Collahuasi campament is located at an altitude of 4,400 meters above sea level.'