In [40]:
import os
import pprint
from dotenv import load_dotenv

# Load the environment variables
load_dotenv()

# Import the necessary modules for models
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain.llms import HuggingFaceEndpoint

# 1. The Langchain Ecosystem

In [None]:
from langchain.llms import HuggingFaceEndpoint

llm = HuggingFaceEndpoint(
    # repo_id="tiiuae/falcon-7b-instruct", # larger, but right now unsupported model (from course)
    repo_id="google/flan-t5-small",  # smaller, supported model
    task="text2text-generation",
    huggingfacehub_api_token=os.getenv("HUGGINGFACEHUB_API_TOKEN")
)

question = "Can you still have fun?"
output = llm.invoke(question)
print(output)



ValueError: Model 'google/flan-t5-small' doesn't support task 'text2text-generation'.

In [None]:
from langchain_openai import OpenAI

llm = OpenAI(
    # model="gpt-4.1-2025-04-14",
    model="gpt-3.5-turbo-instruct",
    api_key=os.getenv("OPENAI_API_KEY")
)

question = "Can you still have fun"
output = llm.invoke(question)
print(output)

 without alcohol?

Yes, there are many ways to have fun without alcohol. Some examples include spending time with friends and family, engaging in hobbies or activities, trying new things, traveling, and exploring new places. Additionally, there are many alcohol-free activities and events such as concerts, sports games, festivals, and art exhibitions that can be enjoyed without the need for alcohol. It is also possible to have a good time and socialize without drinking by finding other non-alcoholic drinks to enjoy, participating in non-drinking games or activities, or simply enjoying good conversation and company.


One thing to note is that the models used need to support the chat completion for this approach. Which is why when i used the newest 4.1 model, it will return this error message:

`NotFoundError: Error code: 404 - {'error': {'message': 'This is a chat model and not supported in the v1/completions endpoint. Did you mean to use v1/chat/completions?', 'type': 'invalid_request_error', 'param': 'model', 'code': None}}`

# 2. Prompting Strategies For Chatbots

In [4]:
# Prompt templates
from langchain.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template("Translate the following into {language}: {text}")
prompt.format(language="French", text="I love programming in Python")

messages=[HumanMessage(content='Translate the following into French: I love programming in Python', additional_kwargs={}, response_metadata={})]


In [20]:
# Prompt templates
from langchain.prompts import PromptTemplate

template = """
You are an artificial intelligence assistant, answer the question. {question}
"""

prompt_template = PromptTemplate(template=template, input_variables=["question"])
print("Version 1:")
print(prompt_template.invoke({"question" : "What is Langchain?"}))

# Another approach
# Prompt template to format question into a string
prompt_template = PromptTemplate.from_template("Answer the following question:\n{question}")
print("\nVersion 2:")
print(prompt_template.invoke({"question" : "What is Langchain?"}))

Version 1:
text='\nYou are an artificial intelligence assistant, answer the question. What is Langchain?\n'

Version 2:
text='Answer the following question:\nWhat is Langchain?'


> **Important thing to notice is that in the .invoke() method, the variable in the prompt template must be specified**

In [14]:
from langchain.llms import HuggingFaceEndpoint

llm = HuggingFaceEndpoint(
    repo_id="google/flan-t5-xl",
    task="text-generation",
    huggingfacehub_api_token=os.getenv("HUGGINGFACEHUB_API_TOKEN")
)

# Chain the prompt template to the model using langchain expression language (LCEL)
# Chain: connect calls to LLMs and other components
llm_chain = prompt_template | llm

# Run the chain
question = "What is Langchain?"
response = llm_chain.invoke({"question": question})
print(response)



HfHubHTTPError: (Request ID: Root=1-6814c4e7-35758dc85fac35c905356813;143bda33-9356-4927-9987-0c45ab5f8617)

403 Forbidden: None.
Cannot access content at: https://router.huggingface.co/hf-inference/models/google/flan-t5-xl.
Make sure your token has the correct permissions.
The model google/flan-t5-xl is too large to be loaded automatically (11GB > 10GB).

In [2]:
# Chat models
from langchain_core.prompts import ChatPromptTemplate

# Define the prompt template
prompt_template = ChatPromptTemplate.from_messages(
    [
        # System message: Sets the behavior/role of the AI assistant
        ("system", "You are soto zen master Roshi"),
        
        # Human message: Represents a message from the user in the conversation history
        ("human", "What is the essence of Zen"),
        
        # AI message: Represents a previous response from the AI in the conversation history
        # This acts as an example of how the AI should respond
        ("ai", "When you are hungry, eat. When you are thirsty, drink. When you are tired, sleep."),
        
        # Human message with a variable: This is where the actual user question will be inserted
        # The {question} placeholder will be replaced with the actual question at runtime
        ("human", "Respond to the question: {question}")
    ]
)

In [9]:
# Integrating ChatPromptTemplate to LLMs
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    model="gpt-4o-mini",
    api_key=os.getenv("OPENAI_API_KEY")
)

# Chain the prompt template to the model
llm_chain = prompt_template | llm
question = "What should I do when I'm angry?"

response = llm_chain.invoke({"question": question})
print(response.content)

When anger arises, pause and breathe. Observe the sensations in your body and the thoughts in your mind without judgment. Allow the energy of the anger to flow through you. Remember that anger is a natural emotion; it is a response, not a directive. You can acknowledge it, but do not let it control your actions. Find a quiet space if you can, and return to your breath. With each exhale, let go of the intensity. In that stillness, clarity will arise. Respond with wisdom rather than reflex.


In [10]:
from langchain_openai import ChatOpenAI

# Another approach is by format the prompt question directly
# Notice that in here we don't need to make the chain
formatted_prompt = prompt_template.format_messages(question="What should I do when I'm angry?")

# Initialize the chat model
llm = ChatOpenAI(model="gpt-4o-mini", api_key=os.getenv("OPENAI_API_KEY"))

# Run the model
response = llm.invoke(formatted_prompt)

# Print the output
print(response.content)

When anger arises, acknowledge it without judgment. Sit quietly and breathe deeply. Allow the energy of anger to be present, but do not let it control you. Observe the sensations in your body, the thoughts in your mind, and the feelings in your heart. 

Let it come and let it go, like clouds passing across the sky. Instead of reacting, cultivate awareness. Find a space of stillness within. You might then choose to act from compassion, understanding, or simply allow the moment to pass. In the midst of anger, the practice is to return to the breath and find your center.


# 3. Few-Show Prompting

Few-Shot Prompting: Providing multiple examples to guide model behavior

    - Zero-shot: No examples, just instructions (e.g., "Translate this to French")
    - One-shot: Single example before the actual task
    - Few-shot: Multiple examples (like we're doing here)
 
FewShotPromptTemplate is superior to regular PromptTemplate because it:

    1. Handles formatting of multiple examples automatically
    2. Maintains consistent structure across examples
    3. Separates example logic from the main prompt
    4. Allows dynamic selection of examples based on context

In [2]:
# Formatting the examples
from langchain_core.prompts import FewShotPromptTemplate, PromptTemplate

example_prompt = PromptTemplate.from_template("Question: {question}\n{answer}")
prompt = example_prompt.invoke({"question": "What is the capital of Italy?", "answer": "Rome"})
print(prompt.text)

Question: What is the capital of Italy?
Rome


In [4]:
# Building an examples set
examples = [
    {"question": "What is the capital of Italy?", "answer": "Rome"},
    {"question": "What is the capital of France?", "answer": "Paris"},
    {"question": "What is the capital of Germany?", "answer": "Berlin"},
    {"question": "What is the capital of Spain?", "answer": "Madrid"},
    {"question": "What is the capital of Portugal?", "answer": "Lisbon"},
    {"question": "What is the capital of Greece?", "answer": "Athens"},
    {"question": "What is the capital of Turkey?", "answer": "Ankara"},
]

# Convert Dataframe to list of dicts
import pandas as pd
df = pd.DataFrame(examples)
examples = df.to_dict(orient="records")

# Using FewShotPromptTemplate
prompt_template = FewShotPromptTemplate(
    examples=examples,                      # examples: the list of dicts
    example_prompt=example_prompt,          # example_prompt: formatted template
    suffix="Question: {input}\nAnswer:",    # suffix: suffix to add to the input
    input_variables=["input"]               # input_variables: the input variables
)

# Invoking the few shot prompt template
response = prompt_template.invoke({"input": "What is the capital of the United States?"})
print(response.text)

Question: What is the capital of Italy?
Rome

Question: What is the capital of France?
Paris

Question: What is the capital of Germany?
Berlin

Question: What is the capital of Spain?
Madrid

Question: What is the capital of Portugal?
Lisbon

Question: What is the capital of Greece?
Athens

Question: What is the capital of Turkey?
Ankara

Question: What is the capital of the United States?
Answer:


In [13]:
# Integration with a chain
llm = ChatOpenAI(model="gpt-4o-mini", api_key=os.getenv("OPENAI_API_KEY"))

chain = prompt_template | llm

response = chain.invoke({"input": "What is the capital of the United States?"})
print(response.content)

Washington, D.C.


# 4. Sequential Chains

- Some probems can only be solved sequentially, for example we can ask a model about where are the places to visit in Paris and then the model will give several recommendations, and then we as the user will tell the model which activities to select and make it plan for us. Notice that this requires more than one user input (sequential), first is to specify the destination and another one to select activities.

- The output from one chain becomes the input to another chain.

In [None]:
# Create 2 prompt template
destination_prompt = PromptTemplate(
    input_variables=["destination"],
    template="""
    I am planning a trip to {destination}. Can you suggest some activities to do there?
    """
)

activity_prompt = PromptTemplate(
    input_variables=["activities"],
    template="""
    I only have one day, so can you create an itinerary from your top three recommendations: {activities}?
    """
)

# Initialize the LLM
llm = ChatOpenAI(model="gpt-4o-mini", api_key=os.getenv("OPENAI_API_KEY"))

# Import the StrOutputParser
from langchain.schema.output_parser import StrOutputParser

# Create the sequential chain using the StrOutputParser
sequential_chain = ({"activities": destination_prompt | llm | StrOutputParser()}
                    | activity_prompt | llm | StrOutputParser()
)

# Run the chain
response = sequential_chain.invoke({"destination": "Jakarta"})
print(response)

Certainly! Here’s a one-day itinerary that covers three top recommendations in Jakarta, giving you a blend of cultural experiences, sightseeing, and local flavors.

### One-Day Jakarta Itinerary

#### Morning
**Start at Monas (National Monument)**
- **Time**: 8:00 AM - 9:30 AM
- **Description**: Begin your day by visiting Monas, the iconic symbol of Indonesia's independence. Take the elevator to the top for panoramic views of the bustling city. Be sure to stroll around the surrounding park for some fresh air and beautiful gardens.

**Head to Istiqlal Mosque**
- **Time**: 9:45 AM - 10:30 AM
- **Description**: A short distance away, visit the Istiqlal Mosque, Southeast Asia’s largest mosque. Admire its remarkable architecture and serene atmosphere. If you’re interested, you can partake in a guided tour to learn more about the mosque's significance.

**Visit Jakarta Cathedral**
- **Time**: 10:45 AM - 11:30 AM
- **Description**: Right next to Istiqlal, explore the Jakarta Cathedral, a stun

In this context, `StrOutputParser()` is used to **convert the output of the language model (LLM) into a plain Python string**. Here's a breakdown of what it's doing:

- Purpose of `StrOutputParser()`:

    - It **parses the raw output** from the LLM and **returns it as a standard string**, which is necessary because LLM output might be in a more complex format (e.g. a JSON-like structure, or a response object).
    - It ensures that the next stage in the chain receives a **simple, clean string** as input.

- Example of its role in the chain:

```python
{"activities": destination_prompt | llm | StrOutputParser()}
```
    - `destination_prompt` generates a prompt.
    - `llm` (e.g., a model like GPT-4) responds to the prompt.
    - `StrOutputParser()` takes the response and extracts the actual text output (as a string).
    - The resulting string is assigned to the key `"activities"`.

This parsed `"activities"` string is then fed into the second half of the chain:

```python
| activity_prompt | llm | StrOutputParser()
```

— which generates the final output.

# 5. Introduction to Langchain Agents

- Agents: use LLMs to take actions
- Tools: functions called by the agent

In this tutorial we will be using ReAct(reason + act) agents.

Example would be like:

user: What is the weather like in Kingston, Jamaica?
agent: 

    - Thought: I should call Weather() to find the weather in Kingston, Jamaica
    - Act: Weather("Kingston, Jamaica")
    - Observe: The weather is mostly sunny with temperatures of 82F

-----------------------------------

To implement agents, we will be using LangGraph, which is a branch of Langchain ecosystem **centered around designing agent system.**

In [18]:
# pip install --upgrade langgraph

In [3]:
# ReAct agent
from langchain.chains.llm_math.base import LLMMathChain
from langgraph.prebuilt import create_react_agent
from langchain_community.agent_toolkits.load_tools import load_tools
import pprint

# First rebuild the model
LLMMathChain.model_rebuild()

# Then create tools
llm = ChatOpenAI(model="gpt-4o-mini", api_key=os.getenv("OPENAI_API_KEY"))
tools = load_tools(["llm-math"], llm=llm)

# Create the agent
agent = create_react_agent(llm, tools)

messages = agent.invoke({"messages": [("human", "What is the square root of 101?")]})
pprint.pprint(messages)

{'messages': [HumanMessage(content='What is the square root of 101?', additional_kwargs={}, response_metadata={}, id='cdb811fd-6a16-4e65-855a-b724adb8b26a'),
              AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_wRsMFdhXHSI7RBc2ZCh3WaQF', 'function': {'arguments': '{"__arg1":"sqrt(101)"}', 'name': 'Calculator'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 20, 'prompt_tokens': 63, 'total_tokens': 83, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_129a36352a', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run-f64fcf55-28e5-4f82-b13d-5ee53875276d-0', tool_calls=[{'name': 'Calculator', 'args': {'__arg1': 'sqrt(101)'}, 'id': 'call_wRsMFdhXHSI7RBc2ZCh3WaQF', 'type': 'tool_call'}], us

In this example, you can see how the agent calls the tools. Another example below is by utilizing the memory so that the ReAct agent can remember previous interactions.

In [None]:
from langchain.chains.llm_math.base import LLMMathChain
from langgraph.prebuilt import create_react_agent
from langchain_community.agent_toolkits.load_tools import load_tools
from langchain.chains.conversation.memory import ConversationBufferMemory
from langchain_openai import ChatOpenAI

# First rebuild the model
LLMMathChain.model_rebuild()

# Create the tools
llm = ChatOpenAI(model="gpt-4o-mini", api_key=os.getenv("OPENAI_API_KEY"))
tools = load_tools(["llm-math"], llm=llm)

# Create a conversation memory
memory = ConversationBufferMemory(return_messages=True)

# Create the agent with memory
agent = create_react_agent(llm, tools)

# Initialize conversation with empty list
conversation_history = []

# Function to interact with the agent while maintaining memory
def interact_with_agent(user_input):
    global conversation_history
    
    # Add the new user message to history
    conversation_history.append(("human", user_input))
    
    # Create the input with all history
    agent_input = {"messages": conversation_history}
    
    # Get the agent's response
    response = agent.invoke(agent_input)
    
    # Extract the latest assistant message and add it to history
    latest_assistant_message = response["messages"][-1]
    conversation_history.append(latest_assistant_message)
    
    # Return the content of the message, not trying to index it
    return latest_assistant_message.content

# Example usage
response1 = interact_with_agent("What is the square root of 101?")
print("First response:")
print(response1)
print("\n")

# The agent now remembers the previous interaction
response2 = interact_with_agent("And what is that number multiplied by 15?")
print("Second response:")
print(response2)

First response:
The square root of 101 is approximately 10.05.


Second response:
The number 10.05 multiplied by 15 is 150.75.


The agent remember the answer from previous conversation.

# 6. Custom Tools For Agents

Tools must be formatted in a specific way to be compatible with agent.

In [3]:
from langchain_community.agent_toolkits.load_tools import load_tools

llm = ChatOpenAI(model="gpt-4o-mini", api_key=os.getenv("OPENAI_API_KEY"))
tools = load_tools(["llm-math"], llm=llm)

In [6]:
# First they must have a name accessible via the name attribute
print(tools[0].name)

# A description which is used by the LLM/agent as contenxt to determine when to call it
print(tools[0].description)

# Return direct parameter idicate whether the agent should stop after invoking this tool
print(tools[0].return_direct)

Calculator
Useful for when you need to answer questions about math.
False


In [15]:
# Defining a custom function
def financial_report(company_name: str, revenue:int, expenses:int) -> str:
    """
    Generate a financial report for a company that calculates new income.
    """

    net_income = revenue - expenses

    # Using string concatenation
    report = f"Financial Report for {company_name}: \n"
    report += f"Revenue: ${revenue}\n"
    report += f"Expenses: ${expenses}\n"
    report += f"Net: ${net_income}\n"

#     # Using triple-quoted string
#     report = f"""Financial Report for {company_name}:
# Revenue: ${revenue}
# Expenses: ${expenses}
# Net: ${net_income}
# """
    return report

In [16]:
print(financial_report(company_name="Lemonade Stand", revenue=100, expenses=50))

Financial Report for Lemonade Stand: 
Revenue: $100
Expenses: $50
Net: $50



Now let's convert this function into a tool that our agent can call, which will be done using the tool decorator. The @tool modifies the function so that it is in the correct format to be used by tool.

In [17]:
from langchain_core.tools import tool

@tool
def financial_report(company_name: str, revenue:int, expenses:int) -> str:
    """
    Generate a financial report for a company that calculates new income.
    """

    net_income = revenue - expenses

    # Using string concatenation
    report = f"Financial Report for {company_name}: \n"
    report += f"Revenue: ${revenue}\n"
    report += f"Expenses: ${expenses}\n"
    report += f"Net: ${net_income}\n"
    return report

In [22]:
# Examining our new tool
print(financial_report.name)
print(financial_report.description)
print(financial_report.return_direct)

# or we can access the .args to print the arguments and expected data types 
print(financial_report.args)

financial_report
Generate a financial report for a company that calculates new income.
False
{'company_name': {'title': 'Company Name', 'type': 'string'}, 'revenue': {'title': 'Revenue', 'type': 'integer'}, 'expenses': {'title': 'Expenses', 'type': 'integer'}}


In [28]:
# Integrating the custom tool
from langgraph.prebuilt import create_react_agent
import pprint

llm = ChatOpenAI(model="gpt-4o-mini", api_key=os.getenv("OPENAI_API_KEY"))

# Create the agent with custom tool
agent = create_react_agent(llm, [financial_report])

messages = agent.invoke({"messages": [("human", "TechStack generated made $10 million with $8 million of costs. Generate a financial report.")]})
pprint.pprint(messages)

{'messages': [HumanMessage(content='TechStack generated made $10 million with $8 million of costs. Generate a financial report.', additional_kwargs={}, response_metadata={}, id='af6c8209-bced-4c0f-b8c5-9ed636ccf550'),
              AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_TcmMAbPff4rZjN4qB08Wowbk', 'function': {'arguments': '{"company_name":"TechStack","revenue":10000000,"expenses":8000000}', 'name': 'financial_report'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 31, 'prompt_tokens': 77, 'total_tokens': 108, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_129a36352a', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run-ddb4ed38-6375-41dc-b2b4-15d6c612f662-0', tool_calls=[{'name': 

In [29]:
# Get only the ai response
final_response = messages["messages"][-1].content
print(final_response)

**Financial Report for TechStack:**

- **Revenue:** $10,000,000
- **Expenses:** $8,000,000
- **Net Income:** $2,000,000


# 7. Integrating Document Loaders

> Note to self: this is a Retrival Augmented Generation(RAG) implementation. For more detailed please later refer to the RAG with Langchain course in this repo.

- Pre-trained language models don't have access to external data sources. Their understanding comes purely from training data. This means that if we require the model to have knowledge that goes beyond its training data, which could be company data or more recent world events, you need a way to integrating that data.

- In RAG, user query is embedded and used to retrieve the relevant document fromt he database. Then these documents are added to the models prompt, so that the modelhave extra context to inform its response.

- There are 3 RAG development steps in Langchain

    1. Document Loader (load into langchain)
    2. Splitting (into chunks -> unit of information that we can index and process individually)
    3. Storage + Retrival

Langchain document loaders:

    - Classes designed to load and configure documents for system integration
    - Dcoument loader for common file types: pdf and csv
    - Additional loaders provided by 3rd parties for managing unique document formats: amazon S3 files, ipynb, wav, etc

In [31]:
# pip install pypdf

Collecting pypdf
  Downloading pypdf-5.4.0-py3-none-any.whl.metadata (7.3 kB)
Downloading pypdf-5.4.0-py3-none-any.whl (302 kB)
Installing collected packages: pypdf
Successfully installed pypdf-5.4.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
# PDF document loader (require installation of the pypdf package)
from langchain_community.document_loaders import PyPDFLoader

# loader = PyPDFLoader("path/to/your'file.pdf")
loader = PyPDFLoader(r"E:\NLP Learning\NLP-Learning\Datacamp Courses\Developing LLM Applications with LangChain\Document Example.pdf")
data = loader.load()

print(data[0])

page_content='Understanding the Rise of AI-Powered Tools in Modern Workflows 
In recent years, artificial intelligence (AI) has transformed how organizations handle data, 
automate processes, and deliver insights. From natural language processing to computer 
vision, AI models are now integrated into everyday business tools, enhancing productivity and 
decision-making. Companies are rapidly adopting frameworks like LangChain to build 
applications that interact with documents, answer questions, and even summarize content in 
real time. 
One popular use case is Retrieval -Augmented Generation (RAG), which improves the 
accuracy and relevance of AI-generated responses by grounding them in real-world data. By 
combining a language model with a vector store and document loader, RAG systems can 
retrieve context-specific information from a knowledge base, such as PDFs, to answer user 
queries more effectively. This approach is especially valuable in domains like legal, healthcare, 
and fina

In [21]:
# CSV document loader
from langchain_community.document_loaders.csv_loader import CSVLoader

loader = CSVLoader(r"E:\NLP Learning\NLP-Learning\Datacamp Courses\Developing LLM Applications with LangChain\sample_employee_data.csv")
data = loader.load()

# To get just the page_content for a single document
print(data[0])

page_content='Name: Michelle Rose
Email: carriehall@yahoo.com
Company: Wood, Foley and Terry
Job Title: Engineer, land
Date Joined: 2023-10-29
Country: Niger
Salary (USD): 129547.35' metadata={'source': 'E:\\NLP Learning\\NLP-Learning\\Datacamp Courses\\Developing LLM Applications with LangChain\\sample_employee_data.csv', 'row': 0}


I'm actually kinda curious about this document loader of csv, can it directly extract insights from here or not (like you actually need other components first like splitting it into chunks first), we will find out below.

In [3]:
import pandas as pd
df = pd.read_csv("sample_employee_data.csv")
df.head()

Unnamed: 0,Name,Email,Company,Job Title,Date Joined,Country,Salary (USD)
0,Michelle Rose,carriehall@yahoo.com,"Wood, Foley and Terry","Engineer, land",2023-10-29,Niger,129547.35
1,Paul Anderson,alicia56@gmail.com,Evans Ltd,Field seismologist,2024-03-26,Swaziland,114199.87
2,Terri Chang,lambertmax@hotmail.com,Watson-Mccoy,Health and safety adviser,2021-01-29,Guernsey,116572.83
3,Larry Williams,crossbrandon@gmail.com,Taylor-Taylor,"Solicitor, Scotland",2022-06-18,Martinique,46575.15
4,Gabriel Beck,meganball@yahoo.com,Osborn LLC,"Designer, television/film set",2020-07-23,Lebanon,100704.51


Let's try 2 simple aggregations and 1 data lookup. As for the agregation we will find the sum of salary and then the count of Albania, while for the lookup we will find the country of empoyee name Gabriel Beck.

In [36]:
# Check the values that we want the LLMs to find
salary_sum = df['Salary (USD)'].sum()
albania_count = len(df['Country'].loc[df['Country'] == "Albania"])
lookup_employee_country = df.loc[df['Name'] == "Gabriel Beck", "Country"].values[0]


# Print the result
print(f"Sum of employee salary: {salary_sum}")
print(f"Number of employee from Albania: {albania_count}")
print(f"Country of employee Gabriel Beck: {lookup_employee_country}")

Sum of employee salary: 7113389.759999999
Number of employee from Albania: 2
Country of employee Gabriel Beck: Lebanon


After I created the data that I want to check above, I just realized that the retrival should actually be done in the section 9. So in here I will just try to pass it directly into available LLM in cursor to see.

In [32]:
# To get page_content for all documents in the list
all_contents = [doc.page_content for doc in data]

# Combine them into a single text
combined_text = "\n\n".join(all_contents)
print(combined_text)

Name: Michelle Rose
Email: carriehall@yahoo.com
Company: Wood, Foley and Terry
Job Title: Engineer, land
Date Joined: 2023-10-29
Country: Niger
Salary (USD): 129547.35

Name: Paul Anderson
Email: alicia56@gmail.com
Company: Evans Ltd
Job Title: Field seismologist
Date Joined: 2024-03-26
Country: Swaziland
Salary (USD): 114199.87

Name: Terri Chang
Email: lambertmax@hotmail.com
Company: Watson-Mccoy
Job Title: Health and safety adviser
Date Joined: 2021-01-29
Country: Guernsey
Salary (USD): 116572.83

Name: Larry Williams
Email: crossbrandon@gmail.com
Company: Taylor-Taylor
Job Title: Solicitor, Scotland
Date Joined: 2022-06-18
Country: Martinique
Salary (USD): 46575.15

Name: Gabriel Beck
Email: meganball@yahoo.com
Company: Osborn LLC
Job Title: Designer, television/film set
Date Joined: 2020-07-23
Country: Lebanon
Salary (USD): 100704.51

Name: Emily Ford
Email: csanchez@gmail.com
Company: Davis and Sons
Job Title: Lighting technician, broadcasting/film/video
Date Joined: 2021-09-19
C

In [42]:
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain.prompts import ChatPromptTemplate

# Create LLM instances
llms = [
    ("OpenAI GPT-4.1", ChatOpenAI(model="gpt-4.1", api_key=os.getenv("OPENAI_API_KEY"))), 
    ("Anthropic Claude Sonnet", ChatAnthropic(model="claude-3-7-sonnet-20250219", api_key=os.getenv("ANTHROPIC_API_KEY")))
]

# Create a prompt template
prompt = ChatPromptTemplate.from_template("""
You are a data analyst. Please analyze the following employee data and provide insights:

{employee_data}

Please provide with the values only:
1. Sum of employee salary
2. Number of employee from Albania
3. Country of employee Gabriel Beck
""")

# Loop through each LLM
for model_name, llm in llms:
    print(f"\n=== Results from {model_name} ===")
    
    # Create the chain for this specific model
    chain = prompt | llm
    
    # Execute the chain with the combined employee data
    try:
        response = chain.invoke({"employee_data": combined_text})
        print(response.content)
    except Exception as e:
        print(f"Error with {model_name}: {str(e)}")


=== Results from OpenAI GPT-4.1 ===
Here are the requested values:

1. **Sum of employee salary**: **$6,269,257.23**

2. **Number of employees from Albania**: **2**

3. **Country of employee Gabriel Beck**: **Lebanon**

=== Results from Anthropic Claude Sonnet ===
Based on my analysis of the employee data, here are the values you requested:

1. Sum of employee salary: 7,345,656.37
2. Number of employees from Albania: 2
3. Country of employee Gabriel Beck: Lebanon


Interesting result, because as we can see here that they both get the number of employees and the lookup country right but missed on the sum of the salary where the result from Anthropic model closer to the actual result than OpenAI model.

In [None]:
# pip install unstructured

In [None]:
# HTML document loader (require installation of the unstructured package)
from langchain_community.document_loaders import UnstructuredHTMLLoader

loader = UnstructuredHTMLLoader("white_house_executive_order_nov_2023.html")
data = loader.load()

print(data[0])
print(data[0].metadata)

# 8. Splitting External Data for Retrival

- Document splitting: split document into chunks
- Break documents up to fit within an LLMs `context window`

- Splitting a document can be done per line, this will be simple to implement but because sentenecs are split between multiple lines and those lines are procesed separately here, key context may be lost. To handle this issue, chunk overlap often implemented where having an extra overlap exist in both chunks help retain context.

- There are 2 document splitting methods that will be discussed here they are:
    - CharacterTextSplitter
    - RecursiveCharacterTextSplitter
    - (there are many others, and emerging methods)

In [4]:
# Let's start with a simple document
quote = """One machine can do the work of fifty ordinary humans. \nNo machine can do the work of one extraordinary human."""
print(len(quote))

109


In [9]:
# CharacterTextSplitter: Splits based on separator first, then evaluate chunk size and chunk overlap
from langchain_text_splitters import CharacterTextSplitter 

chunk_size=24
chunk_overlap=3

ct_splitter = CharacterTextSplitter(
        separator=".",
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
)

docs = ct_splitter.split_text(quote)
print(docs)
print([len(doc) for doc in docs])

Created a chunk of size 52, which is longer than the specified 24


['One machine can do the work of fifty ordinary humans', 'No machine can do the work of one extraordinary human']
[52, 53]


This method split on separator so it could result to smaller than chunk_size, but may not always succeed. The RecursiveCharacterTextSplitter method is considered more robust.

In [11]:
# RecursiveCharacterTextSplitter: Takes a list of separators to split on, and works through the list from left to right, splitting the doc using each separator in turn
#                                 and see if these chunks can be combined, while remaining in the chunk_size
from langchain_text_splitters import RecursiveCharacterTextSplitter

chunk_size=24
chunk_overlap=3

rc_splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", " ", ""],
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
)

docs = rc_splitter.split_text(quote)
print(docs)
print([len(doc) for doc in docs])

['One machine can do the', 'work of fifty ordinary', 'humans.', 'No machine can do the', 'work of one', 'extraordinary human.']
[22, 22, 7, 21, 11, 20]


- Notice how the lenght of each chunk varies here, the class splits by paragraph first, and found that the chunk size was too big. Likewise for sentences, and then we move on to the third separator (space), splitting words using the space separator and found that words can be combined into chunks while remaining under the chunk size limit.

- However some of these chunks are too small to contain meaningful context, but this recursive implementation may works better on larger documents.

In [None]:
# Additional: RecursiveCharacterTextSplitter with HTML
from langchain_community.document_loaders import UnstructuredHTMLLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

loader = UnstructuredHTMLLoader("white_house_executive_order_nov_2023.html")
data = loader.load()

rc_splitter = RecursiveCharacterTextSplitter(
        separators=["."],
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
)

docs = rc_splitter.split_documents(data)
print(docs[0])

# 9. RAG Storage and Retrival Using Vector Databases