In [1]:
#pip install --upgrade openai
#pip install --upgrade transformers

### Understanding LangChain: A Modular Framework for LLMs

* LangChain is fundamentally a framework designed for Large Language Models (LLMs).

* It enables the development of various applications such as chatbots, Generative Question-Answering (GQA), content summarization, and beyond.

* The essence of the framework lies in its ability to "chain" diverse components, facilitating the creation of sophisticated functionalities utilizing LLMs.
  * Chains are composed of various elements across different modules, including:

* These are pre-designed templates tailored for specific interactions, ranging from chatbot dialogues to Explain Like I'm Five (ELI5) question-responding formats.

* This encompasses a range of Large Language Models such as ChatGPT, Bard, Claude, etc.
* Agents leverage LLMs to determine necessary actions. They can employ tools like web search or calculators, integrated into a cohesive operational loop.
* Incorporating both short-term and long-term memory functionalities.

* Our primary aim here is to delve into the functionality that enables the transformation of unstructured text into structured data, extracting valuable insights.

### Core Components of LangChain

* Chains are composed of various modules that can be combined to enhance the capabilities of LLMs.

Key Modules Include:

  * Prompt Templates: Customizable templates suited for different interaction styles, including chatbot  conversations.
  * LLMs: Incorporation of various Large Language Models such as ChatGPT, Bard, Claude, etc.
  *  Agents: Agents utilize LLMs to determine the necessary actions, employing tools like web searches or calculators within a logical operational loop.
  * Memory Modules: These include both short-term and long-term memory functionalities.



In [64]:
from openai import OpenAI
client = OpenAI()


prompt = """ What is the most populated city in the state of Hawaii. 
Provide city name and no additional information."""


response = client.chat.completions.create(
  model="gpt-4o",
  messages=[
    {
      "role": "user",
      "content": [
        {
          "type": "text",
          "text": prompt
        }
      ]
    },
    {
      "role": "assistant",
      "content": [
        {
          "type": "text",
          "text": "Honolulu"
        }
      ]
    }
  ],
  temperature=0.5,
  max_tokens=2048,
  top_p=1,
  frequency_penalty=0,
  presence_penalty=0,
  response_format={
    "type": "text"
  }
)
print(response)

ChatCompletion(id='chatcmpl-AR3IVHLklkzZ6l35psylQJdb9KWTV', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Honolulu', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1731010895, model='gpt-4o-2024-08-06', object='chat.completion', service_tier=None, system_fingerprint='fp_159d8341cc', usage=CompletionUsage(completion_tokens=2, prompt_tokens=34, total_tokens=36, completion_tokens_details=CompletionTokensDetails(audio_tokens=0, reasoning_tokens=0, accepted_prediction_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0)))


In [16]:
response.choices[0].message.content

'Honolulu'

In [65]:
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
model = ChatOpenAI()

In [18]:
prompt_str = """What is the most populated city in the state of Hawaii. 
Provide city name and no additional information."""

prompt = PromptTemplate.from_template(prompt_str)


In [19]:
chain = prompt | model

In [20]:
chain.invoke({})

AIMessage(content='Honolulu', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 2, 'prompt_tokens': 28, 'total_tokens': 30, 'completion_tokens_details': {'audio_tokens': 0, 'reasoning_tokens': 0, 'accepted_prediction_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-4a0a8fd9-2bdc-4555-8d10-76f1eaf08a2d-0', usage_metadata={'input_tokens': 28, 'output_tokens': 2, 'total_tokens': 30})

### Prompts Are First Class objects in LangChain

* Prompts can be easily tailored to incorporate runtime variables.
* They can also be customized with examples for more precise and context-relevant responses.

In [21]:
prompt_str = """What is the most populated city in the state of {state}.

Provide city name and no additional information."""

prompt = PromptTemplate.from_template(prompt_str)

In [22]:
chain = prompt | model

In [14]:
response = chain.invoke({"state": "Hawaii"})
response.content

'Honolulu'

In [12]:
response = chain.invoke({"state": "California"})
response.content

'Los Angeles'

In [23]:
response = chain.invoke({"state": "Georgia"})
response.content

'Atlanta'

In [25]:
prompt_str = """What is the most populated city in the state provided below.

Provide city name and no additional information. 

Examples:

State: Hawaii
City: Honolulu

State: California
City: Los Angeles

State: {state}
"""

prompt = PromptTemplate.from_template(prompt_str)

chain = prompt | model


In [26]:
response = chain.invoke({"state": "Georgia"})

response

AIMessage(content='City: Atlanta', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 3, 'prompt_tokens': 51, 'total_tokens': 54, 'completion_tokens_details': {'audio_tokens': 0, 'reasoning_tokens': 0, 'accepted_prediction_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-9f4f4107-fcfa-480b-a704-2adc248e9ede-0', usage_metadata={'input_tokens': 51, 'output_tokens': 3, 'total_tokens': 54})

In [27]:
response.content

'City: Atlanta'

In [28]:
prompt_str = """What is the most populated city in the state provided below.

Provide city name and no additional information. 

Examples:

State: Hawaii
{{"City": "Honolulu"}}

State: California
{{"City": "Los Angeles"}}

State: {state}
"""

prompt = PromptTemplate.from_template(prompt_str)

chain = prompt | model


In [29]:
response = chain.invoke({"state": "Georgia"})

response

AIMessage(content='{"City": "Atlanta"}', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 6, 'prompt_tokens': 56, 'total_tokens': 62, 'completion_tokens_details': {'audio_tokens': 0, 'reasoning_tokens': 0, 'accepted_prediction_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-c19ed859-2ebd-4190-8af6-ef56ac3a107d-0', usage_metadata={'input_tokens': 56, 'output_tokens': 6, 'total_tokens': 62})

In [30]:
response.content

'{"City": "Atlanta"}'

In [31]:
import json
data = json.loads(response.content)
data

{'City': 'Atlanta'}

In [32]:
data["City"]

'Atlanta'

In [33]:
prompt_prefix = """What is the most populated city in the state provided below. 
Provide city name and no additional information. """


In [34]:
prompt_examples = [
    {"ExampleState": "Hawaii", "ExampleCity": "Honolulu"},
    {"ExampleState": "California", "ExampleCity": "Los Angeles"}   
]
prompt_examples

[{'ExampleState': 'Hawaii', 'ExampleCity': 'Honolulu'},
 {'ExampleState': 'California', 'ExampleCity': 'Los Angeles'}]

In [35]:
example_prompt_str ="State: {ExampleState}\nCity: {ExampleCity}"
print(example_prompt_str)

State: {ExampleState}
City: {ExampleCity}


In [37]:
example_prompt = PromptTemplate(input_variables=["ExampleState", "ExampleCity"], template = example_prompt_str)

example_prompt


PromptTemplate(input_variables=['ExampleCity', 'ExampleState'], input_types={}, partial_variables={}, template='State: {ExampleState}\nCity: {ExampleCity}')

In [38]:
print(example_prompt.format(**prompt_examples[0]))

State: Hawaii
City: Honolulu


In [39]:
print(example_prompt.format(**prompt_examples[1]))

State: California
City: Los Angeles


In [40]:
from langchain.prompts.few_shot import FewShotPromptTemplate

execute_fewshot_prompt = FewShotPromptTemplate(
    prefix = prompt_prefix,
    input_variables=["state"],
    examples= prompt_examples,
    example_prompt = example_prompt,
    example_separator="\n\n",
    suffix = "State: {state}"
)

In [41]:
data = {"state": "Georgia"}
print(execute_fewshot_prompt.format(**data))

What is the most populated city in the state provided below. 
Provide city name and no additional information. 

State: Hawaii
City: Honolulu

State: California
City: Los Angeles

State: Georgia


In [42]:
chain = execute_fewshot_prompt | model
chain.invoke(data)

AIMessage(content='City: Atlanta', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 3, 'prompt_tokens': 49, 'total_tokens': 52, 'completion_tokens_details': {'audio_tokens': 0, 'reasoning_tokens': 0, 'accepted_prediction_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-d80f6e18-e2d2-4581-bc8b-5445415c859f-0', usage_metadata={'input_tokens': 49, 'output_tokens': 3, 'total_tokens': 52})

In [43]:
from pydantic import BaseModel, Field


In [77]:
class CityParser(BaseModel):
    """
    this object holds information about the most populated city in the 
    given state.
    """
    City: str = Field(..., description="The name of the most populous city") 

In [78]:
from langchain.output_parsers import PydanticOutputParser

cityParser = PydanticOutputParser(pydantic_object=CityParser)


In [79]:
output = cityParser.parse("""{"City": "Atlanta"}""")
output


CityParser(City='Atlanta')

In [80]:
output_parser = PydanticOutputParser(pydantic_object=CityParser)
print(output_parser.get_format_instructions())

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"description": "this object holds information about the most populated city in the \ngiven state.", "properties": {"City": {"description": "The name of the most populous city", "title": "City", "type": "string"}}, "required": ["City"]}
```


In [81]:
prompt_prefix = """ What is the most populated city for {state}. 
{format_instructions}
"""




In [82]:

prompt = PromptTemplate(
    template=prompt_prefix,
    input_variables=["state"],
    partial_variables={"format_instructions": output_parser.get_format_instructions()}
)

In [83]:
print(prompt.format(**data))

 What is the most populated city for Georgia. 
The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"description": "this object holds information about the most populated city in the \ngiven state.", "properties": {"City": {"description": "The name of the most populous city", "title": "City", "type": "string"}}, "required": ["City"]}
```



In [84]:
chain = prompt | model
output = chain.invoke(data)

In [85]:
output.content

'{\n    "City": "Atlanta"\n}'

In [86]:
cityParser.parse(output.content)

CityParser(City='Atlanta')

In [3]:

from pydantic import BaseModel, Field

class PersonDetails(BaseModel):
    """
    this object holds information about a person including their preferred activity, name and location (city).
    """
    name: str = Field(..., description="The name of the person") 
    hobby: str = Field(..., description="The preferred activity ") 
    location: str = Field(..., description="The city where the person lives") 

In [7]:
from langchain.output_parsers import PydanticOutputParser

person_idetail_parser = PydanticOutputParser(pydantic_object=PersonDetails)
print(person_idetail_parser.get_format_instructions())

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"description": "this object holds information about a person including their preferred activity, name and location (city).", "properties": {"name": {"description": "The name of the person", "title": "Name", "type": "string"}, "hobby": {"description": "The preferred activity ", "title": "Hobby", "type": "string"}, "location": {"description": "The city where the person lives", "title": "Location", "type": "string"}}, "required": ["name", "hobby", "location"]}
```


In [16]:
prompt_prefix = """
The person said something below about themselves. Please parse only the fields of interests; those are
name, preferred activity and location.
{what_person_said}
{format_instructions}
"""

In [15]:
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
model = ChatOpenAI()

prompt = PromptTemplate(
    template=prompt_prefix,
    input_variables=["what_person_said"],
    partial_variables={"format_instructions": person_idetail_parser.get_format_instructions()}
)
print(prompt.format(what_person_said= "Hello there"))



The person said something below about themselves. Please parse only the fields of interests; those are
name, preferred and location.
Hello there
The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"description": "this object holds information about a person including their preferred activity, name and location (city).", "properties": {"name": {"description": "The name of the person", "title": "Name", "type": "string"}, "hobby": {"description": "The preferred activity ", "title": "Hobby", "type": "string"}, "location": {"description": "The city where the person lives", "title": "Location", "ty

In [20]:
chain = prompt | model
response = chain.invoke({"what_person_said": "Hi, my name is Mahdi, I am in Honolulu, and I like surfing"})

In [22]:
print(response.content)

{
    "name": "Mahdi",
    "hobby": "surfing",
    "location": "Honolulu"
}


In [24]:
person_idetail_parser.parse(response.content)

PersonDetails(name='Mahdi', hobby='surfing', location='Honolulu')

### Current Preferred Way 
- Leverage the tool usage capabilties

In [87]:
structured_llm = model.with_structured_output(CityParser)
structured_chain = prompt | structured_llm
structured_chain.invoke({"state": "Georgia"})


CityParser(City='Atlanta')