In [1]:
from langchain_ollama import OllamaLLM
llm = OllamaLLM(model="rnj-1:8b-cloud", 
                max_tokens=1024,
                temperature=0.7)

### Output parsers
Output parsers take the output from an LLM and transform that output to a more suitable format. Parsing the output is very useful when you are using LLMs to generate any form of structured data, or to normalize output from chat models and other LLMs.

LangChain has lots of different types of output parsers. This is a [list](https://python.langchain.com/v0.2/docs/concepts/#output-parsers) of output parsers LangChain supports. In this lab, you will use the following two output parsers as examples:

- `JSON`: Returns a JSON object as specified. You can specify a Pydantic model and it will return JSON for that model. Probably the most reliable output parser for getting structured data that does NOT use function calling.
- `CSV`: Returns a list of comma separated values.


In [35]:
# Import the JsonOutputParser from langchain_core to convert LLM responses into structured JSON
from langchain_core.output_parsers import JsonOutputParser
# Import BaseModel and Field from pydantic
from pydantic import BaseModel, Field

In [36]:
# Define your desired data structure.
class Netflix(BaseModel):
    name : str = Field(description="Name of the Netflix show or movie")
    release_year : int = Field(description="Year the show or movie was released")
    genre : str = Field(description="Genre of the show or movie")
    mactor : str = Field(description="Main male actor in the show or movie")
    factor : str = Field(description="Main female actor in the show or movie")

In [37]:
from langchain_core.prompts import PromptTemplate
from pprint import pprint

# And a query intended to prompt a language model to populate the data structure.
netflix_query = "Tell me about a Netflix show or movie top 1 in Thailand."

# Set up a parser + inject instructions into the prompt template.
output_parser = JsonOutputParser(pydantic_object=Netflix)

# Get the formatting instructions for the output parser
# This generates guidance text that tells the LLM how to format its response
format_instructions = output_parser.get_format_instructions()
pprint(f"format_instructions: {format_instructions}")

# Create a prompt template that includes:
# 1. Instructions for the LLM to answer the user's query
# 2. Format instructions to ensure the LLM returns properly structured data
# 3. The actual user query placeholder
prompt = PromptTemplate(
    template="Answer the user query.\nFormat Instructions:{format_instructions}\n{query}\n",
    input_variables=["query"],  # Dynamic variables that will be provided when invoking the chain
    partial_variables={"format_instructions": format_instructions},  # Static variables set once when creating the prompt
)

# Create a processing chain that:
# 1. Formats the prompt using the template
# 2. Sends the formatted prompt to the Llama LLM
# 3. Parses the LLM's response using the output parser to extract structured data
chain = prompt | llm | output_parser

# Invoke the chain with a specific query about jokes
# This will:
# 1. Format the prompt with the joke query
# 2. Send it to Llama
# 3. Parse the response into the structure defined by your output parser
# 4. Return the structured result
chain.invoke({"query": netflix_query })

('format_instructions: STRICT OUTPUT FORMAT:\n'
 '- Return only the JSON value that conforms to the schema. Do not include any '
 'additional text, explanations, headings, or separators.\n'
 '- Do not wrap the JSON in Markdown or code fences (no ``` or ```json).\n'
 '- Do not prepend or append any text (e.g., do not write "Here is the '
 'JSON:").\n'
 '- The response must be a single top-level JSON value exactly as required by '
 'the schema (object/array/etc.), with no trailing commas or comments.\n'
 '\n'
 'The output should be formatted as a JSON instance that conforms to the JSON '
 'schema below.\n'
 '\n'
 'As an example, for the schema {"properties": {"foo": {"title": "Foo", '
 '"description": "a list of strings", "type": "array", "items": {"type": '
 '"string"}}}, "required": ["foo"]} the object {"foo": ["bar", "baz"]} is a '
 'well-formatted instance of the schema. The object {"properties": {"foo": '
 '["bar", "baz"]}} is not well-formatted.\n'
 '\n'
 'Here is the output schema

{'name': 'The Kissing Booth',
 'release_year': 2018,
 'genre': 'Romance',
 'mactor': 'Jojo Siwa',
 'factor': 'Taylor Zakhar Perez'}

In [38]:
from langchain_core.output_parsers import CommaSeparatedListOutputParser

output_parser = CommaSeparatedListOutputParser()

netflix_query = "Tell me about a Netflix show or movie top 10 in Thailand."


# Get the formatting instructions for the output parser
# This generates guidance text that tells the LLM how to format its response
format_instructions = output_parser.get_format_instructions()

# Create a prompt template that includes:
# 1. Instructions for the LLM to answer the user's query
# 2. Format instructions to ensure the LLM returns properly structured data
# 3. The actual user query placeholder
prompt = PromptTemplate(
    template="""Answer the user query.\nFormat Instructions:{format_instructions}\n
    colomn list    name     release_year    genre         mactor('Main male actor in the show or movie')     factor('Main female actor in the show or movie')  
    {query}\n""",
    input_variables=["query"],  # Dynamic variables that will be provided when invoking the chain
    partial_variables={"format_instructions": format_instructions},  # Static variables set once when creating the prompt
)

# Create a processing chain that:
# 1. Formats the prompt using the template
# 2. Sends the formatted prompt to the Llama LLM
# 3. Parses the LLM's response using the output parser to extract structured data
chain = prompt | llm | output_parser

# Invoke the chain with a specific query about jokes
# This will:
# 1. Format the prompt with the joke query
# 2. Send it to Llama
# 3. Parse the response into the structure defined by your output parser
# 4. Return the structured result
chain.invoke({"query": netflix_query })

["I'm unable to directly access real-time or specific regional data like Netflix's top 10 shows or movies in Thailand",
 "as my training only goes up until April 2023 and I don't have live internet access. However",
 'I can guide you on how to find this information:',
 '1. Visit the Netflix website or app.',
 '2. Set the language and region to Thailand.',
 '3. Browse or search for the top 10 shows or movies in Thailand.',
 "If you have a specific list of shows or movies you're interested in",
 'I can provide more detailed information about them. Please let me know the names of the shows or movies',
 "and I'll do my best to assist you."]

In [39]:
from langchain_core.documents import Document

In [40]:

# Create a Document instance with:
# 1. page_content: The actual text content about Python
# 2. metadata: A dictionary containing additional information about this document
Document(page_content="""Python is an interpreted high-level general-purpose programming language.
 Python's design philosophy emphasizes code readability with its notable use of significant indentation.""",
metadata={
    'my_document_id' : 234234,                      # Unique identifier for this document
    'my_document_source' : "About Python",          # Source or title information
    'my_document_create_time' : 1680013019          # Unix timestamp for document creation (March 28, 2023)
 })

Document(metadata={'my_document_id': 234234, 'my_document_source': 'About Python', 'my_document_create_time': 1680013019}, page_content="Python is an interpreted high-level general-purpose programming language.\n Python's design philosophy emphasizes code readability with its notable use of significant indentation.")

In [41]:
from langchain_community.document_loaders import PyPDFLoader

In [42]:
loader = PyPDFLoader(file_path= "https://arxiv.org/pdf/2310.05421",)
document = loader.load()

In [43]:
document[1].metadata

{'producer': 'Microsoft® Word 2016',
 'creator': 'Microsoft® Word 2016',
 'creationdate': '2023-10-09T10:46:36+05:30',
 'title': 'Paper Title (use style: paper title)',
 'author': 'Keivalya Pandya;Dr Mehfuza Holia',
 'moddate': '2023-10-09T10:46:36+05:30',
 'source': 'https://arxiv.org/pdf/2310.05421',
 'total_pages': 4,
 'page': 1,
 'page_label': '2'}

In [44]:
pprint(document[1].page_content[0:500])

('Submitted to the 3rd International Conference on “Women in Science & '
 'Technology: Creating Sustainable Career”  \n'
 '28 -30 December, 2023 \n'
 'III. METHODOLOGY \n'
 'This sect ion covers the data c ollection, details about the \n'
 'selected model, fine -tuning, and integration with the Gradio \n'
 'APIs for web deployment. \n'
 'A. Data Collection \n'
 'To gather the necessary data for our project, we employed \n'
 'BeautifulSoup web scraping techniques to retri eve publicly \n'
 'accessible information from an organization’s homepage. ')


In [45]:
# Import the WebBaseLoader class from langchain_community's document_loaders module
# This loader is designed to scrape and extract text content from web pages
from langchain_community.document_loaders import WebBaseLoader #use beautifulsoup4 libary for dealing with HTML

# Create a WebBaseLoader instance by passing the URL of the web page to load
# This URL points to the LangChain documentation's introduction page
loader = WebBaseLoader("https://python.langchain.com/v0.2/docs/introduction/")

# Call the load() method to:
# 1. Send an HTTP request to the specified URL
# 2. Download the HTML content
# 3. Parse the HTML to extract meaningful text
# 4. Create a list of Document objects containing the extracted content
web_data = loader.load()

# Print the first 1000 characters of the page content from the first Document
# This provides a preview of the successfully loaded web content
# web_data[0] accesses the first Document in the list
# .page_content accesses the text content of that Document
# [:1000] slices the string to get only the first 1000 characters
print(web_data[0].page_content[:1000])

LangChain overview - Docs by LangChainSkip to main contentDocs by LangChain home pageLangChain + LangGraphSearch...⌘KSupportGitHubTry LangSmithTry LangSmithSearch...NavigationLangChain overviewLangChainLangGraphDeep AgentsIntegrationsLearnReferenceContributePythonOverviewGet startedInstallQuickstartChangelogPhilosophyCore componentsAgentsModelsMessagesToolsShort-term memoryStreamingStructured outputMiddlewareOverviewBuilt-in middlewareCustom middlewareAdvanced usageGuardrailsRuntimeContext engineeringModel Context Protocol (MCP)Human-in-the-loopMulti-agentRetrievalLong-term memoryAgent developmentLangSmith StudioTestAgent Chat UIDeploy with LangSmithDeploymentObservabilityOn this page Create an agent Core benefitsLangChain overviewCopy pageLangChain is an open source framework with a pre-built agent architecture and integrations for any model or tool — so you can build agents that adapt as fast as the ecosystem evolvesCopy pageLangChain is the easiest way to start building agents and a

#### Text splitters

After you load documents, you will often want to transform those documents to better suit your application.

One of the most simple examples of making documents better suit your application is to split a long document into smaller chunks that can fit into your model's context window. LangChain has built-in document transformers that ease the process of splitting, combining, filtering, and otherwise manipulating documents.

At a high level, here is how text splitters work:

1. They split the text into small, semantically meaningful chunks (often sentences).
2. They start combining these small chunks of text into a larger chunk until you reach a certain size (as measured by a specific function).
3. After the combined text reaches the new chunk's size, make that chunk its own piece of text and then start creating a new chunk of text with some overlap to keep context between chunks.

For a list of types of text splitters LangChain supports, see [LangChain Text Splitters](https://python.langchain.com/v0.1/docs/modules/data_connection/document_transformers/).


`CharacterTextSplitter`
- Straightforward implementation
- Consistent chunk sizes
- Easily adaptable to different model requirement

In [47]:
# Import the CharacterTextSplitter class from langchain_text_splitters module
# Text splitters are used to divide large texts into smaller, manageable chunks
from langchain_text_splitters import CharacterTextSplitter

# Create a CharacterTextSplitter with specific configuration:
# - chunk_size=200: Each chunk will contain approximately 200 characters
# - chunk_overlap=20: Consecutive chunks will overlap by 20 characters to maintain context
# - separator="\n": Text will be split at newline characters when possible
text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=20, separator="\n")

# Split the previously loaded document (PDF or other text) into chunks
# The split_documents method:
# 1. Takes a list of Document objects
# 2. Splits each document's content based on the configured parameters
# 3. Returns a new list of Document objects where each contains a chunk of text
# 4. Preserves the original metadata for each chunk
chunks = text_splitter.split_documents(document)

# Print the total number of chunks created
# This shows how many smaller Document objects were generated from the original document(s)
# The number depends on the original document length and the chunk_size setting
print("count of chunks: %d" %len(chunks))

count of chunks: 88


In [48]:
pprint("the metadata: %s" % chunks[0].metadata)
pprint("the page content: %s" % chunks[0].page_content)

("the metadata: {'producer': 'Microsoft® Word 2016', 'creator': 'Microsoft® "
 "Word 2016', 'creationdate': '2023-10-09T10:46:36+05:30', 'title': 'Paper "
 "Title (use style: paper title)', 'author': 'Keivalya Pandya;Dr Mehfuza "
 "Holia', 'moddate': '2023-10-09T10:46:36+05:30', 'source': "
 "'https://arxiv.org/pdf/2310.05421', 'total_pages': 4, 'page': 0, "
 "'page_label': '1'}")
('the page content: Submitted to the 3rd International Conference on “Women in '
 'Science & Technology: Creating Sustainable Career”  \n'
 '28 -30 December, 2023 \n'
 'Automating Customer Service using LangChain')


 `RecursiveCharacterTextSplitter` implements this concept:
- The RecursiveCharacterTextSplitter attempts to keep larger units (e.g., paragraphs) intact.
- If a unit exceeds the chunk size, it moves to the next level (e.g., sentences).
- This process continues down to the word level if necessary.

In [63]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter_recursive = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20, separators=["\n"])
chunks_recursive = text_splitter_recursive.split_documents(document)
pprint("the metadata: %s" % chunks_recursive[0].metadata)
print("count of chunks: %d \n" %len(chunks_recursive))
pprint("the page content: %s " % chunks_recursive[0].page_content)


("the metadata: {'producer': 'Microsoft® Word 2016', 'creator': 'Microsoft® "
 "Word 2016', 'creationdate': '2023-10-09T10:46:36+05:30', 'title': 'Paper "
 "Title (use style: paper title)', 'author': 'Keivalya Pandya;Dr Mehfuza "
 "Holia', 'moddate': '2023-10-09T10:46:36+05:30', 'source': "
 "'https://arxiv.org/pdf/2310.05421', 'total_pages': 4, 'page': 0, "
 "'page_label': '1'}")
count of chunks: 89 

('the page content: Submitted to the 3rd International Conference on “Women in '
 'Science & Technology: Creating Sustainable Career”  \n'
 '28 -30 December, 2023 \n'
 'Automating Customer Service using LangChain ')


In [None]:
def display_chunks_splitter(text_splitter: RecursiveCharacterTextSplitter|CharacterTextSplitter,
                            document:Document, name:str):
    
    chunks = text_splitter.split_documents(document)
    
    print(f"the number of chunks created by {name}: {len(chunks)} \n")
    avg_characters = sum(len(chunk.page_content) for chunk in chunks) / len(chunks)
    print(f"the average characters in all chunks created by {name}: {int(avg_characters)}\n")
    
    all_metadata_keys = chunks[0].metadata
    print(f"the metadata keys in all chunks  {all_metadata_keys.keys()}\n")

    if chunks:
        print("Example chunk:")
        example_doc = chunks[min(5, len(chunks)-1)]  # Get the 5th chunk or the last one if fewer
        print(f"Content (first 150 chars): {example_doc.page_content[:150]}...")
        print(f"Metadata: {example_doc.metadata}")
        
        # Calculate length distribution
        lengths = [len(doc.page_content) for doc in chunks]
        min_len = min(lengths)
        max_len = max(lengths)
        print(f"Min chunk size: {min_len} characters")
        print(f"Max chunk size: {max_len} characters\n")

splitter_1 = CharacterTextSplitter(chunk_size=500, chunk_overlap=20, separator="\n")
splitter_2 = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0, separators=["\n\n", "\n", ". ", " ", ""])

print("="*30,"Recursive Splitter","="*30)
display_chunks_splitter(text_splitter = splitter_2,document =document , name="Recursive Splitter")

print("="*30,"Character Splitter","="*30)
display_chunks_splitter(text_splitter = splitter_1, document =document , name="Character Splitter")

the number of chunks created by Recursive Splitter: 33 

the average characters in all chunks created by Recursive Splitter: 467

the metadata keys in all chunks created by Recursive Splitter: dict_keys(['producer', 'creator', 'creationdate', 'title', 'author', 'moddate', 'source', 'total_pages', 'page', 'page_label'])

Example chunk:
Content (first 150 chars): where the rhythms of mode rn life are guided by the pu lse of 
technology, the realm of customer service stands as the 
frontline of engagement betwee...
Metadata: {'producer': 'Microsoft® Word 2016', 'creator': 'Microsoft® Word 2016', 'creationdate': '2023-10-09T10:46:36+05:30', 'title': 'Paper Title (use style: paper title)', 'author': 'Keivalya Pandya;Dr Mehfuza Holia', 'moddate': '2023-10-09T10:46:36+05:30', 'source': 'https://arxiv.org/pdf/2310.05421', 'total_pages': 4, 'page': 0, 'page_label': '1'}
Min chunk size: 436 characters
Max chunk size: 498 characters

the number of chunks created by Character Splitter: 33 

the av