In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

In [None]:
groq_api_key=os.getenv("GROQ_API_KEY")
groq_api_key

In [None]:
!pip install langchain_groq

In [None]:
from langchain_groq import ChatGroq
model=ChatGroq(model="Gemma2-9b-It",groq_api_key=groq_api_key)
model

In [None]:
from langchain_core.messages import HumanMessage,SystemMessage
 
messages=[
    SystemMessage(content="Translate the following from English to French"),
    HumanMessage(content="Hello How are you?")
]
 
result=model.invoke(messages)
result

In [None]:
from langchain_core.output_parsers import StrOutputParser
parser=StrOutputParser()
parser.invoke(result)

In [7]:
### Prompt Templates
from langchain_core.prompts import ChatPromptTemplate
 
generic_template="Translate the following into {language}:"
 
prompt=ChatPromptTemplate.from_messages(
    [("system",generic_template),("user","{text}")]
)

In [None]:
##Chaining together components with LCEL
chain=prompt|model|parser
chain.invoke({"language":"French","text":"Hello my name is Nakul"})

In [None]:
## Text Loader
from langchain_community.document_loaders import TextLoader
loader = TextLoader('speech.txt')
text_documents = loader.load()
text_documents

In [None]:
#Reading a PDF file
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader('attention.pdf')
docs = loader.load()
docs

In [None]:
##Arxiv
from langchain_community.document_loaders import ArxivLoader
docs = ArxivLoader(query="Generative AI", load_max_docs=3).load()
##len(docs)
docs

In [None]:
##Wikipedia
from langchain_community.document_loaders import WikipediaLoader
docs = WikipediaLoader(query="Generative AI", load_max_docs=1).load()
len(docs)
print(docs)

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size =500, chunk_overlap=50)
final_documents = text_splitter.split_documents(docs)
final_documents

Below exmaple shows, if it does not find the separator it can create longer chunk size.

In [None]:
##now lets use character text splitter
## Created a chunk of size 470, which is longer than the specified 100 -- its not able to find the separator
from langchain_text_splitters import CharacterTextSplitter
text_splitter = CharacterTextSplitter(separator="\n\n",chunk_size = 100, chunk_overlap = 20)
text_splitter.split_documents(docs)

In [None]:
##HTML text splitter
from langchain_text_splitters import HTMLHeaderTextSplitter
html_string = """
<!DOCTYPE html>
<html>
<body>
    <div>
        <h1>Foo</h1>
        <p>Some intro text about Foo.</p>
        <div>
            <h2>Bar main section</h2>
            <p>Some intro text about Bar.</p>
            <h3>Bar subsection 1</h3>
            <p>Some text about the first subtopic of Bar.</p>
            <h3>Bar subsection 2</h3>
            <p>Some text about the second subtopic of Bar.</p>
        </div>
        <div>
            <h2>Baz</h2>
            <p>Some text about Baz</p>
        </div>
        <br>
        <p>Some concluding text about Foo</p>
    </div>
</body>
</html>
"""
## which headers want to split
headers_to_split_on=[
    ("h1","Header 1"),
    ("h2","Header 2"),
    ("h3","Header 3")
]
 
html_splitter=HTMLHeaderTextSplitter(headers_to_split_on)
html_header_splits=html_splitter.split_text(html_string)
html_header_splits

In [None]:
## Split context from html url
url = "https://plato.stanford.edu/entries/goedel/"
 
headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
    ("h4", "Header 4"),
]
html_splitter = HTMLHeaderTextSplitter(headers_to_split_on)
html_header_splits = html_splitter.split_text_from_url(url)
html_header_splits

Below another example of Jason data.

In [38]:
## Split Jason Data
import json
import requests
 
json_data=requests.get("https://api.smith.langchain.com/openapi.json").json()

In [None]:
json_data

In [40]:
## we are using recursiveJason Splitter becasue there might be nested elements
from langchain_text_splitters import RecursiveJsonSplitter
json_splitter=RecursiveJsonSplitter(max_chunk_size=300)
json_chunks=json_splitter.split_json(json_data)

In [None]:
json_chunks

----------------------------------------------------------XX--------------------XX--------------------------------------------------------------------------

Embedding technique. Converting text to Vector. We're using Ollama

In [21]:
from langchain_community.embeddings import OllamaEmbeddings

In [None]:
embeddings=(
    OllamaEmbeddings(model="gemma:2b")  ##by default it ues llama2
)

In [None]:
embeddings

In [24]:
## now i want to embedding some specific text
r1=embeddings.embed_documents(
    [
       "Alpha is the first letter of Greek alphabet",
       "Beta is the second letter of Greek alphabet",
    ]
)

In [None]:
embeddings

In [None]:
### Other Embedding Models
### https://ollama.com/blog/embedding-models
#embeddings = OllamaEmbeddings(model="mxbai-embed-large")
embeddings = OllamaEmbeddings(model="gemma:2b")
text = "This is a test document."
query_result = embeddings.embed_query(text)
query_result

In [27]:
##lets first do the text splitting
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import CharacterTextSplitter
 
loader=TextLoader("speech.txt")
documents=loader.load()
text_splitter=CharacterTextSplitter(chunk_size=1000,chunk_overlap=30)
docs=text_splitter.split_documents(documents)

In [None]:
docs

In [None]:
## now lets use ollama embedding and create vector store DB
embeddings=OllamaEmbeddings()
db=FAISS.from_documents(docs,embeddings)
db

In [None]:
## now lets use ollama embedding and create vector store DB
embeddings=OllamaEmbeddings()
db=FAISS.from_documents(docs,embeddings)
db

In [None]:
## if you see using retriever also i am getting same answer. But we use retriever when we use different different llm model
## initially i need to convert this vector store DB to retriever and then use it.
retriever=db.as_retriever()
docs=retriever.invoke(query)
docs[0].page_content
 
docs_and_score=db.similarity_search_with_score(query)
docs_and_score
 

In [None]:
##can we pass vector instead of setences
embedding_vector=embeddings.embed_query(query)
embedding_vector

In [None]:
## Now use this vector to find the information from DB
docs_score=db.similarity_search_by_vector(embedding_vector)
docs_score

In [None]:
##How to store this vector DB in my local
### Saving And Loading
db.save_local("faiss_index1")

In [39]:
## building a  vectordb
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [None]:
loader = TextLoader("speech.txt")
data = loader.load()
data

ChromaDB

In [69]:
## building a  vectordb
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [None]:
loader = TextLoader("speech.txt")
data = loader.load()
data

In [71]:
# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
splits = text_splitter.split_documents(data)

In [None]:
splits

In [None]:
## create vector store along with embedding
embedding=OllamaEmbeddings(model="gemma:2b")
vectordb=Chroma.from_documents(documents=splits,embedding=embedding)
vectordb

In [None]:
## query it
query = "What does the speaker believe is the main reason the United States should enter the war?"
docs = vectordb.similarity_search(query)
docs[0].page_content

In [82]:
## Saving to the disk
vectordb=Chroma.from_documents(documents=splits,embedding=embedding,persist_directory="./chroma_db1")

In [None]:
# load from disk
db2 = Chroma(persist_directory="./chroma_db1", embedding_function=embedding)
docs=db2.similarity_search(query)
print(docs[0].page_content)

In [None]:
## similarity Search With Score
docs = vectordb.similarity_search_with_score(query)
docs

-------------------------------------------------------------------------------------xx----------------------------------------------------xx-----------------------------------------------------------------

Streamlit

In [32]:
import streamlit as st

Chat History

In [33]:
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory
 
store={}
 
def get_session_history(session_id:str)->BaseChatMessageHistory:
    if session_id not in store:
        store[session_id]=ChatMessageHistory()
    return store[session_id]
 
with_message_history=RunnableWithMessageHistory(model,get_session_history)

In [34]:
config={"configurable":{"session_id":"chat1"}}

In [35]:
response=with_message_history.invoke(
    [HumanMessage(content="Hi , My name is Nakul and I am an  Engineer")],
    config=config
)

In [None]:
response.content
 

In [None]:
with_message_history.invoke(
    [HumanMessage(content="What's my name?")],
    config=config,
)

In [None]:
## change the config-->session id
config1={"configurable":{"session_id":"chat2"}}
response=with_message_history.invoke(
    [HumanMessage(content="Whats my name")],
    config=config1
)
response.content

In [None]:
response=with_message_history.invoke(
    [HumanMessage(content="Hey My name is John")],
    config=config1
)
response.content

In [None]:
response=with_message_history.invoke(
    [HumanMessage(content="Whats my name")],
    config=config1
)
response.content