In [1]:
from dotenv import load_dotenv
load_dotenv()
import os
# Ensure that the Hugging Face API key is set in the environment
os.environ['HF_API_KEY']=os.getenv("HF_API_KEY")
from langchain_huggingface import HuggingFaceEmbeddings

# Initialize the HuggingFaceEmbeddings with a specific model
embeddings=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
embeddings.embed_query("hello AI")

[-0.033388182520866394,
 0.03453972190618515,
 0.059474531561136246,
 0.05928609147667885,
 -0.0635354220867157,
 -0.06819586455821991,
 0.08823323994874954,
 0.03444080427289009,
 -0.03278516232967377,
 -0.015814989805221558,
 0.02098178118467331,
 -0.01834029331803322,
 -0.03983215242624283,
 -0.0804707482457161,
 -0.014469144865870476,
 0.0332648828625679,
 0.014259284362196922,
 -0.03404996916651726,
 -0.142915740609169,
 -0.023083344101905823,
 -0.021380102261900902,
 0.002633501309901476,
 -0.047292742878198624,
 -0.010752756148576736,
 -0.06866798549890518,
 0.031125057488679886,
 0.0759458914399147,
 0.0011283254716545343,
 0.011631987057626247,
 -0.03603919595479965,
 0.04483763128519058,
 0.018390750512480736,
 0.12672801315784454,
 -0.0013597895158454776,
 0.008206663653254509,
 0.06909968703985214,
 -0.08076353371143341,
 -0.05841314047574997,
 0.053754497319459915,
 0.026227595284581184,
 -0.006828607991337776,
 -0.056358352303504944,
 0.0032930178567767143,
 -0.0725017860

In [3]:
# Importing cosine_similarity from sklearn.metrics.pairwise
# This is used to compute the cosine similarity between two vectors
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
# Example usage of embeddings to compute similarity between a query and a set of documents
documents=["what is a capital of USA?",
           "Who is a president of USA?",
           "Who is a prime minister of India?"]

# Example query to find similarity with the documents
my_query="Narendra modi is prime minister of india?"

# Embed documents
document_embedding=embeddings.embed_documents(documents)

In [5]:
document_embedding

[[0.11998696625232697,
  -0.021302605047822,
  -0.04288087412714958,
  0.06645582616329193,
  -0.0643523558974266,
  -0.04424864798784256,
  0.022408470511436462,
  -0.049873027950525284,
  -0.023437663912773132,
  -0.03397207707166672,
  -0.0140480762347579,
  -0.06065931171178818,
  -0.003906761296093464,
  -0.017782077193260193,
  -0.047971006482839584,
  -0.0666816309094429,
  0.004103219136595726,
  -0.013092794455587864,
  0.0443977415561676,
  0.022350674495100975,
  0.009459568187594414,
  -0.02056453935801983,
  -0.00033560290466994047,
  -0.005685777403414249,
  0.05558697134256363,
  0.025123219937086105,
  -0.0028171155136078596,
  0.008759002201259136,
  0.003255249932408333,
  -0.015963444486260414,
  0.014263702556490898,
  -0.11220847815275192,
  0.0896855890750885,
  -0.031083744019269943,
  -0.02422383241355419,
  0.006152077112346888,
  0.08058709651231766,
  0.018250005319714546,
  0.05568312108516693,
  0.016702702268958092,
  0.01589597389101982,
  0.0003410273930

In [6]:
# Embed the query
query_embedding=embeddings.embed_query(my_query)

In [7]:
len(query_embedding)

384

In [8]:
# Compute cosine similarity between the query and document embeddings
cosine_similarity([query_embedding],document_embedding)

array([[0.11756668, 0.3432456 , 0.81413237]])

In [9]:
from sklearn.metrics.pairwise import euclidean_distances
euclidean_distances([query_embedding], document_embedding)

array([[1.32848283, 1.14608416, 0.60970098]])

| Metric            | Similarity Score Range | Behavior                              |
| ----------------- | ---------------------- | ------------------------------------- |
| Cosine Similarity | \[-1, 1]               | Focuses on angle only |
| L2 Distance       | \[0, ∞)                | Focuses on **magnitude + direction**  |


## FAISS (Facebook AI Similarity Search) is a library for efficient similarity search and clustering

In [10]:
## Importing FAISS for vector storage and retrieval
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore

# Create a FAISS index for the document embeddings
# The dimension of the embeddings is 384 for the all-MiniLM-L6-v2
index=faiss.IndexFlatL2(384)
index

<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x00000195AC4C8360> >

In [11]:
# Create a FAISS vector store using the embeddings and the index
vector_store=FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

# Add documents to the vector store
vector_store.add_texts(["AI is future","AI is powerful","Dogs are cute"])

['da31f07d-f211-4dbf-b698-95f176b02959',
 '354adf07-85ed-48f0-b342-b7c78f748a21',
 'd02f54cb-341c-4f36-975f-b5174639f393']

In [12]:
# Check the number of documents in the vector store
vector_store.index_to_docstore_id

{0: 'da31f07d-f211-4dbf-b698-95f176b02959',
 1: '354adf07-85ed-48f0-b342-b7c78f748a21',
 2: 'd02f54cb-341c-4f36-975f-b5174639f393'}

In [13]:
# Perform a similarity search in the vector store
# This will return the top 3 documents similar to the query "Tell me about AI"
results = vector_store.similarity_search("Tell me about AI", k=3)

In [14]:
results


[Document(id='354adf07-85ed-48f0-b342-b7c78f748a21', metadata={}, page_content='AI is powerful'),
 Document(id='da31f07d-f211-4dbf-b698-95f176b02959', metadata={}, page_content='AI is future'),
 Document(id='d02f54cb-341c-4f36-975f-b5174639f393', metadata={}, page_content='Dogs are cute')]

| Feature               | `Flat`                | `IVF` (Inverted File Index)        | `HNSW` (Graph-based Index)          |
| --------------------- | --------------------- | ---------------------------------- | ----------------------------------- |
| Type of Search     | Exact                 | Approximate (cluster-based)        | Approximate (graph-based traversal) |
| Speed               | Slow (linear scan)    | Fast (search only in top clusters) | Very Fast (graph walk)              |


| Dataset Size              | Recommended Index                 |
| ------------------------- | --------------------------------- |
| UPTO 1L                     | `IndexFlatL2` or `IndexFlatIP`    |
| UPTO 1M                  | `IndexIVFFlat` or `IndexHNSWFlat` |
| > 1M                      | `IndexIVFPQ` or `IndexHNSWFlat`   |


In [15]:
# Example documents to be used with the FAISS vector store
# Each document is represented as a langchain_core Document object
# Uncomment the following line if you want to generate unique IDs for documents
# import uuid
# from uuid import uuid4
from langchain_core.documents import Document

document_1 = Document(
    page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
)

document_3 = Document(
    page_content="Building an exciting new project with LangChain - come check it out!",
    metadata={"source": "tweet"},
)

document_4 = Document(
    page_content="Robbers broke into the city bank and stole $1 million in cash.",
    metadata={"source": "news"},
)

document_5 = Document(
    page_content="Wow! That was an amazing movie. I can't wait to see it again.",
    metadata={"source": "tweet"},
)

document_6 = Document(
    page_content="Is the new iPhone worth the price? Read this review to find out.",
    metadata={"source": "website"},
)

document_7 = Document(
    page_content="The top 10 soccer players in the world right now.",
    metadata={"source": "website"},
)

document_8 = Document(
    page_content="LangGraph is the best framework for building stateful, agentic applications!",
    metadata={"source": "tweet"},
)

document_9 = Document(
    page_content="The stock market is down 500 points today due to fears of a recession.",
    metadata={"source": "news"},
)

document_10 = Document(
    page_content="I have a bad feeling I am going to get deleted :(",
    metadata={"source": "tweet"},
)

documents = [
    document_1,
    document_2,
    document_3,
    document_4,
    document_5,
    document_6,
    document_7,
    document_8,
    document_9,
    document_10,
]

In [16]:
# Create a FAISS index for the document embeddings
index=faiss.IndexFlatIP(384)

# Create a FAISS vector store using the embeddings and the index
vector_store=FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [17]:
# Add documents to the vector store
vector_store.add_documents(documents=documents)

['44cd348e-b0a0-41ce-9a09-22dbc10fc0de',
 '47773a60-fdaf-41a2-ab85-54197be6214c',
 '276fe791-15cd-4d4e-b6b1-0b6a61d987f2',
 'b79c62ff-42d5-4c58-97b5-b240e3f4e8c4',
 '3b678eb7-96f9-485a-9061-33f38bafb64e',
 '577a82e1-0aa2-4486-bf07-c1e24a18f92a',
 '942e0f38-ec80-4ec4-82e0-54347bf27170',
 '139ad9f9-fc57-47a6-adbd-f146fd4d3598',
 '1fd530b3-59de-4526-b164-07aa48369eeb',
 'dff27f8a-e69d-4a88-8275-ac77ccb3679b']

In [18]:
# similarity_search is a method that allows you to find documents similar to a given query
# The k parameter specifies how many similar documents to return
vector_store.similarity_search(
    "LangChain provides abstractions to make working with LLMs easy",
    k=2 #hyperparameter
    
)

[Document(id='276fe791-15cd-4d4e-b6b1-0b6a61d987f2', metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(id='139ad9f9-fc57-47a6-adbd-f146fd4d3598', metadata={'source': 'tweet'}, page_content='LangGraph is the best framework for building stateful, agentic applications!')]

In [19]:
# Perform a similarity search with a filter on the source metadata
# This will return documents that match the query and have a specific source
vector_store.similarity_search(
    "LangChain provides abstractions to make working with LLMs easy",
    #k=2 #hyperparameter,
    filter={"source":{"$eq": "tweet"}}
    
)

[Document(id='276fe791-15cd-4d4e-b6b1-0b6a61d987f2', metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(id='139ad9f9-fc57-47a6-adbd-f146fd4d3598', metadata={'source': 'tweet'}, page_content='LangGraph is the best framework for building stateful, agentic applications!'),
 Document(id='dff27f8a-e69d-4a88-8275-ac77ccb3679b', metadata={'source': 'tweet'}, page_content='I have a bad feeling I am going to get deleted :('),
 Document(id='44cd348e-b0a0-41ce-9a09-22dbc10fc0de', metadata={'source': 'tweet'}, page_content='I had chocolate chip pancakes and scrambled eggs for breakfast this morning.')]

In [20]:
# Perform a similarity search with a filter on the source metadata
result=vector_store.similarity_search(
    "LangChain provides abstractions to make working with LLMs easy",
    #k=2 #hyperparameter,
    filter={"source":"news"}
    
)

In [21]:
result[0].metadata

{'source': 'news'}

In [22]:
result[0].page_content

'Robbers broke into the city bank and stole $1 million in cash.'

In [23]:
# Create a retriever from the vector store
# A retriever is an object that can be used to retrieve documents based on a query
retriever=vector_store.as_retriever(search_kwargs={"k": 3})

In [24]:
# Use the retriever to find documents similar to a query
# The invoke method is used to perform the retrieval
retriever.invoke("LangChain provides abstractions to make working with LLMs easy")

[Document(id='276fe791-15cd-4d4e-b6b1-0b6a61d987f2', metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(id='139ad9f9-fc57-47a6-adbd-f146fd4d3598', metadata={'source': 'tweet'}, page_content='LangGraph is the best framework for building stateful, agentic applications!'),
 Document(id='dff27f8a-e69d-4a88-8275-ac77ccb3679b', metadata={'source': 'tweet'}, page_content='I have a bad feeling I am going to get deleted :(')]

In [25]:
# Save the vector store to a local directory
# This allows you to persist the vector store and load it later without needing to re-index the
vector_store.save_local("faiss index")

In [28]:
# Load the vector store from a local directory
new_vector_store = FAISS.load_local(
    "faiss index", embeddings, allow_dangerous_deserialization=True
)

In [29]:
# Perform a similarity search on the newly loaded vector store
new_vector_store.similarity_search("langchain")

[Document(id='276fe791-15cd-4d4e-b6b1-0b6a61d987f2', metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(id='139ad9f9-fc57-47a6-adbd-f146fd4d3598', metadata={'source': 'tweet'}, page_content='LangGraph is the best framework for building stateful, agentic applications!'),
 Document(id='3b678eb7-96f9-485a-9061-33f38bafb64e', metadata={'source': 'tweet'}, page_content="Wow! That was an amazing movie. I can't wait to see it again."),
 Document(id='942e0f38-ec80-4ec4-82e0-54347bf27170', metadata={'source': 'website'}, page_content='The top 10 soccer players in the world right now.')]

## add PDF documents to the vector store

In [32]:
# Load a PDF document using PyPDFLoader from langchain_community.document_loaders
# This loader is used to read PDF files and convert them into langchain_core Document objects
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
FILE_PATH=r"C:\Users\niles\Agentic-AI2.0\2-Langchain Basics\2.4-VectorDatabase\FAISS\llama2-bf0a30209b224e26e31087559688ce81.pdf"
loader=PyPDFLoader(FILE_PATH)
print(len(loader.load()))
pages=loader.load()
pages = []
async for page in loader.alazy_load():
    pages.append(page)

77


In [33]:
# Split the loaded documents into smaller chunks for better processing
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,#hyperparameter
    chunk_overlap=50 #hyperparemeter
)

# Split the loaded documents into smaller chunks
split_docs = splitter.split_documents(pages)
len(split_docs)

615

In [34]:
# Create a FAISS index for the document embeddings
index=faiss.IndexFlatIP(384)
# create a FAISS vector store using the embeddings and the index
vector_store=FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [35]:
# Add the split documents to the vector store
vector_store.add_documents(documents=split_docs)

['8dfca0f2-3681-433b-b7a9-6dff78db91de',
 '8ffe7526-3482-4cc9-9403-13f2ffc6047e',
 '1b1bb68d-7209-442d-9133-6201e3f62c97',
 '19083fe6-ff0c-4889-bd43-aaa1d736c11c',
 '83e3d8b3-fd6e-4f22-b327-7c9bee0ba1f7',
 'bfba1124-fec9-4b1c-99ae-efa0a3d00de0',
 'cfdc8d6d-db7c-4616-82c7-19f737d63ce2',
 '617f2f82-29de-4d94-a442-2a9c75001fdf',
 'fc6b2a5b-8dc5-4275-95d9-41432dc26242',
 '04ee3c2b-ce3b-417b-aab0-a3e1f64f8938',
 '14c46daf-bbd7-451f-b0b4-b712adfc743a',
 '3034bda7-0058-4e5f-a27f-dec54278a790',
 '7826d1aa-6b65-4089-a5b8-c61345e9e0fa',
 'f13393c0-84bb-457f-9db2-f662e73c60f8',
 '96cefe20-ef4a-4958-9482-eb66ed7f1a9d',
 '3aa35bbc-8a08-47fb-8605-f493d237ac60',
 'caf7b060-3b15-4aae-b56c-0ac3ff51012e',
 'eadacd09-a074-446f-92fc-8443f695eeb0',
 '390d9f1d-8fcd-4d87-9a97-b5f590301c85',
 '2e56f68d-263f-4b5a-bb8d-3b66cc35ccf0',
 '29b3ce1f-5232-40b2-adf5-06bc1fcd5b98',
 '7caafa16-dd6c-4791-97fc-6159376649f8',
 'e85d7bc7-3469-4bc7-ab25-c10d4912ee45',
 '634efe4c-e3df-4904-90fe-d0900077d108',
 'fba1ba86-a5ba-

In [36]:
# retriver is an object that can be used to retrieve documents based on a query
retriever=vector_store.as_retriever(
    search_kwargs={"k": 10} #hyperparameter
)

In [37]:
# Use the retriever to find documents similar to a query
retriever.invoke("what is llama model?")

[Document(id='fba1ba86-a5ba-4f8a-a524-e52a4279441d', metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2023-07-20T00:30:36+00:00', 'author': '', 'keywords': '', 'moddate': '2023-07-20T00:30:36+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'C:\\Users\\niles\\Agentic-AI2.0\\2-Langchain Basics\\2.4-VectorDatabase\\FAISS\\llama2-bf0a30209b224e26e31087559688ce81.pdf', 'total_pages': 77, 'page': 3, 'page_label': '4'}, page_content='work (Section 6), and conclusions (Section 7).\n‡https://ai.meta.com/resources/models-and-libraries/llama/\n§We are delaying the release of the 34B model due to a lack of time to sufficiently red team.\n¶https://ai.meta.com/llama\n‖https://github.com/facebookresearch/llama\n4'),
 Document(id='687cc2d1-0ab7-418c-a9b4-e932cd7b9660', metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyp

In [38]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain import hub
import pprint

# Initialize the ChatGoogleGenerativeAI model with a specific model name
model=ChatGoogleGenerativeAI(model='gemini-1.5-flash')
# Create a prompt template for the model
prompt = hub.pull("rlm/rag-prompt")
# print the prompt template
pprint.pprint(prompt.messages)



[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})]


[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})]

In [39]:
# format_docs is a utility function that formats a list of Document objects into a string
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)
    

In [40]:
# Create a rag_chain that combines the retriever, prompt, model, and output parser
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [41]:
# Invoke the rag_chain with a question to retrieve relevant documents and generate an answer
rag_chain.invoke("what is llama model?")

'Llama is a large language model developed by Meta.  There are different versions, including Llama 1 and Llama 2, with varying parameter sizes.  Llama 2 is designed for commercial and research use and has been fine-tuned for helpfulness and safety.'