In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import os
os.environ['HF_TOKEN']=os.getenv("HF_TOKEN")

In [3]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddings=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
embeddings.embed_query("hello AI")

[-0.03338824212551117,
 0.03453981503844261,
 0.05947450175881386,
 0.059286098927259445,
 -0.06353531032800674,
 -0.06819586455821991,
 0.08823321014642715,
 0.0344407856464386,
 -0.03278516232967377,
 -0.01581495814025402,
 0.020981721580028534,
 -0.018340323120355606,
 -0.03983224928379059,
 -0.0804707333445549,
 -0.014469229616224766,
 0.03326485678553581,
 0.014259278774261475,
 -0.03404996171593666,
 -0.142915740609169,
 -0.023083431646227837,
 -0.021380223333835602,
 0.002633583964779973,
 -0.04729269817471504,
 -0.010752756148576736,
 -0.06866802275180817,
 0.031124936416745186,
 0.07594586908817291,
 0.0011282607447355986,
 0.011632048524916172,
 -0.036039240658283234,
 0.04483754187822342,
 0.018390731886029243,
 0.12672801315784454,
 -0.0013597395736724138,
 0.00820669624954462,
 0.06909967958927155,
 -0.08076362311840057,
 -0.05841310694813728,
 0.0537545382976532,
 0.02622750587761402,
 -0.0068285781890153885,
 -0.05635844171047211,
 0.0032929808367043734,
 -0.072501882910

In [5]:
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
documents=["what is a capital of USA?",
           "Who is a president of USA?",
           "Who is a prime minister of India?"]

In [7]:
my_query="Narendra modi is prime minister of india?"

In [8]:
document_embedding=embeddings.embed_documents(documents)

In [9]:
document_embedding

[[0.11998702585697174,
  -0.02130262926220894,
  -0.042880889028310776,
  0.06645582616329193,
  -0.0643523707985878,
  -0.04424867033958435,
  0.0224084984511137,
  -0.04987304285168648,
  -0.02343769185245037,
  -0.03397204354405403,
  -0.014047956094145775,
  -0.06065933406352997,
  -0.003906781319528818,
  -0.017782054841518402,
  -0.047971002757549286,
  -0.06668156385421753,
  0.004103164188563824,
  -0.013092752546072006,
  0.0443977490067482,
  0.022350700572133064,
  0.009459548629820347,
  -0.020564574748277664,
  -0.00033560910378582776,
  -0.005685771815478802,
  0.05558698996901512,
  0.025123195722699165,
  -0.0028170919977128506,
  0.008758911862969398,
  0.003255274845287204,
  -0.015963375568389893,
  0.014263669960200787,
  -0.11220848560333252,
  0.08968566358089447,
  -0.03108374960720539,
  -0.02422380819916725,
  0.006152117159217596,
  0.08058716356754303,
  0.0182499997317791,
  0.05568310618400574,
  0.016702676191926003,
  0.015895962715148926,
  0.00034109121

In [10]:
query_embedding=embeddings.embed_query(my_query)

In [11]:
len(query_embedding)

384

In [12]:
cosine_similarity([query_embedding],document_embedding)

array([[0.11756667, 0.34324566, 0.81413235]])

In [13]:
from sklearn.metrics.pairwise import euclidean_distances

In [14]:
euclidean_distances([query_embedding], document_embedding)

array([[1.32848281, 1.14608409, 0.60970098]])

| Metric            | Similarity Score Range | Behavior                              |
| ----------------- | ---------------------- | ------------------------------------- |
| Cosine Similarity | \[-1, 1]               | Focuses on angle only |
| L2 Distance       | \[0, ∞)                | Focuses on **magnitude + direction**  |


In [15]:
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore

| Feature               | `Flat`                | `IVF` (Inverted File Index)        | `HNSW` (Graph-based Index)          |
| --------------------- | --------------------- | ---------------------------------- | ----------------------------------- |
| Type of Search     | Exact                 | Approximate (cluster-based)        | Approximate (graph-based traversal) |
| Speed               | Slow (linear scan)    | Fast (search only in top clusters) | Very Fast (graph walk)              |


| Dataset Size              | Recommended Index                 |
| ------------------------- | --------------------------------- |
| UPTO 1L                     | `IndexFlatL2` or `IndexFlatIP`    |
| UPTO 1M                  | `IndexIVFFlat` or `IndexHNSWFlat` |
| > 1M                      | `IndexIVFPQ` or `IndexHNSWFlat`   |


In [16]:
index=faiss.IndexFlatL2(384)

In [17]:
index

<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x000002C6DC3F89F0> >

In [18]:
vector_store=FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)


In [19]:
vector_store.add_texts(["AI is future","AI is powerful","Dogs are cute"])

['fc2c4b0c-d977-48dc-8152-693e585645f4',
 'ffd445f7-c093-41ec-b492-d9ce8738d0ff',
 'd1f0893d-80a2-4857-8a89-e754d9d4f970']

In [20]:
vector_store.index_to_docstore_id

{0: 'fc2c4b0c-d977-48dc-8152-693e585645f4',
 1: 'ffd445f7-c093-41ec-b492-d9ce8738d0ff',
 2: 'd1f0893d-80a2-4857-8a89-e754d9d4f970'}

In [22]:
results = vector_store.similarity_search("Tell me about AI", k=3)
results


[Document(id='ffd445f7-c093-41ec-b492-d9ce8738d0ff', metadata={}, page_content='AI is powerful'),
 Document(id='fc2c4b0c-d977-48dc-8152-693e585645f4', metadata={}, page_content='AI is future'),
 Document(id='d1f0893d-80a2-4857-8a89-e754d9d4f970', metadata={}, page_content='Dogs are cute')]

In [23]:
# from uuid import uuid4
from langchain_core.documents import Document

document_1 = Document(
    page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
)

document_3 = Document(
    page_content="Building an exciting new project with LangChain - come check it out!",
    metadata={"source": "tweet"},
)

document_4 = Document(
    page_content="Robbers broke into the city bank and stole $1 million in cash.",
    metadata={"source": "news"},
)

document_5 = Document(
    page_content="Wow! That was an amazing movie. I can't wait to see it again.",
    metadata={"source": "tweet"},
)

document_6 = Document(
    page_content="Is the new iPhone worth the price? Read this review to find out.",
    metadata={"source": "website"},
)

document_7 = Document(
    page_content="The top 10 soccer players in the world right now.",
    metadata={"source": "website"},
)

document_8 = Document(
    page_content="LangGraph is the best framework for building stateful, agentic applications!",
    metadata={"source": "tweet"},
)

document_9 = Document(
    page_content="The stock market is down 500 points today due to fears of a recession.",
    metadata={"source": "news"},
)

document_10 = Document(
    page_content="I have a bad feeling I am going to get deleted :(",
    metadata={"source": "tweet"},
)

documents = [
    document_1,
    document_2,
    document_3,
    document_4,
    document_5,
    document_6,
    document_7,
    document_8,
    document_9,
    document_10,
]

In [24]:
index=faiss.IndexFlatIP(384)
vector_store=FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [25]:
vector_store.add_documents(documents=documents)

['e6e9fc70-2cd9-4b2f-91e0-9b566e627d71',
 '9598e439-5134-4c4c-a15c-baa6a7ad19f9',
 '649ef31d-d543-45b3-a516-932c1ac20e63',
 '6fbb09db-97fe-4cd6-80b6-f297a9f2050e',
 'b92ccbed-6d82-4491-917b-ab85cbad7950',
 '938247ba-083f-4fb0-9ce2-ca02a2f9ab5f',
 '07d7010b-a802-4820-b4a0-74520ee88b5f',
 '04e7ef38-718f-4129-9ab0-9bba6d641119',
 'c11082f9-3523-475a-9a76-654bae4f7636',
 '13ce35d8-f532-42d3-9f61-37be87940db8']

In [26]:
vector_store.similarity_search(
    "LangChain provides abstractions to make working with LLMs easy",
    k=2 #hyperparameter
    
)

[Document(id='649ef31d-d543-45b3-a516-932c1ac20e63', metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(id='04e7ef38-718f-4129-9ab0-9bba6d641119', metadata={'source': 'tweet'}, page_content='LangGraph is the best framework for building stateful, agentic applications!')]

In [27]:
vector_store.similarity_search(
    "LangChain provides abstractions to make working with LLMs easy",
    #k=2 #hyperparameter,
    filter={"source":{"$eq": "tweet"}}
    
)

[Document(id='649ef31d-d543-45b3-a516-932c1ac20e63', metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(id='04e7ef38-718f-4129-9ab0-9bba6d641119', metadata={'source': 'tweet'}, page_content='LangGraph is the best framework for building stateful, agentic applications!'),
 Document(id='13ce35d8-f532-42d3-9f61-37be87940db8', metadata={'source': 'tweet'}, page_content='I have a bad feeling I am going to get deleted :('),
 Document(id='e6e9fc70-2cd9-4b2f-91e0-9b566e627d71', metadata={'source': 'tweet'}, page_content='I had chocolate chip pancakes and scrambled eggs for breakfast this morning.')]

In [28]:
result=vector_store.similarity_search(
    "LangChain provides abstractions to make working with LLMs easy",
    #k=2 #hyperparameter,
    filter={"source":"news"}
    
)

In [29]:
result[0].metadata

{'source': 'news'}

In [30]:
result[0].page_content

'Robbers broke into the city bank and stole $1 million in cash.'

In [31]:
retriever=vector_store.as_retriever(search_kwargs={"k": 3})

In [32]:
retriever.invoke("LangChain provides abstractions to make working with LLMs easy")

[Document(id='649ef31d-d543-45b3-a516-932c1ac20e63', metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(id='04e7ef38-718f-4129-9ab0-9bba6d641119', metadata={'source': 'tweet'}, page_content='LangGraph is the best framework for building stateful, agentic applications!'),
 Document(id='13ce35d8-f532-42d3-9f61-37be87940db8', metadata={'source': 'tweet'}, page_content='I have a bad feeling I am going to get deleted :(')]

In [33]:
vector_store.save_local("today's class faiss index")

In [34]:
new_vector_store=FAISS.load_local(
  "today's class faiss index",embeddings ,allow_dangerous_deserialization=True
)

In [35]:
new_vector_store.similarity_search("langchain")

[Document(id='649ef31d-d543-45b3-a516-932c1ac20e63', metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(id='04e7ef38-718f-4129-9ab0-9bba6d641119', metadata={'source': 'tweet'}, page_content='LangGraph is the best framework for building stateful, agentic applications!'),
 Document(id='b92ccbed-6d82-4491-917b-ab85cbad7950', metadata={'source': 'tweet'}, page_content="Wow! That was an amazing movie. I can't wait to see it again."),
 Document(id='07d7010b-a802-4820-b4a0-74520ee88b5f', metadata={'source': 'website'}, page_content='The top 10 soccer players in the world right now.')]