In [15]:
from dotenv import load_dotenv
load_dotenv()
import os
os.environ['GEMINI_API_KEY']=os.getenv("GEMINI_API_KEY")


In [17]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings


embeddings_model = GoogleGenerativeAIEmbeddings(
	model="models/embedding-001",
	google_api_key=os.environ['GEMINI_API_KEY']
)

embeddings=embeddings_model.embed_query("Hello world")  # Example query
len(embeddings)

# embeddings = embeddings_model.embed_documents(["Hello world", "Goodbye world"])
# print(embeddings)
# len(embeddings)  # Should print 2, as we have two documents


768

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

documents=["what is a capital of USA?",
           "Who is a president of USA?",
           "Who is a prime minister of India?"]
document_embedding = embeddings_model.embed_documents(documents)

my_query = "Narendra modi is prime minister of india?"
query_embedding = embeddings_model.embed_query(my_query)

# Calculate cosine similarity between the query embedding and each document embedding
similarities = cosine_similarity([query_embedding], document_embedding)
print(similarities)
# To get the similarity score for each document with respect to the query
similarities[0]  # This will give you the similarity scores for each document with respect to the query
# To get the index of the most similar document
most_similar_index = similarities[0].argmax()
print(f"The most similar document is at index: {most_similar_index}")
# To get the most similar document
most_similar_document = documents[most_similar_index]
print(f"The most similar document is: {most_similar_document}")

[[0.54779149 0.63220972 0.75589552]]
The most similar document is at index: 2
The most similar document is: Who is a prime minister of India?


In [19]:
from sklearn.metrics.pairwise import euclidean_distances

# Calculate Euclidean distances between the query embedding and each document embedding
euclidean_distances_result = euclidean_distances([query_embedding], document_embedding)
print(euclidean_distances_result)
# To get the distance for each document with respect to the query
euclidean_distances_result[0]  # This will give you the distances for each document with respect to the query   
# To get the index of the closest document
closest_index = euclidean_distances_result[0].argmin()
print(f"The closest document is at index: {closest_index}")
# To get the closest document
closest_document = documents[closest_index]
print(f"The closest document is: {closest_document}")

[[0.95100778 0.8576593  0.698719  ]]
The closest document is at index: 2
The closest document is: Who is a prime minister of India?


| Metric            | Similarity Score Range | Behavior                              |
| ----------------- | ---------------------- | ------------------------------------- |
| Cosine Similarity | \[-1, 1]               | Focuses on angle only |
| L2 Distance       | \[0, ∞)                | Focuses on **magnitude + direction**  |

In [20]:
# Additional examples to illustrate similarity metrics

# New set of documents and a query
more_documents = [
    "The sun rises in the east.",
    "Python is a popular programming language.",
    "The capital of France is Paris.",
    "Dogs are loyal animals."
]
more_query = "What is the capital city of France?"

# Get embeddings for new documents and query
more_doc_embeddings = embeddings_model.embed_documents(more_documents)
more_query_embedding = embeddings_model.embed_query(more_query)

# Cosine similarity
cos_sim = cosine_similarity([more_query_embedding], more_doc_embeddings)
print("Cosine Similarities:", cos_sim[0])

# Euclidean (L2) distance
l2_dist = euclidean_distances([more_query_embedding], more_doc_embeddings)
print("Euclidean Distances:", l2_dist[0])

# Find most similar document by cosine similarity
most_sim_cos_idx = cos_sim[0].argmax()
print(f"Most similar (cosine): '{more_documents[most_sim_cos_idx]}'")

# Find closest document by Euclidean distance
closest_l2_idx = l2_dist[0].argmin()
print(f"Closest (L2): '{more_documents[closest_l2_idx]}'")


Cosine Similarities: [0.4735272  0.51139907 0.76754131 0.46933159]
Euclidean Distances: [1.02613069 0.98853459 0.68184802 1.03021138]
Most similar (cosine): 'The capital of France is Paris.'
Closest (L2): 'The capital of France is Paris.'


In [21]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import numpy as np


# Clear explanation of pros and cons
print("\nCosine Similarity:")
print("  Pros:")
print("    - Measures the angle between vectors, focusing on their direction (semantic meaning).")
print("    - Ignores the magnitude, so it's robust to differences in document/query length or scale.")
print("    - Well-suited for text similarity tasks where the length of text varies.")
print("  Cons:")
print("    - Ignores magnitude, so it may miss important differences if vector length carries meaning.")

print("\nL2 (Euclidean) Distance:")
print("  Pros:")
print("    - Considers both the direction and magnitude of vectors.")
print("    - Useful when absolute values or the scale of embeddings are important.")
print("  Cons:")
print("    - Sensitive to the length and scale of vectors, which can be affected by document/query length.")
print("    - May not reflect semantic similarity if magnitude is not meaningful in the context.")


# Example 1: Cosine similarity ignores magnitude (length)
vec_a = [1, 2, 3]
vec_b = [2, 4, 6]  # Same direction as vec_a, but twice the magnitude

cos_sim_example = cosine_similarity([vec_a], [vec_b])[0][0]
l2_dist_example = euclidean_distances([vec_a], [vec_b])[0][0]

print("\nExample 1: Same direction, different magnitude")
print(f"Cosine similarity: {cos_sim_example:.2f} (should be 1.0)")
print(f"L2 (Euclidean) distance: {l2_dist_example:.2f} (should be > 0)")

# Example 2: L2 distance sensitive to scale
vec_c = [100, 200, 300]
cos_sim_example2 = cosine_similarity([vec_a], [vec_c])[0][0]
l2_dist_example2 = euclidean_distances([vec_a], [vec_c])[0][0]

print("\nExample 2: Large magnitude difference")
print(f"Cosine similarity: {cos_sim_example2:.2f} (should be 1.0)")
print(f"L2 (Euclidean) distance: {l2_dist_example2:.2f} (should be large)")

# Example 3: Different direction, same magnitude
vec_d = [-1, -2, -3]
cos_sim_example3 = cosine_similarity([vec_a], [vec_d])[0][0]
l2_dist_example3 = euclidean_distances([vec_a], [vec_d])[0][0]

print("\nExample 3: Opposite direction, same magnitude")
print(f"Cosine similarity: {cos_sim_example3:.2f} (should be -1.0)")
print(f"L2 (Euclidean) distance: {l2_dist_example3:.2f} (should be large)")

# Example 4: Text embeddings (semantic similarity)
print("\nExample 4: Text embeddings (semantic similarity)")
print(f"Cosine similarity between query and most similar document: {similarities[0][most_similar_index]:.2f}")
print(f"L2 distance between query and closest document: {euclidean_distances_result[0][closest_index]:.2f}")
print(f"Most similar document (cosine): '{most_similar_document}'")
print(f"Closest document (L2): '{closest_document}'")
#


Cosine Similarity:
  Pros:
    - Measures the angle between vectors, focusing on their direction (semantic meaning).
    - Ignores the magnitude, so it's robust to differences in document/query length or scale.
    - Well-suited for text similarity tasks where the length of text varies.
  Cons:
    - Ignores magnitude, so it may miss important differences if vector length carries meaning.

L2 (Euclidean) Distance:
  Pros:
    - Considers both the direction and magnitude of vectors.
    - Useful when absolute values or the scale of embeddings are important.
  Cons:
    - Sensitive to the length and scale of vectors, which can be affected by document/query length.
    - May not reflect semantic similarity if magnitude is not meaningful in the context.

Example 1: Same direction, different magnitude
Cosine similarity: 1.00 (should be 1.0)
L2 (Euclidean) distance: 3.74 (should be > 0)

Example 2: Large magnitude difference
Cosine similarity: 1.00 (should be 1.0)
L2 (Euclidean) distance: 3

In [22]:
# Refined examples to clearly illustrate cosine similarity vs L2 distance

example_texts = [
    "The quick brown fox jumps over the lazy dog.",  # Reference
    "The quick brown fox jumps over the lazy dog.",  # Identical
    "The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog.",  # Same direction, double length
    "Completely unrelated sentence about machine learning."  # Different topic
]

# Get embeddings for the example texts
example_text_embeddings = embeddings_model.embed_documents(example_texts)

# Use the first sentence as the query
example_query_text = example_texts[0]
example_query_text_embedding = embeddings_model.embed_query(example_query_text)

# Cosine similarity
example_cosine_sim = cosine_similarity([example_query_text_embedding], example_text_embeddings)[0]
# L2 distance
example_l2_dist = euclidean_distances([example_query_text_embedding], example_text_embeddings)[0]

# Display results
for i, text in enumerate(example_texts):
    print(f"\nExample {i+1}: '{text}'")
    print(f"Cosine similarity: {example_cosine_sim[i]:.2f}")
    print(f"L2 (Euclidean) distance: {example_l2_dist[i]:.2f}")

print("\nInterpretation:")
print("- Example 1 & 2: Identical sentences — cosine similarity = 1, L2 distance = 0.")
print("- Example 3: Same sentence repeated — cosine similarity ≈ 1 (same direction), L2 distance increases (greater magnitude).")
print("- Example 4: Unrelated sentence — cosine similarity low, L2 distance high.")



Example 1: 'The quick brown fox jumps over the lazy dog.'
Cosine similarity: 0.84
L2 (Euclidean) distance: 0.57

Example 2: 'The quick brown fox jumps over the lazy dog.'
Cosine similarity: 0.84
L2 (Euclidean) distance: 0.57

Example 3: 'The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog.'
Cosine similarity: 0.83
L2 (Euclidean) distance: 0.58

Example 4: 'Completely unrelated sentence about machine learning.'
Cosine similarity: 0.56
L2 (Euclidean) distance: 0.94

Interpretation:
- Example 1 & 2: Identical sentences — cosine similarity = 1, L2 distance = 0.
- Example 3: Same sentence repeated — cosine similarity ≈ 1 (same direction), L2 distance increases (greater magnitude).
- Example 4: Unrelated sentence — cosine similarity low, L2 distance high.


## Using more advanced similarity searches using vector stores

### Local In Memory

In [23]:
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_core.documents import Document

# create embeddings model
embeddings_model = GoogleGenerativeAIEmbeddings(
	model="models/embedding-001",
	google_api_key=os.environ['GEMINI_API_KEY']
)

# Data set 1
documents=["what is a capital of USA?",
           "Who is a president of USA?",
           "Who is a prime minister of India?"]
document_embedding = embeddings_model.embed_documents(documents)

# Step 1: Create an index for FAISS
index=faiss.IndexFlatL2(768) # 768 is the dimension of the embeddings
import numpy as np

# Step 2: Add the document embeddings to the index
index.add(np.array(document_embedding, dtype=np.float32))  # Add the document embeddings to the index

# Step 3: Create a FAISS vector store
index_to_docstore_id = {i: i for i in range(len(documents))}
docstore_dict = {i: Document(page_content=doc, metadata={"id": i}) for i, doc in enumerate(documents)}
vectorstore = FAISS(
    index=index,
    embedding_function=embeddings_model,
    docstore=InMemoryDocstore(docstore_dict),
    index_to_docstore_id=index_to_docstore_id
)

# No need to add documents again, as they are already in the docstore

# Step 5: Perform a similarity search with scores
my_query = "Narendra modi is prime minister of india?"
results_with_score = vectorstore.similarity_search_with_score(my_query, k=3)  # Get top 3 similar documents with scores
print("\nFAISS Similarity Search Results (with scores):")
for doc, score in results_with_score:
    print(f"Query: {my_query}, Document: {doc.page_content}, Score: {score}")

# Step 6: Perform a similarity search with a different query and show scores
new_query = "What is the capital of the USA?"
new_results_with_score = vectorstore.similarity_search_with_score(new_query, k=3)
print("\nFAISS Similarity Search Results for New Query (with scores):")
for doc, score in new_results_with_score:
    print(f"Query: {new_query}, Document: {doc.page_content}, Score: {score}")

# Step 7: Save the FAISS index to disk
vectorstore.save_local("faiss_index")  # Save the index to disk

# Step 8: Load the FAISS index from disk
loaded_vectorstore = FAISS.load_local("faiss_index", embeddings_model, allow_dangerous_deserialization=True)

# Step 9: Perform a similarity search on the loaded index with scores
loaded_results_with_score = loaded_vectorstore.similarity_search_with_score(my_query, k=3)
print("\nLoaded FAISS Similarity Search Results (with scores):")
for doc, score in loaded_results_with_score:
    print(f"Query: {my_query}, Document: {doc.page_content}, Score: {score}")

# Step 10: Perform a similarity search with a different query on the loaded index and show scores
new_loaded_results_with_score = loaded_vectorstore.similarity_search_with_score(new_query, k=3)
print("\nLoaded FAISS Similarity Search Results for New Query (with scores):")
for doc, score in new_loaded_results_with_score:
    print(f"Query: {new_query}, Document: {doc.page_content}, Score: {score}")



FAISS Similarity Search Results (with scores):
Query: Narendra modi is prime minister of india?, Document: Who is a prime minister of India?, Score: 0.48820820450782776
Query: Narendra modi is prime minister of india?, Document: Who is a president of USA?, Score: 0.7355794906616211
Query: Narendra modi is prime minister of india?, Document: what is a capital of USA?, Score: 0.9044157266616821

FAISS Similarity Search Results for New Query (with scores):
Query: What is the capital of the USA?, Document: what is a capital of USA?, Score: 0.3721003234386444
Query: What is the capital of the USA?, Document: Who is a president of USA?, Score: 0.5979712009429932
Query: What is the capital of the USA?, Document: Who is a prime minister of India?, Score: 0.8431156873703003

Loaded FAISS Similarity Search Results (with scores):
Query: Narendra modi is prime minister of india?, Document: Who is a prime minister of India?, Score: 0.48820820450782776
Query: Narendra modi is prime minister of indi

In [24]:
import faiss
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_core.documents import Document
from dotenv import load_dotenv
import os

load_dotenv()
os.environ['GEMINI_API_KEY']=os.getenv("GEMINI_API_KEY")
os.environ['LANGCHAIN_API_KEY']=os.getenv("LANGCHAIN_API_KEY")

# Create a prompt template for the question-answering task
from langchain import hub
prompt = hub.pull("rlm/rag-prompt", api_key=os.environ['LANGCHAIN_API_KEY'])

# define the model for question-answering
model = ChatGoogleGenerativeAI(
	model='gemini-1.5-flash',
	google_api_key=os.environ['GEMINI_API_KEY']
)

# create embeddings model
embeddings_model = GoogleGenerativeAIEmbeddings(
	model="models/embedding-001",
	google_api_key=os.environ['GEMINI_API_KEY']
)

# Create an index for FAISS
index=faiss.IndexFlatL2(768) # 768 is the dimension of the embeddings

# Create a FAISS vector store
vectorstore = FAISS(
    index=index,
    embedding_function=embeddings_model,
    docstore=InMemoryDocstore({}),
    index_to_docstore_id={}
)

retriever = vectorstore.as_retriever(
    search_kwargs={"k": 3} #hyperparameter
)

# ------------------------------------ #


docs = [
        Document(page_content="The capital of the USA is Washington, D.C."),
        Document(page_content="The president of the USA is Donald Trump."),
        Document(page_content="The prime minister of India is Narendra Modi."),
        Document(page_content="The capital of France is Paris."),
        Document(page_content="The capital of Japan is Tokyo."),
        Document(page_content="The capital of India is New Delhi."),
        Document(page_content="The prime minister of India is Narendra Modi."),
        Document(page_content="Narendra Modi is the prime minister of India."),
        Document(page_content="The capital of the UK is London."),
        Document(page_content="The capital of Germany is Berlin."),
        Document(page_content="The capital of Canada is Ottawa."),
        Document(page_content="The capital of Australia is Canberra."),
        Document(page_content="The capital of Russia is Moscow."),
        Document(page_content="The capital of China is Beijing."),
        Document(page_content="The capital of Brazil is Brasília."),
        Document(page_content="The capital of South Africa is Pretoria."),
        Document(page_content="The capital of Italy is Rome."),
        Document(page_content="The capital of Spain is Madrid."),
        Document(page_content="The capital of Mexico is Mexico City."),
        Document(page_content="The capital of Argentina is Buenos Aires."),
    ] 

def format_docs(docs):
    return "\n".join(doc.page_content for doc in docs)

print(f"Formatted documents: {format_docs(docs)}")  # Print the formatted documents for debugging
print()
print("-----------------------------------------------")

rag_chain = (
    {"context": retriever | (lambda docs: format_docs(docs)), "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

result = rag_chain.invoke("Who is the president of the USA?")  # Example question
print(f"Result: {result}")  # Print the result of the question-answering task

result = rag_chain.invoke("What is the capital of the USA?")  # Another example question
print(f"Result: {result}")  # Print the result of the question-answering task

result = rag_chain.invoke("What is the capital of France?")  # Another example question
print(f"Result: {result}")  # Print the result of the question-answering task

result = rag_chain.invoke("What is the capital of Japan?")  # Another example question
print(f"Result: {result}")  # Print the result of the question-answering task

result = rag_chain.invoke("What is the capital of India?")  # Another example question
print(f"Result: {result}")  # Print the result of the question-answering task

result = rag_chain.invoke("Who is the prime minister of India?")  # Another example question
print(f"Result: {result}")  # Print the result of the question-answering task

Formatted documents: The capital of the USA is Washington, D.C.
The president of the USA is Donald Trump.
The prime minister of India is Narendra Modi.
The capital of France is Paris.
The capital of Japan is Tokyo.
The capital of India is New Delhi.
The prime minister of India is Narendra Modi.
Narendra Modi is the prime minister of India.
The capital of the UK is London.
The capital of Germany is Berlin.
The capital of Canada is Ottawa.
The capital of Australia is Canberra.
The capital of Russia is Moscow.
The capital of China is Beijing.
The capital of Brazil is Brasília.
The capital of South Africa is Pretoria.
The capital of Italy is Rome.
The capital of Spain is Madrid.
The capital of Mexico is Mexico City.
The capital of Argentina is Buenos Aires.

-----------------------------------------------
Result: The current president of the USA is Joe Biden.  He assumed office on January 20, 2021.
Result: The capital of the USA is Washington, D.C.  It's located on the Potomac River.  This