### Open AI Embeddings

In [18]:
# Import the os module to interact with operating system environment variables
# os.environ allows us to read and set environment variables like API keys

# Import the load_dotenv function from the dotenv package
# This function reads a .env file and loads the variables into os.environ
# This is a secure way to store sensitive information like API keys

# Call load_dotenv() to read the .env file in your project directory
# After this line executes, all variables from .env are available via os.environ
# For example, if .env contains: OPENAI_API_KEY=sk-abc123
# You can access it with: os.environ["OPENAI_API_KEY"] or os.getenv("OPENAI_API_KEY")


# PURPOSE OF THIS CELL:
# 1. Security: Keep API keys out of your code (never hardcode them!)
# 2. Flexibility: Different environments (dev, prod) can use different .env files
# 3. Best Practice: Separates configuration from code, making it easier to share code safely
import os
from dotenv import load_dotenv
load_dotenv()

True

In [19]:
# This cell demonstrates how to initialize OpenAI embeddings using LangChain
# It creates an embeddings object that will be used to convert text into numerical vectors
# The model "text-embedding-3-small" is cost-effective and suitable for general purpose use
os.environ["OPENAI_API_KEY"]=os.getenv("OPENAI_API_KEY")

In [20]:
# This cell demonstrates embedding creation with detailed explanations

# WHAT ARE EMBEDDINGS?
# Embeddings are numerical representations (vectors) of text that capture semantic meaning.
# Similar texts will have similar vectors, allowing machines to understand relationships.
# STEP 1: Verify the embeddings object is initialized
# The 'embeddings' variable was created in CELL 3 using OpenAIEmbeddings
# It uses the "text-embedding-3-small" model which creates 1536-dimensional vectors
# STEP 2: Understand what happens when we embed text
# When you call embed_query() or embed_documents():
# 1. Text is sent to OpenAI's API
# 2. The model processes it using neural networks
# 3. Returns a list of 1536 floating-point numbers (a vector)
# 4. These numbers capture the semantic meaning of the text

from langchain_openai import OpenAIEmbeddings
embeddings=OpenAIEmbeddings(model="text-embedding-3-small")

In [21]:
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x0000019FCA5D3E50>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x0000019FCA5F5C50>, model='text-embedding-3-small', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [None]:
# CELL INDEX 6: Analyzing the embedding output
# 
# WHAT WE JUST DID IN CELL 5:
# - Took a single text string about LangChain and RAG
# - Converted it into a 1536-dimensional vector using OpenAI's text-embedding-3-small model
# - Each number in the vector captures different semantic aspects of the text
#
# WHY THIS IS USEFUL:
# - These vectors can be compared mathematically (e.g., cosine similarity)
# - Similar texts will have similar vectors, enabling semantic search
# - Vectors can be stored in vector databases for efficient retrieval
# - Foundation for RAG (Retrieval Augmented Generation) systems
#
# KEY OBSERVATIONS:
# - Input: Human-readable text (64 characters)
# - Output: 1536 floating-point numbers representing semantic meaning
# - The model understands "LangChain" and "RAG" as technical terms
# - Values range roughly from -0.06 to +0.06 in this example

## Single text embeddings
single_text="Langchain and Rag are amazing frameworks and projects to work on"
single_embeddings=embeddings.embed_query(single_text)
print(len(single_embeddings))
print(single_embeddings)

1536
[-0.050007786601781845, -0.031081510707736015, -0.0034399048890918493, -0.003286944702267647, 0.03265869989991188, -0.03132624924182892, -0.014466634951531887, 0.001493061427026987, -0.01057804748415947, -0.033909574151039124, 0.018178468570113182, 0.004568411037325859, -0.03817886486649513, 0.05003497749567032, 0.005989241413772106, 0.01439865306019783, -8.14194354461506e-05, -0.06227179616689682, 0.04057184234261513, 0.06520862877368927, -0.001464169006794691, -0.006070820149034262, -0.018246451392769814, 0.03295782208442688, -0.006907002534717321, -0.008334631100296974, -0.00758002744987607, 0.06330512464046478, 0.01789294369518757, -0.019796447828412056, 8.497788803651929e-05, -0.033936768770217896, -0.006420928984880447, 0.033719226717948914, 0.014942510984838009, 0.021835917606949806, -0.00574450520798564, 0.004095933865755796, -0.008096693083643913, 0.022556530311703682, 0.009626294486224651, 0.03804289922118187, 0.004422249272465706, -0.0075188432820141315, 0.0178929436951

In [23]:
print("üìù Single Text Embedding:")
print(f"Input: {single_text}")
print(f"Output: Vector of {len(single_embeddings)} dimensions")
print(f"Sample values: {single_embeddings[:5]}")

üìù Single Text Embedding:
Input: Langchain and Rag are amazing frameworks and projects to work on
Output: Vector of 1536 dimensions
Sample values: [-0.050007786601781845, -0.031081510707736015, -0.0034399048890918493, -0.003286944702267647, 0.03265869989991188]


In [24]:
# Example 2: Multiple texts at once
multiple_texts = [
    "Python is a programming language",
    "LangChain is a framework for LLM applications",
    "Embeddings convert text to numbers",
    "Vectors can be compared for similarity"
]

In [25]:
multiple_embeddings = embeddings.embed_documents(multiple_texts)

In [26]:
multiple_embeddings

[[-0.010911782272160053,
  -0.020436646416783333,
  0.018866166472434998,
  -0.0028477711603045464,
  0.015623224899172783,
  -0.0265961941331625,
  0.0005207316717132926,
  0.03720203787088394,
  -0.001690306467935443,
  0.013022753410041332,
  0.021578814834356308,
  -0.024740172550082207,
  -0.00941778626292944,
  0.0018598469905555248,
  0.0039313011802732944,
  0.015521246008574963,
  -0.032959699630737305,
  0.029798343777656555,
  -0.02718767523765564,
  0.010417182929813862,
  -0.0014353583101183176,
  -0.009912385605275631,
  -0.053845055401325226,
  0.01538867224007845,
  0.0368553102016449,
  -0.04287208616733551,
  0.0055323755368590355,
  0.036263830959796906,
  -0.019549427554011345,
  0.0011020135134458542,
  0.012951367534697056,
  -0.032327428460121155,
  -0.036528974771499634,
  0.05123438686132431,
  -0.03118526190519333,
  -0.04499325528740883,
  0.04580909013748169,
  -0.010452875867486,
  0.06828532367944717,
  -0.015072536654770374,
  0.004043478053063154,
  -0.0

In [27]:
print("\nüìö Multiple Text Embeddings:")
print(f"Number of texts: {len(multiple_texts)}")
print(f"Number of embeddings: {len(multiple_embeddings)}")
print(f"Each embedding size: {len(multiple_embeddings[0])}")


üìö Multiple Text Embeddings:
Number of texts: 4
Number of embeddings: 4
Each embedding size: 1536


In [28]:
multiple_embeddings[0]

[-0.010911782272160053,
 -0.020436646416783333,
 0.018866166472434998,
 -0.0028477711603045464,
 0.015623224899172783,
 -0.0265961941331625,
 0.0005207316717132926,
 0.03720203787088394,
 -0.001690306467935443,
 0.013022753410041332,
 0.021578814834356308,
 -0.024740172550082207,
 -0.00941778626292944,
 0.0018598469905555248,
 0.0039313011802732944,
 0.015521246008574963,
 -0.032959699630737305,
 0.029798343777656555,
 -0.02718767523765564,
 0.010417182929813862,
 -0.0014353583101183176,
 -0.009912385605275631,
 -0.053845055401325226,
 0.01538867224007845,
 0.0368553102016449,
 -0.04287208616733551,
 0.0055323755368590355,
 0.036263830959796906,
 -0.019549427554011345,
 0.0011020135134458542,
 0.012951367534697056,
 -0.032327428460121155,
 -0.036528974771499634,
 0.05123438686132431,
 -0.03118526190519333,
 -0.04499325528740883,
 0.04580909013748169,
 -0.010452875867486,
 0.06828532367944717,
 -0.015072536654770374,
 0.004043478053063154,
 -0.039180438965559006,
 0.03132803365588188,
 

In [29]:

from langchain_openai import OpenAIEmbeddings

# Different OpenAI embedding models
models_comparison = {
    "text-embedding-3-small": {
        "dimensions": 1536,
        "description": "Good balance of performance and cost",
        "cost_per_1m_tokens": 0.02,
        "use_case": "General purpose, cost-effective"
    },
    "text-embedding-3-large": {
        "dimensions": 3072,
        "description": "Highest quality embeddings",
        "cost_per_1m_tokens": 0.13,
        "use_case": "When accuracy is critical"
    },
    "text-embedding-ada-002": {
        "dimensions": 1536,
        "description": "Previous generation model",
        "cost_per_1m_tokens": 0.10,
        "use_case": "Legacy applications"
    }
}

# Display comparison
print("üìä OpenAI Embedding Models Comparison:\n")
for model_name, details in models_comparison.items():
    print(f"Model: {model_name}")
    print(f"  üìè Dimensions: {details['dimensions']}")
    print(f"  üí∞ Cost: ${details['cost_per_1m_tokens']}/1M tokens")
    print(f"  üìù Description: {details['description']}")
    print(f"  üéØ Use case: {details['use_case']}\n")

üìä OpenAI Embedding Models Comparison:

Model: text-embedding-3-small
  üìè Dimensions: 1536
  üí∞ Cost: $0.02/1M tokens
  üìù Description: Good balance of performance and cost
  üéØ Use case: General purpose, cost-effective

Model: text-embedding-3-large
  üìè Dimensions: 3072
  üí∞ Cost: $0.13/1M tokens
  üìù Description: Highest quality embeddings
  üéØ Use case: When accuracy is critical

Model: text-embedding-ada-002
  üìè Dimensions: 1536
  üí∞ Cost: $0.1/1M tokens
  üìù Description: Previous generation model
  üéØ Use case: Legacy applications



### Cosine Similarity With OpenAI Embeddings

In [30]:
# Example 1: Finding similar sentences
sentences = [
    "The cat sat on the mat",
    "A feline rested on the rug",
    "The dog played in the yard",
    "I love programming in Python",
    "Python is my favorite programming language"
]

In [None]:
# -----------------------------------------------
# COSINE SIMILARITY - SIMPLE EXPLANATION
# -----------------------------------------------

# Think of every vector as an arrow.
# An arrow has:
# 1. Length  -> how big the arrow is
# 2. Direction -> which way the arrow points

# -----------------------------------------------
# WHAT IS NORM?
# -----------------------------------------------

# Norm means the LENGTH of the arrow.
# - Small arrow  -> small norm
# - Big arrow    -> big norm
# Norm tells us "how big" the vector is.

# In real life:
# Norm is like:
# - How tall someone is
# - How loud a voice is
# - How long a stick is

# We calculate norm so we can IGNORE size later
# and focus only on direction (meaning).

# -----------------------------------------------
# WHAT IS COSINE?
# -----------------------------------------------

# Cosine checks the DIRECTION of two arrows.
# It asks:
# "Are these two arrows pointing the same way?"

# If arrows point:
# - Same direction     -> cosine value is close to 1
# - Different direction -> cosine value is close to 0
# - Opposite direction  -> cosine value is close to -1

# -----------------------------------------------
# WHY DO WE USE COSINE SIMILARITY?
# -----------------------------------------------

# In embeddings:
# - Length of vector does NOT matter
# - Direction of vector represents meaning

# Two sentences can mean the same thing
# even if their vectors have different lengths.

# Cosine similarity:
# - Removes the effect of length (using norm)
# - Keeps only direction (meaning)

# -----------------------------------------------
# HOW COSINE SIMILARITY WORKS
# -----------------------------------------------

# Step 1: Dot product
# - Checks how much two arrows point in the same direction

# Step 2: Norm of each vector
# - Finds the length of each arrow

# Step 3: Divide dot product by (norm1 * norm2)
# - This removes size and keeps only direction

# -----------------------------------------------
# FINAL RESULT INTERPRETATION
# -----------------------------------------------

# Result close to 1   -> very similar meaning
# Result close to 0   -> not related
# Result close to -1  -> opposite meaning

# This is why cosine similarity is used in:
# - Embeddings
# - Semantic search
# - Vector databases
# - RAG pipelines


import numpy as np
def cosine_similarity(vec1, vec2):
    """
    Cosine similarity measures the angle between two vectors.
    - Result close to 1: Very similar
    - Result close to 0: Not related
    - Result close to -1: Opposite meanings
    """

    dot_product=np.dot(vec1,vec2)
    norm_a=np.linalg.norm(vec1) # linalg means linear algebra
    norm_b=np.linalg.norm(vec2)
    return dot_product/(norm_a * norm_b)

In [None]:
# -----------------------------------------------
# OPENAI EMBEDDINGS - SIMPLE EXPLANATION
# -----------------------------------------------

# This line imports the OpenAI embedding class from LangChain.
# LangChain is a framework that helps us work easily with LLMs and embeddings.
# OpenAIEmbeddings is a ready-made wrapper to talk to OpenAI embedding models.
#from langchain_openai import OpenAIEmbeddings


# Here we create an embeddings object.
# Think of this as a "machine" that converts text into numbers (vectors).
# These vectors are also called embeddings.
#embeddings = OpenAIEmbeddings(
#    model="text-embedding-3-small"
#)


# "text-embedding-3-small" is an OpenAI embedding model.
# What this model does:
# - Takes text as input (sentence, paragraph, document)
# - Converts it into a list of numbers (a vector)
# - Similar meanings -> vectors point in the same direction

# Why this model is used:
# - Fast
# - Cheap
# - Very good for semantic search and RAG
# - Commonly used in production systems


# When we write just `embeddings`,
# Jupyter Notebook shows the object information.
# This confirms that the embedding model is loaded and ready to use.
#embeddings


# -----------------------------------------------
# WHAT HAPPENS NEXT (MENTAL MODEL)
# -----------------------------------------------

# Later, when we call:
# embeddings.embed_query("some text")

# The flow is:
# 1. Text is sent to OpenAI
# 2. OpenAI converts text into a vector (numbers)
# 3. The vector is returned to Python
# 4. We store or compare it using cosine similarity

# These vectors are later used in:
# - Vector databases
# - Similarity search
# - RAG pipelines

from langchain_openai import OpenAIEmbeddings
embeddings=OpenAIEmbeddings(model="text-embedding-3-small")
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x0000019FC8EB3ED0>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x0000019FC8EDE190>, model='text-embedding-3-small', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [None]:
# This line takes a list of sentences as input
# Each sentence is sent to the embedding model
# The embedding model converts each sentence into a numeric vector
# Each vector represents the semantic meaning of one sentence
# All sentences are processed together in a single call for efficiency
# The result is a list of embedding vectors, one per sentence
# Each embedding vector has a fixed dimension (for example, 1536 values)
# These embeddings can later be used for similarity search or stored in a vector database
# sentence_embeddings = embeddings.embed_documents(sentences)

# This line outputs the generated sentence embeddings
# It allows you to verify that embeddings were successfully created
# The output will be a list of numeric arrays corresponding to each sentence
# This is mainly useful for learning, inspection, or debugging
# sentence_embeddings


sentence_embeddings=embeddings.embed_documents(sentences)
sentence_embeddings

[[-0.03074316307902336,
  -0.04954070597887039,
  -0.005032286047935486,
  -0.0014980505220592022,
  0.036250557750463486,
  -0.0020749696996062994,
  -0.008868717588484287,
  0.027200847864151,
  0.007110487669706345,
  -0.011906835250556469,
  0.04160281643271446,
  -0.0013857370940968394,
  0.0451192781329155,
  0.05274689197540283,
  0.03206183388829231,
  0.03244968131184578,
  -0.012417497113347054,
  0.003046197583898902,
  -0.06603703647851944,
  0.047446344047784805,
  0.025869246572256088,
  -0.04540369659662247,
  -0.003451818600296974,
  0.014621748588979244,
  0.009101424366235733,
  0.01482859905809164,
  -0.011208713985979557,
  -0.012049044482409954,
  0.010762692429125309,
  0.01282473374158144,
  0.012288215570151806,
  -0.036069564521312714,
  -0.02650272659957409,
  -0.04535198211669922,
  -0.034595753997564316,
  0.00479957927018404,
  -0.019844723865389824,
  -0.011745233088731766,
  -0.0420682318508625,
  -0.02290869876742363,
  -0.03637984022498131,
  -0.0050581

In [None]:
## Calculate the simialrity betwween all pairs
# This outer loop goes through each sentence index one by one
# The variable i represents the index of the current sentence
# range(len(sentences)) means it starts from the first sentence (index 0)
# and goes up to the last sentence index
# for i in range(len(sentences)):

    # This inner loop compares the current sentence with the next sentences only
    # j starts from i + 1 to avoid comparing a sentence with itself
    # This also avoids duplicate comparisons (A vs B and B vs A)
    # for j in range(i + 1, len(sentences)):

        # This line calculates the cosine similarity between two sentence embeddings
        # sentence_embeddings[i] is the embedding vector for sentence at index i
        # sentence_embeddings[j] is the embedding vector for sentence at index j
        # cosine_similarity returns a number that represents how similar the meanings are
        # similarity = cosine_similarity(sentence_embeddings[i], sentence_embeddings[j])

        # This line prints which two sentences are being compared
        # It helps you clearly see the pair of sentences involved in the comparison
        # print(f"'{sentences[i]}' vs '{sentences[j]}'")

        # This line prints the similarity score
        # :.3f formats the similarity value to 3 decimal places
        # \n adds a blank line to make the output easier to read
        # print(f"Similarity: {similarity:.3f}\n")


for i in range(len(sentences)):
    for j in range(i+1,len(sentences)):
        similarity=cosine_similarity(sentence_embeddings[i],sentence_embeddings[j])

        print(f"'{sentences[i]}' vs '{sentences[j]}'")
        print(f"Similarity: {similarity:.3f}\n")


'The cat sat on the mat' vs 'A feline rested on the rug'
Similarity: 0.655

'The cat sat on the mat' vs 'The dog played in the yard'
Similarity: 0.324

'The cat sat on the mat' vs 'I love programming in Python'
Similarity: 0.089

'The cat sat on the mat' vs 'Python is my favorite programming language'
Similarity: 0.120

'A feline rested on the rug' vs 'The dog played in the yard'
Similarity: 0.296

'A feline rested on the rug' vs 'I love programming in Python'
Similarity: 0.055

'A feline rested on the rug' vs 'Python is my favorite programming language'
Similarity: 0.103

'The dog played in the yard' vs 'I love programming in Python'
Similarity: 0.126

'The dog played in the yard' vs 'Python is my favorite programming language'
Similarity: 0.085

'I love programming in Python' vs 'Python is my favorite programming language'
Similarity: 0.708



In [None]:
### Example- Semantic Search- Retireve the similar sentence
# This example demonstrates a basic semantic search setup
# The goal is to retrieve the sentence that is most similar in meaning to a given query
# Semantic search focuses on understanding meaning rather than exact word matching

# This list represents a small collection of documents
# Each item in the list is treated as an individual document
# In semantic search, a document can be as small as a single sentence
# The documents intentionally cover different topics to show how semantic similarity works
# documents = [
#     "LangChain is a framework for developing applications powered by language models",
#     "Python is a high-level programming language",
#     "Machine learning is a subset of artificial intelligence",
#     "Embeddings convert text into numerical vectors",
#     "The weather today is sunny and warm"
# ]

# This variable represents the user query or question
# It is written in natural language
# The query does not need to exactly match the wording of any document
# Semantic search will later compare the meaning of this query with each document
# query = "What is Langchain?"

# Test semantic search
documents = [
    "LangChain is a framework for developing applications powered by language models",
    "Python is a high-level programming language",
    "Machine learning is a subset of artificial intelligence",
    "Embeddings convert text into numerical vectors",
    "The weather today is sunny and warm"
]
query="What is Langchain?"

In [42]:
# This function performs a simple semantic search
# It compares a query sentence with a list of documents based on meaning
# It returns the top K most similar documents

# def semantic_search(query, documents, embeddings_models, top_k=3):
#     """
#     Simple semantic search implementation
#     This docstring explains the purpose of the function for humans
#     """

#     # This line converts the query text into an embedding vector
#     # The embedding represents the semantic meaning of the query
#     # query_embedding = embeddings_models.embed_query(query)

#     # This line converts each document into an embedding vector
#     # Each document gets its own numerical representation
#     # The order of embeddings matches the order of documents
#     # doc_embeddings = embeddings_models.embed_documents(documents)

#     # This creates an empty list to store similarity scores and documents
#     # Each entry will be a tuple: (similarity_score, document_text)
#     # similarties = []

#     # This loop iterates over all document embeddings
#     # enumerate gives both the index (i) and the embedding (doc_emb)
#     # for i, doc_emb in enumerate(doc_embeddings):

#         # This line calculates how similar the query is to one document
#         # Cosine similarity compares the meaning of two embedding vectors
#         # similarity = cosine_similarity(query_embedding, doc_emb)

#         # This line stores the similarity score along with the document text
#         # documents[i] retrieves the original document using the index
#         # similarties.append((similarity, documents[i]))

#     # This line sorts all document-similarity pairs by similarity score
#     # reverse=True ensures the highest similarity comes first
#     # similarties.sort(reverse=True)

#     # This line returns only the top K most similar documents
#     # top_k controls how many results are returned
#     # return similarties[:top_k]

def semantic_search(query,documents,embeddings_models,top_k=3):
    """Simple semantic search implementation"""

    ## embed query and doument

    query_embedding=embeddings_models.embed_query(query)
    doc_embeddings = embeddings_models.embed_documents(documents)

    ## Calculate the similarity score

    similarties=[]

    for i,doc_emb in enumerate(doc_embeddings):
        similarity=cosine_similarity(query_embedding,doc_emb)
        similarties.append((similarity,documents[i]))

    ## Sort by similarity
    similarties.sort(reverse=True)
    return similarties[:top_k]



In [43]:
results=semantic_search(query,documents,embeddings)
results

[(np.float64(0.6227387139613368),
  'Embeddings convert text into numerical vectors'),
 (np.float64(0.25206899523723963),
  'Machine learning is a subset of artificial intelligence'),
 (np.float64(0.2291919996321144),
  'LangChain is a framework for developing applications powered by language models')]

In [47]:
print(f"\nüîé Semantic Search Results for: '{query}'")
for score, doc in results:
    print(f"Score: {score:.3f} | {doc}")


üîé Semantic Search Results for: 'What is Embeddings?'
Score: 0.623 | Embeddings convert text into numerical vectors
Score: 0.252 | Machine learning is a subset of artificial intelligence
Score: 0.229 | LangChain is a framework for developing applications powered by language models


In [46]:
query="What is Embeddings?"
results=semantic_search(query,documents,embeddings)
results

[(np.float64(0.6227387139613368),
  'Embeddings convert text into numerical vectors'),
 (np.float64(0.25206899523723963),
  'Machine learning is a subset of artificial intelligence'),
 (np.float64(0.2291919996321144),
  'LangChain is a framework for developing applications powered by language models')]