# Embedding Models

In [6]:
import os
from dotenv import load_dotenv
from openai import OpenAI

# Load environment variables
load_dotenv()

# Initialize client
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Call the API
response = client.embeddings.create(
    model="text-embedding-3-small",
    input="Embed this string, please."
)

# Extract the vector from the response
embedding = response.data[0].embedding

# len() returns the number of dimensions
print(f"Embedding dimensions: {len(embedding)}")


Embedding dimensions: 1536
[('data', [Embedding(embedding=[0.017261570319533348, -0.008031242527067661, -0.025707880035042763, -0.018460657447576523, -0.011470379307866096, -0.018855959177017212, -0.017749110236763954, -0.014560332521796227, -0.03499750420451164, -0.016431434080004692, 0.024337496608495712, -0.053076036274433136, -0.0032760747708380222, -0.02126730978488922, 0.0337061807513237, 0.0049676429480314255, -0.02092471346259117, 0.00603166688233614, -0.011793210171163082, 0.06229977309703827, -0.0217153187841177, -0.04408947750926018, -0.0033139579463750124, 0.04237649589776993, 0.008802083320915699, -0.013769726268947124, -0.021873440593481064, 0.0034523140639066696, 0.02166261151432991, 0.006311673671007156, -0.04569704458117485, -0.0385025255382061, 0.042112961411476135, -0.05871569365262985, -0.016088837757706642, 0.03125530108809471, 0.024798683822155, 1.048093781719217e-05, -0.0254838764667511, 0.017788641154766083, 0.04374688118696213, -0.0077149998396635056, -0.001602

# Building a Custom Vector Database

In [29]:
import numpy as np
from typing import List, Tuple

class SimpleVectorDB:
    def __init__(self):
        self.vectors = []
        self.metadata = []
    
    def add(self, vector: List[float], metadata: dict):
        # Add a vector and its metadata to the database
        self.vectors.append(np.array(vector))
        self.metadata.append(metadata)
    
    def search(self, query_vector: List[float], k: int = 5) -> List[Tuple[dict, float]]:
        # Search for the k most similar vectors
        query = np.array(query_vector)
        
        # Calculate cosine similarity for all vectors
        similarities = []
        for i, vec in enumerate(self.vectors):
            similarity = self._cosine_similarity(query, vec)
            similarities.append((i, similarity))
        
        # Sort by similarity (highest first) and get top k
        similarities.sort(key=lambda x: x[1], reverse=True)
        top_k = similarities[:k]
        
        # Return metadata and scores
        results = [(self.metadata[idx], score) for idx, score in top_k]
        return results
    
    def _cosine_similarity(self, vec1: np.ndarray, vec2: np.ndarray) -> float:
        # Calculate cosine similarity between two vectors
        dot_product = np.dot(vec1, vec2)
        norm1 = np.linalg.norm(vec1)
        norm2 = np.linalg.norm(vec2)
        return dot_product / (norm1 * norm2)


# Distance Metrics
## Cosine Similarity

In [7]:
import numpy as np

def cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float:
    # Calculate cosine similarity between two vectors
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    return dot_product / (norm1 * norm2)

# Example usage
vec_a = np.array([1.0, 2.0, 3.0])
vec_b = np.array([2.0, 4.0, 6.0])
similarity = cosine_similarity(vec_a, vec_b)
print(f"Cosine similarity: {similarity:.4f}")  # Output: 1.0000


Cosine similarity: 1.0000


## Euclidean Distance

In [17]:
def euclidean_distance(vec1: np.ndarray, vec2: np.ndarray) -> float:
    # Calculate Euclidean distance between two vectors
    return np.sqrt(np.sum((vec1 - vec2) ** 2))

# Example usage
vec_a = np.array([1.0, 2.0, 3.0])
vec_b = np.array([4.0, 5.0, 6.0])
distance = euclidean_distance(vec_a, vec_b)
print(f"Euclidean distance: {distance:.4f}")

# Calculate similarity
similarity = 1 / (1 + distance)
print(f"Similarity: {similarity:.4f}")


Euclidean distance: 5.1962
Similarity: 0.1614


## Dot Product

In [20]:
def dot_product(vec1: np.ndarray, vec2: np.ndarray) -> float:
    # Calculate dot product between two vectors
    return np.dot(vec1, vec2)

# Example usage
vec_a = np.array([1.0, 2.0, 3.0])
vec_b = np.array([2.0, 4.0, 6.0])
product = dot_product(vec_a, vec_b)
print(f"Dot product: {product:.4f}")

# For normalized vectors:
normalized_a = vec_a / np.linalg.norm(vec_a)
normalized_b = vec_b / np.linalg.norm(vec_b)
# This equals cosine similarity
product_normalized = dot_product(normalized_a, normalized_b)
print(f"Normalized dot product: {product_normalized:.4f}")

Dot product: 28.0000
Normalized dot product: 1.0000


## Comparison

In [21]:
# Compare all three metrics
vec_a = np.array([1.0, 2.0, 3.0, 4.0])
vec_b = np.array([1.5, 2.5, 3.5, 4.5])

cos_sim = cosine_similarity(vec_a, vec_b)
euc_dist = euclidean_distance(vec_a, vec_b)
dot_prod = dot_product(vec_a, vec_b)

print(f"Cosine Similarity: {cos_sim:.4f}")
print(f"Euclidean Distance: {euc_dist:.4f}")
print(f"Dot Product: {dot_prod:.4f}")


Cosine Similarity: 0.9980
Euclidean Distance: 1.0000
Dot Product: 35.0000


# Fixed-Size vs Semantic Chunking
## Fixed-Size Chunking

In [None]:
def fixed_size_chunking(text: str, chunk_size: int = 500,
                        overlap: int = 50) -> List[str]:
    # Split text into fixed-size chunks with overlap
    words = text.split()
    chunks = []
    
    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append(chunk)
    
    return chunks


## Semantic Chunking (Paragraph Boundaries)

In [27]:
import numpy as np
from typing import List, Tuple

def semantic_chunking(text: str, max_chunk_size: int = 1000) -> List[str]:
    # Split text into semantic chunks at paragraph boundaries
    # Split on double newlines (paragraph breaks)
    paragraphs = text.split('\n\n')
    
    chunks = []
    current_chunk = []
    current_size = 0
    
    for paragraph in paragraphs:
        paragraph = paragraph.strip()
        if not paragraph:
            continue
        
        para_size = len(paragraph.split())
        
        # If adding this paragraph exceeds max size, start new chunk
        if current_size + para_size > max_chunk_size and current_chunk:
            chunks.append('\n\n'.join(current_chunk))
            current_chunk = [paragraph]
            current_size = para_size
        else:
            current_chunk.append(paragraph)
            current_size += para_size
    
    # Add the last chunk
    if current_chunk:
        chunks.append('\n\n'.join(current_chunk))
    
    return chunks

# Example usage
document = """First paragraph about topic A.

Second paragraph also about topic A.

Third paragraph about topic B. 

Fourth paragraph about topic A.

Fifth paragraph also about topic A.

Sixth paragraph about topic B."""

chunks = semantic_chunking(document, max_chunk_size=20)
for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1}: {len(chunk.split())} words")


Chunk 1: 16 words
Chunk 2: 16 words


## Semantic Chunking (Section Boundaries)

In [None]:
def chunk_by_sections(text: str, header_pattern: str = r'^#{1,3} ') -> List[str]:
    # Chunk markdown documents by section headers
    import re
    
    lines = text.split('\n')
    chunks = []
    current_chunk = []
    
    for line in lines:
        # Check if line is a header
        if re.match(header_pattern, line):
            # Save previous chunk if it exists
            if current_chunk:
                chunks.append('\n'.join(current_chunk))
            current_chunk = [line]
        else:
            current_chunk.append(line)
    
    # Add the last chunk
    if current_chunk:
        chunks.append('\n'.join(current_chunk))
    
    return chunks


# Building RAG From First Principles

In [None]:
import os
from dotenv import load_dotenv
from openai import OpenAI
import numpy as np
from typing import List, Dict

# Load environment variables
load_dotenv()

class RAGSystem:
    def __init__(self):
        self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
        self.vector_db = SimpleVectorDB()
        self.embedding_model = "text-embedding-3-small"
        self.llm_model = "gpt-5-mini"
    
    def index_documents(self, documents: List[str], chunk_size: int = 500,
                        overlap: int = 50):
        # Index documents by chunking and embedding them
        print(f"Indexing {len(documents)} documents...")
        
        for doc_id, document in enumerate(documents):
            # Chunk the document
            chunks = self._chunk_text(document, chunk_size, overlap)
            
            # Embed each chunk
            for chunk_id, chunk in enumerate(chunks):
                embedding = self._get_embedding(chunk)
                
                # Store in vector database
                metadata = {
                    'doc_id': doc_id,
                    'chunk_id': chunk_id,
                    'text': chunk
                }
                self.vector_db.add(embedding, metadata)
        
        print(f"Indexed {len(self.vector_db.vectors)} chunks")
    
    def query(self, question: str, k: int = 5) -> str:
        # Query the RAG system with a question
        # Get embedding for the question
        question_embedding = self._get_embedding(question)
        
        # Retrieve most similar chunks
        results = self.vector_db.search(question_embedding, k=k)
        
        # Build context from retrieved chunks
        context = "\n\n".join([result[0]['text'] for result in results])
        
        # Create prompt
        prompt = f"""Answer the question based on the context below.

Context:
{context}

Question: {question}

Answer:"""
        
        # Get response from LLM
        response = self.client.responses.create(
            model=self.llm_model,
            input=prompt,
        )
        
        return response.output_text
    
    def _get_embedding(self, text: str) -> List[float]:
        """Get embedding from OpenAI"""
        response = self.client.embeddings.create(
            model=self.embedding_model,
            input=text
        )
        return response.data[0].embedding
    
    def _chunk_text(self, text: str, chunk_size: int,
                    overlap: int) -> List[str]:
        """Split text into overlapping chunks"""
        words = text.split()
        chunks = []
        
        for i in range(0, len(words), chunk_size - overlap):
            chunk = ' '.join(words[i:i + chunk_size])
            if chunk:  # Only add non-empty chunks
                chunks.append(chunk)
        
        return chunks
    
# Initialize the system
rag = RAGSystem()

# Index some documents
documents = [
    """Python is a high-level, general-purpose programming language used for a vast range of applications, including
web development, data science, artificial intelligence (AI), machine learning, and automation. Its versatility and beginner-friendly, readable syntax have made it one of the most popular languages in the world. 
Key uses for Python include:

    Data Analysis and Machine Learning Python has become a staple in data science due to its extensive libraries (like NumPy, pandas, and TensorFlow), which enable complex statistical calculations, data visualization, and the building of machine learning algorithms.
    Web Development It is often used for the back-end (server-side) of websites and applications, processing logic, interacting with databases, and ensuring security. Companies like Google, Netflix, and Instagram use Python for their services.
    Automation and Scripting Python can automate repetitive tasks, such as renaming files, finding and downloading online content, or processing errors across multiple files. This makes work more efficient for both programmers and non-programmers.
    Software Testing and Prototyping In software development, Python aids in build control, bug tracking, and automating tests for new products or features, and its fast prototyping capabilities are valuable for startups.
    Scientific Computing and Research Python's robust libraries are used extensively in academic research across various fields, including bioinformatics, biology, and mathematics.
    Embedded Systems and Gaming While not its primary use, Python can be found in embedded systems and graphic design applications (e.g., as a scripting language for Maya, a 3D modeling tool). 

Python's design emphasizes code readability and simplicity, which contributes to its wide adoption across various industries and job functions, from developers and software engineers to data analysts and AI researchers.""",
    """Machine learning (ML) is a subset of artificial intelligence focused on building systems that learn patterns from data and then use those patterns to make predictions, decisions, or recommendations. Instead of writing explicit rules for every scenario, you provide examples (data) and a learning algorithm finds a model that generalizes beyond those examples.

A typical ML workflow includes collecting and cleaning data, splitting it into training/validation/test sets, selecting features (or learning them automatically), training a model, evaluating it with appropriate metrics, and then deploying it to make predictions on new inputs. A key challenge is avoiding overfitting—when a model performs well on training data but poorly on new data—so techniques like regularization, cross-validation, and careful monitoring are essential.

Common learning paradigms include:
    Supervised learning: learn from labeled examples (e.g., spam detection, price prediction).
    Unsupervised learning: discover structure without labels (e.g., clustering customers, dimensionality reduction).
    Reinforcement learning: learn by trial and error with rewards (e.g., game-playing agents, robotics).
Deep learning is a family of ML methods (neural networks with many layers) that has been especially successful in vision, speech, and language tasks.

ML is used across many industries: recommendation systems, search ranking, fraud detection, anomaly detection, medical imaging, predictive maintenance, and personalization. In practice, ML success depends as much on data quality, problem framing, and evaluation as on the specific algorithm, and teams must also consider issues like bias, data leakage, privacy, and ongoing model drift after deployment.""",
    """Vector databases store and search high-dimensional vector embeddings—numeric representations of text, images, audio, or other data produced by embedding models. The core idea is that “similar” items end up close together in vector space, allowing semantic search: you can retrieve relevant content even when it doesn’t share the same exact keywords as the query.

A vector database typically supports:
    Ingestion: store an embedding plus an ID and metadata (source, timestamp, tags, etc.).
    Similarity search: find the nearest neighbors to a query vector using metrics like cosine similarity, dot product, or Euclidean distance.
    Indexing for speed: use approximate nearest neighbor (ANN) structures (e.g., HNSW or IVF-based indexes) to search efficiently at scale, trading a small amount of accuracy for large performance gains.
    Metadata filtering: constrain results by structured fields (e.g., only documents from a certain product, user, or date range).
Hybrid approaches often combine semantic retrieval (vectors) with lexical retrieval (BM25/keyword search) to improve precision for proper nouns, codes, and rare terms.

Vector databases are widely used in Retrieval-Augmented Generation (RAG) systems: documents are chunked, embedded, and stored; at query time the system embeds the question, retrieves the most similar chunks, and provides them as grounded context to a language model. This reduces hallucinations and enables answering questions using up-to-date or proprietary knowledge. Practical considerations include chunking strategy, embedding model choice, deduplication, index rebuild/update behavior, and evaluation of retrieval quality (recall, precision, latency, and cost).""",
 ]

rag.index_documents(documents)

# Query the system
question = "What is Python used for?"
answer = rag.query(question)

print(f"Question: {question}")
print(f"Answer: {answer}")


Indexing 3 documents...
Indexed 3 chunks
Question: What is Python used for?
Answer: Python is a high-level, general-purpose language used across many domains because of its readable syntax and rich libraries. Common uses include:

- Data analysis, machine learning and AI (with libraries like NumPy, pandas, TensorFlow)
- Web development (server-side/back-end logic, database interaction, security) — used by companies such as Google, Netflix, Instagram
- Automation and scripting (automating repetitive tasks, file processing, web downloads)
- Software testing, build control and rapid prototyping
- Scientific computing and academic research (bioinformatics, mathematics, etc.)
- Embedded systems and game/graphics scripting (as a secondary role or scripting language)

Overall, Python is popular for tasks that benefit from quick development, extensive libraries, and readable code.
