In [None]:
%pip install -U langchain-community pymupdf langchain langchain-text-splitters

In [None]:
from langchain_community.document_loaders.pdf import PyMuPDFLoader

# load document
 
loader = PyMuPDFLoader("ES Mod1@AzDOCUMENTS2.pdf")
pages = loader.load()

print(len(pages))

In [None]:
print(pages[0])


In [None]:
import tiktoken

tokenizer = tiktoken.get_encoding('cl100k_base')
def tiktoken_len(text):
    tokens = tokenizer.encode(
    text,
    disallowed_special=()
)
    return len(tokens)
tiktoken.encoding_for_model('gpt-4.1-mini')

# create the length function
token_counts = []
for page in pages:
    token_counts.append(tiktoken_len(page.page_content))
min_token_count = min(token_counts)
avg_token_count = int(sum(token_counts) / len(token_counts))
max_token_count = max(token_counts)

# print token counts
print(f"Min: {min_token_count}")
print(f"Avg: {avg_token_count}")
print(f"Max: {max_token_count}")

In [None]:
# from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter
# split documents into text and embeddings

text_splitter = RecursiveCharacterTextSplitter(
   chunk_size=1000, 
   chunk_overlap=200,
   length_function=len,
   is_separator_regex=False
)

chunks = text_splitter.split_documents(pages)

print(f"Number of chunks: {len(chunks)}")
print(chunks[0])

In [None]:
pip install nltk

In [None]:
from langchain_text_splitters.nltk import NLTKTextSplitter
import nltk

# ensure required NLTK data is available
nltk.download('punkt', quiet=True)
# some NLTK versions/use-cases expect 'punkt_tab' — attempt to download if available
try:
    nltk.download('punkt_tab', quiet=True)
except Exception:
    # ignore if the downloader doesn't recognize this resource name
    pass

nltk_splitter = NLTKTextSplitter(
    language='english'
)

chunks = nltk_splitter.split_documents(pages)

print(f"Number of chunks: {len(chunks)}")
print(chunks[0])

In [None]:
import os
from getpass import getpass
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import OpenAIEmbeddings

# Ensure OPENAI_API_KEY is available. You can export it in your environment:
#   export OPENAI_API_KEY="sk-..."
# Or enter it interactively (won't be echoed) when running this cell.
openai_api_key = os.getenv("OPENAI_API_KEY")
if not openai_api_key:
    openai_api_key = getpass("Enter your OpenAI API key (input hidden): ").strip()
    if openai_api_key:
        os.environ["OPENAI_API_KEY"] = openai_api_key
    else:
        raise ValueError("OpenAI API key not provided. Set OPENAI_API_KEY environment variable or provide it when prompted.")

# Initialize the embedding model with the API key
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)

# Initialize the SemanticChunker with the embedding model
semantic_chunker = SemanticChunker(embeddings=embeddings)

# Split the first page into semantic chunks
chunks = semantic_chunker.split_text(pages[0].page_content)

# Print the resulting chunks
for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1}:\n{chunk}\n---")


SemanticChunking

In [None]:
pip install huggingface_hub[hf_xet]

In [None]:
pip install sentence-transformers

In [None]:
pip install sentence-transformers

In [None]:
# Install required package for Hugging Face sentence-transformers (fix ImportError)
# %pip install -q sentence-transformers

from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import HuggingFaceEmbeddings
# from langchain_community.document_loaders import TextLoader
# from langchain_core.documents import Document 

# Sample text for demonstration
# text_data = """
# Apple's new Vision Pro headset is a spatial computing device that blends digital content with the physical world. 
# It features a micro-OLED display system and advanced eye-tracking technology.
# The company also announced new MacBook Pro models with the M3 chip family, offering significant performance improvements for professional users.
# In a separate development, a recent study on climate change highlights the urgent need for global cooperation. 
# Researchers suggest that renewable energy sources are crucial for mitigating rising temperatures. 
# The report emphasizes policy changes and sustainable practices.
# """

# 1. Initialize a free embedding model from Hugging Face
# This model runs locally and is a good default choice.
# model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

# 2. Initialize the SemanticChunker with the embedding model
# You can tune parameters like breakpoint_threshold_type and amount
# to control how aggressively the text is split.
semantic_chunker = SemanticChunker(
    embeddings=embeddings,
    breakpoint_threshold_type="percentile", # Other options: "standard_deviation", "interquartile"
    breakpoint_threshold_amount=80 # Top 20% of semantic changes trigger a split
)


semantic_chunks = semantic_chunker.transform_documents(pages)


# page1 = pages[0].page_content
# semantic_chunks = semantic_chunker.transform_documents(pages)

# print(f"Number of semantic chunks: {len(semantic_chunks)}")
# print(semantic_chunks[0])



# 3. Split the text into documents (chunks)
# For a simple string, wrap it in a Document object
# documents = [Document(page_content=text_data)]
# semantic_chunks = semantic_chunker.split_documents(documents)

# # 4. Print the resulting chunks
for i, chunk in enumerate(semantic_chunks):
    print(f"--- Chunk {i+1} (Length: {len(chunk.page_content)}) ---")
    print(chunk.page_content)
    print("\n")



In [None]:
pip install huggingface_hub[hf_xet]

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

# Example: Using Hugging Face Transformers directly to create chunk vectors (embeddings) for free


# Choose a free, widely used sentence transformer model
hf_model_name = "sentence-transformers/all-MiniLM-L6-v2"

# Load tokenizer and model
hf_tokenizer = AutoTokenizer.from_pretrained(hf_model_name)
hf_model = AutoModel.from_pretrained(hf_model_name)

def get_chunk_embedding(text):
    # Tokenize input text
    inputs = hf_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        model_output = hf_model(**inputs)
    # Mean pooling
    embeddings = model_output.last_hidden_state.mean(dim=1)
    return embeddings[0].numpy()

# Example: create embeddings for all chunks
chunk_vectors = [get_chunk_embedding(chunk.page_content) for chunk in semantic_chunks]

print(f"Created {len(chunk_vectors)} chunk vectors.")
print(chunk_vectors[0])  # Show the first vector

In [None]:
pip install chromadb

In [None]:
import chromadb
from chromadb.utils import embedding_functions

# --- Configuration (Assuming these variables are defined upstream) ---
# NOTE: Replace 'all-MiniLM-L6-v2' with the model name defined in your hf_model_name variable
HF_MODEL_NAME = 'all-MiniLM-L6-v2' 

# Initialize ChromaDB client (using default settings, stores data locally)
client = chromadb.Client()

# Create or get a collection for your document chunks
# NOTE: We use the defined model name here.
collection = client.get_or_create_collection(
    name="embedded_systems_chunks",
    embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction(model_name=HF_MODEL_NAME)
)

# --- Prep and Insertion (Assuming semantic_chunks is defined and populated) ---
# Prepare dummy data for insertion
# documents = [
#     "The CPU is the brain of an embedded system, executing instructions.",
#     "Microcontrollers integrate CPU, memory, and peripherals on a single chip.",
#     "An RTOS manages tasks and resources in real-time constraints."
# ]
# metadatas = [
#     {"source": "textbook", "page": 10},
#     {"source": "datasheet", "page": 5},
#     {"source": "wiki", "page": 1}
# ]
documents = [chunk.page_content for chunk in semantic_chunks]
metadatas = [{"source": f"page_{i+1}"} for i in range(len(semantic_chunks))]
ids = [f"chunk_{i}" for i in range(len(documents))]

# Add documents and their metadata to the collection
collection.add(
    documents=documents,
    metadatas=metadatas,
    ids=ids
)

print(f"Stored {len(documents)} chunks in ChromaDB collection 'embedded_systems_chunks'.")

# --- Retrieval with Fix (The fix for your original 'Embeddings: None' issue) ---

# Explicitly include "embeddings" in the results for inspection
results_with_embeddings = collection.get(
    include=["metadatas", "documents", "embeddings"] 
)

ids = results_with_embeddings['ids']
metadatas = results_with_embeddings['metadatas']
chunk_vectors = results_with_embeddings['embeddings'] 

# Check if embeddings are now available (they should be, as we requested them)
if chunk_vectors is not None and len(chunk_vectors) > 0:
    # Print a snippet of the first vector to confirm it's not None
    print(f"\n--- Retrieval Check ---")
    print(f"Successfully retrieved embeddings for {len(chunk_vectors)} chunks.")
    print(f"Embedding of 'chunk_0' starts with: {chunk_vectors[0][:5]}...") 
else:
    print("Error: Embeddings are still None or empty.")

# --- The Core Similarity Search (Querying the RAG system) ---

# Define a query string related to the documents
query_text = "What is a single-chip device that controls systems?"

print(f"\n--- Similarity Search for: '{query_text}' ---")

# Perform the query
# ChromaDB embeds the query_text using the same embedding function 
# and finds the most similar vectors (k=1)
query_results = collection.query(
    query_texts=[query_text],
    n_results=1, 
    include=['documents', 'metadatas', 'distances'] # Only include necessary data for the result
)

# Display the results
if query_results and query_results.get('documents') and query_results['documents'][0]:
    print("\n✅ Closest Chunk Found:")
    print(f"Document ID: {query_results['ids'][0][0]}")
    print(f"Distance (Lower is better): {query_results['distances'][0][0]:.4f}")
    print(f"Metadata: {query_results['metadatas'][0][0]}")
    print(f"Content: **{query_results['documents'][0][0]}**")
else:
    print("No relevant chunks found.")

In [None]:
def query_chunk_by_query(query_text, collection, n_results=1):
    """
    Query the ChromaDB collection for the most relevant chunk based on the input query text.

    Args:
        query_text (str): The text query to search for.
        collection (chromadb.Collection): The ChromaDB collection containing document chunks.
        n_results (int): The number of top results to return.

    Returns:
        dict: A dictionary containing the retrieved chunk details.
    """
    query_results = collection.query(
        query_texts=[query_text],
        n_results=n_results,
        include=['documents', 'metadatas', 'distances']
    )
    if query_results and query_results.get('documents') and query_results['documents'][0]:
        return {
            'id': query_results['ids'][0][0],
            'distance': query_results['distances'][0][0],
            'metadata': query_results['metadatas'][0][0],
            'content': query_results['documents'][0][0]
        }
    else:
        return None

In [None]:
# Example usage of the query function
# query_text = "Briefly discuss how Cortex-M3 address the requirements of the 32-bit embedded processor market"
query_text = " Briefly explain the Thumb-2 technology and its advantages over thumb instruction set with relevat diagram."

result = query_chunk_by_query(query_text, collection)
if result:
    print("\n✅ Query Result:")
    print(f"Document ID: {result['id']}")
    print(f"Distance: {result['distance']:.4f}")
    print(f"Metadata: {result['metadata']}")
    print(f"Content: **{result['content']}**")

In [None]:
import lanchain