# The Hound of Baskervilles
https://www.gutenberg.org/cache/epub/3070/pg3070.txt

In [None]:
import requests

In [None]:
url = "https://www.gutenberg.org/cache/epub/3070/pg3070.txt"

In [None]:
response = requests.get(url)

In [None]:
if response.status_code == 200:
    book_full_text = response.text

## Calculating tokens using tiktoken

In [None]:
from tiktoken import encoding_for_model

In [None]:
encoder = encoding_for_model("gpt-4o")

In [None]:
tokens = encoder.encode(book_full_text)

In [None]:
print(f"Number of tokens: {len(tokens)}")

## Fixed Window Chunking

In [None]:
from typing import List

In [None]:
def naive_splitter_v1(text: str) -> List[str]:
    """Splits text at every new line"""
    return text.split("\n")

In [None]:
def naive_splitter_v2(text: str, separators: List[str] = ["\n", "\r"]) -> List[str]:
    """Splits text at every separator"""
    splits = [text]
    for sep in separators:
        splits = [segment for part in splits for segment in part.split(sep) if segment]

    return splits

In [None]:
def fixed_window_splitter(text: str, chunk_size: int = 1000) -> List[str]:
    """Splits text at given chunk_size"""
    splits = []
    for i in range(0, len(text), chunk_size):
        splits.append(text[i:i + chunk_size])
    return splits

In [None]:
def fixed_window_with_overlap_splitter(text: str, chunk_size: int = 1000, chunk_overlap: int = 10) -> List[str]:
    """Splits text at given chunk_size, and starts next chunk from start - chunk_overlap position"""
    chunks = []
    start = 0
    
    while start <= len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start = end - chunk_overlap
    
    return chunks

In [None]:
def recursive_character_splitter(text: str, separators: List[str] = ["\n", "\r"], chunk_size: int = 1000, chunk_overlap: int = 10) -> List[str]:
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    splitter = RecursiveCharacterTextSplitter(separators=separators, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_text(text)

In [None]:
def semantic_splitter(text: str, chunk_size: int = 1000, chunk_overlap: int = 10) -> List[str]:
    from flair.models import SequenceTagger
    from flair.data import Sentence
    from flair.splitter import SegtokSentenceSplitter

    splitter = SegtokSentenceSplitter()
    
    # Split text into sentences
    sentences = splitter.split(text)

    chunks = []
    current_chunk = ""

    for sentence in sentences:
        # Add sentence to the current chunk
        if len(current_chunk) + len(sentence.to_plain_string()) <= chunk_size:
            current_chunk += " " + sentence.to_plain_string()
        else:
            # If adding the next sentence exceeds max size, start a new chunk
            chunks.append(current_chunk.strip())
            current_chunk = sentence.to_plain_string()

    # Add the last chunk if it exists
    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks


In [None]:
from flair.splitter import SegtokSentenceSplitter
from typing import List
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from langchain_openai import AzureOpenAIEmbeddings
import os
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

# Placeholder function to generate embeddings using your own OpenAI model.
def get_embedding(sentence: str) -> np.ndarray:
    """
    Generate embeddings for a given sentence using your custom OpenAI model.
    Replace this with your actual model API call.
    """
    # Example placeholder; replace with actual API call, e.g.:
    model = "text-embedding-ada-002-01"
    api_version = "2023-05-15"

    embeddings = AzureOpenAIEmbeddings(model=model,
                                 api_version=api_version,
                                 azure_endpoint=os.getenv(
                                     "AZURE_OPENAI_ENDPOINT"),
                                 api_key=os.getenv("AZURE_OPENAI_API_KEY"))
    return embeddings.embed_query(sentence)


def cosine_sim(vec1: np.ndarray, vec2: np.ndarray) -> float:
    """Calculate cosine similarity between two vectors."""
    return cosine_similarity([vec1], [vec2])[0][0]

def can_merge(sentences: List[str], chunk_size: int, similarity_threshold: float) -> bool:
    """
    Check if the group of sentences can be merged into a chunk without exceeding chunk_size
    and meeting the similarity criteria.
    """
    combined_text = " ".join(sentences)
    if len(combined_text) > chunk_size:
        return False

    # Get embeddings for each sentence
    embeddings = [get_embedding(sentence) for sentence in sentences]

    # Check similarity between consecutive sentences
    for i in range(len(embeddings) - 1):
        if cosine_sim(embeddings[i], embeddings[i + 1]) < similarity_threshold:
            return False
    return True

def recursive_merge(sentences: List[str], chunk_size: int = 1000, similarity_threshold: float = 0.8) -> List[str]:
    """
    Recursively merge sentences into meaningful chunks based on similarity and chunk size constraints.
    
    Args:
        sentences (List[str]): List of sentences to be merged.
        chunk_size (int): Maximum size of each chunk in characters.
        similarity_threshold (float): Minimum similarity score to allow merging.
    
    Returns:
        List[str]: A list of merged semantic chunks.
    """
    if not sentences:
        return []

    chunks = []
    current_chunk = []
    
    for sentence in sentences:
        current_chunk.append(sentence)
        
        # Check if the current chunk can be merged based on size and similarity
        if can_merge(current_chunk, chunk_size, similarity_threshold):
            continue
        else:
            # If merging is not possible, finalize the current chunk without the last sentence
            if len(current_chunk) > 1:
                chunks.append(" ".join(current_chunk[:-1]))
                current_chunk = [current_chunk[-1]]  # Start new chunk with the last sentence

    # Add the last remaining chunk
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    
    return chunks

def embedding_chunking(text: str, chunk_size: int) -> List[str]:
    """
    Splits the text into semantically meaningful chunks based on embeddings and chunk size constraints.
    
    Args:
        text (str): The input text to be split.
        chunk_size (int): Maximum size of each chunk in characters.
    
    Returns:
        List[str]: A list of semantic chunks.
    """
    # Initialize the sentence splitter from Flair
    splitter = SegtokSentenceSplitter()
    
    # Split the text into sentences
    sentences = splitter.split(text)
    
    # Convert Flair Sentence objects to plain strings
    sentence_texts = [sentence.to_plain_string() for sentence in sentences]
    
    # Call the recursive merge function to create chunks
    chunks = recursive_merge(sentence_texts, chunk_size=chunk_size, similarity_threshold=0.5)
    
    return chunks

# Example usage
text = (book_full_text
)

display(embedding_chunking(text, chunk_size=1000))

In [None]:
def display(items: List[str]):
    for item in items:
        print(f"|{item}|")
        print("=========END OF CHUNK===========")

In [None]:
display(fixed_window_with_overlap_splitter(book_full_text, chunk_size=400, chunk_overlap=40))

In [None]:
recursive_character_splitter(book_full_text, separators=["\n\n", "\n\r", "\n", "\r"], chunk_size=400, chunk_overlap=100)

In [None]:
semantic_splitter(book_full_text, chunk_size=1000, chunk_overlap=200)

## Embedding Splitter

In [None]:
def embedding_splitter(text_data, chunk_size=400):
    import os
    import nltk
    from langchain_openai.embeddings import AzureOpenAIEmbeddings
    from sklearn.metrics.pairwise import cosine_similarity
    import numpy as np
    from dotenv import load_dotenv, find_dotenv
    from tqdm import tqdm
    from flair.splitter import SegtokSentenceSplitter
    
    load_dotenv(find_dotenv())
    
    
    # Set Azure OpenAI API environment variables (ensure these are set in your environment)
    # You can also set these in your environment directly
    # os.environ["OPENAI_API_KEY"] = "your-azure-openai-api-key"
    # os.environ["OPENAI_API_BASE"] = "your-azure-openai-api-endpoint"
    os.environ["OPENAI_API_VERSION"] = "2023-05-15"
    
    # Initialize OpenAIEmbeddings using LangChain's Azure support
    embedding_model = AzureOpenAIEmbeddings(deployment="text-embedding-ada-002-01")  # Use your Azure model name
    
    
    # Step 1: Split the text into sentences
    def split_into_sentences(text):
        splitter = SegtokSentenceSplitter()
    
        # Split text into sentences
        sentences = splitter.split(text)
        sentence_str = []
        for sentence in sentences:
            sentence_str.append(sentence.to_plain_string())
        return sentence_str[:100]
    
    # Step 2: Get embeddings for each sentence using the same Azure embedding model
    def get_embeddings(sentences):
        embeddings = []
        for sentence in tqdm(sentences, desc="Generating embeddings"):
            embedding = embedding_model.embed_documents([sentence])  # Embeds a single sentence
            embeddings.append(embedding[0])  # embed_documents returns a list, so take the first element
        return embeddings
    
    # Step 3: Form chunks based on sentence embeddings, a similarity threshold, and a max chunk character size
    def form_chunks(sentences, embeddings, similarity_threshold=0.7, chunk_size=500):
        chunks = []
        current_chunk = []
        current_chunk_emb = []
        current_chunk_length = 0  # Track the character length of the current chunk
    
        for i, (sentence, emb) in enumerate(zip(sentences, embeddings)):
            emb = np.array(emb)  # Ensure the embedding is a numpy array
            sentence_length = len(sentence)  # Calculate the length of the sentence
    
            if current_chunk:
                # Calculate similarity with the current chunk's embedding (mean of embeddings in the chunk)
                chunk_emb = np.mean(np.array(current_chunk_emb), axis=0).reshape(1, -1)  # Average embedding of the chunk
                similarity = cosine_similarity(emb.reshape(1, -1), chunk_emb)[0][0]
    
                if similarity < similarity_threshold or current_chunk_length + sentence_length > chunk_size:
                    # If similarity is below threshold or adding this sentence exceeds max chunk size, create a new chunk
                    chunks.append(current_chunk)
                    current_chunk = [sentence]
                    current_chunk_emb = [emb]
                    current_chunk_length = sentence_length  # Reset chunk length
                else:
                    # Else, add sentence to the current chunk
                    current_chunk.append(sentence)
                    current_chunk_emb.append(emb)
                    current_chunk_length += sentence_length  # Update chunk length
            else:
                current_chunk.append(sentence)
                current_chunk_emb = [emb]
                current_chunk_length = sentence_length  # Set initial chunk length
    
        # Add the last chunk
        if current_chunk:
            chunks.append(current_chunk)
    
        return chunks
    
    # Apply the sentence splitting
    sentences = split_into_sentences(text_data)
    
    # Get sentence embeddings
    embeddings = get_embeddings(sentences)
    
    # Form chunks based on embeddings
    chunks = form_chunks(sentences, embeddings, chunk_size=chunk_size)
    
    return chunks

In [None]:
chunks = embedding_splitter(book_full_text, chunk_size=1000)

In [None]:
display(chunks)

In [None]:
len(chunks)

In [None]:
def agentic_chunking(text_data):
    from langchain_openai import AzureChatOpenAI
    from langchain.prompts import PromptTemplate
    llm = AzureChatOpenAI(model="gpt-4o",
                           azure_endpoint=os.getenv(
                               "AZURE_OPENAI_ENDPOINT"),
                           api_key=os.getenv("AZURE_OPENAI_API_KEY"),
                           api_version="2023-03-15-preview",
                           verbose=True,
                           temperature=1)
    prompt = """I am providing a document below. 
    Please split the document into chunks that maintain semantic coherence and ensure that each chunk represents a complete and meaningful unit of information. 
    Each chunk should stand alone, preserving the context and meaning without splitting key ideas across chunks. 
    Use your understanding of the content’s structure, topics, and flow to identify natural breakpoints in the text. 
    Ensure that no chunk exceeds 1000 characters length, and prioritize keeping related concepts or sections together.

    Do not modify the document, just split to chunks and return them as an array of strings, where each string is one chunk of the document.
    Return the entire book not dont stop in betweek some sentences.

    Document:
    {document}
    """

    prompt_template = PromptTemplate.from_template(prompt)

    chain = prompt_template | llm

    result = chain.invoke({"document": text_data})
    return result





In [None]:
result = agentic_chunking(book_full_text)