Semantic Chunking
* Semantic chunking is a process of spliting document into a meaningful unit or chunks base on semantic similarity, not number of tokens or line.

In [9]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

In [10]:
# Initialize the model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [11]:
# Sample text
text = """
    Langchain is a framework for building software applications with LLM.
    Langchain integrates LLM with tools like huggingface, OpenAI, Pinnecone.
    You can create chain, agent, memory and retrievers.
    Zuma rock is located in Abuja, the capital of Nigeria.
    Abuja is the capital of Nigeria.
"""

In [12]:
#Step 1: Split the text into sentences
sentences = [sentence.strip() for sentence in text.split('\n') if sentence.strip()]

#Step 2: Embed each sentence
embeddings = model.encode(sentences)

#Step 3: Initialize parameters
threshold = 0.7 # Control chunk tightness

chunks = []

current_chunk = [sentences[0]]

for i in range(1, len(sentences)):
    similarity = cosine_similarity(
        [embeddings[i - 1]],
        [embeddings[i]]
    )[0][0]
    
    if similarity >= threshold:
        current_chunk.append(sentences[i])
    
    else:
        chunks.append(' '.join(current_chunk))
        current_chunk = [sentences[i]]
        
# Append the last chunk
chunks.append('. '.join(current_chunk))

print('\n Semantic chunks')

for idx, chunk in enumerate(chunks):
    print(f'\nChunk: {idx+1}: \n{chunk}')


 Semantic chunks

Chunk: 1: 
Langchain is a framework for building software applications with LLM. Langchain integrates LLM with tools like huggingface, OpenAI, Pinnecone.

Chunk: 2: 
You can create chain, agent, memory and retrievers.

Chunk: 3: 
Zuma rock is located in Abuja, the capital of Nigeria.

Chunk: 4: 
Abuja is the capital of Nigeria.


RAG Pipeline 

In [3]:
import os
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from langchain.schema import Document
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import init_chat_model
from langchain.schema.runnable import RunnableLambda, RunnableMap
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from dotenv import load_dotenv

In [5]:
load_dotenv()
groq_api_key = os.getenv('GROQ_API_KEY')

In [24]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from langchain.schema import Document  # assuming you're using LangChain
import numpy as np

class SemanticChunker:
    
    def __init__(self, model_name='all-MiniLM-L6-v2', threshold=0.7):
        # Load model correctly
        self.model = SentenceTransformer(model_name)
        self.threshold = threshold
        
    def split(self, text: str):
        # Correct sentence splitting
        sentences = [s.strip() for s in text.split('\n') if s.strip()]
        
        # Encode sentences
        embeddings = self.model.encode(sentences)
        
        chunks = []
        current_chunk = [sentences[0]]
        
        # Compare each sentence with the previous one
        for i in range(1, len(sentences)):
            similarity = cosine_similarity(
                [embeddings[i - 1]], 
                [embeddings[i]]
            )[0][0]
            
            if similarity >= self.threshold:
                current_chunk.append(sentences[i])
            else:
                chunks.append('. '.join(current_chunk) + '.')
                current_chunk = [sentences[i]]
                
        # Append last chunk
        chunks.append('. '.join(current_chunk) + '.')
        
        return chunks
    
    def split_documents(self, docs):
        results = []
        
        for doc in docs:
            for chunk in self.split(doc.page_content):
                results.append(
                    Document(page_content=chunk, metadata=doc.metadata)
                )
                
        return results

In [25]:
# Sample text
text = """
    Langchain is a framework for building software applications with LLM.
    Langchain integrates LLM with tools like huggingface, OpenAI, Pinnecone.
    You can create chain, agent, memory and retrievers.
    Zuma rock is located in Abuja, the capital of Nigeria.
    Abuja is the capital of Nigeria.
"""

doc = Document(page_content=text)
doc

Document(metadata={}, page_content='\n    Langchain is a framework for building software applications with LLM.\n    Langchain integrates LLM with tools like huggingface, OpenAI, Pinnecone.\n    You can create chain, agent, memory and retrievers.\n    Zuma rock is located in Abuja, the capital of Nigeria.\n    Abuja is the capital of Nigeria.\n')

In [26]:
# Chunking
chunker = SemanticChunker(threshold=0.7)
chunks = chunker.split_documents([doc])
chunks

[Document(metadata={}, page_content='Langchain is a framework for building software applications with LLM.. Langchain integrates LLM with tools like huggingface, OpenAI, Pinnecone..'),
 Document(metadata={}, page_content='You can create chain, agent, memory and retrievers..'),
 Document(metadata={}, page_content='Zuma rock is located in Abuja, the capital of Nigeria..'),
 Document(metadata={}, page_content='Abuja is the capital of Nigeria..')]

In [27]:
# VectorStore

embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(chunks, embeddings)
retriever = vectorstore.as_retriever()


  embeddings = OpenAIEmbeddings()


In [28]:
# Prompt template
template = '''
    Answer the question based on the following context
    
    {context}
    
    Question: {question}
'''

prompt = PromptTemplate.from_template(template)
prompt

PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='\n    Answer the question based on the following context\n\n    {context}\n\n    Question: {question}\n')

In [30]:
# LLM
llm = init_chat_model(model='groq:gemma2-9b-it', temperature=0.4)

#LCEL Chain with retrieval

rag_chain = (
    RunnableMap({
        'context': lambda x: retriever.invoke(x['question']),
        'question': lambda x: x['question'],
    })
    | prompt
    | llm
    | StrOutputParser()
)

# Text query
query = {
    'question': 'What is the use of langchain?'
}

result = rag_chain.invoke(query)
print(result)

BadRequestError: Error code: 400 - {'error': {'message': 'The model `gemma2-9b-it` has been decommissioned and is no longer supported. Please refer to https://console.groq.com/docs/deprecations for a recommendation on which model to use instead.', 'type': 'invalid_request_error', 'code': 'model_decommissioned'}}

Semantic chunker with langchain

In [31]:
from langchain_openai import OpenAIEmbeddings
from langchain_experimental.text_splitter import SemanticChunker
from langchain.document_loaders import TextLoader

In [32]:
# Load document
loader = TextLoader('text.txt')
docs = loader.load()

# Initialize embedding model
embedding = OpenAIEmbeddings()

# Create semantic chunker
chunker = SemanticChunker(embedding)

# Split the document
chunks = chunker.split_documents(docs)

#output
for i, chunk in enumerate(chunks):
    print(f'\n chunk {i+1}: \n{chunk.page_content}')


 chunk 1: 
Large language models can generate humanâ€‘like text by learning patterns from massive datasets. Businesses use large language models to automate customer support and improve communication workflows. Training a large language model requires significant computational power and specialized hardware. Large language models can summarize long documents while preserving the main ideas and context. Researchers continue to improve large language models to reduce bias and increase reliability. Many developers integrate large language models into applications to enhance user experiences. Large language models can understand complex prompts and produce detailed, contextâ€‘aware responses. Using large language models responsibly involves monitoring outputs and ensuring ethical guidelines are followed. Large language models can assist with creative tasks such as writing stories, brainstorming ideas, and drafting content. As large language models grow more advanced, they raise important 