In [1]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
## Initialize the model
model=SentenceTransformer('all-MiniLM-L6-v2')

In [3]:
sample_text = """Reinforcement learning is a way of teaching machines to make decisions by letting them learn from experience rather than instructions.
      Instead of being told exactly what to do, a learning agent interacts with an environment, takes actions, and receives feedback in the form of rewards or penalties. Over time, the agent adjusts its behavior to maximize the total reward it receives.

At the heart of reinforcement learning is a simple idea: actions have consequences. Some choices lead to good outcomes, others to bad ones, and many only reveal their value after a sequence of steps. Because of this, reinforcement learning focuses not just on immediate rewards but on long-term success. 
An agent must balance trying new actions (exploration) with using what it already knows works well (exploitation).

This learning style closely mirrors how humans and animals learn skills, from riding a bicycle to playing a game.
 By repeatedly experimenting and improving based on feedback, reinforcement learning systems can eventually discover effective strategies in complex and uncertain environments, even when no clear “right answer” is provided in advance.

Switzerland is a rich country with nature, swiss chocolates and black money.
India is rich in heritage and culture 
 """

In [4]:
## Step 1 : Split into sentences
sentences=[s.strip() for s in sample_text.split("\n") if s.strip()]

In [5]:
### sstep 2: Embed each setence
embeddings=model.encode(sentences)

In [6]:
# Step 3: Initialize parameters
threshold = 0.7  # control chunk tightness
chunks = []
current_chunk=[sentences[0]]

In [8]:
current_chunk

['Reinforcement learning is a way of teaching machines to make decisions by letting them learn from experience rather than instructions.']

In [9]:
## Step 4: Semantic grouping based on threshold

for i in range(1, len(sentences)):
    sim = cosine_similarity(
        [embeddings[i - 1]],
        [embeddings[i]]
    )[0][0]

    if sim>=threshold:
        current_chunk.append(sentences[i])
    else:
        chunks.append(" ".join(current_chunk))
        current_chunk=[sentences[i]]

In [10]:
# Append the last chunk
chunks.append(" ".join(current_chunk))

In [11]:
# Output the chunks
print(" Semantic Chunks:")
for idx, chunk in enumerate(chunks):
    print(f"\nChunk {idx+1}:\n{chunk}")

 Semantic Chunks:

Chunk 1:
Reinforcement learning is a way of teaching machines to make decisions by letting them learn from experience rather than instructions.

Chunk 2:
Instead of being told exactly what to do, a learning agent interacts with an environment, takes actions, and receives feedback in the form of rewards or penalties. Over time, the agent adjusts its behavior to maximize the total reward it receives.

Chunk 3:
At the heart of reinforcement learning is a simple idea: actions have consequences. Some choices lead to good outcomes, others to bad ones, and many only reveal their value after a sequence of steps. Because of this, reinforcement learning focuses not just on immediate rewards but on long-term success.

Chunk 4:
An agent must balance trying new actions (exploration) with using what it already knows works well (exploitation).

Chunk 5:
This learning style closely mirrors how humans and animals learn skills, from riding a bicycle to playing a game.

Chunk 6:
By rep

## RAG Pipeline Modular Coding

In [2]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain.chat_models import init_chat_model
from langchain_core.runnables import RunnableLambda, RunnableMap
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
import os
os.environ["GROQ_API_KEY"]=os.getenv("GROQ_API_KEY")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
class CustomThresholdSemanticChunker:
    def __init__(self,threshold=0.7,model="all-mpnet-base-v2"):
        self.model = SentenceTransformer(model)
        self.threshold  = threshold
    
    def split(self, text: str):
        sentences = [s.strip() for s in text.split('.') if s.strip()]
        embeddings = self.model.encode(sentences)
        chunks = []
        current_chunk = [sentences[0]]

        for i in range(1, len(sentences)):
            sim = cosine_similarity([embeddings[i - 1]], [embeddings[i]])[0][0]
            if sim >= self.threshold:
                current_chunk.append(sentences[i])
            else:
                chunks.append(". ".join(current_chunk) + ".")
                current_chunk = [sentences[i]]

        chunks.append(". ".join(current_chunk) + ".")
        return chunks
    
    def split_documents(self,docs):
        result=[]
        for doc in docs:
            for chunk in self.split(doc.page_content):
                result.append(Document(page_content=chunk, metadata=doc.metadata))

        return result

In [5]:
# Sample text
sample_text = """
LangChain is a framework for building applications with LLMs.
Langchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.
You can create chains, agents, memory, and retrievers.
The Eiffel Tower is located in Paris.
France is a popular tourist destination.
"""

doc = Document(page_content=sample_text,metadata={
    "source":"just_trust_me_bro",
    "author":"morty"
})
doc

Document(metadata={'source': 'just_trust_me_bro', 'author': 'morty'}, page_content='\nLangChain is a framework for building applications with LLMs.\nLangchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.\nYou can create chains, agents, memory, and retrievers.\nThe Eiffel Tower is located in Paris.\nFrance is a popular tourist destination.\n')

In [6]:
chunker = CustomThresholdSemanticChunker(threshold=0.75)
chunks = chunker.split_documents([doc])
doc

Document(metadata={'source': 'just_trust_me_bro', 'author': 'morty'}, page_content='\nLangChain is a framework for building applications with LLMs.\nLangchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.\nYou can create chains, agents, memory, and retrievers.\nThe Eiffel Tower is located in Paris.\nFrance is a popular tourist destination.\n')

In [7]:
chunks

[Document(metadata={'source': 'just_trust_me_bro', 'author': 'morty'}, page_content='LangChain is a framework for building applications with LLMs. Langchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.'),
 Document(metadata={'source': 'just_trust_me_bro', 'author': 'morty'}, page_content='You can create chains, agents, memory, and retrievers.'),
 Document(metadata={'source': 'just_trust_me_bro', 'author': 'morty'}, page_content='The Eiffel Tower is located in Paris.'),
 Document(metadata={'source': 'just_trust_me_bro', 'author': 'morty'}, page_content='France is a popular tourist destination.')]

In [8]:
### VectorStore
import os
from langchain_huggingface import HuggingFaceEmbeddings
embedding=HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2" 
)
vectorstore=FAISS.from_documents(chunks,embedding)
retriever=vectorstore.as_retriever()



In [9]:
## Prompt Template

# ---  Prompt Template ---
template = """Answer the question based on the following context:

{context}

Question: {question}
"""

prompt = PromptTemplate.from_template(template)
prompt

PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Answer the question based on the following context:\n\n{context}\n\nQuestion: {question}\n')

In [10]:
llm = init_chat_model("groq:llama-3.3-70b-versatile")
#LCEL chain with retrieval

rag_chain = (
    RunnableMap({
        "context": lambda x: retriever.invoke(x["question"]),
        "question": lambda x: x["question"],
    })
    | prompt
    | llm
    | StrOutputParser()
)

query = {"question":"what is langchain used for ?"}
result = rag_chain.invoke(query)
result


'LangChain is used for building applications with Large Language Models (LLMs). It provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone. Additionally, it allows you to create chains, agents, memory, and retrievers.'