In [1]:
import os
from httpcore import stream
import streamlit as st
from dotenv import load_dotenv
from typing import TypedDict, Annotated
import re

from langgraph.graph import StateGraph, START, END


from langgraph.graph.message import add_messages

from langchain_core.messages import BaseMessage,HumanMessage
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
import sqlite3
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from pydantic import BaseModel
from langchain_huggingface import HuggingFaceEmbeddings

  from pydantic.v1.fields import FieldInfo as FieldInfoV1
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")

# LLM loader (cached)
@st.cache_resource
def get_llm():
    llm_endpoint = HuggingFaceEndpoint(
        repo_id="meta-llama/Llama-3.2-3B-Instruct",
        temperature=0.01,
        max_new_tokens=512,
        huggingfacehub_api_token=HF_TOKEN,
        task="text-generation",
    )
    return ChatHuggingFace(llm=llm_endpoint)

llm = get_llm()

prompt = HumanMessage(content="hello")

response = llm.invoke([prompt])
print(response.content)




Hello! How can I assist you today?


In [3]:
docs=(
    PyPDFLoader("./BOOK1.pdf").load()
+ PyPDFLoader("./BOOK2.pdf").load()
)

In [4]:
chunks=RecursiveCharacterTextSplitter(chunk_size=900, chunk_overlap=150).split_documents(docs)


In [5]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)



In [6]:
vectore_store=FAISS.from_documents(chunks,embeddings)
retriever=vectore_store.as_retriever(search_type="similarity", search_kwargs={"k":4})

In [7]:
llm = get_llm()

In [8]:
class State(TypedDict):
    question:str
    docs:list[docs] #output of th edecomposition (sentence strips)

    strips:list[str] #after filtering (kept sentences)

    kept_strips:str #recomposed internal knowledge (joined kept_strips)
    refined_context:str
    anwser:str

In [None]:
def retrieve(state: State):
    query = state["question"]

    docs = retriever.invoke(query)

    return {
        "docs": docs
    }


In [10]:
## sentence-level decomposer
#make the list make its as a strip sentences
def decompose_to_sequence(text:str)->List[str]:
    text=re.sub(r"\s+"," ",text).strip()
    sentences=re.split(r"(?<=[.!?])\s+",text)
    return [s.strip() for s in sentences if len(s.strip())>20]

In [19]:
from pydantic import BaseModel

class KeepOrDrop(BaseModel):
    keep: bool

from langchain_core.prompts import ChatPromptTemplate

filter_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a strict relevance filter.\n"
            "Return ONLY valid JSON.\n"
            "Format: {\"keep\": true} or {\"keep\": false}\n"
            "Do not explain anything.\n"
        ),
        (
            "human",
            "Question: {question}\n\nSentence:\n{sentence}"
        ),
    ]
)

from langchain_core.output_parsers import PydanticOutputParser


parser = PydanticOutputParser(pydantic_object=KeepOrDrop)

filter_chain = filter_prompt | llm | parser





# -----------------------------
# REFINING (Decompose -> Filter -> Recompose)
# -----------------------------
from typing import List

def refine(state: State) -> State:

    q = state["question"]

    # Combine retrieved docs
    context = "\n\n".join(d.page_content for d in state["docs"]).strip()

    # 1️⃣ Decompose
    strips = decompose_to_sentences(context)

    # 2️⃣ Filter
    kept: List[str] = []

    for s in strips:
        try:
            result = filter_chain.invoke({
                "question": q,
                "sentence": s
            })
            if result.keep:
                kept.append(s)

        except Exception as e:
            print("Filter error:", e)
            continue

    # 3️⃣ Recompose
    refined_context = "\n".join(kept).strip()

    return {
        "strips": strips,
        "kept_strips": kept,
        "refined_context": refined_context,
    }


In [20]:
from langchain_core.prompts import ChatPromptTemplate

answer_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a helpful ML tutor. Answer ONLY using the provided refined bullets.\n"
            "If the bullets are empty or insufficient, say: 'I don't know based on the provided books.'",
        ),
        ("human", "Question: {question}\n\nRefined context:\n{refined_context}"),
    ]
)

def generate(state: State) -> State:
    out = (answer_prompt | llm).invoke({"question": state["question"], "refined_context": state['refined_context']})
    return {"answer": out.content}

In [21]:
g = StateGraph(State)
g.add_node("retriever", retriever)
g.add_node("refine", refine)
g.add_node("generate", generate)

g.add_edge(START, "retriever")
g.add_edge("retriever", "refine")
g.add_edge("refine", "generate")
g.add_edge("generate", END)

app= g.compile()



In [22]:
print(res['kept_strips'])

NameError: name 'res' is not defined

In [23]:
res = app.invoke({
    "question": "Explain the bias–variance tradeoff",
    "docs": [],
    "strips": [],
    "kept_strips": [],
    "refined_context": "",
    "answer": ""
})
print(res["answer"])


AttributeError: 'dict' object has no attribute 'replace'