In [None]:
%pip install -q streamlit langchain langchain-community langchain-ollama chromadb pypdf pillow pytesseract beautifulsoup4 requests ffmpeg-python openai-whisper


In [None]:
%pip install -U langchain langchain-community langchain-ollama


In [None]:
%pip uninstall -y langchain
%pip install -U langchain


Found existing installation: langchain 1.2.6
Uninstalling langchain-1.2.6:
  Successfully uninstalled langchain-1.2.6
Note: you may need to restart the kernel to use updated packages.
Collecting langchain
  Using cached langchain-1.2.6-py3-none-any.whl.metadata (4.9 kB)
Using cached langchain-1.2.6-py3-none-any.whl (108 kB)
Installing collected packages: langchain
Successfully installed langchain-1.2.6
Note: you may need to restart the kernel to use updated packages.


In [None]:
import sys
print(sys.executable)


C:\Users\Asus\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe


In [None]:
import sys
!{sys.executable} -m pip install -U langchain langchain-community langchain-ollama




In [None]:
import os
import re
import uuid
import shutil
from typing import List, Dict
from collections import Counter

import requests
from bs4 import BeautifulSoup
from PIL import Image
import pytesseract
import whisper

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import PromptTemplate

from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_community.vectorstores import Chroma

from langchain_ollama import OllamaEmbeddings, ChatOllama


In [None]:
CONFIG = {
    "data_folder": "./data",
    "persist_directory": "./chroma_db",
    "collection_name": "rag_documents",
    "embedding_model": "nomic-embed-text",
    "llm_model": "llama3",
    "top_k_retrieval": 5,
    "chunk_size": 500,
    "chunk_overlap": 50,
    "allowed_extensions": [
        ".pdf", ".txt", ".md",
        ".png", ".jpg", ".jpeg",
        ".mp4", ".mov", ".mkv"
    ],
}


In [None]:

class VectorDB:
    def __init__(self, persist_directory: str, embedding_model: str, collection_name: str):
        self.embedding_function = OllamaEmbeddings(
            model=embedding_model
        )

        self.vectorstore = Chroma(
            collection_name=collection_name,
            embedding_function=self.embedding_function,
            persist_directory=persist_directory,
        )

    def add_texts(self, texts: List[str], metadatas: List[Dict]) -> int:
        if not texts:
            return 0

        self.vectorstore.add_texts(
            texts=texts,
            metadatas=metadatas,
        )

        return len(texts)

    def search(self, query: str, top_k: int = 5) -> List[Dict]:
        docs = self.vectorstore.similarity_search_with_score(
            query,
            k=top_k,
        )

        results = []
        for doc, score in docs:
            results.append(
                {
                    "id": doc.metadata.get("id", "unknown"),
                    "text": doc.page_content,
                    "metadata": doc.metadata,
                    "distance": score,
                }
            )

        return results

    def clear(self) -> None:
        if os.path.exists(CONFIG["persist_directory"]):
            shutil.rmtree(CONFIG["persist_directory"])


In [None]:
def extract_text_from_url(url: str) -> str:
    resp = requests.get(url, timeout=15, headers={"User-Agent": "Mozilla/5.0"})
    resp.raise_for_status()

    soup = BeautifulSoup(resp.text, "html.parser")

    for tag in soup(["script", "style", "noscript"]):
        tag.extract()

    return soup.get_text(separator=" ", strip=True)


In [None]:
def extract_text_from_image(filepath: str) -> str:
    img = Image.open(filepath)
    text = pytesseract.image_to_string(img)
    return text.strip()


In [None]:
def extract_text_from_video(filepath: str) -> str:
    model = whisper.load_model("base")
    result = model.transcribe(filepath)
    return (result.get("text") or "").strip()

In [None]:
def load_raw_text_from_source(source: str) -> str:
    source = source.strip()

    if source.startswith("http://") or source.startswith("https://"):
        return extract_text_from_url(source)

    ext = os.path.splitext(source)[-1].lower()

    if ext == ".pdf":
        loader = PyPDFLoader(source)
        docs = loader.load()
        return "\n".join([d.page_content for d in docs]).strip()

    if ext in [".txt", ".md"]:
        loader = TextLoader(source, encoding="utf-8")
        docs = loader.load()
        return "\n".join([d.page_content for d in docs]).strip()

    if ext in [".png", ".jpg", ".jpeg"]:
        return extract_text_from_image(source)

    if ext in [".mp4", ".mov", ".mkv"]:
        return extract_text_from_video(source)

    raise ValueError(f"Unsupported source: {source}")


In [None]:
def chunk_text(text: str, chunk_size: int, chunk_overlap: int) -> List[str]:
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
    )
    return splitter.split_text(text)


In [None]:
def ingest_source(vector_db: VectorDB, source: str, config: Dict) -> int:
    raw_text = load_raw_text_from_source(source)

    if not raw_text or len(raw_text.strip()) < 10:
        print(f"Skipping source (no readable text): {source}")
        return 0

    chunks = chunk_text(
        raw_text,
        chunk_size=config["chunk_size"],
        chunk_overlap=config["chunk_overlap"],
    )

    metadatas = [{"source": source} for _ in chunks]

    return vector_db.add_texts(chunks, metadatas)

In [None]:
def ingest_data_folder(vector_db: VectorDB, config: Dict) -> int:
    folder = config["data_folder"]

    if not os.path.exists(folder):
        raise FileNotFoundError(f"Data folder not found: {folder}")

    paths = []
    for root, _, files in os.walk(folder):
        for f in files:
            ext = os.path.splitext(f)[-1].lower()
            if ext in config["allowed_extensions"]:
                paths.append(os.path.join(root, f))

    paths = sorted(paths)

    if not paths:
        print("No supported files found in data folder")
        return 0

    total = 0
    for path in paths:
        try:
            added = ingest_source(vector_db, path, config)
            total += added
            print(f"Ingested {os.path.basename(path)} | chunks added: {added}")
        except Exception as e:
            print(f"Failed ingesting {path}: {e}")

    print("Total chunks added:", total)
    return total


In [None]:
def ingest_links(vector_db: VectorDB, links: List[str], config: Dict) -> int:
    total = 0
    for link in links:
        try:
            added = ingest_source(vector_db, link, config)
            total += added
            print(f"Ingested link {link} | chunks added: {added}")
        except Exception as e:
            print(f"Failed ingesting link {link}: {e}")

    print("Total link chunks added:", total)
    return total



In [None]:
RAG_PROMPT_TEMPLATE = """You are a helpful AI assistant.
Answer the question using ONLY the context below.

Context:
{context}

Question:
{question}

Rules:
- Do NOT use outside knowledge
- If the answer is not in the context, say:
  "I don't have enough information to answer this question."
- Be concise and accurate

Answer:
"""

prompt_template = PromptTemplate(
    template=RAG_PROMPT_TEMPLATE,
    input_variables=["context", "question"],
)




In [None]:
class RAGPipeline:
    def __init__(self, vector_db: VectorDB, llm_model: str, top_k: int = 5):
        self.vector_db = vector_db
        self.top_k = top_k
        self.llm = ChatOllama(model=llm_model, temperature=0)
        self.prompt_template = prompt_template

    def format_context(self, retrieved_chunks: List[Dict]) -> str:
        blocks = []
        for i, chunk in enumerate(retrieved_chunks, start=1):
            source = chunk.get("metadata", {}).get("source", "unknown")
            text = chunk.get("text", "")
            blocks.append(f"Source {i} ({source}):\n{text}")
        return "\n\n".join(blocks)

    def query(self, question: str) -> Dict:
        retrieved = self.vector_db.search(question, top_k=self.top_k)

        if not retrieved:
            return {
                "answer": "I don't have enough information to answer this question.",
                "sources": [],
                "retrieved_chunks": [],
            }

        context = self.format_context(retrieved)
        prompt = self.prompt_template.format(context=context, question=question)

        answer = self.llm.invoke(prompt).response

        sources = list(
            {chunk.get("metadata", {}).get("source", "unknown") for chunk in retrieved}
        )

        return {
            "answer": str(answer).strip(),
            "sources": sources,
            "retrieved_chunks": retrieved,
        }



In [None]:
vector_db = VectorDB(
    persist_directory=CONFIG["persist_directory"],
    embedding_model=CONFIG["embedding_model"],
    collection_name=CONFIG["collection_name"],
)

rag_pipeline = RAGPipeline(
    vector_db=vector_db,
    llm_model=CONFIG["llm_model"],
    top_k=CONFIG["top_k_retrieval"],
)


AttributeError: 'OllamaEmbeddings' object has no attribute 'name'