In [43]:
pdf_file_path= r"/home/asad/projects/Owais AI trial tasks/RAG pipeline/data/indo-pak-histoy.pdf"

In [1]:
import os, fitz
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from whoosh.fields import Schema, TEXT, ID
from whoosh.analysis import StemmingAnalyzer
from whoosh.index import create_in, open_dir, exists_in


In [2]:
CHROMA_DIR = "/home/asad/projects/Owais AI trial tasks/RAG pipeline/chroma_db"
WHOOSH_DIR = "/home/asad/projects/Owais AI trial tasks/RAG pipeline/whoosh"


In [3]:

# def load_text(file_path):
#     ext = file_path.lower().split('.')[-1]
#     if ext == "pdf":
#         doc = fitz.open(file_path)
#         return "\n".join([page.get_text() for page in doc])
#     elif ext in {"txt", "md"}:
#         with open(file_path, 'r', encoding='utf-8') as f:
#             return f.read()
#     else:
#         raise ValueError("Unsupported file")

def load_text(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    if ext == ".pdf":
        doc = fitz.open(file_path)
        return "\n".join([p.get_text() for p in doc])
    elif ext in {".txt", ".md"}:
        with open(file_path, encoding="utf-8") as f:
            return f.read()
    else:
        raise ValueError(f"Unsupported file type '{ext}' – supported: .pdf, .txt, .md")

def chunk_text(text, source):
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    return splitter.create_documents([text], metadatas=[{"source": source}])

def build_chroma(chunks):
    embeddings = OpenAIEmbeddings()
    vectordb = Chroma.from_documents(chunks, embedding=embeddings, persist_directory=CHROMA_DIR)
    vectordb.persist()

def build_whoosh(chunks):
    if not os.path.exists(WHOOSH_DIR):
        os.mkdir(WHOOSH_DIR)
        schema = Schema(content=TEXT(analyzer=StemmingAnalyzer()), source=ID(stored=True))
        ix = create_in(WHOOSH_DIR, schema)
    else:
        ix = open_dir(WHOOSH_DIR)

    writer = ix.writer()
    for chunk in chunks:
        writer.add_document(content=chunk.page_content, source=chunk.metadata["source"])
    writer.commit()

def ingest(file_path):
    text = load_text(file_path)
    chunks = chunk_text(text, os.path.basename(file_path))
    build_chroma(chunks)
    build_whoosh(chunks)
    print(f"Ingested: {file_path}")

In [4]:
from whoosh.index import create_in, open_dir, exists_in

def build_whoosh(chunks):
    if not os.path.exists(WHOOSH_DIR):
        os.makedirs(WHOOSH_DIR)
    if not exists_in(WHOOSH_DIR):
        schema = Schema(content=TEXT(analyzer=StemmingAnalyzer()), source=ID(stored=True))
        ix = create_in(WHOOSH_DIR, schema)
    else:
        ix = open_dir(WHOOSH_DIR)

    writer = ix.writer()
    for chunk in chunks:
        writer.add_document(content=chunk.page_content, source=chunk.metadata["source"])
    writer.commit()


In [None]:
import os
os.environ["OPENAI_API_KEY"] = openai_api_key

In [7]:
files = [
        "/home/asad/projects/Owais AI trial tasks/RAG pipeline/data/indo-pak-histoy.pdf",
        "/home/asad/projects/Owais AI trial tasks/RAG pipeline/data/indo pak txt.txt",
        "/home/asad/projects/Owais AI trial tasks/RAG pipeline/data/richtext_converted_to_markdown.md"

    ]
for file in files:
    ingest(file)

  vectordb.persist()


Ingested: /home/asad/projects/Owais AI trial tasks/RAG pipeline/data/indo-pak-histoy.pdf
Ingested: /home/asad/projects/Owais AI trial tasks/RAG pipeline/data/indo pak txt.txt
Ingested: /home/asad/projects/Owais AI trial tasks/RAG pipeline/data/richtext_converted_to_markdown.md


In [12]:
# rag_query.py

import openai
from langchain.embeddings import OpenAIEmbeddings
from chromadb import PersistentClient
from whoosh.index import open_dir
from whoosh.qparser import QueryParser
from sklearn.metrics.pairwise import cosine_similarity
from openai import OpenAI
openai_client = OpenAI()
# CHROMA_DIR = "chroma_store"
# WHOOSH_DIR = "whoosh_index"
# openai.api_key = "your-openai-key"

def bm25_search(query):
    ix = open_dir(WHOOSH_DIR)
    with ix.searcher() as searcher:
        parser = QueryParser("content", schema=ix.schema)
        q = parser.parse(query)
        results = searcher.search(q, limit=5)
        return [{"text": r["content"], "source": r["source"]} for r in results]

def vector_search(query, embedding):
    db = Chroma(persist_directory=CHROMA_DIR, embedding_function=embedding)
    results = db.similarity_search_with_score(query, k=5)
    return [{"text": r[0].page_content, "source": r[0].metadata["source"], "score": r[1]} for r in results]

def rerank(all_docs, query, embeddings):
    q_vec = embeddings.embed_query(query)
    doc_vecs = [embeddings.embed_query(doc["text"]) for doc in all_docs]
    scores = cosine_similarity([q_vec], doc_vecs)[0]
    for i, score in enumerate(scores):
        all_docs[i]["confidence"] = round(float(score), 2)
    return sorted(all_docs, key=lambda x: -x["confidence"])

def generate_response(query, top_docs):
    context = "\n\n".join(f"[{d['source']}]\n{d['text']}" for d in top_docs)
    prompt = f"Use the following context to answer:\n{context}\n\nQ: {query}\nA:"

    resp = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.3,
        stream=False  # change to True if you want streaming output
    )

    return resp.choices[0].message

def query_rag(query):
    embeddings = OpenAIEmbeddings()
    docs = bm25_search(query) + vector_search(query, embeddings)
    print(docs)
    if not docs:
        return "No results found.", []
    ranked = rerank(docs, query, embeddings)[:3]
    answer = generate_response(query, ranked)
    return answer, ranked



In [13]:

q = "what happened in 1947?"
answer, sources = query_rag(q)
print("🔍 Answer:\n", answer)
print("\n📚 Sources:")
for s in sources:
    print(f"{s['source']} | Confidence: {s['confidence']}")

[{'text': '(founded in 1906) played pivotal roles. While Congress aimed for a unified Indian independence, the Muslim League feared marginalization of\n\nMuslims in a Hindu-majority India.\n\n2\\. Partition of British India – 1947\n\nThe demand for a separate Muslim state, Pakistan, was formally introduced by Muhammad Ali Jinnah in the Lahore Resolution (1940). Astensions grew, the British decided to leave India, leading to the Mountbatten Plan (1947), which resulted in the partition into India and Pakistan.', 'source': 'richtext_converted_to_markdown.md', 'score': 0.4264531433582306}, {'text': '(founded in 1906) played pivotal roles. While Congress aimed for a unified Indian independence, the Muslim League feared marginalization of\n\nMuslims in a Hindu-majority India.\n\n2\\. Partition of British India – 1947\n\nThe demand for a separate Muslim state, Pakistan, was formally introduced by Muhammad Ali Jinnah in the Lahore Resolution (1940). Astensions grew, the British decided to leav