In [1]:

import os
import glob
import json
import re
from typing import List, Dict

import pdfplumber
from sentence_transformers import SentenceTransformer
import faiss

import google.generativeai as genai

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Configuration
PDF_FOLDER = "./pdfs"
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
EMBEDDING_DIM = 384
CHUNK_SIZE = 800
CHUNK_OVERLAP = 200
INDEX_PATH = "jharkhand_faiss.index"
METADATA_PATH = "jharkhand_metadata.json"

In [3]:
# Gemini API key
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "YOUR_GEMINI_API_KEY")
print(GEMINI_API_KEY)
genai.configure(api_key=GEMINI_API_KEY)

print("Configuration done. PDFs expected in:", PDF_FOLDER)

AIzaSyAEX75wi2PJu038QuAdeelmTZ6_-mQNVqY
Configuration done. PDFs expected in: ./pdfs


In [4]:
# 3) PDF loading and text extraction

def extract_text_from_pdf(path: str) -> str:
    texts = []
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            txt = page.extract_text()
            if txt:
                texts.append(txt)
    return "\n".join(texts)


def load_all_pdfs(folder: str) -> Dict[str, str]:
    pdfs = glob.glob(os.path.join(folder, "*.pdf"))
    results = {}
    for p in pdfs:
        name = os.path.basename(p)
        print(f"Extracting: {name} ...")
        try:
            results[name] = extract_text_from_pdf(p)
        except Exception as e:
            print(f"Failed to read {name}: {e}")
    return results

if not os.path.exists(PDF_FOLDER) or len(glob.glob(os.path.join(PDF_FOLDER, "*.pdf"))) == 0:
    os.makedirs(PDF_FOLDER, exist_ok=True)
    docs = {"demo_policy.txt": "This is a demo policy about agriculture subsidies in Jharkhand. Eligibility: residents of Jharkhand; amount: up to INR 10,000; effective: 2024-01-01."}
    print("No PDFs detected — using demo text.")
else:
    docs = load_all_pdfs(PDF_FOLDER)

Extracting: 1. Jharkhand Food and Feed Processing Policy 2024.pdf ...
Extracting: 2. Jharkhand Export Policy 2023.pdf ...
Extracting: 3. Jharkhand MSME Promotion Policy 2023.pdf ...
Extracting: 4. Jharkhand Electric Vehicle Policy 2022.pdf ...


Cannot set gray non-stroke color because /'Pattern1' is an invalid float value


Extracting: 5. Jharkhand Ethanol Production Promotion Policy, 2022.pdf ...
Extracting: 6. Jharkhand Industrial Park and Logistic Policy 2022.pdf ...
Extracting: 7. Jharkhand Industrial and Investment Promotion Policy 2021.pdf ...
Extracting: 8. Jharkhand Procurement Policy 2014.pdf ...
Extracting: BPO Policy 2016.pdf ...
Extracting: IPR_Assignment.pdf ...
Extracting: ITes Policy 2016.pdf ...
Extracting: Jharkhand State Mineral Policy 2017.pdf ...
Extracting: visionhindi.pdf ...


In [5]:
# 4) Text cleaning and chunking

def clean_text(t: str) -> str:
    return re.sub(r"\s+", " ", t).strip()


def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP):
    text = clean_text(text)
    chunks, start, L = [], 0, len(text)
    while start < L:
        end = min(start + chunk_size, L)
        chunks.append(text[start:end])
        if end == L:
            break
        start = end - overlap
    return chunks

corpus = []
for fname, txt in docs.items():
    chunks = chunk_text(txt)
    for i, c in enumerate(chunks):
        corpus.append({"id": f"{fname}_chunk_{i}", "source": fname, "text": c})

print(f"Prepared {len(corpus)} chunks from {len(docs)} documents.")

Prepared 901 chunks from 13 documents.


In [6]:
# 5) Embeddings and FAISS index

embedder = SentenceTransformer(EMBEDDING_MODEL_NAME)
texts = [c["text"] for c in corpus]
embeddings = embedder.encode(texts, show_progress_bar=True, convert_to_numpy=True)
faiss.normalize_L2(embeddings)

index = faiss.IndexFlatIP(EMBEDDING_DIM)
index.add(embeddings)
print("FAISS index built with", index.ntotal, "vectors")

faiss.write_index(index, INDEX_PATH)
with open(METADATA_PATH, "w", encoding="utf-8") as f:
    json.dump(corpus, f, ensure_ascii=False, indent=2)

Batches: 100%|██████████| 29/29 [00:12<00:00,  2.35it/s]

FAISS index built with 901 vectors





In [7]:

# 6) Retriever

def load_index_and_metadata(index_path: str = INDEX_PATH, metadata_path: str = METADATA_PATH):
    idx = faiss.read_index(index_path)
    with open(metadata_path, "r", encoding="utf-8") as f:
        meta = json.load(f)
    return idx, meta


index, metadata = load_index_and_metadata()

def retrieve(query: str, k: int = 5):
    q_emb = embedder.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(q_emb)
    D, I = index.search(q_emb, k)
    results = []
    for score, idx in zip(D[0], I[0]):
        if idx < 0:
            continue
        meta = metadata[idx]
        results.append({"score": float(score), "id": meta["id"], "source": meta["source"], "text": meta["text"]})
    return results

print(retrieve("What are agriculture subsidies for Jharkhand?", k=3))


[{'score': 0.739046573638916, 'id': '1. Jharkhand Food and Feed Processing Policy 2024.pdf_chunk_44', 'source': '1. Jharkhand Food and Feed Processing Policy 2024.pdf', 'text': 'nt Subsidy (CPIS) in addition to what is already available under clause(s) 8.1, 8.2, 8.3 and 8.4 subject to the same maximum monetary limits. The Government of Jharkhand has identified the following products under the One District One Product Programme: Sl. No. District Name Product Name 1 Deoghar Dairy products 2 Dumka Dairy products 3 East Singhbhum Cashew Nut 4 Hazaribagh Jaggery 5 Jamtara Paddy based(Puffed rice, flattened rice, rice) 6 Khunti Tamarind 7 Koderma Ragi(Millet) 8 Latehar Mahua 9 Lohardaga Green Peas& other vegetable processing 10 Pakur Meat Processing 11 Ranchi Honey processing 12 Sahibganj Pickle 13 Saraikela Kharsawan Chironji 14 West Singhbhum Custard Apple 15 Bokaro Paddy/Rice based unit 16 Chatra Tomato 17 Dhanbad Paddy/Rice based unit 18 Garhwa Chilli 19 Giridih Mai'}, {'score': 0.720971

In [8]:
# 7) RAG with Gemini

SYSTEM_PROMPT = (
    "You are an assistant that answers questions about Jharkhand government policies. "
    "Use only the provided source excerpts and cite source filenames. "
    "If the answer is not present, say you don't know."
)

def generate_answer_gemini(question: str, contexts: List[Dict]) -> str:
    context_block = "\n\n".join([f"Source: {c['source']}\n{c['text']}" for c in contexts])

    prompt = f"""
    {SYSTEM_PROMPT}

    CONTEXT:
    {context_block}

    QUESTION: {question}

    Provide a clear, concise answer with sources.
    """
    model = genai.GenerativeModel("gemini-1.5-pro")
    response = model.generate_content(prompt)
    return response.text


def answer_question(question: str, k: int = 5) -> Dict:
    contexts = retrieve(question, k=k)
    ans = generate_answer_gemini(question, contexts)
    return {"question": question, "answer": ans, "contexts": contexts}


In [12]:
question = "What are agriculture subsidies for Jharkhand?"
out = answer_question(question, k=5)
print(out)

ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerDayPerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-1.5-pro"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
}
violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-1.5-pro"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
}
violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_input_token_count"
  quota_id: "GenerateContentInputTokensPerModelPerMinute-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-1.5-pro"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 3
}
]

In [11]:

# 8) Gradio UI

try:
    import gradio as gr
except Exception:
    gr = None

if gr:
    def qa_fn(question):
        out = answer_question(question, k=5)
        srcs = "\n\n".join([f"[{i+1}] {c['source']} (score={c['score']:.3f})\n{c['text'][:400]}..." for i,c in enumerate(out['contexts'])])
        return out['answer'], srcs

    with gr.Blocks() as demo:
        gr.Markdown("# Jharkhand Policies — RAG QnA (Gemini)")
        txt = gr.Textbox(lines=2, placeholder="Ask about Jharkhand policies...", label="Question")
        out_ans = gr.Textbox(lines=8, label="Answer")
        out_ctx = gr.Textbox(lines=12, label="Retrieved contexts")
        btn = gr.Button("Ask")
        btn.click(fn=qa_fn, inputs=txt, outputs=[out_ans, out_ctx])

    # To launch locally: demo.launch()
    demo.launch()
    print("Gradio UI ready. To launch, run: demo.launch()")
else:
    print("Gradio not installed — skip UI cell.")

# %%
# 9) Index and metadata

print("Index file:", INDEX_PATH)
print("Metadata file:", METADATA_PATH)
print("Notebook complete with Gemini integration.")
# demo.launch()

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


Gradio UI ready. To launch, run: demo.launch()
Index file: jharkhand_faiss.index
Metadata file: jharkhand_metadata.json
Notebook complete with Gemini integration.


Traceback (most recent call last):
  File "c:\Users\ASUS\Desktop\Major Project\ML-KK\.venv\lib\site-packages\gradio\queueing.py", line 667, in process_events
    response = await route_utils.call_process_api(
  File "c:\Users\ASUS\Desktop\Major Project\ML-KK\.venv\lib\site-packages\gradio\route_utils.py", line 349, in call_process_api
    output = await app.get_blocks().process_api(
  File "c:\Users\ASUS\Desktop\Major Project\ML-KK\.venv\lib\site-packages\gradio\blocks.py", line 2274, in process_api
    result = await self.call_function(
  File "c:\Users\ASUS\Desktop\Major Project\ML-KK\.venv\lib\site-packages\gradio\blocks.py", line 1781, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
  File "c:\Users\ASUS\Desktop\Major Project\ML-KK\.venv\lib\site-packages\anyio\to_thread.py", line 56, in run_sync
    return await get_async_backend().run_sync_in_worker_thread(
  File "c:\Users\ASUS\Desktop\Major Project\ML-KK\.venv\lib\site-packages\anyio\_backends\