# Upload PDF and OCR processing

In [6]:
from google.colab import files
uploaded = files.upload()  # 📂 Upload multiple PDFs here

import logging
logging.getLogger('ppocr').setLevel(logging.ERROR)

import fitz  # PyMuPDF
from pdf2image import convert_from_path
from paddleocr import PaddleOCR
import os
import json
import cv2
import numpy as np
import re
from fuzzywuzzy import fuzz

ocr = PaddleOCR(use_angle_cls=True, lang='en')
output_data = []

def clean_text_advanced(text):
    text = text.replace("\n", " ")
    text = re.sub(r"[^\w\s]", "", text)
    return " ".join(text.split()).lower()

def chunk_text(text, max_words=70, overlap=20):
    words = text.split()
    chunks = []
    i = 0
    while i < len(words):
        chunk = words[i:i + max_words]
        chunks.append(' '.join(chunk))
        i += max_words - overlap
    return chunks

def preprocess_image(image):
    img = np.array(image)
    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    if gray.shape[0] < 1000 or gray.shape[1] < 1000:
        gray = cv2.resize(gray, (int(gray.shape[1] * 1.5), int(gray.shape[0] * 1.5)), interpolation=cv2.INTER_CUBIC)
    sharp = cv2.filter2D(gray, -1, np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]]))
    denoised = cv2.fastNlMeansDenoising(sharp, None, h=10)
    return cv2.adaptiveThreshold(denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 31, 10)

def should_save_ocr(ocr_text, native_text, fuzzy_thresh=90, length_ratio_thresh=1.2):
    ocr_clean = clean_text_advanced(ocr_text)
    native_clean = clean_text_advanced(native_text)
    score = fuzz.token_set_ratio(native_clean, ocr_clean)
    length_ratio = len(ocr_clean) / max(1, len(native_clean))
    return not (score >= fuzzy_thresh and length_ratio < length_ratio_thresh)

# 🔁 Process each uploaded PDF
for pdf_filename in uploaded.keys():
    print(f"\n📄 Processing PDF: {pdf_filename}")
    doc = fitz.open(pdf_filename)
    pages_images = convert_from_path(pdf_filename, dpi=300)

    for i, img in enumerate(pages_images):
        img.save(f"page_{i + 1}_{os.path.basename(pdf_filename)}.png")  # 🖼 Save per-pdf images separately

    for page_number in range(len(doc)):
        print(f"   📃 Page {page_number + 1}")
        page = doc[page_number]
        raw_text = page.get_text()

        if raw_text.strip():
            chunks = chunk_text(raw_text, max_words=70, overlap=20)
            for i, chunk in enumerate(chunks):
                output_data.append({
                    "type": "text",
                    "source": pdf_filename,
                    "section": f"Page {page_number + 1}, Chunk {i + 1}",
                    "content": chunk
                })

        try:
            image = preprocess_image(pages_images[page_number])
            result = ocr.ocr(image, cls=True)
            flat_results = [line for block in result for line in block]
            filtered_lines = [line[1][0].strip() for line in flat_results if line[1][0].strip()]
            full_ocr_text = " ".join(filtered_lines).strip()

            if len(full_ocr_text.split()) >= 10 and should_save_ocr(full_ocr_text, raw_text):
                chunks = chunk_text(full_ocr_text, max_words=70, overlap=20)
                for i, chunk in enumerate(chunks):
                    output_data.append({
                        "type": "image",
                        "source": pdf_filename,
                        "section": f"Page {page_number + 1}, OCR Chunk {i + 1}",
                        "content": chunk
                    })

        except Exception as e:
            print(f"⚠️ OCR failed on Page {page_number + 1}: {e}")

# 💾 Save all chunked data from all PDFs
with open("parsed_output_chunked.json", 'w', encoding='utf-8') as f:
    json.dump(output_data, f, indent=2, ensure_ascii=False)

print("\n✅ All PDFs processed. Combined chunked data saved.")
files.download("parsed_output_chunked.json")

Saving MLOps Final Exam (Solution)(Spring-2023).pdf to MLOps Final Exam (Solution)(Spring-2023).pdf
download https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar to /root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer/en_PP-OCRv3_det_infer.tar


100%|██████████| 4.00M/4.00M [00:17<00:00, 234kiB/s] 


download https://paddleocr.bj.bcebos.com/PP-OCRv4/english/en_PP-OCRv4_rec_infer.tar to /root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/en_PP-OCRv4_rec_infer.tar


100%|██████████| 10.2M/10.2M [00:18<00:00, 542kiB/s] 


download https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar to /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.tar


100%|██████████| 2.19M/2.19M [00:16<00:00, 135kiB/s]

[2025/06/22 21:47:55] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25, rec_c





📄 Processing PDF: MLOps Final Exam (Solution)(Spring-2023).pdf
   📃 Page 1
[2025/06/22 21:48:19] ppocr DEBUG: dt_boxes num : 45, elapsed : 0.5992431640625
[2025/06/22 21:48:19] ppocr DEBUG: cls num  : 45, elapsed : 0.17390918731689453
[2025/06/22 21:48:25] ppocr DEBUG: rec_res num  : 45, elapsed : 5.953069448471069
   📃 Page 2
[2025/06/22 21:48:36] ppocr DEBUG: dt_boxes num : 42, elapsed : 0.25591540336608887
[2025/06/22 21:48:36] ppocr DEBUG: cls num  : 42, elapsed : 0.18630480766296387
[2025/06/22 21:48:53] ppocr DEBUG: rec_res num  : 42, elapsed : 17.3131046295166
   📃 Page 3
[2025/06/22 21:49:05] ppocr DEBUG: dt_boxes num : 53, elapsed : 0.3988921642303467
[2025/06/22 21:49:05] ppocr DEBUG: cls num  : 53, elapsed : 0.1772761344909668
[2025/06/22 21:49:11] ppocr DEBUG: rec_res num  : 53, elapsed : 5.8791844844818115
   📃 Page 4
[2025/06/22 21:49:23] ppocr DEBUG: dt_boxes num : 50, elapsed : 0.25315427780151367
[2025/06/22 21:49:24] ppocr DEBUG: cls num  : 50, elapsed : 0.1709206104

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Embeddings generated and stored in FAISS

In [4]:
import os
import re
import json
from google.colab import files

import faiss
import numpy as np
import torch
import clip
from tqdm import tqdm
from PIL import Image
from sentence_transformers import SentenceTransformer

# ── 1. Load chunked JSON for all uploaded PDFs ────────────────────────────────
with open("parsed_output_chunked.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# ── 2. Initialize models once ────────────────────────────────────────────────
text_model = SentenceTransformer('all-MiniLM-L6-v2')
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)

# ── 3. Embed all text/OCR chunks into one FAISS index ─────────────────────────
text_embeddings = []
text_metadata   = []

print("\n🔤 Embedding text + OCR chunks…")
for entry in tqdm(data, desc="Text/OCR chunks"):
    if entry["type"] in ("text","image"):
        emb = text_model.encode(entry["content"], normalize_embeddings=True)
        text_embeddings.append(emb.astype(np.float32))
        text_metadata.append(entry)

if text_embeddings:
    dim = text_embeddings[0].shape[-1]
    t_index = faiss.IndexFlatL2(dim)
    t_index.add(np.stack(text_embeddings))
    faiss.write_index(t_index, "textual_index.faiss")

    with open("textual_metadata.json", "w", encoding="utf-8") as f:
        json.dump(text_metadata, f, indent=2, ensure_ascii=False)

    print(f"💾 Saved textual_index.faiss ({len(text_embeddings)} vectors) + textual_metadata.json")
else:
    print("⚠️ No text/OCR chunks found—skipping textual index.")

# ── 4. Embed all full‐page images into one FAISS index ────────────────────────
image_embeddings = []
image_metadata   = []

print("\n🖼 Embedding full-page images…")
for img_fn in tqdm(os.listdir(), desc="Page images"):
    m = re.match(r"page_(\d+)_(.+)\.png$", img_fn)
    if not m:
        continue
    page_num, source_pdf = int(m.group(1)), m.group(2)

    # load & preprocess for CLIP
    img = Image.open(img_fn).convert("RGB")
    inp = clip_preprocess(img).unsqueeze(0).to(device)
    with torch.no_grad():
        feat = clip_model.encode_image(inp)
        feat = feat / feat.norm(dim=-1, keepdim=True)

    image_embeddings.append(feat.cpu().numpy().flatten().astype(np.float32))
    image_metadata.append({
        "page":     page_num,
        "filename": img_fn,
        "source":   source_pdf
    })

if image_embeddings:
    dim = image_embeddings[0].shape[0]
    i_index = faiss.IndexFlatL2(dim)
    i_index.add(np.stack(image_embeddings))
    faiss.write_index(i_index, "image_clip_index.faiss")

    with open("image_clip_metadata.json", "w", encoding="utf-8") as f:
        json.dump(image_metadata, f, indent=2, ensure_ascii=False)

    print(f"💾 Saved image_clip_index.faiss ({len(image_embeddings)} vectors) + image_clip_metadata.json")
else:
    print("⚠️ No page images found—skipping image CLIP index.")

print("\n✅ All embedding indexes and metadata written.")

files.download("image_clip_index.faiss")

files.download("image_clip_metadata.json")

files.download("textual_metadata.json")
files.download("textual_index.faiss")






🔤 Embedding text + OCR chunks…


Text/OCR chunks: 100%|██████████| 124/124 [00:07<00:00, 16.47it/s]


💾 Saved textual_index.faiss (124 vectors) + textual_metadata.json

🖼 Embedding full-page images…


Page images: 100%|██████████| 26/26 [00:08<00:00,  3.00it/s]

💾 Saved image_clip_index.faiss (18 vectors) + image_clip_metadata.json

✅ All embedding indexes and metadata written.





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Streamlit App Code

In [6]:
# 2) Write your Streamlit app to app.py
%%bash
cat > app.py << 'EOF'
import streamlit as st
import json
import requests

import faiss
import numpy as np
import torch
import clip
from PIL import Image
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from google.colab import files

# ── CONFIG ────────────────────────────────────────────────────────────────────
LM_API_URL   = "https://ce55-119-73-124-175.ngrok-free.app/v1/chat/completions"
LM_MODEL     = "meta-llama-3-8b-instruct"
TOP_K_TEXT   = 5

# ── LOAD INDEXES & METADATA ────────────────────────────────────────────────────
text_index = faiss.read_index("textual_index.faiss")
with open("textual_metadata.json", "r", encoding="utf-8") as f:
    text_metadata = json.load(f)

image_index = faiss.read_index("image_clip_index.faiss")
with open("image_clip_metadata.json", "r", encoding="utf-8") as f:
    image_metadata = json.load(f)

# ── LOAD MODELS ────────────────────────────────────────────────────────────────
text_model    = SentenceTransformer("all-MiniLM-L6-v2")
device        = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, clip_preproc = clip.load("ViT-B/32", device=device)

# ── TEXT Q&A RETRIEVAL ─────────────────────────────────────────────────────────
def retrieve_text_context(query, top_k=TOP_K_TEXT):
    q_emb = text_model.encode(query, normalize_embeddings=True).astype(np.float32)
    _, I = text_index.search(np.expand_dims(q_emb,0), top_k)
    return [ text_metadata[i] for i in I[0] ]

# ── IMAGE → TEXT Q&A RETRIEVAL ────────────────────────────────────────────────
def retrieve_context_from_image(image_path):
    img = Image.open(image_path).convert("RGB")
    inp = clip_preproc(img).unsqueeze(0).to(device)
    with torch.no_grad():
        emb = clip_model.encode_image(inp)
        emb = emb / emb.norm(dim=-1, keepdim=True)
    emb_np = emb.cpu().numpy().astype(np.float32)
    _, I = image_index.search(emb_np, 1)
    idx = I[0][0]
    page_info = image_metadata[idx]
    page_num   = page_info["page"]
    src_pdf    = page_info["source"]
    image_chunks = [
        ch for ch in text_metadata
        if ch["type"]   == "image"
        and ch["source"]== src_pdf
        and ch["section"].startswith(f"Page {page_num},")
    ]
    if image_chunks:
        return image_chunks
    text_chunks = [
        ch for ch in text_metadata
        if ch["type"]   == "text"
        and ch["source"]== src_pdf
        and ch["section"].startswith(f"Page {page_num},")
    ]
    return text_chunks

# ── LLM CALL ───────────────────────────────────────────────────────────────────
def get_answer_from_llm(question, context_chunks):
    ctx = ""
    for ch in context_chunks:
        page_only = ch["section"].split(",")[0]
        src       = ch["source"]
        content   = ch["content"]
        ctx += f"[{src} | {page_only}]\n{content}\n\n"
    system = "You are a helpful assistant."
    user   = (
        "Use the following context snippets to answer the question. "
        "Always cite the PDF name and page number in your answer."
        "Answer my question as done in the following examples"
        "Q: What is value of investments for 2019?"
        "A: 1,200,056 Source: financials.pdf, Page: 15"
        "Q: What is value of inventories for June 2018?"
        "A: 522,354. Source: hbl.pdf, Page: 10"
        "Dont add english text, just answer in numerical value.\n\n"
        f"Context:\n{ctx}"
        f"Question: {question}\n"
        "Answer:"
    )
    payload = {
        "model": LM_MODEL,
        "messages": [
            {"role":"system", "content":system},
            {"role":"user",   "content":user}
        ],
        "temperature":0.7,
        "max_tokens":512
    }
    headers = {"Content-Type": "application/json"}
    try:
        response = requests.post(LM_API_URL, json=payload, headers=headers)
        response.raise_for_status()
        return response.json()["choices"][0]["message"]["content"].strip()
    except Exception as e:
        return f"❌ Error communicating with LLM: {e}"

# ── STREAMLIT UI ───────────────────────────────────────────────────────────────
st.set_page_config(page_title="PDF & Image Q&A", layout="centered")
st.title("📚 PDF & Image Q&A Chat")

mode = st.radio("Mode", ("Text Q&A", "Image Q&A"))

if mode == "Text Q&A":
    question = st.text_input("Enter your question about the PDF")
    if st.button("Ask") and question:
        with st.spinner("Thinking..."):
            ctx = retrieve_text_context(question)
            answer = get_answer_from_llm(question, ctx)
        st.markdown("**Answer:**")
        st.write(answer)

else:
    uploaded = st.file_uploader("Upload an image", type=["png","jpg","jpeg"])
    question = st.text_input("Enter your question about the image")
    if st.button("Ask") and uploaded and question:
        with st.spinner("Thinking..."):
            tmp_path = f"/tmp/{uploaded.name}"
            with open(tmp_path, "wb") as f:
                f.write(uploaded.getbuffer())
            ctx = retrieve_context_from_image(tmp_path)
            answer = get_answer_from_llm(question, ctx)
        st.markdown("**Answer:**")
        st.write(answer)
EOF


# Get Authentication Token

In [9]:
!ngrok config add-authtoken "YOUR TOKEN"

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


# Launch the Interface

In [11]:
# 3) Launch Streamlit & Ngrok
from pyngrok import ngrok

# expose port 8501
public_url = ngrok.connect(8501)
print("▶️ Streamlit is live on:", public_url)

# run Streamlit in the background
import subprocess, sys
subprocess.Popen([sys.executable, "-m", "streamlit", "run", "app.py", "--server.port=8501"])


▶️ Streamlit is live on: NgrokTunnel: "https://bfd2-34-169-94-240.ngrok-free.app" -> "http://localhost:8501"


<Popen: returncode: None args: ['/usr/bin/python3', '-m', 'streamlit', 'run'...>