Title: Alltius AI Data Science Intern(PDF Parsing + JSON Extraction)

In [1]:
# Install required libraries
!pip install pymupdf pdfplumber pandas pillow

Collecting pymupdf
  Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m79.6 MB/s[0m eta [36m0:

In [2]:
from google.colab import files

uploaded = files.upload()  # choose your factsheet PDF
pdf_path = list(uploaded.keys())[0]  # get uploaded file name
print("Uploaded:", pdf_path)

Saving [Fund Factsheet - May]360ONE-MF-May 2025.pdf.pdf to [Fund Factsheet - May]360ONE-MF-May 2025.pdf.pdf
Uploaded: [Fund Factsheet - May]360ONE-MF-May 2025.pdf.pdf


In [3]:
# Import libraries
import fitz  # PyMuPDF
import pdfplumber
import json, re, os
from pathlib import Path
from statistics import median

In [4]:
# Define helpers
def clean_text(s):
    if not s: return ""
    s = re.sub(r'\s+\n\s+', '\n', s)
    s = s.replace('\n', ' ')
    s = re.sub(r'\s{2,}', ' ', s)
    s = re.sub(r'Mutual Fund investments are subject to.*', '', s, flags=re.I)
    return s.strip()

def save_image_bytes(img_dict, out_dir, prefix="img"):
    data = img_dict["image"]
    ext = img_dict.get("ext", "png")
    idx = img_dict.get("xref", 0)
    fname = f"{prefix}_xref{idx}.{ext}"
    out_path = Path(out_dir) / fname
    with open(out_path, "wb") as f:
        f.write(data)
    return str(out_path)

def extract_tables_pdfplumber(pdf_path, page_no):
    tables = []
    with pdfplumber.open(pdf_path) as pdf:
        if page_no < len(pdf.pages):
            page = pdf.pages[page_no]
            raw_tables = page.extract_tables()
            for t in raw_tables:
                tables.append(t)
    return tables

def extract_page_fitz(doc, page_number, pdf_path=None, images_dir=None):
    page = doc.load_page(page_number)
    text_dict = page.get_text("dict")
    blocks = text_dict.get("blocks", [])

    span_sizes = []
    for b in blocks:
        if b.get("type") == 0:
            for line in b.get("lines", []):
                for span in line.get("spans", []):
                    if isinstance(span.get("size"), (int, float)):
                        span_sizes.append(span["size"])
    median_sz = median(span_sizes) if span_sizes else 0
    heading_thresh = median_sz + 1.4

    page_content = []
    current_section = None

    for b in blocks:
        if b.get("type") == 0:  # text
            for line in b.get("lines", []):
                line_text = "".join(span.get("text", "") for span in line.get("spans", []))
                if not line_text.strip():
                    continue
                max_size = max((span.get("size", 0) for span in line.get("spans", [])), default=0)
                line_text_clean = re.sub(r'\s+', ' ', line_text).strip()

                is_upper = line_text_clean.upper() == line_text_clean and len(line_text_clean) <= 80
                if (max_size >= heading_thresh and max_size > 6) or is_upper:
                    current_section = line_text_clean
                    page_content.append({
                        "type": "section",
                        "section": current_section,
                        "sub_section": None,
                        "text": None
                    })
                else:
                    page_content.append({
                        "type": "paragraph",
                        "section": current_section,
                        "sub_section": None,
                        "text": clean_text(line_text_clean)
                    })

        elif b.get("type") == 1:  # image block
            bbox = b.get("bbox")
            page_content.append({
                "type": "chart",
                "section": current_section,
                "sub_section": None,
                "description": f"Image/chart at bbox {bbox}",
                "image": None
            })

    saved_images = []
    try:
        images = page.get_images(full=True)
        for imginfo in images:
            xref = imginfo[0]
            img_dict = doc.extract_image(xref)
            img_dict["xref"] = xref
            saved = save_image_bytes(img_dict, images_dir or ".", prefix=f"page{page_number+1}")
            saved_images.append({"xref": xref, "path": saved})
    except Exception:
        pass

    img_paths = [i["path"] for i in saved_images]
    ci = 0
    for c in page_content:
        if c["type"] == "chart":
            if ci < len(img_paths):
                c["image"] = img_paths[ci]
            ci += 1

    tables = []
    if pdf_path is not None:
        try:
            tables = extract_tables_pdfplumber(pdf_path, page_number)
        except Exception:
            pass

    for t in tables:
        page_content.append({
            "type": "table",
            "section": current_section,
            "description": None,
            "table_data": t
        })

    return page_content

def build_json(pdf_path, out_json, images_dir="images", verbose=False):
    doc = fitz.open(pdf_path)
    pages = []
    os.makedirs(images_dir, exist_ok=True)
    for i in range(doc.page_count):
        if verbose:
            print(f"Processing page {i+1}/{doc.page_count} ...")
        content = extract_page_fitz(doc, i, pdf_path=pdf_path, images_dir=images_dir)
        pages.append({
            "page_number": i+1,
            "content": content
        })
    final = {"pages": pages}
    with open(out_json, "w", encoding="utf-8") as f:
        json.dump(final, f, ensure_ascii=False, indent=2)
    return out_json


In [5]:
# Run extraction
output_json = "factsheet_output.json"
images_dir = "factsheet_images"

os.makedirs(images_dir, exist_ok=True)

result = build_json(pdf_path, output_json, images_dir=images_dir, verbose=True)

print("✅ JSON saved:", result)

Processing page 1/17 ...
Processing page 2/17 ...
Processing page 3/17 ...
Processing page 4/17 ...
Processing page 5/17 ...
Processing page 6/17 ...
Processing page 7/17 ...
Processing page 8/17 ...
Processing page 9/17 ...
Processing page 10/17 ...
Processing page 11/17 ...
Processing page 12/17 ...
Processing page 13/17 ...
Processing page 14/17 ...
Processing page 15/17 ...
Processing page 16/17 ...
Processing page 17/17 ...
✅ JSON saved: factsheet_output.json


In [6]:
# Preview JSON
with open(output_json, "r", encoding="utf-8") as f:
    data = json.load(f)

# Show first page only
data["pages"][0]

{'page_number': 1,
 'content': [{'type': 'chart',
   'section': None,
   'sub_section': None,
   'description': 'Image/chart at bbox (530.4791870117188, 16.103736877441406, 579.3099975585938, 80.11248779296875)',
   'image': 'factsheet_images/page1_xref267.jpeg'},
  {'type': 'paragraph',
   'section': None,
   'sub_section': None,
   'text': 'June 2025'},
  {'type': 'paragraph',
   'section': None,
   'sub_section': None,
   'text': 'Page |'},
  {'type': 'paragraph', 'section': None, 'sub_section': None, 'text': ''},
  {'type': 'section', 'section': '1', 'sub_section': None, 'text': None},
  {'type': 'chart',
   'section': '1',
   'sub_section': None,
   'description': 'Image/chart at bbox (17.00787353515625, 30.14190673828125, 579.0892944335938, 825.1261596679688)',
   'image': 'factsheet_images/page1_xref269.jpeg'},
  {'type': 'section', 'section': 'MONTHLY', 'sub_section': None, 'text': None},
  {'type': 'section',
   'section': 'FACTSHEET',
   'sub_section': None,
   'text': None},

In [7]:
# Downloading results
from google.colab import files

files.download(output_json)  # download JSON file

!zip -r images.zip factsheet_images
files.download("images.zip")  # download extracted images


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

  adding: factsheet_images/ (stored 0%)
  adding: factsheet_images/page10_xref46.jpeg (deflated 13%)
  adding: factsheet_images/page15_xref100.jpeg (deflated 13%)
  adding: factsheet_images/page15_xref263.png (stored 0%)
  adding: factsheet_images/page1_xref263.png (stored 0%)
  adding: factsheet_images/page10_xref50.jpeg (deflated 14%)
  adding: factsheet_images/page7_xref28.jpeg (deflated 14%)
  adding: factsheet_images/page6_xref20.jpeg (deflated 12%)
  adding: factsheet_images/page13_xref84.jpeg (deflated 21%)
  adding: factsheet_images/page6_xref263.png (stored 0%)
  adding: factsheet_images/page5_xref263.png (stored 0%)
  adding: factsheet_images/page14_xref92.jpeg (deflated 14%)
  adding: factsheet_images/page12_xref263.png (stored 0%)
  adding: factsheet_images/page16_xref263.png (stored 0%)
  adding: factsheet_images/page17_xref263.png (stored 0%)
  adding: factsheet_images/page14_xref263.png (stored 0%)
  adding: factsheet_images/page3_xref263.png (stored 0%)
  adding: factsh

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>