In [2]:
!pip install llama-parse

Collecting llama-parse
  Downloading llama_parse-0.6.51-py3-none-any.whl.metadata (6.9 kB)
Collecting llama-cloud-services>=0.6.51 (from llama-parse)
  Downloading llama_cloud_services-0.6.51-py3-none-any.whl.metadata (3.5 kB)
Collecting llama-cloud==0.1.34 (from llama-cloud-services>=0.6.51->llama-parse)
  Downloading llama_cloud-0.1.34-py3-none-any.whl.metadata (1.2 kB)
Collecting llama-index-core>=0.12.0 (from llama-cloud-services>=0.6.51->llama-parse)
  Downloading llama_index_core-0.12.52.post1-py3-none-any.whl.metadata (2.5 kB)
Collecting python-dotenv<2.0.0,>=1.0.1 (from llama-cloud-services>=0.6.51->llama-parse)
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Collecting aiosqlite (from llama-index-core>=0.12.0->llama-cloud-services>=0.6.51->llama-parse)
  Downloading aiosqlite-0.21.0-py3-none-any.whl.metadata (4.3 kB)
Collecting banks<3,>=2.2.0 (from llama-index-core>=0.12.0->llama-cloud-services>=0.6.51->llama-parse)
  Downloading banks-2.2.0-py3-none-any.w

In [1]:
import nest_asyncio
nest_asyncio.apply()
import os
LLAMA_CLOUD_API_KEY="llx-cMJE4Wcp5scmIbiGrdT4DJIqzUng5UZuGa0fqho3Mm2E0MP1"

In [2]:
from llama_parse import LlamaParse

In [51]:
  parsinginstruct = """
You are an intelligent document parser designed to extract a structured outline from research papers, reports, whitepapers, policy documents, and technical PDFs.

Your goal is to produce a clean, hierarchical representation of the document’s headings in the following JSON format:

{
  "title": "<document title>",
  "outline": [
    {
      "level": "H1",
      "text": "<heading text>",
      "page": <page number>
    },
    ...
  ]
}

INSTRUCTIONS:

1. Title Extraction:
   - Extract the document title from the first page or cover.This is the title for the entire document.It should appear only once in the json file for the entire pdf.
   - It is usually the largest, most centered, and prominent text.
   - It may span multiple lines.
   - Do not extract logos or headers.
   1. The **document title** (only once for the entire document)

2. Outline Extraction:
   - Only extract **meaningful hierarchical section headings** that define the structure of the document:
     - H1: Top-level (e.g., Introduction, Abstract, Methodology)
     - H2: Sub-sections (e.g., 1.1 Scope)
     - H3: Sub-sub-sections (e.g., 1.1.1 Details)
    - Some documents may not contain all levels (H1, H2, H3) — extract whatever is present.
   - Do not assume a fixed font size threshold for each level — use layout and semantics to infer hierarchy.


3. Do not classify something as a heading simply because it is:
   - Bold
   - Uppercase
   - Indented or stylistically emphasized
   These features are only weak signals. Headings must match structural importance semantically and visually.

4. Special Case Clarification:
   - If a document contains a "Table of Contents", **only the heading 'Table of Contents' itself is to be extracted** (as H1).
   - The rest of the lines under it (even if numbered or bold) are **not actual headings**, but references — ignore them.
   - Similarly, ignore section previews or index lists inside TOC or elsewhere.

   - For “Table of Contents”, extract only the heading "Table of Contents" as heading — ignore all lines below it.
   - Do **not** extract references from inside a TOC block.
   - Recognize and include important headings like “Revision History” if present.


5. Use semantic understanding to identify section boundaries:
   - Don’t rely on font size alone.
   - Some headings may be multiline — treat them as one unit.
   - Use knowledge of common section names (e.g., Introduction, Abstract, References).

6. Multilingual Handling:
   - Support headings in other languages  — infer meaning contextually.
   -Follow the same process for multilingual documents.
   -Do not miss any headings or subheadings.
   -Give more focus to visual features for multilingual document handling(if text is bold and big it is a heading)

7. Ignore:
   - Figure/table/image captions, footnotes, headers/footers, text inside tables or graphics.

8. Output:
   - Return only JSON as shown above for the entire PDF.
   - Each heading object must include: { level: "H1"|"H2"|"H3", text: "heading text", page: page_number }
   - Page number starts from 1.
   -There is only one title for the entire document.It shouldnt appear for each page

Be robust even on complex layouts or scanned text.
"""


In [None]:
def process_challenge_1a(pdf_path,parsinginstruct, lang):

  withInstructionParsing=LlamaParse(api_key=LLAMA_CLOUD_API_KEY,result_type='markdown',parsing_instruction=parsinginstruct,language=lang).load_data(pdf_path)
  n=len(withInstructionParsing)
  return withInstructionParsing



In [16]:


def extract_outline_using_doc_index(markdown_docs, output_dir, pdf_filename):
    all_headings = []
    true_title = None

    for idx, doc in enumerate(markdown_docs):
        page_number = idx + 1  # Pages are 1-indexed

        try:
            parsed = json.loads(doc.text.strip())

            # Only set the title once
            title = parsed.get("title", "").strip()
            if not true_title and title:
                true_title = title

            # Extract outline entries but override the page with index-based page number
            for item in parsed.get("outline", []):
                if not (item.get("level") and item.get("text")):
                    continue

                all_headings.append({
                    "level": item["level"],
                    "text": item["text"].strip(),
                    "page": page_number
                })

        except Exception as e:
            print(f"Error parsing markdown on page {page_number}: {e}")
            continue

    # Deduplicate: keep latest occurrence by page
    deduped = {}
    for entry in all_headings:
        key = (entry["level"], entry["text"])
        if key not in deduped or entry["page"] > deduped[key]["page"]:
            deduped[key] = entry

    # Sort by page number
    sorted_headings = sorted(deduped.values(), key=lambda x: x["page"])

    final_json = {
        "title": true_title or "Untitled Document",
        "outline": sorted_headings
    }

    # Save to file
    os.makedirs(output_dir, exist_ok=True)

    # Save to file using original PDF name (replace .pdf with .json)
    json_filename = os.path.splitext(pdf_filename)[0] + ".json"
    output_path = os.path.join(output_dir, json_filename)

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(final_json, f, ensure_ascii=False, indent=2)



In [21]:
!pip install langdetect pdf2image pytesseract

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.1/981.5 kB[0m [31m5.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m972.8/981.5 kB[0m [31m15.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Building wheels for collected packages: langdetect
  Building wheel for langdete

In [35]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m71.7/73.4 kB[0m [31m3.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-3.0.0-py3-none-any.whl.metadata (10.0 kB)
Using cached pybind11-3.0.0-py3-none-any.whl (292 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp311-cp311-linux_x86_64.whl size=4508440 sha256=06813cdd70ae935

In [36]:
from pdf2image import convert_from_path
import pytesseract
import fasttext
import tempfile
import os

# Pretrained language identification model (download if not already)
FASTTEXT_MODEL_PATH = "lid.176.ftz"
if not os.path.exists(FASTTEXT_MODEL_PATH):
    import urllib.request
    urllib.request.urlretrieve(
        "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz",
        FASTTEXT_MODEL_PATH
    )

# Load the model
ft_model = fasttext.load_model(FASTTEXT_MODEL_PATH)

# Lang → Tesseract mapping
lang_map = {
    "hi": "hin", "ta": "tam", "fr": "fra", "en": "eng", "bn": "ben",
    "mr": "mar", "gu": "guj", "te": "tel", "ur": "urd", "kn": "kan",
    "ml": "mal", "pa": "pan", "or": "ori", "as": "asm", "ne": "nep"
}

def detect_pdf_language(pdf_path, max_pages=2):
    # Convert first few pages to images
    images = convert_from_path(pdf_path, dpi=300, first_page=1, last_page=max_pages)

    ocr_text = ""
    for img in images:
        ocr_text += pytesseract.image_to_string(img, lang='eng')  # fallback

    # Run fasttext language detection
    with tempfile.NamedTemporaryFile(mode="w+", encoding="utf-8", delete=False) as f:
        f.write(ocr_text)
        f.flush()
        lang_pred = ft_model.predict(ocr_text.replace('\n', ' '))[0][0].replace("__label__", "")

    # Map to Tesseract OCR language code
    tesseract_lang = lang_map.get(lang_pred, "eng")

    print(f"🌐 Detected language: {lang_pred} → Using Tesseract lang: {tesseract_lang}")

    return tesseract_lang, lang_pred


In [43]:
def run_ocr_in_detected_language(pdf_path):
    tesseract_lang = detect_pdf_language(pdf_path)[0]

    images = convert_from_path(pdf_path, dpi=300)
    full_text = ""
    for img in images:
        full_text += pytesseract.image_to_string(img, lang=tesseract_lang)

    return full_text


In [25]:
lang_map = {
    "af": "afr", "ar": "ara", "bn": "ben", "zh-cn": "chi_sim", "zh-tw": "chi_tra",
    "en": "eng", "fr": "fra", "de": "deu", "gu": "guj", "hi": "hin", "it": "ita",
    "ja": "jpn", "kn": "kan", "ko": "kor", "ml": "mal", "mr": "mar", "ta": "tam",
    "te": "tel", "ur": "urd", "pa": "pan", "es": "spa", "ru": "rus"
}

In [44]:
import tempfile
import os

def save_to_markdown(text, pdf_path):
    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
    md_path = os.path.join(tempfile.gettempdir(), pdf_name + ".md")
    with open(md_path, "w", encoding="utf-8") as f:
        f.write(text)
    return md_path


# Step 4: Parse markdown using LlamaParse
def parse_markdown_with_llamaparse(md_path, api_key):
    parser = LlamaParse(api_key=api_key, result_type="markdown")
    docs = parser.load_data(md_path)
    return docs

In [27]:
!pip install poppler

[31mERROR: Could not find a version that satisfies the requirement poppler (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for poppler[0m[31m
[0m

In [28]:
!pip install pdfinfo

[31mERROR: Could not find a version that satisfies the requirement pdfinfo (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for pdfinfo[0m[31m
[0m

In [29]:
!apt-get install -y poppler-utils


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 35 not upgraded.
Need to get 186 kB of archives.
After this operation, 697 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.8 [186 kB]
Fetched 186 kB in 1s (302 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 126284 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.8_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.8) ...
Setting up poppler-utils (22.02.0-2ubuntu0.8) ...
Processing triggers for man-db (2.10.2-1) ...


In [46]:
!pip install numpy==1.26.4 --quiet
import numpy as np


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m89.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opencv-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.
opencv-python-headless 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
opencv-contrib-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.[0m[31m
[0m

In [48]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━[0m [32m153.6/232.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [49]:
def detect_lang(file_path):
  with open(file_path,'rb') as file:
    pdf_reader=PyPDF2.PdfReader(file)
    page=pdf_reader.pages[0]
    text=page.extract_text()
    lang=detect(text)
    return lang

In [56]:
import os
import json
from pdf2image import convert_from_path
import pytesseract

from langdetect import detect
import PyPDF2








# Path to the input directory containing PDF files
input_dir = "/content/input_dir"
output_dir="output_dir"

# List all PDF files in the directory
pdf_files = [os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.lower().endswith(".pdf")]

# Now loop through them and process each
for pdf_path in pdf_files:
    filename = os.path.basename(pdf_path)
    lang=detect_lang(pdf_path)
    if lang in ["en","eng","ENG","ENGLISH"]:
      obj=process_challenge_1a(pdf_path,parsinginstruct,lang)
      extract_outline_using_doc_index(obj,output_dir,filename)
    else:
      #ocr_text=run_ocr_in_detected_language(pdf_path)
      #path=save_to_markdown(ocr_text, pdf_path)
      obj=process_challenge_1a(pdf_path,parsinginstruct,lang)
      extract_outline_using_doc_index(obj,output_dir,filename)













Started parsing the file under job_id a03898d4-84a4-4d52-a775-21051a661b6e
Error parsing markdown on page 1: Expecting ',' delimiter: line 21 column 13 (char 261)


In [8]:
!mkdir input_dir