In [12]:
import os
from unstructured.partition.pdf import partition_pdf
import json
from typing import Dict
from dotenv import load_dotenv
from tqdm import tqdm
load_dotenv()

True

In [13]:

dataset_path = "dataset/pdfs"
datasets = [os.path.join(dataset_path, i) for i in os.listdir(dataset_path) if i.endswith(".pdf")]
datasets

['dataset/pdfs/sbi.pdf', 'dataset/pdfs/icici.pdf', 'dataset/pdfs/HDFC.pdf']

In [14]:
def extract_pdf_text_by_page(pdf_path: str) -> dict:
    """
    Extracts text from a PDF file page by page using Unstructured's partition_pdf.

    Args:
        pdf_path (str): Full path to the PDF file.

    Returns:
        dict: A dictionary where keys are page numbers (int) and values are strings of extracted text.

    Raises:
        FileNotFoundError: If the provided file path does not exist.
        ValueError: If extraction fails or no elements are returned.
        Exception: For any other unexpected errors.
    """
    import os

    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"File not found: {pdf_path}")

    try:
        elements = partition_pdf(
            filename=pdf_path,
            strategy=os.getenv("strategy") ,# faster, less accurate (use "hi_res" for better layout)
            infer_table_structure=os.getenv("infer_table_structure")
        )
        if not elements:
            raise ValueError(f"No extractable content found in {pdf_path}")

        pagewise_text = {}

        for el in elements:
            page_num = el.metadata.page_number or 0

            if page_num not in pagewise_text:
                pagewise_text[page_num] = []

            if el.text:
                clean_text = el.text.strip()
                if clean_text:  # skip empty lines
                    pagewise_text[page_num].append(clean_text)

        # Convert list of lines per page to single string
        return {
            page: "\n".join(lines)
            for page, lines in sorted(pagewise_text.items())
        }

    except Exception as e:
        raise Exception(f"Failed to extract text from {pdf_path}: {str(e)}")
    
    


def create_json(file_path: str, year: int, company: str, data: Dict[int, str]) -> str:
    """
    Creates a structured JSON file from page-wise extracted PDF text.

    Args:
        file_path (str): Path to the original PDF file.
        year (int): The year associated with the document.
        company (str): The company name associated with the document.
        data (Dict[int, str]): Dictionary mapping page numbers to text content.

    Returns:
        str: Path to the saved JSON file.

    Raises:
        ValueError: If data is not a dictionary or contains invalid content.
        Exception: For any file writing or JSON serialization errors.
    """
    if not isinstance(data, dict):
        raise ValueError("Expected `data` to be a dictionary of page_num -> text")

    try:
        final_text = []
        for page_num, text in data.items():
            if not isinstance(page_num, int):
                raise ValueError(f"Invalid page number: {page_num}")
            if not isinstance(text, str):
                raise ValueError(f"Invalid text for page {page_num}")
            final_text.append({
                "page_num": page_num,
                "content": text,
                "year": year,
                "company": company
            })

        # Create output directory if it doesn't exist
        output_dir = "dataset/json"
        os.makedirs(output_dir, exist_ok=True)

        # Build JSON file name from original PDF name
        output_path = os.path.join(
            output_dir,
            f"{os.path.basename(file_path).split('.')[0]}.json"
        )

        # Write to file
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(final_text, f, indent=4, ensure_ascii=False)

        print(f"✅ JSON saved to: {output_path}")
        return output_path

    except Exception as e:
        raise Exception(f"Failed to write JSON for {file_path}: {str(e)}")



def process_pdfs(datasets, year=2024):
    for path in tqdm(datasets):
        data = extract_pdf_text_by_page(path)
        company_name = os.path.basename(path).split(".")[0]
        create_json(path, year, company_name, data)
        

In [15]:

        
process_pdfs(datasets)


 33%|███▎      | 1/3 [01:18<02:37, 78.99s/it]

✅ JSON saved to: dataset/json/sbi.json


 67%|██████▋   | 2/3 [02:31<01:15, 75.11s/it]

✅ JSON saved to: dataset/json/icici.json


100%|██████████| 3/3 [03:26<00:00, 68.76s/it]

✅ JSON saved to: dataset/json/HDFC.json





# cleaning data

In [3]:
import os 
dataset_path = "dataset/json"
datasets = [os.path.join(dataset_path, i) for i in os.listdir(dataset_path) if i.endswith(".json")]
datasets

['dataset/json/icici_2024.json',
 'dataset/json/sbi_2024.json',
 'dataset/json/HDFC_2024.json']

In [7]:
import re

def clean_pdf_json_content(data: list) -> list:
    def clean_text(text: str) -> str:
        text = re.sub(r'-\n(\w+)', r'\1', text)                     # Fix hyphenated words
        text = re.sub(r'[ ]{2,}', ' ', text)                        # Collapse multiple spaces
        text = re.sub(r'[─═╚╩╝╔╦╗╠╣╬]+', '', text)                  # Remove table borders
        text = re.sub(r'^\s*(Page|PAGE)?\s*\d+\s*$', '', text, flags=re.MULTILINE)  # Remove page numbers
        text = re.sub(r'\n{2,}', '\n', text)                        # Remove excessive line breaks
        text = re.sub(r'\s+', ' ', text)                            # Normalize whitespace
        return text.strip()
    
    for item in data:
        if 'content' in item and isinstance(item['content'], str):
            item['clean_content'] = clean_text(item['content'])
    
    return data


import json
from tqdm import tqdm
for files in tqdm(datasets):
    with open(files, 'r') as f:
        data = json.load(f)

    updated_data = clean_pdf_json_content(data)

    with open(files, 'w') as f:
        json.dump(updated_data, f, indent=2)



100%|██████████| 3/3 [00:00<00:00,  8.85it/s]
