In [1]:
from typing import List, Dict, Any
import PyPDF2
from tqdm import tqdm
import os
import uuid
import glob
import json
from docling.document_converter import DocumentConverter
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
import pandas as pd
import shutil
import subprocess
import os
from urllib.parse import urlparse, unquote

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class PreprocessorResponse:
    def __init__(self, results: List[Dict[str, Any]]):
        self.results = results

    def save_to_json(self, output_file_path: str):
        try:
            # Ensure the directory exists
            directory = os.path.dirname(output_file_path)
            if directory and not os.path.exists(directory):
                print(f"Creating directory: {directory}")
                os.makedirs(directory)

            # Save the results to the JSON file
            with open(output_file_path, 'w') as json_file:
                json.dump(self.results, json_file, indent=4)
                print(f"Results saved to {output_file_path}")
        except Exception as e:
            print(f"An error occurred while saving to JSON: {e}")

class Preprocessor:
    def get_pdf_page_count(self, pdf_path: str) -> int:
        try:
            with open(pdf_path, 'rb') as pdf_file:
                reader = PyPDF2.PdfReader(pdf_file)
                page_count = len(reader.pages)
                return page_count
        except Exception as e:
            print(f"An error occurred while counting pages: {e}")
            return -1

    def extract_page(self, input_pdf_path: str, output_pdf_path: str, page_number: int):
        try:
            with open(input_pdf_path, 'rb') as input_pdf:
                reader = PyPDF2.PdfReader(input_pdf)
                writer = PyPDF2.PdfWriter()

                if page_number < 0 or page_number >= len(reader.pages):
                    raise ValueError("Invalid page number.")

                writer.add_page(reader.pages[page_number])

                with open(output_pdf_path, 'wb') as output_pdf:
                    writer.write(output_pdf)
        except Exception as e:
            print(f"An error occurred while extracting page: {e}")

    def process(self, file_path: str):
        page_count = self.get_pdf_page_count(file_path)
        if page_count == -1:
            raise ValueError("Failed to get the page count of the PDF.")

        results = []
        for i in tqdm(range(page_count)):
            temp_pdf_path = "temp.pdf"
            self.extract_page(file_path, temp_pdf_path, i)

            try:
                pipeline_options = PdfPipelineOptions()
                pipeline_options.do_ocr = False  # Disable OCR
                pipeline_options.accelerator_options.num_threads = 20

                converter = DocumentConverter(
                    format_options={
                        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
                    }
                )
                result = converter.convert(temp_pdf_path)
                result_md = result.document.export_to_markdown()

                results.append({
                    "page_content": result_md,
                    "metadata": {
                        "page": i + 1,
                    },
                    "id": str(uuid.uuid4())
                })

            except Exception as e:
                print(f"An error occurred while converting page {i + 1}: {e}")

        return PreprocessorResponse(results)

In [6]:
# Utils

def folder_to_zip(folder_path, output_zip_path):
    try:
        # Menggunakan shutil.make_archive untuk membuat zip
        zip_path = shutil.make_archive(output_zip_path, 'zip', folder_path)
        print(f"Folder berhasil diubah menjadi ZIP: {zip_path}")
        return zip_path
    except Exception as e:
        print(f"Terjadi kesalahan: {e}")
        return None

# Input URL

In [7]:
# ADJUSTABLE

urls = [
  'https://storage.googleapis.com/cesgs-dart/TCFD%20Report/AR/2020/HK_0002_AR_2020(1).pdf',
]

In [10]:
failed = []

for idx, url in enumerate(urls):
    print(f"Memproses ({idx + 1}/{len(urls)}) - {url}", flush=True)
    
    try:
        # 1. Download file
        encoded_url = url.replace("(", "%28").replace(")", "%29")
        result = subprocess.run(
            ["wget", "-q", encoded_url],
            check=True,
            stderr=subprocess.PIPE
        )
        
        # 2. Proses file
        preprocessor = Preprocessor()
        pdf_filename = unquote(os.path.basename(urlparse(url).path))
        pdf_path = os.path.join(pdf_filename)
        
        # 3. Ekstraksi dan simpan output
        response = preprocessor.process(pdf_path)
        output_path = os.path.normpath(os.path.join(
            "data", "JSON",
            url.replace("https://storage.googleapis.com/cesgs-dart/", "")[:-4].replace("%20", " ") + '.json'
        ))
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        response.save_to_json(output_path)
        
        # 4. Bersihkan file PDF
        os.remove(pdf_path)
        
    except Exception as e:
        print(f"Gagal memproses {url}: {str(e)}")
        failed.append(url)

# Simpan daftar yang gagal ke CSV
if failed:
    pd.Series(failed).to_csv("failed.csv", index=False)
    print(f"{len(failed)} URL gagal, disimpan ke failed.csv")

Memproses (1/1) - https://storage.googleapis.com/cesgs-dart/TCFD%20Report/AR/2020/HK_0002_AR_2020(1).pdf


100%|██████████| 1/1 [00:04<00:00,  4.63s/it]

Results saved to data\JSON\TCFD Report\AR\2020\HK_0002_AR_2020(1).json





# Input Filepath

In [3]:
# ADJUSTABLE
pdf_dir = "data\\PDF\\"
files = glob.glob(pdf_dir + "*\*.pdf")
files = [os.path.normpath(f) for f in files]
len(files)

1

In [4]:
failed = []

for idx, pdf_path in enumerate(files):
    print(f"Memproses ({idx + 1}/{len(files)}) - {pdf_path}", flush=True)
    
    try:
        # 2. Proses file
        preprocessor = Preprocessor()
        
        # 3. Ekstraksi dan simpan output
        response = preprocessor.process(pdf_path)
        output_path = os.path.normpath(os.path.join(
            "data", "JSON",
            pdf_path.replace(pdf_dir, "")[:-4].replace("%20", " ") + '.json'
        ))
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        response.save_to_json(output_path)
       
    except Exception as e:
        print(f"Gagal memproses {pdf_path}: {str(e)}")
        failed.append(pdf_path)

# Simpan daftar yang gagal ke CSV
if failed:
    pd.Series(failed).to_csv("failed.csv", index=False)
    print(f"{len(failed)} file gagal, disimpan ke failed.csv")

Memproses (1/1) - data\PDF\AR\ID_ADRO_AR_2022.pdf


100%|██████████| 504/504 [26:50<00:00,  3.20s/it]

Results saved to data\JSON\AR\ID_ADRO_AR_2022.json



