In [1]:
pip install gdown



**Get files from drive**

The files are on Google drive in a compressed format. Download them into the colab runtime, and extract them in the next cell

In [2]:
import gdown

# File ID from the Google Drive link
file_id = "1RV2djXbnkrbKTWB3-KlF92sjtcaA2ts3"
output_file = "documents.zip"  # Desired name for the downloaded file

# Construct the download URL and download the file
gdown.download(f"https://drive.google.com/uc?id={file_id}", output_file, quiet=False)
print(f"Downloaded file: {output_file}")


Downloading...
From (original): https://drive.google.com/uc?id=1RV2djXbnkrbKTWB3-KlF92sjtcaA2ts3
From (redirected): https://drive.google.com/uc?id=1RV2djXbnkrbKTWB3-KlF92sjtcaA2ts3&confirm=t&uuid=3f9fc393-2b6e-4211-a661-529d18cce0f1
To: /content/documents.zip
100%|██████████| 955M/955M [00:33<00:00, 28.4MB/s]

Downloaded file: documents.zip





**Extract zip files**

In [1]:
import zipfile
import os

def extract_nested_zip(zip_file, output_folder):
    """
    Recursively extracts ZIP files, including nested ZIPs.
    """
    # Extract the main ZIP file
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(output_folder)
        print(f"Extracted: {zip_file} to {output_folder}")

    # Check for nested ZIP files in the extracted folder
    for root, _, files in os.walk(output_folder):
        for file in files:
            if file.endswith(".zip"):
                nested_zip_path = os.path.join(root, file)
                nested_output_folder = os.path.join(root, file.replace(".zip", ""))
                os.makedirs(nested_output_folder, exist_ok=True)
                # Recursively extract the nested ZIP file
                extract_nested_zip(nested_zip_path, nested_output_folder)
                # Optionally, delete the nested ZIP file after extraction to save space
                os.remove(nested_zip_path)
                print(f"Deleted nested ZIP: {nested_zip_path}")

# Example usage
zip_file = "compressed_pdf_outputs.zip"  # Main ZIP file
output_folder = "/content/extracted_files"  # Where to extract everything
os.makedirs(output_folder, exist_ok=True)

extract_nested_zip(zip_file, output_folder)


Extracted: compressed_pdf_outputs.zip to /content/extracted_files


**Getting the PDFs and the PNGs**

In [8]:
import os
import subprocess
from multiprocessing import Pool

def convert_single_docx(docx_path, pdf_output_folder):
    """
    Convert a single DOCX file to PDF using LibreOffice while preserving folder structure.
    """
    try:
        # Determine relative path and output folder
        relative_path = os.path.relpath(docx_path, start=docx_folder)
        subfolder = os.path.dirname(relative_path)
        output_subfolder = os.path.join(pdf_output_folder, subfolder)
        os.makedirs(output_subfolder, exist_ok=True)

        # Output path
        command = f'libreoffice --headless --convert-to pdf --outdir "{output_subfolder}" "{docx_path}"'
        subprocess.run(command, shell=True, check=True)
        print(f"Converted: {docx_path}")
    except subprocess.CalledProcessError as e:
        error_message = f"Error converting {docx_path}: {e}\n"
        print(error_message)
        with open("conversion_errors.log", "a") as error_log:
            error_log.write(error_message)

def batch_convert(docx_files, pdf_output_folder):
    """
    Convert a batch of DOCX files to PDFs.
    """
    for docx_file in docx_files:
        convert_single_docx(docx_file, pdf_output_folder)

def parallel_docx_to_pdf(docx_folder, pdf_output_folder, excluded_folders=("5k", "20k"), num_workers=1):
    """
    Perform parallel conversion of DOCX files to PDF while preserving folder structure.
    """
    os.makedirs(pdf_output_folder, exist_ok=True)
    docx_files = []

    # Walk through the folder structure
    for root, _, files in os.walk(docx_folder):
        # Check if any part of the path contains any excluded folder name
        if any(excluded_folder in os.path.normpath(root).split(os.sep) for excluded_folder in excluded_folders):
            print(f"Skipping folder: {root}")
            continue
        for file in files:
            if file.endswith(".docx"):
                docx_files.append(os.path.join(root, file))

    # Divide files into chunks for parallel processing
    chunk_size = len(docx_files) // num_workers
    chunks = [docx_files[i:i + chunk_size] for i in range(0, len(docx_files), chunk_size)]

    with Pool(processes=num_workers) as pool:
        pool.starmap(batch_convert, [(chunk, pdf_output_folder) for chunk in chunks])

# Paths
docx_folder = "/content/extracted_files"  # Change to the path where your files are located
pdf_output_folder = "/content/pdf_outputs"

# Run the conversion
parallel_docx_to_pdf(docx_folder, pdf_output_folder, excluded_folders=("5k", "20k"), num_workers=1)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Converted: /content/extracted_files/15k/Receipts3/Receipt_4979.docx
Converted: /content/extracted_files/15k/Receipts3/Receipt_2048.docx
Converted: /content/extracted_files/15k/Receipts3/Receipt_3688.docx
Converted: /content/extracted_files/15k/Receipts3/Receipt_4954.docx
Converted: /content/extracted_files/15k/Receipts3/Receipt_1152.docx
Converted: /content/extracted_files/15k/Receipts3/Receipt_4173.docx
Converted: /content/extracted_files/15k/Receipts3/Receipt_3151.docx
Converted: /content/extracted_files/15k/Receipts3/Receipt_4877.docx
Converted: /content/extracted_files/15k/Receipts3/Receipt_1725.docx
Converted: /content/extracted_files/15k/Receipts3/Receipt_1511.docx
Converted: /content/extracted_files/15k/Receipts3/Receipt_257.docx
Converted: /content/extracted_files/15k/Receipts3/Receipt_4003.docx
Converted: /content/extracted_files/15k/Receipts3/Receipt_4748.docx
Converted: /content/extracted_files/15k/Receipts3/Re

In [5]:
import os
from pdf2image import convert_from_path

def convert_pdf_to_png(pdf_file, png_output_folder):
    """
    Convert a single PDF to PNG(s) while preserving folder structure.
    Each page of the PDF will be saved as a separate PNG file.
    """
    try:
        # Determine relative path and output folder
        relative_path = os.path.relpath(pdf_file, start=pdf_folder)
        subfolder = os.path.dirname(relative_path)
        output_subfolder = os.path.join(png_output_folder, subfolder)
        os.makedirs(output_subfolder, exist_ok=True)

        # Convert PDF to PNGs
        pages = convert_from_path(pdf_file, dpi=300)  # Set DPI as needed
        for page_num, page in enumerate(pages, start=1):
            output_file = os.path.join(output_subfolder, f"{os.path.splitext(os.path.basename(pdf_file))[0]}_page_{page_num}.png")
            page.save(output_file, "PNG")
            print(f"Saved: {output_file}")
    except Exception as e:
        error_message = f"Error converting {pdf_file}: {e}\n"
        print(error_message)
        with open("png_conversion_errors.log", "a") as error_log:
            error_log.write(error_message)

def process_pdfs_to_pngs(pdf_folder, png_output_folder):
    """
    Process all PDFs in a folder (including subfolders) to convert them to PNGs.
    """
    os.makedirs(png_output_folder, exist_ok=True)
    for root, _, files in os.walk(pdf_folder):
        for file in files:
            if file.endswith(".pdf"):
                pdf_file = os.path.join(root, file)
                convert_pdf_to_png(pdf_file, png_output_folder)

# Paths
pdf_folder = "/content/extracted_files/compressed_pdf_outputs"  # Path to the folder containing PDFs
png_output_folder = "/content/png_outputs"  # Path to save PNG files

# Convert PDFs to PNGs
process_pdfs_to_pngs(pdf_folder, png_output_folder)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Saved: /content/png_outputs/15k/Receipts3/Receipt_2917_page_1.png
Saved: /content/png_outputs/15k/Receipts3/Receipt_1494_page_1.png
Saved: /content/png_outputs/20k/Receipts4/Receipt_3157_page_1.png
Saved: /content/png_outputs/20k/Receipts4/Receipt_1540_page_1.png
Saved: /content/png_outputs/20k/Receipts4/Receipt_865_page_1.png
Saved: /content/png_outputs/20k/Receipts4/Receipt_303_page_1.png
Saved: /content/png_outputs/20k/Receipts4/Receipt_3676_page_1.png
Saved: /content/png_outputs/20k/Receipts4/Receipt_2877_page_1.png
Saved: /content/png_outputs/20k/Receipts4/Receipt_3224_page_1.png
Saved: /content/png_outputs/20k/Receipts4/Receipt_1317_page_1.png
Saved: /content/png_outputs/20k/Receipts4/Receipt_4628_page_1.png
Saved: /content/png_outputs/20k/Receipts4/Receipt_4650_page_1.png
Saved: /content/png_outputs/20k/Receipts4/Receipt_4809_page_1.png
Saved: /content/png_outputs/20k/Receipts4/Receipt_310_page_1.png
Saved: /conten

**Checking smooth operation**

In [6]:
pdf_output_folder = "/content/png_outputs/20k/Receipts4"

# Count the PDFs in the output folder
pdf_files = [file for file in os.listdir(pdf_output_folder) if file.endswith(".png")]

print(f"Total PDFs in the output folder: {len(pdf_files)}")


Total PDFs in the output folder: 4999


**Downloads**

In [3]:
!pip install pdf2image

Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.17.0


In [4]:
!apt-get install -y poppler-utils

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 49 not upgraded.
Need to get 186 kB of archives.
After this operation, 696 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.5 [186 kB]
Fetched 186 kB in 0s (439 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 123634 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.5_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.5) ...
Setting up poppler-utils (22.02.0-2ubuntu0.5) ...
Processing triggers for man-db (2.10.2-1) ...


In [5]:
!sudo apt-get update
!sudo apt-get install -y libreoffice

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:9 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,632 kB]
Get:10 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [2,517 kB]
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:12 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:13 http://archive.ubuntu.com/ubuntu jammy-u

In [8]:
!libreoffice --headless --convert-to pdf --outdir /content/test_output /content/extracted_files/10k/Receipts2/Receipt_1.docx

convert /content/extracted_files/10k/Receipts2/Receipt_1.docx -> /content/test_output/Receipt_1.pdf using filter : writer_pdf_Export


In [11]:
!libreoffice --headless --convert-to pdf --outdir /content/test_output/ /content/extracted_files/20k/Receipts4/Receipt_160.docx

convert /content/extracted_files/20k/Receipts4/Receipt_160.docx -> /content/test_output/Receipt_160.pdf using filter : writer_pdf_Export


In [11]:
import os
import zipfile

def zip_folder(folder_path, output_zip):
    """
    Compress all PNG files in the folder and its subdirectories into a ZIP file.
    """
    with zipfile.ZipFile(output_zip, 'w', zipfile.ZIP_DEFLATED) as zipf:
        file_count = 0
        for root, _, files in os.walk(folder_path):
            for file in files:
                if file.endswith('.png'):
                    file_path = os.path.join(root, file)
                    arcname = os.path.relpath(file_path, start=folder_path)
                    zipf.write(file_path, arcname)
                    print(f"Added to ZIP: {file_path}")  # Debugging print
                    file_count += 1
        print(f"Total files added to ZIP: {file_count}")

    if file_count == 0:
        print("No PNG files were found in the folder.")

# Usage
folder_path = "/content/png_outputs"  # Update with your folder containing PNGs
output_zip = "/content/png_outputs.zip"

zip_folder(folder_path, output_zip)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Added to ZIP: /content/png_outputs/20k/Receipts4/Receipt_1045_page_1.png
Added to ZIP: /content/png_outputs/20k/Receipts4/Receipt_2029_page_1.png
Added to ZIP: /content/png_outputs/20k/Receipts4/Receipt_3062_page_1.png
Added to ZIP: /content/png_outputs/20k/Receipts4/Receipt_3610_page_1.png
Added to ZIP: /content/png_outputs/20k/Receipts4/Receipt_1945_page_1.png
Added to ZIP: /content/png_outputs/20k/Receipts4/Receipt_1321_page_1.png
Added to ZIP: /content/png_outputs/20k/Receipts4/Receipt_1264_page_1.png
Added to ZIP: /content/png_outputs/20k/Receipts4/Receipt_481_page_1.png
Added to ZIP: /content/png_outputs/20k/Receipts4/Receipt_1294_page_1.png
Added to ZIP: /content/png_outputs/20k/Receipts4/Receipt_150_page_1.png
Added to ZIP: /content/png_outputs/20k/Receipts4/Receipt_734_page_1.png
Added to ZIP: /content/png_outputs/20k/Receipts4/Receipt_4162_page_1.png
Added to ZIP: /content/png_outputs/20k/Receipts4/Receipt_3973_

In [None]:
from google.colab import files

# Specify the path to your compressed file
compressed_file_path = "/content/png_outputs.zip"

# Trigger the download
files.download(compressed_file_path)

In [None]:
from google.colab import files
files.download("/content/png_outputs.zip")


In [16]:
from google.colab import drive
drive.mount('/content/drive')

# Move ZIP file to Google Drive
!cp /content/png_outputs.zip /content/drive/MyDrive/


Mounted at /content/drive
