In [None]:
# 🧹 Cleanup: Remove all previously generated files except system folders
import shutil, os

# Remove splits folder if exists
if os.path.exists("splits"):
    shutil.rmtree("splits")

# Remove all files in the root (except system folders)
for fname in os.listdir():
    if fname not in ["sample_data", "splits", ".config"] and not fname.endswith(".ipynb"):
        try:
            if os.path.isfile(fname):
                os.remove(fname)
        except Exception as e:
            print(f"Could not delete {fname}: {e}")


In [None]:
# 🛠️ Install dependencies
!pip install PyPDF2 pandas openpyxl

In [None]:
# 🧹 Clear the 'splits' folder before splitting
import os
folder = 'splits'
if os.path.exists(folder):
    for filename in os.listdir(folder):
        file_path = os.path.join(folder, filename)
        os.remove(file_path)
    print('✅ All files in the splits folder have been deleted.')
else:
    print('ℹ️ Folder splits does not exist.')

In [None]:
# 📦 Import libraries
from PyPDF2 import PdfReader, PdfWriter
import pandas as pd
import os, re

In [None]:
# 🔍 Function to extract car registration number
def extract_registration(text):
    match = re.search(r'\b[A-Z]{1,3}-[A-Z]{1,2} \d{1,4}\b', text)
    return match.group(0) if match else 'Not found'

In [None]:
# ✂️ Split PDF and extract registration numbers
def split_and_extract(input_pdf):
    reader = PdfReader(input_pdf)
    total_pages = len(reader.pages)
    results = []
    if not os.path.exists("splits"): os.makedirs("splits")
    for i in range(0, total_pages, 2):
        print(f"📄 Processing pages {i+1}-{min(i+2, total_pages)}")
        text = reader.pages[i].extract_text()
        reg_number = extract_registration(text)
        safe_name = reg_number.replace(" ", "_").replace("-", "_") if reg_number != "Not found" else f"Not_found_{i+1:03d}"
        name = f"{safe_name}.pdf"
        print(f"📝 Saving file: {name}")
        path = os.path.join("splits", name)
        writer = PdfWriter()
        writer.add_page(reader.pages[i])
        if i + 1 < total_pages:
            writer.add_page(reader.pages[i+1])
        with open(path, "wb") as f_out:
            writer.write(f_out)
        results.append({"File": name, "Registration No": reg_number})
    df = pd.DataFrame(results)
    df.to_excel("result.xlsx", index=False)
    return "result.xlsx"

In [None]:
# 📤 Upload PDF
from google.colab import files
import os
uploaded = files.upload()
pdf_file = os.path.join("/content", list(uploaded.keys())[0])

In [None]:
# 🚀 Run extraction and download result
output_file = split_and_extract(pdf_file)
files.download(output_file)

In [None]:
# 📦 Archive all split PDF files into a ZIP
import shutil
shutil.make_archive("split_files", 'zip', "splits")

In [None]:
# 📥 Download ZIP file with all split PDFs
from google.colab import files
files.download("split_files.zip")