In [None]:
# Install required libraries
!pip install pyyaml==5.1
!pip install torch torchvision
!apt-get install tesseract-ocr-all
!pip install pytesseract
!apt-get install -y poppler-utils
!pip install pytesseract pdf2image
!pip install PyPDF2 pdf2image pytesseract
!python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'

# Import necessary libraries
import os
import zipfile
import cv2
from pytesseract import image_to_string, Output
from PyPDF2 import PdfFileReader
from pdf2image import convert_from_path
import torch
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor, DefaultTrainer
from detectron2.config import get_cfg
from detectron2.utils.logger import setup_logger
from detectron2.data import MetadataCatalog, DatasetCatalog
from detectron2.data.datasets import register_coco_instances

# Setup logger
setup_logger()

# Extract the dataset from the zip file
zip_path = '/content/data.zip'  # Path to the ZIP file
extract_dir = '/content/train'  # Path where you want to extract the data

# Ensure the zip file exists
if os.path.exists(zip_path):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall('/content')

# Register your custom dataset in COCO format (Train and Validation)
register_coco_instances("my_dataset_train", {}, "/content/train/instances_Train.json", "/content/train")
register_coco_instances("my_dataset_val", {}, "/content/train/instances_Train.json", "/content/train")

# Configure the model for training
cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml"))
cfg.DATASETS.TRAIN = ("my_dataset_train",)
cfg.DATASETS.TEST = ("my_dataset_val",)
cfg.DATALOADER.NUM_WORKERS = 2
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml")
cfg.SOLVER.IMS_PER_BATCH = 2
cfg.SOLVER.BASE_LR = 0.0025
cfg.SOLVER.MAX_ITER = 500
cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 128
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 8

# Create output directory and start training
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
trainer = DefaultTrainer(cfg)
trainer.resume_or_load(resume=False)
trainer.train()

# Set up the model for extracting objects from the specific image
cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.9
predictor = DefaultPredictor(cfg)

# Metadata for your dataset (important for extracting class names)
metadata = MetadataCatalog.get("my_dataset_train")
class_names = metadata.get("thing_classes")

# Convert PDF pages to images
pdf_path = '/content/TTA1.pdf'  # Your PDF file
output_folder = '/content/extracted_files'
os.makedirs(output_folder, exist_ok=True)

# Use pdf2image to convert PDF to images
pages = convert_from_path(pdf_path)
main_text_path = os.path.join(output_folder, "main.txt")

# Define the classes that are images and texts
image_classes = ['Table', 'Image', 'Figure', 'Form']
text_classes = ['Text', 'Page Number', 'Header and Footer']

# Process each page of the PDF
with open(main_text_path, 'w', encoding='utf-8') as main_file:
    for page_num, page in enumerate(pages):
        # Convert the PIL page to a format compatible with cv2
        page_image = cv2.cvtColor(np.array(page), cv2.COLOR_RGB2BGR)
        outputs = predictor(page_image)
        instances = outputs["instances"].to("cpu")

        for i, box in enumerate(instances.pred_boxes):
            predicted_class = instances.pred_classes[i].item()
            class_name = class_names[predicted_class]
            bbox = box.numpy().astype(int).tolist()
            x1, y1, x2, y2 = bbox
            cropped_img = page_image[y1:y2, x1:x2]

            if class_name in text_classes:
                # OCR for text classes
                ocr_text = image_to_string(cropped_img, config='--psm 6')
                main_file.write(ocr_text + "\n\n")
            elif class_name in image_classes:
                # Save images for classes like 'Table', 'Image', etc.
                image_output_dir = os.path.join(output_folder, class_name)
                os.makedirs(image_output_dir, exist_ok=True)
                output_image_path = os.path.join(image_output_dir, f"page_{page_num + 1}_instance_{i}.jpg")
                cv2.imwrite(output_image_path, cropped_img)
                main_file.write(f"[Image saved at: {output_image_path}]\n\n")

print(f"Processing complete. The extracted data is saved in {main_text_path}.")
