In [1]:
!pip install python-doctr
!pip install "python-doctr[torch]"

Collecting python-doctr
  Downloading python_doctr-0.11.0-py3-none-any.whl.metadata (33 kB)
Collecting pypdfium2<5.0.0,>=4.11.0 (from python-doctr)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting langdetect<2.0.0,>=1.0.9 (from python-doctr)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting rapidfuzz<4.0.0,>=3.0.0 (from python-doctr)
  Downloading rapidfuzz-3.12.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting anyascii>=0.3.2 (from python-doctr)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Downloading python_doctr-0.11.0-py3-none-any.whl (304 kB)

In [2]:
import os
import glob
import json
from doctr.io import DocumentFile
from doctr.models import ocr_predictor, kie_predictor

# Initialize models
ocr_model = ocr_predictor(det_arch='db_resnet50', reco_arch='vitstr_small', pretrained=True)
kie_model = kie_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)

# Define input and output base directories
input_base = '/kaggle/input/find-it-again-dataset/findit2'  # adjust this to your input folder path
output_base = '/kaggle/working/doctr_output'         # adjust this to your desired output folder path

# Ensure output subfolders exist
for subfolder in ['train', 'test', 'val']:
    os.makedirs(os.path.join(output_base, subfolder), exist_ok=True)

# Process images in each subfolder
for subfolder in ['train', 'test', 'val']:
    input_folder = os.path.join(input_base, subfolder)
    
    # List image files; adjust extensions if needed
    image_paths = (glob.glob(os.path.join(input_folder, '*.[pP][nN][gG]')) +
                   glob.glob(os.path.join(input_folder, '*.[jJ][pP][gG]')) +
                   glob.glob(os.path.join(input_folder, '*.[jJ][pP][eE][gG]')))
    
    for image_path in image_paths:
        try:
            # Read image from file
            image = DocumentFile.from_images(image_path)
            result = ocr_model(image)
            
            # Obtain outputs
            result_json = result.export()
            string_result = result.render()
            
            # Use the input file name (without extension) for the output file names
            base_name = os.path.splitext(os.path.basename(image_path))[0]
            json_output_path = os.path.join(output_base, subfolder, f"{base_name}.json")
            text_output_path = os.path.join(output_base, subfolder, f"{base_name}.txt")
            
            # Save JSON output
            with open(json_output_path, 'w', encoding='utf-8') as json_file:
                json.dump(result_json, json_file, ensure_ascii=False, indent=4)
            
            # Save text output
            with open(text_output_path, 'w', encoding='utf-8') as text_file:
                text_file.write(string_result)
            
            print(f"Processed {image_path}")
        except Exception as e:
            print(f"Error processing {image_path}: {e}")


Downloading https://doctr-static.mindee.com/models?id=v0.7.0/db_resnet50-79bd7d70.pt&src=0 to /root/.cache/doctr/models/db_resnet50-79bd7d70.pt


  0%|          | 0/102021912 [00:00<?, ?it/s]

  state_dict = torch.load(archive_path, map_location="cpu")


Downloading https://doctr-static.mindee.com/models?id=v0.7.0/vitstr_small-fcd12655.pt&src=0 to /root/.cache/doctr/models/vitstr_small-fcd12655.pt


  0%|          | 0/85720631 [00:00<?, ?it/s]

Downloading https://doctr-static.mindee.com/models?id=v0.3.1/crnn_vgg16_bn-9762b0b0.pt&src=0 to /root/.cache/doctr/models/crnn_vgg16_bn-9762b0b0.pt


  0%|          | 0/63286381 [00:00<?, ?it/s]

Processed /kaggle/input/find-it-again-dataset/findit2/train/X51006387813.png
Processed /kaggle/input/find-it-again-dataset/findit2/train/X51005442379.png
Processed /kaggle/input/find-it-again-dataset/findit2/train/X51006556728.png
Processed /kaggle/input/find-it-again-dataset/findit2/train/X51006913067.png
Processed /kaggle/input/find-it-again-dataset/findit2/train/X51005361908.png
Processed /kaggle/input/find-it-again-dataset/findit2/train/X51006389898.png
Processed /kaggle/input/find-it-again-dataset/findit2/train/X51006556734.png
Processed /kaggle/input/find-it-again-dataset/findit2/train/X51005361946.png
Processed /kaggle/input/find-it-again-dataset/findit2/train/X51006647984.png
Processed /kaggle/input/find-it-again-dataset/findit2/train/X51006557187.png
Processed /kaggle/input/find-it-again-dataset/findit2/train/X51005719888.png
Processed /kaggle/input/find-it-again-dataset/findit2/train/X51005447840.png
Processed /kaggle/input/find-it-again-dataset/findit2/train/X51006387953.png