In [2]:
import lmdb
import cv2
import os
import glob

def check_image_is_valid(image_path):
    try:
        img = cv2.imread(image_path)
        if img is None:
            return False
        return True
    except Exception:
        return False

def write_cache(env, cache):
    with env.begin(write=True) as txn:
        for k, v in cache.items():
            txn.put(k.encode(), v)

def create_lmdb_dataset(image_dir, label_file, output_path):
    os.makedirs(output_path, exist_ok=True)
    env = lmdb.open(output_path, map_size=2 * 1024 * 1024 * 1024)  # 2GB

    cache = {}
    cnt = 0

    with open(label_file, "r", encoding="utf-8") as f:
        lines = f.readlines()

    for line in lines:
        img_path, label = line.strip().split("\t")

        img_path = os.path.join(image_dir, img_path)

        if not os.path.exists(img_path) or not check_image_is_valid(img_path):
            print(f"Skip invalid image: {img_path}")
            continue

        with open(img_path, "rb") as img_file:
            image_bin = img_file.read()

        cache[f'image-{cnt:09d}'] = image_bin
        cache[f'label-{cnt:09d}'] = label.encode('utf-8')

        cnt += 1

        if cnt % 1000 == 0:
            write_cache(env, cache)
            cache = {}
            print(f"{cnt} images processed.")

    # Write remaining cache
    if cache:
        write_cache(env, cache)
        print(f"Final {cnt} images processed.")

    print(f"Dataset created at: {output_path}")

if __name__ == "__main__":
    image_dir = "./dataset/train_images/"
    label_file = "./dataset/train_labels.txt"
    output_path = "./train_data/data_lmdb_release/training/"
    create_lmdb_dataset(image_dir, label_file, output_path)


Skip invalid image: D:/Private/Project/kaggle/OCR/data_OCR/final_data\2a(1).png
Skip invalid image: D:/Private/Project/kaggle/OCR/data_OCR/final_data\2a(10).png
Skip invalid image: D:/Private/Project/kaggle/OCR/data_OCR/final_data\2a(100).png
Skip invalid image: D:/Private/Project/kaggle/OCR/data_OCR/final_data\2a(1000).png
Skip invalid image: D:/Private/Project/kaggle/OCR/data_OCR/final_data\2a(100000).png
Skip invalid image: D:/Private/Project/kaggle/OCR/data_OCR/final_data\2a(100001).png
Skip invalid image: D:/Private/Project/kaggle/OCR/data_OCR/final_data\2a(100002).png
Skip invalid image: D:/Private/Project/kaggle/OCR/data_OCR/final_data\2a(100003).png
Skip invalid image: D:/Private/Project/kaggle/OCR/data_OCR/final_data\2a(100004).png
Skip invalid image: D:/Private/Project/kaggle/OCR/data_OCR/final_data\2a(100005).png
Skip invalid image: D:/Private/Project/kaggle/OCR/data_OCR/final_data\2a(100006).png
Skip invalid image: D:/Private/Project/kaggle/OCR/data_OCR/final_data\2a(100007

PaddlePaddle Version: 2.6.2
CUDA Available: True
GPU Count: 1


AttributeError: module 'paddle.device.cuda' has no attribute 'current_device'

In [4]:
import paddle

print("Paddle version:", paddle.__version__)
print("Device:", paddle.device.get_device())
print("CUDA support:", paddle.is_compiled_with_cuda())


Paddle version: 2.6.2
Device: gpu:0
CUDA support: True


In [None]:
# $env:PYTHONUTF8=1
# python tools/infer_rec.py -c output/rec_ppocr_v4/config.yml -o Global.infer_img=data_OCR/final_data/img_1.jpg
