In [1]:
import os
import unicodedata
import random
import math
from PIL import Image, ImageDraw, ImageFont, ImageFilter, ImageOps
import numpy as np

corpus_file = "ne.txt"         # your Devanagari text corpus
fonts_dir = "fonts/static"           # folder with .ttf/.otf fonts
output_images = "data/images"
output_text = "data/"

num_images = 10000              # total number of images to generate
max_lines_per_image = 25      # max number of lines to concatenate per image
image_size = (256, 64)         # width x height
font_size_range = (32, 48)     # min and max font size
os.makedirs(output_images, exist_ok=True)
os.makedirs(output_text, exist_ok=True)


In [2]:
def apply_skew(img, max_angle=5):
    """Random horizontal skew using affine transform."""
    angle = random.uniform(-max_angle, max_angle)
    return img.transform(
        img.size,
        Image.AFFINE,
        (1, math.tan(math.radians(angle)), 0, 0, 1, 0),
        resample=Image.BICUBIC
    )

def apply_blur(img, max_radius=1.5):
    """Random Gaussian blur."""
    radius = random.uniform(0, max_radius)
    if radius > 0:
        img = img.filter(ImageFilter.GaussianBlur(radius=radius))
    return img

def apply_rotation(img, max_angle=5):
    """Random rotation."""
    angle = random.uniform(-max_angle, max_angle)
    return img.rotate(angle, expand=True, fillcolor="white")

def change_text_color(draw, x, y, text, font, colors=["black"]):
    """Random text color from list."""
    fill = random.choice(colors)
    draw.text((x, y), text, font=font, fill=fill)

# def apply_sin_cos_distortion(img, amplitude_range=(2,5), frequency_range=(0.05,0.15)):
#     """Apply vertical sin/cos wave distortion."""
#     arr = np.array(img)
#     h, w = arr.shape[:2]
#     amplitude = random.uniform(*amplitude_range)
#     frequency = random.uniform(*frequency_range)
    
#     new_arr = np.zeros_like(arr)
#     for i in range(h):
#         shift = int(amplitude * math.sin(2*math.pi*frequency*i))
#         new_arr[i] = np.roll(arr[i], shift, axis=0)
#     return Image.fromarray(new_arr)

def decenter_text(x, y, text_width, text_height, img_width, img_height, margin=10):
    """Randomly decenter text within image."""
    max_x = max(img_width - text_width - margin, margin)
    max_y = max(img_height - text_height - margin, margin)
    x = random.randint(margin, max_x)
    y = random.randint(margin, max_y)
    return x, y

def adjust_char_spacing(draw, x, y, text, font, spacing_range=(0, 5), fill="black"):
    """Draw text with random character spacing using textbbox."""
    for char in text:
        draw.text((x, y), char, font=font, fill=fill)
        # measure character width using textbbox
        bbox = draw.textbbox((x, y), char, font=font)
        char_width = bbox[2] - bbox[0]
        spacing = random.randint(*spacing_range)
        x += char_width + spacing
    return x, y


In [3]:
def draw_text(text, fontfile, output_path, base_image_size):
    text = unicodedata.normalize("NFC", text)
    
    # Start with a random font size
    font_size = random.randint(20, 48)
    pil_font = ImageFont.truetype(fontfile, font_size)
    
    # Temporary image to measure text width
    temp_img = Image.new("RGB", base_image_size, color="white")
    draw = ImageDraw.Draw(temp_img)
    bbox = draw.textbbox((0, 0), text, font=pil_font)
    text_width = bbox[2] - bbox[0]
    text_height = bbox[3] - bbox[1]
    
    # Shrink font if too wide
    while text_width + 20 > base_image_size[0] and font_size > 20:
        font_size -= 1
        pil_font = ImageFont.truetype(fontfile, font_size)
        bbox = draw.textbbox((0, 0), text, font=pil_font)
        text_width = bbox[2] - bbox[0]
        text_height = bbox[3] - bbox[1]
    
    # Dynamically adjust image width
    img_width = max(base_image_size[0], text_width + 20)
    img_height = base_image_size[1]
    
    # Create the final image
    img = Image.new("RGB", (img_width, img_height), color="white")
    draw = ImageDraw.Draw(img)
    
    # Random decentering
    x, y = decenter_text(0, 0, text_width, text_height, img_width, img_height)
    
    # Draw text
    draw.text((x, y), text, font=pil_font, fill="black")
    
    # Apply random augmentations
    if random.random() < 0.5:
        img = apply_skew(img)
    if random.random() < 0.5:
        img = apply_blur(img)
    if random.random() < 0.3:
        img = apply_rotation(img)
    # if random.random() < 0.3:
    #     img = apply_sin_cos_distortion(img)
    
    img.save(output_path)



In [5]:
gt_file_path = os.path.join(output_text, "gt.txt")

with open(corpus_file, "r", encoding="utf-8") as f:
    lines = [line.strip() for line in f if line.strip()]

font_files = [os.path.join(fonts_dir, f) for f in os.listdir(fonts_dir)
              if f.lower().endswith((".ttf", ".otf"))]

counter = 0
with open(gt_file_path, "w", encoding="utf-8") as gt_file:
    for _ in range(num_images):
        # pick random number of lines to concatenate
        num_lines = random.randint(1, max_lines_per_image)
        selected_lines = random.choices(lines, k=num_lines)
        text = " ".join(selected_lines)  # concatenate with spaces
        
        font_file = random.choice(font_files)
        file_name = f"image_{counter:05d}.jpg"
        output_path = os.path.join(output_images, file_name)
        
        draw_text(text, font_file, output_path, image_size)
        
        # write to gt.txt
        gt_file.write(f"{file_name} {text}\n")
        
        counter += 1

print(f"Generated {counter} images in {output_images}")
print(f"GT file saved at: {gt_file_path}")


Generated 10000 images in data/images
GT file saved at: data/gt.txt
