In [1]:
import pandas as pd
from PIL import Image,ImageDraw, ImageFont,ImageFilter
import re
import random
import colorsys

In [2]:
df = pd.read_csv('sentences.csv')

In [3]:
def generate_random_hiragana(n):
    hiragana_pattern = re.compile('[\u3041-\u3096]')
    return ''.join([random.choice(hiragana_pattern.findall(chr(random.randint(0x3041, 0x3096)))) for _ in range(n)])

In [4]:

def generate_random_color():
    # Generate random RGB values excluding black
    red = random.randint(20, 255)
    green = random.randint(20, 255)
    blue = random.randint(20, 255)

    # Format the RGB values into a tuple
    color_tuple = (red, green, blue)

    return color_tuple


In [5]:
def generate_img_with_random_furi(sentence,file_name):
    # Specify the font and size
    font_path = "fonts/NotoSerifJP/NotoSerifJP-Black.otf"  # Replace with the path to your TTF font file
    font_size = 16
    font = ImageFont.truetype(font_path, font_size)

    furigana_font = 8
    furigana_font = ImageFont.truetype(font_path,furigana_font)

    # Specify the sentence and position
    sentence_position = (10, 10)


    # Draw the sentence on the image vertically
    PADDING = 2
    # get size of box
    char_height = font.getbbox(sentence[0])[3] - font.getbbox(sentence[0])[1]
    char_width = font.getbbox(sentence[0])[2]
    char_size = PADDING
    if char_width > char_height:
        char_size += char_width
    else:
        char_size += char_height

    # Calculate the total height of the vertical sentence
    total_height = sum(char_size for char in sentence)

    if random.random() < 0.3:
        background_color = generate_random_color()
    else:
        background_color = 'white'

    image = Image.new("RGB", (50,total_height + font_size),background_color)

    # Create a draw object
    draw = ImageDraw.Draw(image)

    for char in sentence:
        draw.text(sentence_position, char, font=font, fill="black")
        # random furigana
        if random.random() < 0.2:

            furigana_txt = generate_random_hiragana(2)
            furigana_x,furigana_y = sentence_position
            # trying to make the furigana look better
            furigana_y += 0.1
            furigana_x -=2
            furigana_x+=char_size
            for furigana_char in furigana_txt:
                draw.text((furigana_x,furigana_y),furigana_char,font=furigana_font,fill='black')
                furigana_y += char_size/2
        sentence_position = (sentence_position[0], sentence_position[1] + char_size)  # Move down for each character
    
    # randomly blur
    if random.random() < 0.3:
        # add blur
        # Generate a blur_radius value from a normal distribution between 0 and 1
        mean_radius = 0.5
        std_dev_radius = 0.2  # You can adjust this value based on the desired spread of the distribution
        blur_radius = max(0, min(1, random.gauss(mean_radius, std_dev_radius)))

        # # Apply a Gaussian blur filter with the generated radius
        image = image.filter(ImageFilter.GaussianBlur(radius=blur_radius))

    # randomly shear
    if random.random() < 0.3:
        # Generate a shear amount from a normal distribution
        mean_shear = 0.0
        std_dev_shear = 0.1 # You can adjust this value based on the desired spread of the distribution
        shear_amount = random.gauss(mean_shear, std_dev_shear)
        # Apply a shear transformation to the image
        image= image.transform(
            image.size,
            Image.AFFINE,
            (1, shear_amount, 0, 0, 1, 0),  # Shear along the x-axis
            resample=Image.BICUBIC
        )


    # Save or display the image
    image.save(file_name)


In [6]:
import numpy as np

def generate_string_with_gaussian_chunks(base_string, mean_length=6, std_dev=1):
    chunks = []
    string_length = len(base_string)
    chunk_lengths = np.random.normal(loc=mean_length, scale=std_dev, size=string_length).astype(int)

    length_counter = 0
    counter = 0
    for l in chunk_lengths:
        length_counter += l
        counter += 1
        if l >= string_length:
            break

    start = 0
    for i in range(counter):
        length = max(1, chunk_lengths[i])  # Ensure the length is at least 1
        chunk = base_string[start:start + chunk_lengths[i]]
        chunks.append(chunk)
        start += length
        if start >= string_length:
            break
    return chunks

# Example usage
# input_string = "abcdefghijklmnopqrstuvwxyz"
# generate_string_with_gaussian_chunks(input_string)


In [12]:
chunks = generate_string_with_gaussian_chunks(df['text'].iloc[0])
chunks

['午後から雨', 'が心配だっ', 'たので遠出', 'はせず、『', 'ふれあいロ', 'ード』を走', 'って来ました', '！']

In [8]:
text = df['text']

In [9]:
counter = 0
limit = 100000
# the file name is just the index of the label
dict_training = {'labels':[]}

limit_broken = False
for sentence in text:
    chunks = generate_string_with_gaussian_chunks(sentence)
    for chunk in chunks:
        file_name = f'{counter}.jpg'
        generate_img_with_random_furi(chunk,f'./train_imgs/{file_name}')
        dict_training['labels'].append(chunk)
        counter += 1
        if counter == limit:
            limit_broken = True
            break
    if limit_broken:
        break


In [10]:
df = pd.DataFrame(dict_training)

# Specify the CSV file path
csv_file_path = 'training_labels.csv'

# Save the DataFrame to CSV
df.to_csv(csv_file_path, index=False)
