# Setup / Imports

In [2]:
# Importing necessary functions
# https://www.geeksforgeeks.org/python-data-augmentation/
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import array_to_img, img_to_array, load_img
import numpy as np
from collections import deque
import random
from pathlib import Path  # to get filenames without extensions
import shutil  # to copy files
import string
from tqdm import tqdm

In [3]:
import os
cwd = os.getcwd()

# Generating Background Data

`frame_images_DB` can be downloaded from Prof. Lior Wolf's [YTFaces Dataset](https://www.cs.tau.ac.il/~wolf/ytfaces).

In [None]:
background_base_path = os.path.join(cwd, "frame_images_DB")
output_background_base_path = os.path.join(cwd, "backgrounds")
os.makedirs(output_background_base_path)
iter = 0
for subfolder in os.listdir(background_base_path):
    if '.' not in subfolder: # is a folder and not a file
        subf_path = os.path.join(background_base_path, subfolder)
        for video_folder in os.listdir(subf_path):
            vidf_path = os.path.join(subf_path, video_folder)
            random_frame = random.choice(os.listdir(vidf_path))
            shutil.copyfile(os.path.join(vidf_path, random_frame), os.path.join(output_background_base_path, "bg_1_" + str(iter) + ".jpg"))
            iter += 1

# Generating Data with Hands

In [8]:
# Initialising the ImageDataGenerator class.
# We will pass in the augmentation parameters in the constructor.
# https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/image/ImageDataGenerator
datagen = ImageDataGenerator(
        rotation_range = 25,
        # width_shift_range = 10,
        # height_shift_range = 10,  # irrelevant as image will be placed in a larger image
        brightness_range = (0.5, 1.7),
        shear_range = 0.2,
        zoom_range = 0.4,
        fill_mode="constant",
        cval = 0, # fill deformed pixels with black by default
        horizontal_flip = False,  # direction matters in sign language!
        vertical_flip = False  # direction matters in sign language!
)

In [18]:
def get_nonzero_sub_image(image):
    # takes away rows and columns from the border of an image that are entirely black
    # returns top left and bottom right corner of resulting sub-image
    min_row = 0
    stop_iteration = False
    for row in range(0,image.shape[0]):
        if np.max(image[row,:,:] != 0):
            min_row = row
            break
    min_col = 0
    for col in range(0,image.shape[1]):
        if np.max(image[:,col,:] != 0):
            min_col = col
            break
    max_row = image.shape[0]
    for row in range(image.shape[0]-1, -1, -1):
        if np.max(image[row,:,:] != 0):
            max_row = row
            break
    max_col = image.shape[1]
    for col in range(image.shape[1]-1,-1,-1):
        if np.max(image[:,col,:] != 0):
            max_col = col
            break
    return (min_row, min_col, max_row, max_col)

In [19]:
def threshold_dark_background(image, norm_threshold=110):
    # will take away all contiguous dark regions on borders of image
    # "dark" being any pixel with a norm less than the threshold
    pixels_to_visit = deque()
    visited_pixels = set()
    for row in range(image.shape[0]):
        pixels_to_visit.append((row,0))
        pixels_to_visit.append((row,image.shape[1]-1))
    for col in range(image.shape[1]):
        pixels_to_visit.append((0,col))
        pixels_to_visit.append((image.shape[0]-1,col))
    while len(pixels_to_visit):
        next_pixel = pixels_to_visit.popleft()
        if next_pixel not in visited_pixels:
            visited_pixels.add(next_pixel)
            if np.linalg.norm(image[*next_pixel] - [0,0,0]) < norm_threshold: # threshold darkness to be black, so it's removable
                image[*next_pixel] = [0,0,0]
                for row_delta in (-1,0,1):
                    for col_delta in (-1,0,1):
                        if (0 <= next_pixel[0] + row_delta < image.shape[0]) and (0 <= next_pixel[1] + col_delta < image.shape[1]): # within bounds
                            pixels_to_visit.append((next_pixel[0] + row_delta, next_pixel[1] + col_delta))
            elif ((image.shape[0]-1) - next_pixel[0] < 2) or ((image.shape[1]-1) - next_pixel[1] < 2): # account for weird border in dataset
                image[*next_pixel] = [0,0,0]
    return image

In [20]:
def generate_xml(filename, path, image_shape, sign, bounds):
    return '''<annotation>
    <folder>xml</folder>
    <filename>''' + filename + '''</filename>
    <path>''' + path + '''</path>
    <source>
        <database>Unknown</database>
    </source>
    <size>
        <width>''' + str(image_shape[1]) + '''</width>
        <height>''' + str(image_shape[0]) + '''</height>
        <depth>''' + str(image_shape[2]) + '''</depth>
    </size>
    <segmented>0</segmented>
    <object>
        <name>''' + sign + '''</name>
        <pose>Unspecified</pose>
        <truncated>0</truncated>
        <difficult>0</difficult>
        <bndbox>
            <xmin>''' + str(bounds[0]) + '''</xmin>
            <ymin>''' + str(bounds[1]) + '''</ymin>
            <xmax>''' + str(bounds[2]) + '''</xmax>
            <ymax>''' + str(bounds[3]) + '''</ymax>
        </bndbox>
    </object>
</annotation>'''

Pathikreet's gestures dataset can be downloaded from his [Kaggle page](https://www.kaggle.com/datasets/pathikreet/indian​).

In [21]:
# Loading a sample image 
def process_sign(sign, total_samples=None, iterations_per_sample=10, flatten_structure=False):
    sign_input_dir = os.path.join(cwd, 'pathikreet_dataset', sign)
    sign_output_img_dir = os.path.join(cwd, 'param_dataset') if flatten_structure else os.path.join(cwd, 'param_dataset', 'img', sign)
    sign_output_xml_dir = os.path.join(cwd, 'param_dataset') if flatten_structure else os.path.join(cwd, 'param_dataset', 'xml', sign)
    os.makedirs(sign_output_img_dir, exist_ok=True)  # make output directory if it doesn't exist
    os.makedirs(sign_output_xml_dir, exist_ok=True)  # make output directory if it doesn't exist
    background_dir = os.path.join(cwd, 'backgrounds')
    
    all_filenames = os.listdir(sign_input_dir)
    if total_samples is None:
        total_samples = len(all_filenames)   
    total_samples = min(total_samples, len(all_filenames))
    for img_filename in tqdm(random.sample(all_filenames, total_samples)):
        hand_img = load_img(os.path.join(sign_input_dir, img_filename))
        hand_img = hand_img.resize((128,128))
        background_img = load_img(os.path.join(background_dir, random.choice(os.listdir(background_dir)))) 
        background_img = background_img.resize((480, 270))
        # Converting the input sample image to an array
        hand_array = img_to_array(hand_img)
        bg_array = img_to_array(background_img)
        pad_by = int(hand_array.shape[0]*0.5)
        
        hand_array_thresholded = threshold_dark_background(hand_array)
        
        hand_array_padded = np.pad(hand_array_thresholded, ((pad_by,pad_by),(pad_by,pad_by),(0,0)), mode='constant')  # pads black values along the side of the hands
        
        # Reshaping the input image
        hand_array_padded = np.expand_dims(hand_array_padded, axis=0) 
        
        iteration = 0
        for deformed_image_list in datagen.flow(hand_array_padded, batch_size = 1,
                                  # save_to_dir = os.path.join(cwd,'preview'), 
                                  # save_prefix ='image', save_format ='jpeg'
                                 ):
            if iteration >= iterations_per_sample:
                break
            
            deformed_image = deformed_image_list[0]
            min_r, min_c, max_r, max_c = get_nonzero_sub_image(deformed_image)  # crops out black borders on every side
            deformed_image_cropped = deformed_image[min_r:max_r+1, min_c:max_c+1, :]
        
            final_processed_array = np.copy(bg_array)
        
            hand_y_placement = np.random.randint(0, final_processed_array.shape[0] - deformed_image_cropped.shape[0])
            hand_x_placement = np.random.randint(0, final_processed_array.shape[1] - deformed_image_cropped.shape[1])
        
            for r in range(deformed_image_cropped.shape[0]):
                for c in range(deformed_image_cropped.shape[1]):
                    if max(deformed_image_cropped[r,c,:]) > 0:  # nonzero (non-black) pixel
                        final_processed_array[hand_y_placement + r, hand_x_placement + c,:] = deformed_image_cropped[r,c,:]
        
            bounds = (hand_x_placement, hand_y_placement, hand_x_placement + deformed_image_cropped.shape[1], hand_y_placement + deformed_image_cropped.shape[0])
            final_processed_img = array_to_img(final_processed_array)

            output_filename = sign + "_" + Path(img_filename).stem + "_" + str(iteration)
            output_img_filename = output_filename+'.jpg'
            output_img_path = os.path.join(sign_output_img_dir, output_img_filename)
            output_xml_path = os.path.join(sign_output_xml_dir, output_filename+'.xml')
                        
            final_processed_img.save(output_img_path)
            corresponding_xml_text = generate_xml(output_img_filename, output_img_path, final_processed_array.shape, sign, bounds)
            with open(output_xml_path, "w+") as xml_outfile:
                xml_outfile.write(corresponding_xml_text)
            
            iteration += 1

In [22]:
all_signs = [str(x) for x in range(1,10)] + list(string.ascii_uppercase) + ['none']
for sign in all_signs:
    print("Processing signs for", sign)
    process_sign(sign, total_samples=300, iterations_per_sample=1, flatten_structure=True)

Processing signs for 1


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:09<00:00,  4.32it/s]


Processing signs for 2


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:08<00:00,  4.41it/s]


Processing signs for 3


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:09<00:00,  4.32it/s]


Processing signs for 4


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:06<00:00,  4.49it/s]


Processing signs for 5


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:09<00:00,  4.31it/s]


Processing signs for 6


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:09<00:00,  4.30it/s]


Processing signs for 7


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:13<00:00,  4.10it/s]


Processing signs for 8


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:13<00:00,  4.09it/s]


Processing signs for 9


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:09<00:00,  4.33it/s]


Processing signs for A


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:13<00:00,  4.06it/s]


Processing signs for B


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:09<00:00,  4.35it/s]


Processing signs for C


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:10<00:00,  4.23it/s]


Processing signs for D


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:34<00:00,  3.17it/s]


Processing signs for E


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:10<00:00,  4.25it/s]


Processing signs for F


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:14<00:00,  4.04it/s]


Processing signs for G


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:12<00:00,  4.16it/s]


Processing signs for H


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:04<00:00,  4.63it/s]


Processing signs for I


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:09<00:00,  4.33it/s]


Processing signs for J


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:12<00:00,  4.16it/s]


Processing signs for K


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:17<00:00,  3.89it/s]


Processing signs for L


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:10<00:00,  4.23it/s]


Processing signs for M


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:09<00:00,  4.31it/s]


Processing signs for N


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:11<00:00,  4.17it/s]


Processing signs for O


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:03<00:00,  4.70it/s]


Processing signs for P


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:10<00:00,  4.23it/s]


Processing signs for Q


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:14<00:00,  4.03it/s]


Processing signs for R


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:05<00:00,  4.58it/s]


Processing signs for S


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:04<00:00,  4.65it/s]


Processing signs for T


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:08<00:00,  4.39it/s]


Processing signs for U


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:07<00:00,  4.47it/s]


Processing signs for V


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:07<00:00,  4.44it/s]


Processing signs for W


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:05<00:00,  4.56it/s]


Processing signs for X


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:08<00:00,  4.35it/s]


Processing signs for Y


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:13<00:00,  4.06it/s]


Processing signs for Z


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:12<00:00,  4.11it/s]


Processing signs for none


100%|████████████████████████████████████████████████████████████████████████████████| 125/125 [00:21<00:00,  5.78it/s]
