# Data Cleaning & Training Preparation

__Goals:__ 
1. Rename the collected raw data files
1. Image supplementation to increase number of images to train on

## 0 Setup

### 0.1 Imports

In [20]:
# import utility libraries
import cv2
import os
import random
import re

# import data analysis libraries
import matplotlib.pyplot as plt
import numpy as np

## 1. Rename raw data

### 1.1 Renaming utility functions

In [None]:
def rename_training_data(src_dir: str, dest_dir: str):
    for i, old_fname in enumerate(sorted(os.listdir(src_dir))):
        new_fname_base = f'{i+1}'.zfill(6)
        new_fname = f'{new_fname_base}.jpg'

        old_fpath = os.path.join(src_dir, old_fname)
        new_fpath = os.path.join(dest_dir, new_fname)

        os.rename(old_fpath, new_fpath)
    return

### 1.2 Execute rename

In [21]:
# create a top-level training data directory
train_dirpath = '../data/train'

# get the directory name for the needs "braces" training data
braces_train_dirpath = os.path.join(train_dirpath, 'braces')

# get the directory name for the doesn't need "no_braces" training data
no_braces_train_dirpath = os.path.join(train_dirpath, 'no_braces')

Upload the images saved from stage __2 Raw Image Data Extraction__ into `data/raw/braces` and `data/raw/no_braces` directories

In [25]:

rename_training_data('../data/raw/braces', braces_train_dirpath)
rename_training_data('../data/raw/no_braces/', no_braces_train_dirpath)

## 2 Image supplementation

### 2.1 Set Up

In [None]:
# define global variables
DATA_DIR = '../data'
TRAIN_DIR = os.path.join(DATA_DIR, 'train')
BRACES_DIR = os.path.join(TRAIN_DIR, 'braces')
NOBRACES_DIR = os.path.join(TRAIN_DIR, 'no_braces')

In [None]:
class ImageSupplementer:

    def __init__(self):
        pass

    def resize(self, img, output_dim: None):

        # if output dimensions give, use them
        if output_dim:
            output_width = output_dim[0]
            output_height = output_dim[1]

        # else flip the width and height
        else:
            output_width = img.shape[1]
            output_height = img.shape[0]

        # resize input image
        output_img = cv2.resize(img, (output_width, output_height))

        return output_img


    def zoom(self, img, zoom_max = 10, zoom_min = 5):

        # create an array of zoom factors from zoom_min to zoom_max
        zoom_lst = list(range(zoom_min, zoom_max + 1))

        # get a random index
        idx = random.randrange((zoom_max - zoom_min + 1))

        # randomly select zoom factor
        zoom_px = zoom_lst[idx]

        # divide by 2 bc we are splitting horizontal/vertical zoom evenly
        half_zoom = zoom_px // 2

        # get original image width
        img_width = img.shape[1]
        img_height = img.shape[0]

        # slice image array based on zoom factor
        output_img = img[half_zoom:(img_height - half_zoom), half_zoom:(img_width - half_zoom)]

        # resize image to original dimensions
        output_img = self.resize(output_img, output_dim=(img_width, img_height))

        return output_img


    def shift(self, img, shift_max = 10, shift_min = 5):
        '''returns 4 shifted images in a tuple:
                (left_shift, up_shift, right_shift, down_shift)
        '''

        shift_lst = list(range(shift_min, shift_max + 1))
        left_shift, up_shift, right_shift, down_shift = random.sample(shift_lst, 4)
        left_resize, up_resize, right_resize, down_resize = [v > 0.5 for v in np.random.random(size=4)]
        img_width = img.shape[1]
        img_height = img.shape[0]

        img_left = self.resize(img[:, :(img_width - left_shift)], output_dim=(img_width, img_height)) if left_resize else img[:, :(img_width - left_shift)]
        img_up = self.resize(img[:(img_height - up_shift), :], output_dim=(img_width, img_height)) if up_resize else img[:(img_height - up_shift), :]
        img_right = self.resize(img[:(img_height - right_shift), :], output_dim=(img_width, img_height)) if right_resize else img[:(img_height - right_shift), :]
        img_down = self.resize(img[:(img_height - down_shift), :], output_dim=(img_width, img_height)) if down_resize else img[:(img_height - down_shift), :]

        return (img_left, img_up, img_right, img_down)


    def altered_img(self, img, zoom_max = 10, zoom_min = 5, shift_max = 10, shift_min = 5):
        '''
            convenience function to zoom and shift image randomly within a given range

            returns - list of images in the format [zoomed, shifted left, shifted up, shifted right, shifted down]
        '''

        img_zoom = self.zoom(img, zoom_max, zoom_min)
        img_left, img_up, img_right, img_down = self.shift(img, shift_max, shift_min)

        return [img_zoom, img_left, img_up, img_right, img_down]

    def read_img(self, fpath):
        '''
            convenience function to read image (overwrite if needed)
        '''

        return plt.imread(fpath)

    def generate_alt_imgs(self, fpath, zoom_max = 10, zoom_min = 5, shift_max = 10, shift_min = 5):
        '''
            convenience function to read and alter images
        '''

        img = self.read_img(fpath)
        return self.altered_img(img, zoom_max, zoom_min, shift_max, shift_min)

    def generate_alt_img_fpaths(self, fpath):
        dirpath, fname = os.path.split(fpath)
        fbase, ext = os.path.splitext(fname)

        new_fpaths = []
        for alt in ['zoom', 'left', 'up', 'right', 'down']:
            new_fbase = f'{fbase}_{alt}'
            new_fpath = os.path.join(dirpath, new_fbase + ext)
            new_fpaths.append(new_fpath)

        return new_fpaths

    def generate_and_save_imgs(self, fpath, zoom_max = 10, zoom_min = 5, shift_max = 10, shift_min = 5):

        img_fpath_data_list = list(zip(self.generate_alt_img_fpaths(fpath), self.generate_alt_imgs(fpath, zoom_max, zoom_min, shift_max, shift_min)))
        print(f'successfully generated alternate images...')
        for fpath, img in img_fpath_data_list:
            plt.imsave(fpath, img)
        print(f'successfully saved alternate images.')
        return

### 2.2 Execute image supplementation

In [None]:
original_img_re = re.compile(r'[0-9]{6}\.[A-Za-z]{1,3}')

# get the file paths for the "needs braces" raw data
braces_fpaths = [os.path.join(BRACES_DIR, fname) for fname in os.listdir(BRACES_DIR) if original_img_re.match(fname)]
# get the file paths for the "doesn't need braces" raw data
nobraces_fpaths = [os.path.join(NOBRACES_DIR, fname) for fname in os.listdir(NOBRACES_DIR) if original_img_re.match(fname)]

# concatenate the file paths
img_fpaths = braces_fpaths + nobraces_fpaths

In [None]:
# initialize image supplementer
img_supplementer = ImageSupplementer()

# generate and save the training data
for img_fpath in img_fpaths:
    print(f'supplementing {img_fpath}...')
    img_supplementer.generate_and_save_imgs(img_fpath, zoom_max = 15, zoom_min = 5, shift_max = 15, shift_min = 5)
    print('='*50)

### 2.3 Review image supplementation

In [None]:
img_1_fpath = img_fpaths[0]
a = os.path.splitext(os.path.split(img_1_fpath)[1])[0]
img_1_arr = [fpath for fpath in img_fpaths if a in fpath]

fig, axs = plt.subplots(nrows=len(img_1_arr), ncols=1, figsize=(8, 42))
for i, fpath in enumerate(img_1_arr):
    axs[i].set_title(fpath)
    axs[i].imshow(plt.imread(fpath))