# Data Augmentation over Kunisch Patterns. 
## Seminario de Tesis I, Primavera 2022 
### MDS Program. University of Chile.
#### Supervisor: Prof. Benjamín Bustos, Prof. Iván Sipirán
#### Author: Matías Vergara

Performs data augmentation on patterns through the application of linear transformations.

## Imports

In [None]:
import cv2
import pandas as pd
import os
import math
import random

## Mounting Google Drive

In [None]:
# Mounting google Drive
try:
    from google.colab import drive
    drive.mount('/content/drive')
    folder_path = 'drive/MyDrive/TesisMV/'
except:
    folder_path = './'

Mounted at /content/drive


## Transformations 

In [None]:
def rotate90(path):
    image = cv2.imread(path)
    rotated = cv2.rotate(image, cv2.cv2.ROTATE_90_CLOCKWISE)
    # cv2.imshow("90", rotated)
    return(rotated, "rot90")


def rotate180(path):
    image = cv2.imread(path)
    rotated = cv2.rotate(image, cv2.cv2.ROTATE_180)
    # cv2.imshow("180", rotated)
    return(rotated, "rot180")


def rotate270(path):
    image = cv2.imread(path)
    rotated = cv2.rotate(image, cv2.cv2.ROTATE_180)
    rotated = cv2.rotate(rotated, cv2.cv2.ROTATE_90_CLOCKWISE)
    # cv2.imshow("270", rotated)
    return (rotated, "rot270")


def invertX(path):
    image = cv2.imread(path)
    flipped = cv2.flip(image, 1)
    # cv2.imshow("flipX", flipped)
    return(flipped, "invX")


def invertY(path):
    image = cv2.imread(path)
    flipped = cv2.flip(image, 0)
    # cv2.imshow("flipY", flipped)
    return(flipped, "invY")


def crop(path, min_width = 1/2, min_height= 1/2, max_width = 1/1.1, 
         max_height = 1/1.1):
    image = cv2.imread(path)
    height, width = image.shape[0], image.shape[1] # Caution: there are images in RGB and GS
    cv2.imwrite("test.jpg", image)
    min_width = math.ceil(width * min_width)
    min_height = math.ceil(height * min_height)
    max_width = math.ceil(width * max_width)
    max_height = math.ceil(height * max_height)
    x1 = random.randint(0, width - min_width)
    w = random.randint(min_width, width - x1)
    y1 = random.randint(0, height - min_height)
    h = random.randint(min_height, height - y1)
    crop = image[y1:y1+h, x1:x1+w]
    return(crop, "crop")

def apply_transformations(path, transformations):
    # drive/MyDrive/Tesis/patterns/originals/37a/37a.png
    new_entries = {}
    i = 0
    for transformation in transformations:
        result, transf_name = transformation(path)   
        if transf_name == "crop": # special treatment for crops
          transf_name = transf_name + str(i)
          i+=1   
        path_els = path.split("/")
        obj_name = path_els[5] + "_" + transf_name
        labels = map_transformation(df, path, obj_name, result)
        new_entries[obj_name] = labels
    return new_entries

def map_transformation(df, path, obj_name, image):
    # drive/MyDrive/Tesis/patterns/originals/37a/37a.png
    path_els = path.split("/")
    os.mkdir(folder_path +  "patterns/sintetics/" + obj_name)
    new_path = obj_name + "/" + obj_name + ".png"
    cv2.imwrite(folder_path + "patterns/sintetics/" + new_path, image)
    labels = df.loc[[path_els[5]]]
    return labels.values[0]

HOR_TRANSFORMATIONS = [invertX, rotate180, crop, crop, crop]
VER_TRANSFORMATIONS = [invertY, rotate180, crop, crop, crop]
COMMON_TRANSFORMATIONS = [invertX, invertY, rotate90, rotate180, rotate270,
                          crop, crop, crop]

## Dataset splitting


In [None]:
df = pd.read_json(folder_path + "labels/normalized_df.json", orient='index', encoding='latin-1')
index = df.index.values
colnames = df.columns

valNumber = math.ceil(0.1 * len(index))
testNumber = math.ceil(0.2 * len(index))
trainNumber = len(index) - valNumber - testNumber

print("Patterns for training: {}".format(trainNumber))
print("Patterns for validation: {}".format(valNumber))
print("Patterns for testing: {}".format(testNumber))


random.shuffle(index)

elem_train = index[:trainNumber]
elem_val = index[trainNumber:trainNumber+valNumber]
elem_test = index[trainNumber+valNumber:]

assert (valNumber + testNumber + trainNumber) == len(index)

# print(elem_train)
# print(elem_val)
# print(elem_test)

Patterns for training: 542
Patterns for validation: 78
Patterns for testing: 156


## Augmentation
(Only over training set)



In [None]:
new_entries = {}

for pattern in elem_train: # only training set
    path = folder_path + "patterns/originals/" + pattern + "/" + pattern + ".png"
    print(path)
    labels = df.loc[[pattern]]
    is_hor = labels['horizontal'].values[0]
    is_ver = labels['vertical'].values[0]
    if is_hor and not is_ver:
        sintetics = apply_transformations(path, HOR_TRANSFORMATIONS)
        new_entries = {**new_entries, **sintetics}
    elif is_ver and not is_hor:
        sintetics = apply_transformations(path, VER_TRANSFORMATIONS)
        new_entries = {**new_entries, **sintetics}
    elif not is_hor and not is_ver:
        sintetics = apply_transformations(path, COMMON_TRANSFORMATIONS)
        new_entries = {**new_entries, **sintetics}

# agregar todas las entradas de elem_train a new_entries, y crear 
# el dataset "augmented_train_df.json"
# guardar nuevas entradas en el df de etiquetas (nuestras etiquetas)
for p in elem_train:
  labels = df.loc[p]
  new_entries[p] = labels.values

df_train = pd.DataFrame.from_dict(new_entries, columns=colnames, orient='index')
df_train.to_json(folder_path + "labels/" + "augmented_train_df.json", orient='index')

# agregar todas las entradas de elem_val a val_entries, y crear 
# el dataset "val_df.json"
val_entries = {}
for p in elem_val:
  labels = df.loc[p]
  val_entries[p] = labels.values

df_val = pd.DataFrame.from_dict(val_entries, columns=colnames, orient='index')
df_val.to_json(folder_path + "labels/" + "val_df.json", orient='index')

# agregar todas las entradas de elem_test a test_entries, y crear
# el dataset "test_df.json"
test_entries = {}
for p in elem_test:
  labels = df.loc[p]
  test_entries[p] = labels.values

df_test = pd.DataFrame.from_dict(test_entries, columns=colnames, orient='index')
df_test.to_json(folder_path + "labels/" + "test_df.json", orient='index')

drive/MyDrive/Tesis/patterns/originals/55b/55b.png
drive/MyDrive/Tesis/patterns/originals/3c/3c.png
drive/MyDrive/Tesis/patterns/originals/76c/76c.png
drive/MyDrive/Tesis/patterns/originals/57f/57f.png
drive/MyDrive/Tesis/patterns/originals/10d/10d.png
drive/MyDrive/Tesis/patterns/originals/93e/93e.png
drive/MyDrive/Tesis/patterns/originals/66c/66c.png
drive/MyDrive/Tesis/patterns/originals/67e/67e.png
drive/MyDrive/Tesis/patterns/originals/45e/45e.png
drive/MyDrive/Tesis/patterns/originals/43h/43h.png
drive/MyDrive/Tesis/patterns/originals/2c/2c.png
drive/MyDrive/Tesis/patterns/originals/12f/12f.png
drive/MyDrive/Tesis/patterns/originals/93j/93j.png
drive/MyDrive/Tesis/patterns/originals/94b/94b.png
drive/MyDrive/Tesis/patterns/originals/94a/94a.png
drive/MyDrive/Tesis/patterns/originals/11d/11d.png
drive/MyDrive/Tesis/patterns/originals/19e/19e.png
drive/MyDrive/Tesis/patterns/originals/53d/53d.png
drive/MyDrive/Tesis/patterns/originals/8d/8d.png
drive/MyDrive/Tesis/patterns/original