# Aprendizaje Multietiqueta de Patrones Geométricos en Objetos de Herencia Cultural
# Split and data augmentation
## Seminario de Tesis II, Primavera 2022
### Master of Data Science. Universidad de Chile.
#### Prof. guía: Benjamín Bustos - Prof. coguía: Iván Sipirán
#### Autor: Matías Vergara

Performs data augmentation on patterns through the application of linear transformations.

## Imports

In [12]:
import cv2
import pandas as pd
from IPython.display import display
import os
import math
import random
import shutil
import imgaug.augmenters as aug
import numpy as np

## Mounting Google Drive

In [13]:
# Mounting google Drive
try:
    from google.colab import drive
    drive.mount('/content/drive')
    root_dir = 'drive/MyDrive/TesisMV/'
except:
    root_dir = '../'

## Dataset and model selection

It is enough to select ds flags and number of crops (this one will have effect depending on flags) and then run the rest of the cells. This way, two folders will be created: a labels one and a patterns one. Both of them will be named after the selected flags, separed by "_"

In [14]:
SUBCHAPTERS = False
DS_FLAGS = []
              # 'ref': [invertX, invertY],
              # 'rot': [rotate90, rotate180, rotate270],
              # 'crop': [crop] * CROP_TIMES,
              # 'blur': [blur],
              # 'emboss': [emboss],
              # 'randaug': [randaug],
              # 'rain': [rain],
              # 'elastic': [elastic]
CROP_TIMES = 1
RANDOM_TIMES = 1
ELASTIC_TIMES = 1
MAP_TIMES = {'crop': CROP_TIMES,
         'randaug': RANDOM_TIMES,
         'elastic': ELASTIC_TIMES,
}

## Transformations

In [15]:
DS_FLAGS = sorted(DS_FLAGS)
data_flags = '_'.join(DS_FLAGS) if len(DS_FLAGS) > 0 else 'base'
if SUBCHAPTERS:
    data_flags = 'subchapters/' + data_flags
MULTIPLE_TRANSF = ['crop', 'randaug', 'elastic']
COPY_FLAGS = DS_FLAGS.copy()

for t in MULTIPLE_TRANSF:
    if t in DS_FLAGS:
        COPY_FLAGS.remove(t)
        COPY_FLAGS.append(t + str(MAP_TIMES[t]))
        data_flags = '_'.join(COPY_FLAGS)


In [16]:
def rotate90(path):
    image = cv2.imread(path)
    rotated = cv2.rotate(image, cv2.cv2.ROTATE_90_CLOCKWISE)
    # cv2.imshow("90", rotated)
    return rotated, "rot90"


def rotate180(path):
    image = cv2.imread(path)
    rotated = cv2.rotate(image, cv2.cv2.ROTATE_180)
    # cv2.imshow("180", rotated)
    return rotated, "rot180"


def rotate270(path):
    image = cv2.imread(path)
    rotated = cv2.rotate(image, cv2.cv2.ROTATE_180)
    rotated = cv2.rotate(rotated, cv2.cv2.ROTATE_90_CLOCKWISE)
    # cv2.imshow("270", rotated)
    return rotated, "rot270"


def invertX(path):
    image = cv2.imread(path)
    flipped = cv2.flip(image, 1)
    # cv2.imshow("flipX", flipped)
    return flipped, "invX"


def invertY(path):
    image = cv2.imread(path)
    flipped = cv2.flip(image, 0)
    # cv2.imshow("flipY", flipped)
    return flipped, "invY"


def crop(path, min_width = 1/2, min_height= 1/2, max_width = 1/1.1,
         max_height = 1/1.1):
    image = cv2.imread(path)
    height, width = image.shape[0], image.shape[1] # Caution: there are images in RGB and GS
    min_width = math.ceil(width * min_width)
    min_height = math.ceil(height * min_height)
    max_width = math.ceil(width * max_width)
    max_height = math.ceil(height * max_height)
    x1 = random.randint(0, width - min_width)
    w = random.randint(min_width, width - x1)
    y1 = random.randint(0, height - min_height)
    h = random.randint(min_height, height - y1)
    crop = image[y1:y1+h, x1:x1+w]
    return crop, "crop"

def blur(path):
    image = cv2.imread(path)
    image_aug = aug.AverageBlur(k=(4, 11))(image=image)
    return image_aug, "blur"

def emboss(path):
    image = cv2.imread(path)
    image_aug = aug.Emboss(alpha=(0.0, 1.0), strength=(0.5, 1.5))(image = image)
    return image_aug, "embs"

def elastic(path):
    image = cv2.imread(path)
    image_aug = aug.PiecewiseAffine(scale=(0.03, 0.075))(image = image)
    return image_aug, "elastic"

def randaug(path):
    image = cv2.imread(path)
    image_aug = aug.RandAugment(m=(2, 9))(image = image)
    return image_aug, "randaug"

def snow(path):
    image = cv2.imread(path)
    image_aug = aug.Snowflakes(flake_size=(0.6, 0.5), speed=(0.2, 0.5))(image = image)
    return image_aug, "snow"


def rain(path):
    image = cv2.imread(path)
    image_aug = aug.Rain(speed=(0.1, 0.5))(image = image)
    return image_aug, "rain"


def apply_transformations(pin, pout, transformations):
    # ../patterns/originals/84e/84e.png
    new_names = []
    i = 0
    for transformation in transformations:
        result, transf_name = transformation(pin)
        if transf_name in MULTIPLE_TRANSF: # special treatment for crops and randoms
          transf_name += str(i)
          i+=1
        path_els = pin.split("/")
        obj_name = path_els[3] + "_" + transf_name
        filename = obj_name + ".png"
        os.makedirs(pout, exist_ok = True)
        cv2.imwrite(pout + filename, result)
        new_names.append(obj_name)
    return new_names

# Select data augmentation functions based on data flags

MAP_FLAGS = {'ref': [invertX, invertY],
             'rot': [rotate90, rotate180, rotate270],
             'crop': [crop],
             'blur': [blur],
             'emboss': [emboss],
             'randaug': [randaug],
             'rain': [rain],
             'elastic': [elastic]
             # snow is not working properly
             }

ALLOWED_TRANSFORMATIONS = []
for f in DS_FLAGS:
    ALLOWED_TRANSFORMATIONS += MAP_FLAGS[f]
HOR_TRANSFORMATIONS = [invertX, rotate180, blur, rain, emboss]
VER_TRANSFORMATIONS = [invertY, rotate180, blur, rain, emboss]
COMMON_TRANSFORMATIONS = [invertX, invertY, rotate90, rotate180, rotate270,
                          blur, rain, emboss]

for t in MULTIPLE_TRANSF:
    if t in DS_FLAGS:
        HOR_TRANSFORMATIONS += MAP_FLAGS[t] * RANDOM_TIMES
        VER_TRANSFORMATIONS += MAP_FLAGS[t] * RANDOM_TIMES
        COMMON_TRANSFORMATIONS += MAP_FLAGS[t] * RANDOM_TIMES


def mergeTransformations(flags, map_flags, map_times, trans_list): 
    # could be improved a lot 
    for k, v in map_flags.items():
        if k not in flags:
            for el in v:
                while el in trans_list:
                    trans_list.remove(el)
    print(trans_list)
    return trans_list

mergeTransformations(DS_FLAGS, MAP_FLAGS, MAP_TIMES, HOR_TRANSFORMATIONS)
mergeTransformations(DS_FLAGS, MAP_FLAGS, MAP_TIMES, VER_TRANSFORMATIONS)
mergeTransformations(DS_FLAGS, MAP_FLAGS, MAP_TIMES, COMMON_TRANSFORMATIONS)

[]
[]
[]


[]

In [17]:
labels_dir = os.path.join(root_dir, "labels")
df = pd.read_json(os.path.join(labels_dir, "normalized_df.json"), orient='index')
classes = pd.read_csv(os.path.join(labels_dir, "class_labels.csv"), index_col=0)

found_train_elems = os.path.isfile(os.path.join(labels_dir, "elem_train.npy")) 
found_val_elems = os.path.isfile(os.path.join(labels_dir, "elem_val.npy"))
found_test_elems = os.path.isfile(os.path.join(labels_dir, "elem_test.npy"))
 
colnames = df.columns

if not ((found_train_elems and found_val_elems) and found_test_elems):
    print("ATENCIÓN: No se encontraron indices para cada conjunto. \n Creando desde 0.")
                      
    display(classes)


    valNumber = math.ceil(0.1 * len(index))
    testNumber = math.ceil(0.2 * len(index))
    trainNumber = len(index) - valNumber - testNumber

    print("Patterns for training: {}".format(trainNumber))
    print("Patterns for validation: {}".format(valNumber))
    print("Patterns for testing: {}".format(testNumber))

    # para revisar que la correspondencia index - label no se altera
    # al hacer sample
    with pd.option_context('display.max_rows', None,
                           'display.max_columns', None,
                           'display.precision', 3,
                           ):
        display(df.loc[['3i']])

    df = df.sample(frac=1)
    index = df.index.values

    # para revisar que la correspondencia index - label no se altera
    # al hacer sample
    with pd.option_context('display.max_rows', None,
                           'display.max_columns', None,
                           'display.precision', 3,
                           ):
        display(df.loc[['3i']])

    elem_train = index[:trainNumber]
    elem_val = index[trainNumber:trainNumber+valNumber]
    elem_test = index[trainNumber+valNumber:]

    assert (valNumber + testNumber + trainNumber) == len(index)

    with open(os.path.join(labels_dir, 'elem_train.npy'), 'wb') as f:
        np.save(f, elem_train)

    with open(os.path.join(labels_dir, 'elem_val.npy'), 'wb') as f:
        np.save(f, elem_val)

    with open(os.path.join(labels_dir, 'elem_test.npy'), 'wb') as f:
        np.save(f, elem_test)
                      
else: #archivos existian desde antes
    print("Cargando indices previamente generados")
    elem_train = elem_test = elem_val = None
    with open(os.path.join(labels_dir, 'elem_train.npy'), 'rb') as f:
        elem_train = np.load(f, allow_pickle = True)
        
    with open(os.path.join(labels_dir, 'elem_val.npy'), 'rb') as f:
        elem_val = np.load(f, allow_pickle = True)
        
    with open(os.path.join(labels_dir, 'elem_test.npy'), 'rb') as f:
        elem_test = np.load(f, allow_pickle = True)

print(elem_train)
print(elem_val)
print(elem_test)

Cargando indices previamente generados
['53d' '86e' '65d' '37f' '26a' '18g' '21i' '87e' '19c' '67a' '70a' '77b'
 '40d' '58a' '5e' '14d' '63c' '92e' '81b' '37d' '11h' '16i' '77h' '11j'
 '35b' '21f' '13a' '16j' '93a' '48b' '77i' '39a' '6c' '96i' '50a' '55d'
 '32e' '71l' '2j' '13e' '89h' '36m' '46g' '1f' '29b' '14e' '69g' '61e'
 '48e' '59c' '51c' '69h' '78d' '8c' '66e' '14a' '46f' '79c' '23e' '95j'
 '16e' '45e' '66d' '3a' '48a' '62d' '27c' '29c' '14h' '20b' '5a' '35a'
 '36e' '70g' '84f' '32h' '26d' '72d' '19a' '1k' '69a' '65b' '18c' '58i'
 '36h' '73b' '46e' '27f' '53i' '1a' '52c' '49d' '41a' '38h' '39j' '40b'
 '67d' '47l' '46k' '1g' '58e' '80f' '21j' '16a' '4d' '91c' '43i' '36b'
 '6a' '13d' '19b' '67i' '85g' '83e' '56c' '11g' '42e' '6e' '64c' '58b'
 '50c' '73c' '21g' '88d' '52d' '83g' '67m' '87c' '17a' '67j' '96d' '33f'
 '14f' '66a' '89f' '76f' '79a' '57i' '17c' '78f' '88f' '43c' '10e' '78e'
 '90e' '76c' '88g' '79b' '83d' '5d' '15a' '24a' '40f' '18b' '3d' '39g'
 '6d' '31c' '71m' '73d' '94

## Augmentation
(Only over training set)

In [18]:
new_entries = {}

for pattern in elem_train: # only training set
    labels = df.loc[[pattern]]
    lbl_class = classes.loc[[pattern]]['chapter'].values[0]
    if SUBCHAPTERS:
        lbl_class = classes.loc[[pattern]]['subchapter'].values[0]
    path_in = os.path.join(root_dir, "patterns", "originals", pattern, pattern + ".png")
    path_out = os.path.join(root_dir, "patterns", data_flags, "train", lbl_class)
    is_hor = labels['horizontal'].values[0]
    is_ver = labels['vertical'].values[0]
    if is_hor and is_ver:
        pass
    if is_hor and not is_ver:
        new_names = apply_transformations(path_in, path_out, HOR_TRANSFORMATIONS)
        labels = df.loc[[pattern]].values[0]
    elif is_ver and not is_hor:
        new_names = apply_transformations(path_in, path_out, VER_TRANSFORMATIONS)
        labels = df.loc[[pattern]].values[0]
    else: #if not is_hor and not is_ver:
        new_names = apply_transformations(path_in, path_out, COMMON_TRANSFORMATIONS)
        labels = df.loc[[pattern]].values[0]
    for name in new_names:
        new_entries[name] = labels
    # add the base pattern to the folder
    os.makedirs(path_out, exist_ok = True)
    shutil.copy(path_in, path_out)

for pattern in elem_val:
    lbl_class = classes.loc[[pattern]]['chapter'].values[0]
    if SUBCHAPTERS:
        lbl_class = classes.loc[[pattern]]['subchapter'].values[0]
    path_in = os.path.join(root_dir, "patterns", "originals", pattern, pattern + ".png")
    path_out = os.path.join(root_dir, "patterns", data_flags, "val", lbl_class)
    os.makedirs(path_out, exist_ok = True)
    shutil.copy(path_in, path_out)

for pattern in elem_test:
    lbl_class = classes.loc[[pattern]]['chapter'].values[0]
    if SUBCHAPTERS:
        lbl_class = classes.loc[[pattern]]['subchapter'].values[0]
    path_in = os.path.join(root_dir, "patterns", "originals", pattern, pattern + ".png")
    path_out = os.path.join(root_dir, "patterns", data_flags, "test", lbl_class)
    os.makedirs(path_out, exist_ok = True)
    shutil.copy(path_in, path_out)

# agregar todas las entradas de elem_train a new_entries, y crear 
# el dataset "augmented_train_df.json"

for p in elem_train:
  labels = df.loc[p]
  new_entries[p] = labels.values

labels_output = os.path.join(labels_dir, data_flags)

os.makedirs(labels_output, exist_ok = True)

df_train = pd.DataFrame.from_dict(new_entries, columns=colnames, orient='index')
df_train.to_json(os.path.join(labels_output, "augmented_train_df.json"), orient='index')

# agregar todas las entradas de elem_val a val_entries, y crear 
# el dataset "val_df.json"
val_entries = {}
for p in elem_val:
  labels = df.loc[p]
  val_entries[p] = labels.values

df_val = pd.DataFrame.from_dict(val_entries, columns=colnames, orient='index')
df_val.to_json(os.path.join(labels_output, "val_df.json"), orient='index')

# agregar todas las entradas de elem_test a test_entries, y crear
# el dataset "test_df.json"
test_entries = {}
for p in elem_test:
  labels = df.loc[p]
  test_entries[p] = labels.values

df_test = pd.DataFrame.from_dict(test_entries, columns=colnames, orient='index')
df_test.to_json(os.path.join(labels_output, "test_df.json"), orient='index')

In [19]:
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                           ):
    prueba = pd.read_json(os.path.join(labels_output, "test_df.json"), orient='index')
    display(prueba.loc[["39d"]])

Unnamed: 0,pendent,teardrop,horizontal,panel,group,vertical,bar,floating,enclosing,shorter,line,alternately,framed,pannel,filling,ornament,alternating,simple,andrew's,cross,chevron,inserted,crossing,composition,comb-like,pattern,free,design,metopal,diagonally,right,left,inclined,stack,oblique,half-lines,opposed,diagonal,hatched,triangle,interlocked,apex,large,small,stacked,field,latticing,separated,solid,cross-hatching,obliquely,disposed,wavy,band,double,triple,multiple,steep,wave,half,moon,single,hook,lightning,pair,parallel,rectangle,innermost,concentric,set,coffer,square,george's,dotted,quarter,cross-bar,cross-hatched,checkerboard,centre,cruciform,latticed,double-decker,metope,wolftooth,eight-pointed,star,hatching,simplified,hour-glass,neck,spandrel,differently,checkerboarded,saltire,shoulder,dot,doubled,horizontally,t-shaped,quatrefoil,central,circle,outline,stalk,reserved,background,quartered,lozenge,strong,surrounded,half-moon,double-halfmoon,hexafoil,octofoil,octofail,battlement,zigzag-filled,checked,interspaces,link,high,slim,rectilinear,attached,arrangement,frgt.,gear-pattern,formed,turning,meander,angle,chain,separate,reversed,inverted,enclosed,intertwined,quadruple,step,additional,downward,turn,detached,wind,swastika,connected,t-meander,beam,remaining,space,meander-like,z-shaped,continuous,repetition,standing,alternatively,meander-type,trident,sape,two-sided,'comb',type,mã©andre,hachurã©,panneau,tree,outlined,acute-angled,hooked,arm,two-tiered,net,careless,row,pointing,outwards,spaced,slack,leaf,outside,three-tiered,metopes,four-tiered,triple-outline,quatered,side,cross-hathed,tangential,blob,linked,check,tapestry,net.,massed,like,underlying,schematized,lozenge-cross,upwards,arrow,overlapping,shape,intersecting,version,previous,upright,form,zigzag,border,form-square,schoulder,filled,intertwinded,linear,rhodian,root,stock,dogtooth,downwards,accompanied,antithetic,ray,opposite,direction,hour-glasses,ax,axe,superimposed,windmill,dissimilar,white,dark,massive,spacious,obtuse,metope-triglyph-frieze,e,cut,fringe,ear,corn,v-chevrons,fishbone,facing,three-limbed,sigma,turned,four-limbed,m-chevrons,limbed,scribble,degeneration,column,variously,unframed,tadpole,dash,rectangular,rosette,flower,spiked,eight-armed,floor,stalked,sixteen-pointed,point,lightning-wheel,semicircle,three-quarter,disposition,confronted,quarter-circle,sound-waves,scale,losenges,eleven-pointed,forming,lined,branch,dif.,triple-line,maltese,three-winged,fan,wing,midrib,connecting,arc,near,rim,v-shaped,four-leaved,twelve-leaved,seven-leaved,four-spoked,wheel,circular,sunburst,flanking,tangent,tagential,tangets,elongated,crossed,concentrique,cable,doted,spiral,inner,looped,running,half-circles,pothook,tongue,plant,figure,volute,palm-tree,fish,serpent,light,body,bird,striped,worm,angular,raised,head,bird-seed,long
39d,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
