<a href="https://colab.research.google.com/github/mapo-lp/mapo-lp.github.io/blob/master/AUMENTO_Y_TFRECORDS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Aumento de imagenes y generacion de tfrecords 

#####Input: imagenes y archivos .xml desde Github
#####Output: tfrecods y label_map.pbtxt a Github

####Params

In [None]:
generate_images_count =  100#@param {type:"integer"}

# mostrar imagenes originales y sus transformaciones, para debug
show_generated_images = True #@param {type:"boolean"}
images_to_show =  1#@param {type:"integer"}

images_repo_url = 'https://github.com/mapo-lp/test' #@param {type:"string"}

import os
repo_dir_path = os.path.abspath(os.path.join('.', os.path.basename(images_repo_url)))
path_annotations = repo_dir_path+'/annotations' 
path_images_train = repo_dir_path+'/images/train'
path_images_test = repo_dir_path+'/images/test' 

github_pass = '' #@param {type:"string"}
repo = 'https://mapo-lp:'+github_pass+'@github.com/mapo-lp/wally.git'
github_mail = '' #@param {type:"string"}
github_user = '' #@param {type:"string"}

####Clonamos repo con imagenes

In [None]:
%cd /content
!git clone {images_repo_url}

####Instalamos libs

In [None]:
%cd /content
!git clone --quiet https://github.com/tensorflow/models.git
%tensorflow_version 1.x
%cd /content/models/research
!protoc object_detection/protos/*.proto --python_out=.

import os
os.environ['PYTHONPATH'] += ':/content/models/research/:/content/models/research/slim/'

from imgaug.augmentables.bbs import BoundingBox, BoundingBoxesOnImage
from imgaug import augmenters as iaa 
import imageio
import pandas as pd
import numpy as np
import re
import os
import glob
import xml.etree.ElementTree as ET
import shutil

####Funciones

In [None]:
# Function that will extract column data for our CSV file as pandas DataFrame
def xml_to_csv(path):
    xml_list = []
    for xml_file in glob.glob(path + '/*.xml'):
        tree = ET.parse(xml_file)
        root = tree.getroot()
        for member in root.findall('object'):
            value = (root.find('filename').text,
                     int(root.find('size')[0].text),
                     int(root.find('size')[1].text),
                     member[0].text,
                     int(member[4][0].text),
                     int(member[4][1].text),
                     int(member[4][2].text),
                     int(member[4][3].text)
                     )
            xml_list.append(value)
    column_name = ['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax']
    xml_df = pd.DataFrame(xml_list, columns=column_name)
    return xml_df
   
# apply the function to convert all XML files in images/ folder into labels.csv
if os.path.exists(path_annotations):
  shutil.rmtree(path_annotations)

os.mkdir(path_annotations) 

In [None]:
# function to convert BoundingBoxesOnImage object into DataFrame
def bbs_obj_to_df(bbs_object):
#     convert BoundingBoxesOnImage object into array
    bbs_array = bbs_object.to_xyxy_array()
#     convert array into a DataFrame ['xmin', 'ymin', 'xmax', 'ymax'] columns
    df_bbs = pd.DataFrame(bbs_array, columns=['xmin', 'ymin', 'xmax', 'ymax'])
    return df_bbs

In [None]:
def image_aug(df, images_path, aug_images_path, image_prefix, augmentor):
    # create data frame which we're going to populate with augmented image info
    aug_bbs_xy = pd.DataFrame(columns=
                              ['filename','width','height','class', 'xmin', 'ymin', 'xmax', 'ymax']
                             )
    grouped = df.groupby('filename')
    
    for filename in df['filename'].unique():
    #   get separate data frame grouped by file name
        group_df = grouped.get_group(filename)
        group_df = group_df.reset_index()
        group_df = group_df.drop(['index'], axis=1)   
    #   read the image
        image = imageio.imread(images_path+filename)
    #   get bounding boxes coordinates and write into array        
        bb_array = group_df.drop(['filename', 'width', 'height', 'class'], axis=1).values
    #   pass the array of bounding boxes coordinates to the imgaug library
        bbs = BoundingBoxesOnImage.from_xyxy_array(bb_array, shape=image.shape)
    #   apply augmentation on image and on the bounding boxes
        image_aug, bbs_aug = augmentor(image=image, bounding_boxes=bbs)
    #   disregard bounding boxes which have fallen out of image pane    
        bbs_aug = bbs_aug.remove_out_of_image()
    #   clip bounding boxes which are partially outside of image pane
        bbs_aug = bbs_aug.clip_out_of_image()
        
    #   don't perform any actions with the image if there are no bounding boxes left in it    
        if re.findall('Image...', str(bbs_aug)) == ['Image([]']:
            pass
        
    #   otherwise continue
        else:
        #   write augmented image to a file
            imageio.imwrite(aug_images_path+image_prefix+filename, image_aug)  
        #   create a data frame with augmented values of image width and height
            info_df = group_df.drop(['xmin', 'ymin', 'xmax', 'ymax'], axis=1)    
            for index, _ in info_df.iterrows():
                info_df.at[index, 'width'] = image_aug.shape[1]
                info_df.at[index, 'height'] = image_aug.shape[0]
        #   rename filenames by adding the predifined prefix
            info_df['filename'] = info_df['filename'].apply(lambda x: image_prefix+x)
        #   create a data frame with augmented bounding boxes coordinates using the function we created earlier
            bbs_df = bbs_obj_to_df(bbs_aug)
        #   concat all new augmented info into new data frame
            aug_df = pd.concat([info_df, bbs_df], axis=1)
        #   append rows to aug_bbs_xy data frame
            aug_bbs_xy = pd.concat([aug_bbs_xy, aug_df])            
    
    # return dataframe with updated images and bounding boxes annotations 
    aug_bbs_xy = aug_bbs_xy.reset_index()
    aug_bbs_xy = aug_bbs_xy.drop(['index'], axis=1)
    return aug_bbs_xy

In [None]:
train_labels_df = xml_to_csv(path_images_train)
train_labels_df.to_csv((path_annotations+'/temp_train_labels.csv'), index=None)

test_labels_df = xml_to_csv(path_images_test)
test_labels_df.to_csv((path_annotations+'/temp_test_labels.csv'), index=None)

las transformaciones son las de la libreria albumnations, que es un wrapper de imgaug .. las posibles opciones son las mencionadas en https://albumentations.readthedocs.io/en/latest/api/index.html

In [None]:
aug = iaa.SomeOf(4, [    
    iaa.Affine(scale=(0.2, 1.5)),
    iaa.Affine(rotate=(-120, 60)),
    iaa.Affine(translate_percent={"x":(-0.8, 0.3),"y":(-0.3, 0.3)}),
    iaa.Fliplr(1),
    iaa.Add((-10, 10), per_channel=0.5),
                    iaa.Multiply((0.75, 1.25), per_channel=0.5),
                    iaa.ContrastNormalization((0.5, 2.0), per_channel=0.5),
                    iaa.Crop(px=(0, 20)),
    iaa.GaussianBlur((0, 2.5)),
    iaa.AdditiveGaussianNoise(loc=0, scale=(0.0, 0.01 * 255), per_channel=0.5),
    iaa.AddToHueAndSaturation((-5, 8)),  # change hue and saturation
    iaa.PiecewiseAffine(scale=(0.01, 0.03)),
    iaa.PerspectiveTransform(scale=(0.01, 0.2))
])

In [None]:
if os.path.exists(path_images_train+'/aug_images'):
  shutil.rmtree(path_images_train+'/aug_images')
os.mkdir(path_images_train+'/aug_images') 

if os.path.exists(path_images_test+'/aug_images'):
  shutil.rmtree(path_images_test+'/aug_images')
os.mkdir(path_images_test+'/aug_images') 

####Aumentamos imagenes

In [None]:
# initialize empty DataFrame
augmented_images_train_df = pd.DataFrame(columns=['filename','width','height','class','xmin','ymin','xmax','ymax'])
# apply augmentation function 5 times to the same set of images
for i in range(generate_images_count):
    aug_df = image_aug(train_labels_df, path_images_train+'/', path_images_train+'/aug_images/', 'aug'+str(i)+'_', aug)
    augmented_images_train_df = pd.concat([augmented_images_train_df, aug_df])

In [None]:
if show_generated_images:
  %matplotlib inline
  import matplotlib as mpl
  import matplotlib.pyplot as plt
  import PIL.ImageDraw as ImageDraw
  IMAGE_SIZE = (12, 8)

  i = 0

  for filename in train_labels_df['filename'].unique():
    
    if i == images_to_show:
      break;

    asd = augmented_images_train_df[augmented_images_train_df["filename"].str.contains(filename)]
    
    for filename1 in asd['filename'].unique():
      grouped = asd.groupby('filename')
      group_df = grouped.get_group(filename1)
    
      plt.figure(figsize=IMAGE_SIZE)
      image = imageio.imread(path_images_train+'/aug_images/'+filename1)
      plt.imshow(image)

      for index, row in group_df.iterrows():
          coord = [[row['xmin'], row['ymax']], [row['xmax'], row['ymax']], [row['xmax'], row['ymin']], [row['xmin'], row['ymin']]]
          coord.append(coord[0]) #repeat the first point to create a 'closed loop'
          coord.append(coord[1])
          coord.append(coord[2])
          coord.append(coord[3])
          xs, ys = zip(*coord) #create lists of x and y values
          plt.plot(xs,ys, linewidth=4) 
    
    i+=1

In [None]:
# initialize empty DataFrame
augmented_images_test_df = pd.DataFrame(columns=['filename','width','height','class','xmin','ymin','xmax','ymax'])
# apply augmentation function 5 times to the same set of images
for i in range(int(generate_images_count)):
    aug_df = image_aug(test_labels_df, path_images_test+'/', path_images_test+'/aug_images/', 'aug'+str(i)+'_', aug)
    augmented_images_test_df = pd.concat([augmented_images_test_df, aug_df])

In [None]:
all_labels_train_df = pd.concat([train_labels_df, augmented_images_train_df])
all_labels_train_df.to_csv(path_annotations+'/train_labels.csv', index=False)

all_labels_test_df = pd.concat([test_labels_df, augmented_images_test_df])
all_labels_test_df.to_csv(path_annotations+'/test_labels.csv', index=False)

In [None]:
for file in os.listdir(path_images_train+'/aug_images/'):
    shutil.copy(path_images_train+'/aug_images/'+file, path_images_train+'/'+file)

for file in os.listdir(path_images_test+'/aug_images/'):
    shutil.copy(path_images_test+'/aug_images/'+file, path_images_test+'/'+file)

####Generamos train.record, test.record y label_map.pbtxt a partir de los .csv

In [None]:
%cd {repo_dir_path}

# Convert train folder annotation xml files to a single csv file,
# generate the `label_map.pbtxt` file to `data/` directory as well.
!python code/xml_to_csv.py -i images/train -o annotations/trainn_labels.csv -l annotations

# Generate `train.record`
!python code/generate_tfrecord.py --csv_input=annotations/train_labels.csv --output_path=annotations/train.record --img_path=images/train --label_map annotations/label_map.pbtxt

# Generate `test.record`
!python code/generate_tfrecord.py --csv_input=annotations/test_labels.csv --output_path=annotations/test.record --img_path=images/test --label_map annotations/label_map.pbtxt

# clear resources
!rm '{path_annotations}/temp_train_labels.csv'
!rm '{path_annotations}/temp_test_labels.csv'

if os.path.exists(path_images_train+'/aug_images'):
  shutil.rmtree(path_images_train+'/aug_images')

if os.path.exists(path_images_test+'/aug_images'):
  shutil.rmtree(path_images_test+'/aug_images')

####Subimos esos archivos a Github.

In [None]:
%cd {repo_dir_path}
!git remote rm origin
!git init
!git remote add origin {repo}
!git config --global user.email {github_mail}
!git config --global user.name {github_user}

%cd {path_annotations}
!git add test.record
!git add train.record
!git add label_map.pbtxt
!git commit -m 'tfrecords actualizados' 
!git push -u origin main          

###Listo.