In [1]:
import pandas as pd
import numpy as np
import os
import shutil
from PIL import Image, ImageOps
import cv2

from dcc_functions import how_many_files_in_folder, folders_summary
from dcc_functions import translate_picture, crop_and_square_image, add_sp_noise, add_background_noise

In [2]:
DATA_FOLDER = "./data/dcc_data/"
INITIAL_FOLDERS = ["train", "val", "label_book"]
FOLDERS = ["train", "val"]
LABELS = ["i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x"]

folders_summary(DATA_FOLDER, INITIAL_FOLDERS, LABELS, display_ratio=3);

train      : i    : 261 : ***************************************************************************************
train      : ii   : 157 : ****************************************************
train      : iii  : 186 : **************************************************************
train      : iv   : 281 : *********************************************************************************************
train      : v    : 196 : *****************************************************************
train      : vi   : 181 : ************************************************************
train      : vii  : 193 : ****************************************************************
train      : viii : 199 : ******************************************************************
train      : ix   : 234 : ******************************************************************************
train      : x    : 179 : ***********************************************************
Total Number of pictures in train : 2067 

v

### Duplicates & erroneous pictures removal

In [3]:
# We extract the list of duplicates files to remove
duplicates = pd.read_excel("./roman-numerals-labeling-plb-20210830.xlsx", sheet_name="duplicates")
duplicates_list = duplicates["file"].tolist()

# as well as the unreadable files 
files_analysis = pd.read_excel("./roman-numerals-labeling-plb-20210830.xlsx", sheet_name="analysis", usecols="B:L")
erroneous_list = files_analysis["file"][files_analysis["to_be_removed"] == 1].tolist()

removal_list = duplicates_list + erroneous_list
print(len(duplicates_list), "duplicates +", len(erroneous_list), "errouneous =", len(removal_list), "pictures to remove.")

53 duplicates + 266 errouneous = 319 pictures to remove.


In [4]:
nb_files_removed_total = 0

# We remove every file in the folders which is listed as "to be removed"
for folder in INITIAL_FOLDERS:
    for label in LABELS:
        
        nb_files_removed_label = 0
        
        nb_files, files = how_many_files_in_folder(DATA_FOLDER+folder+"/"+label+"/*.png")
        
        for file in files:
            if file[-40:] in removal_list:
                
                nb_files_removed_label += 1
                nb_files_removed_total += 1
                
                os.remove(file)
                
print ("Total Number of files removed:", nb_files_removed_total, "\n")

folders_summary(DATA_FOLDER, INITIAL_FOLDERS, LABELS, display_ratio=3);

Total Number of files removed: 319 

train      : i    : 231 : *****************************************************************************
train      : ii   : 136 : *********************************************
train      : iii  : 145 : ************************************************
train      : iv   : 256 : *************************************************************************************
train      : v    : 174 : **********************************************************
train      : vi   : 169 : ********************************************************
train      : vii  : 164 : ******************************************************
train      : viii : 169 : ********************************************************
train      : ix   : 203 : *******************************************************************
train      : x    : 155 : ***************************************************
Total Number of pictures in train : 1802 

val        : i    : 77 : *************************
va

### Move wrongly labels pictures to the right folders

In [6]:
files_to_move = files_analysis[files_analysis["true_label"] != 0]

# We move every file listed as "to be moved" to the right folder
# "xi" pictures are mirrored vertically and transfer to the "ix" folder
for index, row in files_to_move.iterrows():
    
    initial_path = DATA_FOLDER+row["subset"]+"/"+row["label"]+"/"+row["file"]
    
    if row["true_label"] != "xi":

        new_path = DATA_FOLDER+row["subset"]+"/"+row["true_label"]+"/"+row["file"]
        shutil.move(initial_path, new_path)
    
    elif row["true_label"] == "xi":

        im_original = Image.open(initial_path)
        im_vertical_sym = ImageOps.mirror(im_original)
        
        new_path = DATA_FOLDER+row["subset"]+"/ix/"+row["file"]
        im_vertical_sym.save(new_path)
        os.remove(initial_path)
        
print("Total Number of files transfered:", len(files_to_move), "\n")

folders_summary(DATA_FOLDER, INITIAL_FOLDERS, LABELS, display_ratio=3);

Total Number of files transfered: 110 

train      : i    : 226 : ***************************************************************************
train      : ii   : 151 : **************************************************
train      : iii  : 141 : ***********************************************
train      : iv   : 253 : ************************************************************************************
train      : v    : 177 : ***********************************************************
train      : vi   : 163 : ******************************************************
train      : vii  : 164 : ******************************************************
train      : viii : 161 : *****************************************************
train      : ix   : 200 : ******************************************************************
train      : x    : 161 : *****************************************************
Total Number of pictures in train : 1797 

val        : i    : 76 : *************************
v

### Move label-book pictures to the training folders

In [7]:
# We move every picture from the label book to the corresponding label in "train"

folder = INITIAL_FOLDERS[2] #label_book

for label in LABELS:

    nb_files, files = how_many_files_in_folder(DATA_FOLDER+folder+"/"+label+"/*.png")

    for file in files:
        
        new_path = file.replace(INITIAL_FOLDERS[2], INITIAL_FOLDERS[0]).replace("\\", "/") #label_book -> train
        
        shutil.move(file, new_path)
    
# We remove the "label_book" folder to keep the directory clean
shutil.rmtree(DATA_FOLDER+folder)

folder_size_val, folder_size_train = folders_summary(DATA_FOLDER, INITIAL_FOLDERS, LABELS, display_ratio=3)

train      : i    : 231 : *****************************************************************************
train      : ii   : 156 : ****************************************************
train      : iii  : 147 : *************************************************
train      : iv   : 258 : **************************************************************************************
train      : v    : 182 : ************************************************************
train      : vi   : 168 : ********************************************************
train      : vii  : 169 : ********************************************************
train      : viii : 166 : *******************************************************
train      : ix   : 205 : ********************************************************************
train      : x    : 165 : *******************************************************
Total Number of pictures in train : 1847 

val        : i    : 76 : *************************
val        : ii   : 79

### Pictures augmentation

In [8]:
# We create a DataFrame to store our augmentation choices
index = pd.Index(LABELS).set_names("label")

transformation_table = pd.DataFrame(index=index)

transformation_table["hor_sym"] = [0]*len(LABELS)
transformation_table["ver_sym"] = [0]*len(LABELS)
transformation_table["a_clock_10"] = [1]*len(LABELS)
transformation_table["clock_10"] = [1]*len(LABELS)
transformation_table["a_clock_20"] = [1]*len(LABELS)
transformation_table["clock_20"] = [1]*len(LABELS)
transformation_table["transl1"] = [1]*len(LABELS)
transformation_table["transl2"] = [1]*len(LABELS)
transformation_table["transl3"] = [1]*len(LABELS)
transformation_table["crop"] = [1]*len(LABELS)
transformation_table["sp_noise"] = [0]*len(LABELS)
transformation_table["bckg_noise"] = [0]*len(LABELS)

transformation_table

Unnamed: 0_level_0,hor_sym,ver_sym,a_clock_10,clock_10,a_clock_20,clock_20,transl1,transl2,transl3,crop,sp_noise,bckg_noise
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
i,0,0,1,1,1,1,1,1,1,1,0,0
ii,0,0,1,1,1,1,1,1,1,1,0,0
iii,0,0,1,1,1,1,1,1,1,1,0,0
iv,0,0,1,1,1,1,1,1,1,1,0,0
v,0,0,1,1,1,1,1,1,1,1,0,0
vi,0,0,1,1,1,1,1,1,1,1,0,0
vii,0,0,1,1,1,1,1,1,1,1,0,0
viii,0,0,1,1,1,1,1,1,1,1,0,0
ix,0,0,1,1,1,1,1,1,1,1,0,0
x,0,0,1,1,1,1,1,1,1,1,0,0


In [9]:
# We can either test the number of files that will be generated (=estimate)
# or apply the transformations and create augmented pictures (=save)
mode = "estimate"
mode = "save"

total_number_of_files = 0

for folder in FOLDERS:
    for label in LABELS:

        # We record augmentation instructions from the DataFrame as variables
        do_hor_sym = transformation_table.loc[label, "hor_sym"]
        do_vert_sym = transformation_table.loc[label, "ver_sym"]
        do_a_clock_10 = transformation_table.loc[label, "a_clock_10"]
        do_clock_10 = transformation_table.loc[label, "clock_10"]
        do_a_clock_20 = transformation_table.loc[label, "a_clock_20"]
        do_clock_20 = transformation_table.loc[label, "clock_20"]
        do_transl1 = transformation_table.loc[label, "transl1"]
        do_transl2 = transformation_table.loc[label, "transl2"]
        do_transl3 = transformation_table.loc[label, "transl3"]
        do_crop = transformation_table.loc[label, "crop"]
        do_sp_noise = transformation_table.loc[label, "sp_noise"]
        do_bckg_noise = transformation_table.loc[label, "bckg_noise"]
        
        nb_files, files = how_many_files_in_folder(DATA_FOLDER+folder+"/"+label+"/*.png")
        
        original_number_of_files = nb_files
        folder_number_of_files = nb_files
        total_number_of_files += nb_files
        
        for file in files:
            
            # These limits allow to balance the number of files created for each label
            if (folder == "train" and folder_number_of_files > 910) or (folder == "val" and folder_number_of_files > 95):
                break
            
            im_original = Image.open(file)
            
            # Horizontal Symmetry
            if do_hor_sym == 1 and np.random.randint(2):
                
                im_horizontal_sym = im_original.transpose(Image.FLIP_TOP_BOTTOM)
                
                if mode == "save": im_horizontal_sym.save(file.replace(".png", "_hor.png"))
                
                folder_number_of_files += 1
                total_number_of_files += 1
            
            # Vertical Symmetry
            if do_vert_sym == 1 and np.random.randint(2):
                
                im_vertical_sym = ImageOps.mirror(im_original)
                if mode == "save": im_vertical_sym.save(file.replace(".png", "_vert.png"))
                
                folder_number_of_files += 1
                total_number_of_files += 1
            
            # 10° Anti-Clockwise Rotation
            if do_a_clock_10 == 1 and np.random.randint(2):
                
                im_rotate_a_clock_10 = im_original.rotate(10, fillcolor="white")
                if mode == "save": im_rotate_a_clock_10.save(file.replace(".png", "_a_clock_10.png"))
                
                folder_number_of_files += 1
                total_number_of_files += 1
            
            # 10° Clockwise Rotation
            if do_clock_10 == 1 and np.random.randint(2):
                
                im_rotate_clock_10 = im_original.rotate(-10, fillcolor="white")
                if mode == "save": im_rotate_clock_10.save(file.replace(".png", "_clock_10.png"))
                
                folder_number_of_files += 1
                total_number_of_files += 1
            
            # 20° Anti-Clockwise Rotation
            if do_a_clock_20 == 1 and np.random.randint(2):
                
                im_rotate_a_clock_20 = im_original.rotate(20, fillcolor="white")
                if mode == "save": im_rotate_a_clock_20.save(file.replace(".png", "_a_clock_20.png"))
                
                folder_number_of_files += 1
                total_number_of_files += 1
            
            # 20° Clockwise Rotation
            if do_clock_20 == 1 and np.random.randint(2):
                
                im_rotate_clock_20 = im_original.rotate(-20, fillcolor="white")
                if mode == "save": im_rotate_clock_20.save(file.replace(".png", "_clock_20.png"))
                
                folder_number_of_files += 1
                total_number_of_files += 1
            
            # Translation 1, mostly horizontal
            if do_transl1 == 1 and np.random.randint(2):
                
                img_transl1 = translate_picture(cv2.imread(file), 30, 10)
                if mode == "save": img_transl1.save(file.replace(".png", "_transl1.png"))
                
                folder_number_of_files += 1
                total_number_of_files += 1
            
            # Translation 2, mostly vertical
            if do_transl2 == 1 and np.random.randint(2):
                
                img_transl2 = translate_picture(cv2.imread(file), 10, 30)
                if mode == "save": img_transl2.save(file.replace(".png", "_transl2.png"))
                
                folder_number_of_files += 1
                total_number_of_files += 1
            
            # Translation 3, both horizontal and vertical
            if do_transl3 == 1 and np.random.randint(2):
                
                img_transl3 = translate_picture(cv2.imread(file), 30, 30)
                if mode == "save": img_transl3.save(file.replace(".png", "_transl3.png"))
                
                folder_number_of_files += 1
                total_number_of_files += 1
            
            # White area cropping
            if do_crop == 1 and np.random.randint(2):
                
                img_crop_and_sq = crop_and_square_image(cv2.imread(file), padding=0.01)
                if mode == "save": img_crop_and_sq.save(file.replace(".png", "_crop.png"))
                
                folder_number_of_files += 1
                total_number_of_files += 1
            
            # Salt and Pepper Noise
            if do_sp_noise == 1 and np.random.randint(2):
                
                img_s_p = add_sp_noise(cv2.imread(file, cv2.IMREAD_GRAYSCALE))
                if mode == "save": img_s_p.save(file.replace(".png", "_sp_noise.png"))
                
                folder_number_of_files += 1
                total_number_of_files += 1
            
            # Background noise transfer
            if do_bckg_noise == 1 and np.random.randint(2):
                
                noise_path = "./data/00_noisy_background/noise_00"+str(np.random.randint(1, 10))+".png"
                
                img_bckg, random_choice = add_background_noise(file, noise_path)
                if mode == "save": img_bckg.save(file.replace(".png", "_bckg_noise_"+str(random_choice)+".png"))
                
                folder_number_of_files += 1
                total_number_of_files += 1 
                
        print (folder.ljust(6), label.ljust(5), str(nb_files).ljust(4), str(original_number_of_files).ljust(4),
               do_hor_sym, do_vert_sym, do_a_clock_10, do_clock_10, do_a_clock_20, do_clock_20, do_transl1, do_transl2, do_transl3, do_crop, do_sp_noise, do_bckg_noise,
              str(folder_number_of_files).ljust(4), str(total_number_of_files).ljust(5), "*"*(folder_number_of_files//10))

train  i     231  231  0 0 1 1 1 1 1 1 1 1 0 0 913  913   *******************************************************************************************
train  ii    156  156  0 0 1 1 1 1 1 1 1 1 0 0 793  1706  *******************************************************************************
train  iii   147  147  0 0 1 1 1 1 1 1 1 1 0 0 713  2419  ***********************************************************************
train  iv    258  258  0 0 1 1 1 1 1 1 1 1 0 0 911  3330  *******************************************************************************************
train  v     182  182  0 0 1 1 1 1 1 1 1 1 0 0 902  4232  ******************************************************************************************
train  vi    168  168  0 0 1 1 1 1 1 1 1 1 0 0 825  5057  **********************************************************************************
train  vii   169  169  0 0 1 1 1 1 1 1 1 1 0 0 840  5897  ************************************************************************************

In [10]:
folders_summary(DATA_FOLDER, FOLDERS, LABELS, display_ratio = 10);

train      : i    : 913 : *******************************************************************************************
train      : ii   : 793 : *******************************************************************************
train      : iii  : 713 : ***********************************************************************
train      : iv   : 911 : *******************************************************************************************
train      : v    : 902 : ******************************************************************************************
train      : vi   : 825 : **********************************************************************************
train      : vii  : 840 : ************************************************************************************
train      : viii : 847 : ************************************************************************************
train      : ix   : 915 : **************************************************************************************