In [None]:
import numpy as np
from PIL import Image
import pandas as pd
from tqdm import tqdm
import os
import imgaug as ia
from imgaug import augmenters as iaa
import cv2

In [None]:
SEED = 777
SHAPE = (512, 512, 4)
DIR = '.'

ia.seed(SEED)

In [None]:
# idea: create a new dataset from the one we already have, but where we increase the absolute frequency of the less 
#       dominant localizations through image augmentation

In [None]:
# building the initial array "counts" (list where each entry is the absolute frequency of the correspondent label )

path_to_train = DIR + '/train/'
train_labels = pd.read_csv(DIR + '/train.csv')

label_names = {
    0:  "Nucleoplasm",  
    1:  "Nuclear membrane",   
    2:  "Nucleoli",   
    3:  "Nucleoli fibrillar center",   
    4:  "Nuclear speckles",
    5:  "Nuclear bodies",   
    6:  "Endoplasmic reticulum",   
    7:  "Golgi apparatus",   
    8:  "Peroxisomes",   
    9:  "Endosomes",   
    10:  "Lysosomes",   
    11:  "Intermediate filaments",   
    12:  "Actin filaments",   
    13:  "Focal adhesion sites",   
    14:  "Microtubules",   
    15:  "Microtubule ends",   
    16:  "Cytokinetic bridge",   
    17:  "Mitotic spindle",   
    18:  "Microtubule organizing center",   
    19:  "Centrosome",   
    20:  "Lipid droplets",   
    21:  "Plasma membrane",   
    22:  "Cell junctions",   
    23:  "Mitochondria",   
    24:  "Aggresome",   
    25:  "Cytosol",   
    26:  "Cytoplasmic bodies",   
    27:  "Rods & rings"
}

#reversed dictionary (label_names)
reverse_train_labels = dict((v,k) for k,v in label_names.items())

# transform the list of targets of each sample into an array with 28 entries (number of classes) and each one can be 1
# (in case of that class being present) or 0 (in case of that class being absent)
def fill_targets(row):
    row.Target = np.array(row.Target.split(" ")).astype(np.int)
    for num in row.Target:
        name = label_names[int(num)]
        row.loc[name] = 1
    return row

for key in label_names.keys():
    train_labels[label_names[key]] = 0
    
train_labels = train_labels.apply(fill_targets, axis=1)

# target_counts will be used to produce counts (more precisely, its values)
target_counts = train_labels.drop(["Id", "Target"],axis=1).sum(axis=0)

In [None]:
# auxiliary functions

#used to see if every class already has absolute frequency bigger than a specific value, nmin
def isGoalState(counts, nmin):
    list_booleans = [i>=nmin for i in counts]
    return all(item == True for item in list_booleans)

# image augmentation as described on the report
seq = iaa.Sequential([
                iaa.OneOf([
                    iaa.Fliplr(0.5), # horizontal flips
                    iaa.Crop(percent=(0, 0.1)), # random crops
                    # Small gaussian blur with random sigma between 0 and 0.5.
                    # But we only blur about 50% of all images.
                    iaa.Sometimes(0.5,
                        iaa.GaussianBlur(sigma=(0, 0.5))
                    ),
                    # Strengthen or weaken the contrast in each image.
                    iaa.ContrastNormalization((0.75, 1.5)),
                    # Add gaussian noise.
                    # For 50% of all images, we sample the noise once per pixel.
                    # For the other 50% of all images, we sample the noise per pixel AND
                    # channel. This can change the color (not only brightness) of the
                    # pixels.
                    iaa.AdditiveGaussianNoise(loc=0, scale=(0.0, 0.05*255), per_channel=0.5),
                    # 
                    iaa.Multiply((0.8, 1.2), per_channel=0.2),
                    # Apply affine transformations to each image.
                    # Scale/zoom them, translate/move them, rotate them and shear them.
                    iaa.Affine(
                        scale={"x": (0.8, 1.2), "y": (0.8, 1.2)},
                        translate_percent={"x": (-0.2, 0.2), "y": (-0.2, 0.2)},
                        rotate=(-180, 180),
                        shear=(-8, 8)
                    )
                ])], random_order=True)

#intersection of two lists
def intersection(lst1, lst2): 
    return list(set(lst1) & set(lst2)) 

In [None]:
#initialization

data = pd.read_csv(DIR + '/train.csv')
nmin = 20
counts = target_counts.values
print ('counts before oversampling:')
print(counts)

In [None]:
def new_dataset(data, nmin, counts):
    
    new_data = pd.DataFrame.copy(data, deep=True)
    samples_names = data['Id']
    samples_labels = data['Target']
    dataset_size = len(samples_names)
    ncycle = 0
    boolean = isGoalState(counts, nmin)
    number_of_classes = counts.shape[0]
    i=0
    
    
    # while not every label has abs freq > nmin we'll keep ourselves cycling, where whenever we find a class that has 
    # lower abs freq than nmin, we copy that sample, but augmenting (applying some transformations), but keeping the 
    # target the same. At the same time, we're building a new .csv, where we're saving the link between the new samples
    # and their target
    
    while not boolean:
        
        sample_name = samples_names[i]
        sample_labels = samples_labels[i]
        
        sample_labels_list = sample_labels.split()
        sample_labels_list = [int(item) for item in sample_labels_list]
         
        wanted = []
        for j in range(number_of_classes):
            if counts[j] < nmin:
                wanted.append(j)
        
        if intersection(wanted, sample_labels_list) != []:
            
            filepath = path_to_train + sample_name
            R = Image.open(filepath + '_red.png')
            G = Image.open(filepath + '_green.png')
            B = Image.open(filepath + '_blue.png')
            Y = Image.open(filepath + '_yellow.png')

            im = [np.array(R), np.array(G), np.array(B), np.array(Y)]
            [new_R, new_G, new_B, new_Y] = seq.augment_images(im)
            new_sample_name = sample_name + '_ia' + str(ncycle)

            
            im_saving_R = Image.fromarray(new_R)
            im_saving_G = Image.fromarray(new_G)
            im_saving_B = Image.fromarray(new_B)
            im_saving_Y = Image.fromarray(new_Y)
            
            savepath = DIR + '/train_oversampling/'
            im_saving_R.save(savepath + new_sample_name + '_red.png')
            im_saving_G.save(savepath + new_sample_name + '_green.png')
            im_saving_B.save(savepath + new_sample_name + '_blue.png')
            im_saving_Y.save(savepath + new_sample_name + '_yellow.png')

            df2 = pd.DataFrame({'Id': new_sample_name , 'Target': [sample_labels]})
            new_data = new_data.append(df2, ignore_index=True )

            for j in range(len(sample_labels_list)):
                counts[sample_labels_list[j]] = counts[sample_labels_list[j]] + 1
                    
        if i == dataset_size - 1:
            i = 0
        else:
            i=i+1
        boolean = isGoalState(counts, nmin)
        ncycle = ncycle + 1
    print('new counts is:')
    print(counts)
    new_data.to_csv(DIR + '/train_oversampling/' + 'train_oversampling' + '.csv', index=False)

In [None]:
#RUN!
new_dataset(data, nmin, counts)