### Cambiamos el directorio de trabajo (en el servidor ngpu.ugr.es)

In [23]:
%cd /mnt/homeGPU1/pbedmar/pycharm/experiments/taming_transformers/taming-transformers/

/mnt/homeGPU1/pbedmar/pycharm/experiments/taming_transformers/taming-transformers


## Normalización de imágenes

### Funciones de normalización de imágenes, separación entre train y test para taming-transformer

In [24]:
from PIL import Image, ImageOps
import glob

In [25]:
def resize_with_padding(img, required_size):
    img.thumbnail((required_size, required_size))
    delta_w = required_size - img.size[0]
    delta_h = required_size - img.size[1]
    pad_w = delta_w // 2
    pad_h = delta_h // 2
    padding = (pad_w, pad_h, delta_w - pad_w, delta_h - pad_h)
    return ImageOps.expand(img, padding, fill=(255,255,255))

def apply_resize(images_path, preprocess_folder="preprocess", required_size=192):
    preprocess_folder = preprocess_folder + "/"
    filenames = glob.glob(images_path+"*.png")
    os.makedirs(images_path+preprocess_folder, exist_ok=True)

    for filename in filenames:
        img = Image.open(filename)
        width, height = img.size
        img = resize_with_padding(img, required_size)

        img.convert("RGB").save(images_path+preprocess_folder+filename.split("/")[-1].replace("png","jpg"))

def gen_metadata(directory, seed=1):
    directory = directory+"/"
    filenames = glob.glob(directory+"*.jpg")
    train, test = train_test_split(filenames, test_size=0.25, random_state=seed)

    with open(directory+"xx_train.txt", "w") as f:
        for filename in train[:-1]:
            f.write(filename+"\n")
        f.write(train[-1])

    with open(directory+"xx_test.txt", "w") as f:
        for filename in test[:-1]:
            f.write(filename+"\n")
        f.write(test[-1])

### Preprocesando imágenes de moléculas organometálicas y los contraejemplos de la Universidad de Neguev (articles_molecules)

256x256 molecules

In [26]:
images_path = "../../../datasets/negev/articles_molecules/"
apply_resize(images_path, "preprocess256/", 256)

In [27]:
preprop_images_path = images_path+"preprocess256/"
gen_metadata(preprop_images_path)

256x256 contraejemplos

In [28]:
images_path = "../../../datasets/negev/not_molecules/"
apply_resize(images_path, "preprocess256/", 256)

In [29]:
preprop_images_path = images_path+"preprocess256/"
gen_metadata(preprop_images_path)

## Aumentando conjunto de imágenes

In [30]:
import os

import imgaug as ia
from imgaug import augmenters as iaa
from sklearn.model_selection import train_test_split
import cv2

### Tres conjuntos de transformaciones de imágenes que vamos a aplicar

In [31]:
augmentation_seq = iaa.Sequential([
                      iaa.Sometimes(0.5, iaa.GaussianBlur(sigma=(0, 0.5))),
                      iaa.Affine(
                          scale={"x": (0.8, 1.0), "y": (0.8, 1.0)},
                          rotate=(-25, 25),
                          shear=(-5,5),
                          cval=255
                      )
], random_order=True)


augmentation_seq_2 = iaa.Sequential([
                      iaa.Sometimes(0.5, iaa.GaussianBlur(sigma=(0, 0.5))),
                      iaa.LinearContrast((0.75, 1.5)),
                      iaa.Affine(
                          scale={"x": (0.7, 1.0), "y": (0.7, 1.0)},
                          rotate=(-45, 45),
                          shear=(-10,10),
                          translate_percent={"x": (-0.1, 0.1), "y": (-0.1, 0.1)},
                          cval=255
                      ),
                      iaa.Multiply((0.8, 1.2), per_channel=0.25),
                      iaa.AdditiveGaussianNoise(loc=0, scale=(0.0, 0.05*255), per_channel=0.3),
                      iaa.Fliplr(0.3)
], random_order=True)

augmentation_seq_3 = iaa.Sequential([
                      iaa.Sometimes(0.7, iaa.GaussianBlur(sigma=(0, 0.6))),
                      iaa.LinearContrast((0.75, 2)),
                      iaa.Affine(
                          scale={"x": (0.7, 1.0), "y": (0.7, 1.0)},
                          rotate=(-45, 45),
                          shear=(-10,10),
                          translate_percent={"x": (-0.2, 0.2), "y": (-0.1, 0.1)},
                          cval=255
                      ),
                      iaa.Multiply((0.6, 1.4), per_channel=0.25),
                      iaa.AdditiveGaussianNoise(loc=0, scale=(0.0, 0.05*255), per_channel=0.3),
                      iaa.Fliplr(0.2),
                      iaa.Flipud(0.2),
                      iaa.Sometimes(0.7, iaa.ElasticTransformation(alpha=(0.75,3), sigma=(0.2, 0.5))),
                      iaa.OneOf([
                          iaa.Sharpen(alpha=(0,1), lightness=(0.5, 1.5)),
                          iaa.Emboss(alpha=(0,1), strength=(0.75, 2))
                      ]),
                      iaa.Dropout((0.01,0.15), per_channel=0.5)
], random_order=True)

In [32]:
def aug(directory, directory_aug, aug_seq, repeat=2, seed=1):
  ia.seed(seed)
  directory = directory + "/"
  filenames = glob.glob(directory+"*.jpg")

  images = []
  for filename in filenames:
    img = cv2.imread(filename)
    images.append(img)

  augmented_images = images + aug_seq(images=images*repeat)

  os.makedirs(directory+directory_aug, exist_ok=True)
  count = 0
  for img in augmented_images:
    cv2.imwrite(directory+directory_aug+"/"+str(count)+".jpg", img)
    count += 1

256x256 images

In [33]:
images_path = "../../../datasets/negev/articles_molecules/"
preprop_images_path = images_path+"preprocess256/"

In [34]:
aug(preprop_images_path, "aug", augmentation_seq, repeat=3)
gen_metadata(preprop_images_path+"aug")

In [35]:
aug(preprop_images_path, "aug2", augmentation_seq_2, repeat=3)
gen_metadata(preprop_images_path+"aug2")

In [36]:
aug(preprop_images_path, "aug3", augmentation_seq_3, repeat=3)
gen_metadata(preprop_images_path+"aug3")

256x256 contraejemplos

In [37]:
images_path = "../../../datasets/negev/not_molecules/"
preprop_images_path = images_path + "preprocess256/"

In [38]:
aug(preprop_images_path, "aug", augmentation_seq, repeat=2)
gen_metadata(preprop_images_path+"aug")

In [39]:
aug(preprop_images_path, "aug2", augmentation_seq_2, repeat=2)
gen_metadata(preprop_images_path + "aug2")

In [40]:
aug(preprop_images_path, "aug3", augmentation_seq_3, repeat=2)
gen_metadata(preprop_images_path+"aug3")

### Combinando moléculas organometálicas y contraejemplos en un mismo directorio

Concretamente, vamos a combinar los resultados de articles_molecules/preprocess256/aug2 con not_molecules/preprocess256. Así obtendremos un dataset balanceado entre ejemplos positivos y negativos

In [36]:
positive_filenames = glob.glob("../../../datasets/negev/articles_molecules/preprocess256/aug2/"+"*.jpg")
negative_filenames = glob.glob("../../../datasets/negev/not_molecules/preprocess256/"+"*.jpg")

combined_filenames = positive_filenames + negative_filenames

In [37]:
new_directory = "../../../datasets/negev/combined/256"
os.makedirs(new_directory+"/", exist_ok=True)

for i, filename in enumerate(combined_filenames):
    img = cv2.imread(filename)
    cv2.imwrite(new_directory+"/"+str(i)+".jpg", img)

In [38]:
gen_metadata(new_directory)