# Artificial Neural Networks and Deep Learning

---

## Homework 2: Preprocessing
This notebook is meant to pre-process the dataset.

## ⚙️ Import Libraries

In [None]:
import os
from datetime import datetime

import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras as tfk
from tensorflow.keras import layers as tfkl

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split

seed = 42
np.random.seed(seed)
tf.random.set_seed(seed)

print(f"TensorFlow version: {tf.__version__}")
print(f"Keras version: {tfk.__version__}")
print(f"GPU devices: {len(tf.config.list_physical_devices('GPU'))}")

## ⏳ Load the Data

In [None]:
data = np.load("mars_for_students.npz")

training_set = data["training_set"]
X_train = training_set[:, 0]
y_train = training_set[:, 1]

X_test = data["test_set"]

print(f"Training X shape: {X_train.shape}")
print(f"Training y shape: {y_train.shape}")
print(f"Test X shape: {X_test.shape}")

### Inspect *X_train* and *y_train* sets

In [None]:
# Create a random index to display every time a different set of images
X_train_size = X_train.shape[0]
random_indices = np.random.randint(0, X_train_size, size = 10)
print(random_indices)

# Plot the image
for index in random_indices:
    plt.imshow(X_train[index])
    plt.title(index)
    plt.axis('off')
    plt.show()
    
    plt.imshow(y_train[index])
    plt.title(index)
    plt.axis('off')
    plt.show()

From the training set, we can understand that **X_train** contains the satellitar images taken from Mars surface, while the **y_train** contains the segmentation masks for the corresponding image.

## Remove the Outliers
Analysing the X_train and y_train sets, it is possible to notice some outliers. For example, there are some pictures (e.g. the one with index 62) contains the picture of an alien. 

By understanding both the pictures and the masks, the masks seems to be the same even if the alien picture is flipped. So we can scan all the masks that are the same of picture 62 and then remove them from the dataset.

In [None]:
# This function establish if image1 and image2 are the same images, pixel-wise.
def are_same(image1, image2):
    if image1.shape != image2.shape:
        print("ERROR: The images are not the same size.")
        return False
    return np.array_equal(image1, image2)

In [None]:
# Show the first picture of an alien
first_alien_index = 62
alien_indices = []
alien_indices.append(first_alien_index)

plt.imshow(X_train[first_alien_index])
plt.title(first_alien_index)
plt.axis('off')
plt.show()

plt.imshow(y_train[first_alien_index])
plt.title(first_alien_index)
plt.axis('off')
plt.show()

In [None]:
# Detect all the images with the same masks as of the first alien
for i in range(len(X_train)):
    if i != first_alien_index and are_same(y_train[first_alien_index], y_train[i]):
        alien_indices.append(i)

print(f"{len(alien_indices)} images have the same mask of the first alien picture:")
print(alien_indices)

In [None]:
# Delete from X_train and y_train all the images with index in alien_indices
X_train = np.delete(X_train, alien_indices, axis=0)
y_train = np.delete(y_train, alien_indices, axis=0)

print("Shape of X_train without aliens: ", X_train.shape)
print("Shape of y_train without aliens: ", y_train.shape)

In [None]:
# Check for duplicated images (that have same image and mask)
unique_images_indices = []
duplicated_images_indices = []

for i in range(len(X_train)):
    is_image_unique = True

    j = 0
    while j < i and is_image_unique:
        if are_same(X_train[i], X_train[j]) and y_train[i] == y_train[j]:
            print(f"Image at index {i} already exists at index {j} with same label {y[i]}.")
            is_image_unique = False
        j += 1

    if is_image_unique:
        unique_images_indices.append(i)
    else:
        duplicated_images_indices.append(i)

if(duplicated_images_indices == []):
    print("There are no duplicates")

We check if there are duplicated images, but we did not find any.
We save the cleaned dataset. 

In [None]:
# Check the number of occurrences of each label
unique, counts = np.unique(y_train, return_counts=True)

print('Number of occurrences of each label: ')
for i in range(unique.size):
    print(f'Label {unique[i]}: {counts[i]}')

As we can see the dataset is higly unbalanced.

In [None]:
# Splitting the dataset 80/20 (training and validation)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=seed
)

#Display the resulting shapes
print(f"Training images shape: {X_train.shape}, Validation images shape: {X_val.shape}")
print(f"Training masks shape: {y_train.shape}, Validation masks shape: {y_val.shape}")

In [None]:
# Add color channel and rescale pixels between 0 and 1
X_train = X_train[..., np.newaxis] / 255.0
X_val = X_val[..., np.newaxis] / 255.0
X_test = X_test[..., np.newaxis] / 255.0

# Retrieve the shape of the input
input_shape = X_train.shape[1:]
num_classes = len(np.unique(y_train))

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"Number of classes: {num_classes}")

In [None]:
from matplotlib.colors import ListedColormap
cmap = ListedColormap(['blue', 'green', 'yellow', 'orange', 'red'])

vmin, vmax = 0, 4

def cut_big_rock(X,y,alpha=1.0):
    i=479
    plt.imshow(X_train[i])
    
    plt.show()
    plt.imshow(y_train[i], cmap=cmap, vmin=vmin, vmax=vmax)
    plt.show()

    batch_size, height, width, channel = X.shape  # No channel dimension for grayscale
    # Generate CutMix lambda
    lam = np.random.beta(alpha, alpha)
    cut_rat = np.sqrt(1.0 - lam)  # Cutout ratio
    cut_h = int(height * cut_rat)
    cut_w = int(width * cut_rat)
    
    # Choose cutout position
    x1,x2= 80,125
    y1,y2= 0, 30
    shift_x = x2 - x1
    shift_y = y2 - y1
    
    # Apply CutMix
    X_cut = X.copy()
    y_cut = y.copy()

    for ind in range(len(X_cut)):
        rx1 = np.random.randint(0, width-shift_x)
        ry1 = np.random.randint(0, height-shift_y)
        rx2 = rx1+shift_x
        ry2 = ry1+shift_y
        X_cut[ind, ry1:ry2, rx1:rx2] = X[i, y1:y2, x1:x2]
        y_cut[ind, ry1:ry2, rx1:rx2] = y[i, y1:y2, x1:x2]
        
    return X_cut, y_cut

X_cut_r, y_cut_r = cut_big_rock(X_train, y_train)


from matplotlib.colors import ListedColormap
cmap = ListedColormap(['blue', 'green', 'yellow', 'orange', 'red'])

vmin, vmax = 0, 4

for i in range(122,124):
    plt.imshow(X_cut_r[i])
    plt.show()
    plt.imshow(y_cut_r[i],cmap=cmap, vmin=vmin, vmax=vmax)
    plt.show()
    plt.savefig("oversampled.png")

In [None]:
# Save the clean dataset (without outliers)
#np.savez_compressed('clean_dataset', images=X_train, labels=y_train, test_set=X_test)

In [None]:
big_rock_indices = [31, 56, 57, 140, 145, 158, 162, 241, 326, 382, 406, 443, 470, 475, 572, 614, 728, 737, 739, 813, 832, 849, 856, 872, 922, 985, 1026, 1030, 1125, 1162, 1168, 1169, 1182, 1266, 1337, 1443, 1456, 1475, 1491, 1526, 1527, 1544, 1576, 1633, 1684, 1781, 1792, 1883, 1919, 2014, 2055, 2086, 2102, 2111, 2156, 2193, 2199, 2253, 2331, 2351, 2412, 2417, 2498]

for index in big_rock_indices:
    plt.imshow(X_train[index])
    plt.show()
    plt.imshow(y_train[index])
    plt.show()