# Data Pre-Processing and Autoencoders for Dimensionality Reduction (Kalia)

## Downloading the Data

In [None]:
# connecting notebook to google drive
# click on the URL, give permissions and copy and paste the authorisation code to connect
from google.colab import drive
drive.mount('/gdrive')

# access the relevant folder in the Google Drive to download the data in
import os
os.chdir('/gdrive/My Drive/Cactus')

# downloading the data from Kaggle
# make sure you have the right Kaggle library installed
!pip3 install kaggle==1.5.6
!kaggle -v

# make a .kaggle folder in your drive 
!mkdir .kaggle
# save kaggle.json file with your username and unique key in .kaggle
!echo '{"username":"USERNAME","key":"UNIQUE_KEY"}' > /root/.kaggle/kaggle.json
# change the file permissions so you can read and write from this file (but not execute)
!chmod 600 /root/.kaggle/kaggle.json

# get data from https://www.kaggle.com/c/aerial-cactus-identification/
!kaggle competitions download -c aerial-cactus-identification

# unzip data
!unzip train.zip
!unzip test.zip

## Importing Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import glob
import imageio
import cv2
from sklearn.model_selection import train_test_split

%tensorflow_version 1.x

import keras
from keras.layers import Input, Dense, Conv2D, MaxPooling2D, UpSampling2D
from keras.models import Model, load_model

## Data Pre-Processing

In [None]:
# importing the training label data
train_label = pd.read_csv('train.csv')

# viewing the first 5 lines of the data
train_label.head()

In [None]:
# accessing the folder with the training data
os.chdir('/gdrive/My Drive/Cactus/train')
# list of filenames in the order images will be imported
filenames = glob.glob('*.*')

# list of dummy data the length of our filenames for new sorted labels
sorted_cactus = ["a"]*len(filenames)

# loop to search for the id in the filenames and train_label
# and assign the right classification in the right order to the new list
for i in range(len(filenames)):
  for j in range(len(train_label)):
    if filenames[i] == train_label["id"][j]:
      sorted_cactus[i] = train_label["has_cactus"][j]

# create new dataframe of sorted labels
d = {'id': filenames, 'has_cactus': sorted_cactus}
labels_sort = pd.DataFrame(data=d)

# accessing the main folder to save the csv
os.chdir('/gdrive/My Drive/Cactus')
# save dataframe as csv to folder
labels_sort.to_csv("train_label_sorted.csv")

In [None]:
labels_sort.head()

In [None]:
# standardised image size
IMAGE_SIZE = (32, 32)

# list we will save our image arrays to
train_img = []

# loop to resize and reshape the RGB images into arrays
for index, filename in enumerate(glob.glob('train/*.*')):
  # read the image files
  image = imageio.imread(filename)
  # resize the data for standardisation
  image = cv2.resize(image, IMAGE_SIZE)
  # convert the image data to an array
  image = np.array(image)
  # append the image array to our list
  train_img.append(image)

In [None]:
print(len(train_img))

In [None]:
plt.imshow(train_img[150])

In [None]:
# creating an array of our image data
train_data = np.array(train_img)

# standardising our image data
train_data = train_data.astype('float32') / 255.

# ensuring our images are the right shape
train_data = train_data.reshape(len(train_data), 32, 32, 3)

# splitting our data into a training, validation, and testing set
train_data, eval_data, train_label, eval_label = train_test_split(train_data, labels_sort, random_state=42)
eval_data, test_data, eval_label, test_label = train_test_split(eval_data, eval_label, test_size=0.5, random_state=42)

In [None]:
# saving our original dataset and training, validation, and testing numpy arrays
np.save("train_img", train_img)
np.save("train_data", train_data)
np.save("eval_data", eval_data)
np.save("test_data", test_data)

np.save("train_label", train_label)
np.save("eval_label", eval_label)
np.save("test_label", test_label)

# loading all our arrays
train_img = np.load("train_img.npy")
train_data = np.load("train_data.npy")
eval_data = np.load("eval_data.npy")
test_data = np.load("test_data.npy")

train_label = np.load("train_label.npy")
eval_label = np.load("eval_label.npy")
test_label = np.load("test_label.npy")

## Autoencoders

### Autoencoder 1 (8 layers)

In [None]:
# this is our input placeholder
input_img = Input(shape=(32, 32, 3))  

## ENCODER ##
# Conv1 #
encoded = Conv2D(filters = 256, kernel_size = (3, 3), activation='relu', padding='same')(input_img)
encoded = Conv2D(filters = 256, kernel_size = (3, 3), activation='relu', padding='same')(encoded)
encoded = MaxPooling2D(pool_size = (2, 2), padding='same', strides=2)(encoded)
print(encoded.shape)

# Conv2 #
encoded = Conv2D(filters = 128, kernel_size = (3, 3), activation='relu', padding='same')(encoded)
encoded = Conv2D(filters = 128, kernel_size = (3, 3), activation='relu', padding='same')(encoded)
encoded = MaxPooling2D(pool_size = (2, 2), padding='same', strides=2)(encoded)
print(encoded.shape)

# Conv3 #
encoded = Conv2D(filters = 64, kernel_size = (3, 3), activation='relu', padding='same')(encoded)
encoded = Conv2D(filters = 64, kernel_size = (3, 3), activation='relu', padding='same')(encoded)
encoded = MaxPooling2D(pool_size = (2, 2), padding='same', strides=2)(encoded)
print(encoded.shape)

# Conv4 #
encoded = Conv2D(filters = 32, kernel_size = (3, 3), activation='relu', padding='same')(encoded)
encoded = Conv2D(filters = 32, kernel_size = (3, 3), activation='relu', padding='same')(encoded)
encoded = MaxPooling2D(pool_size = (2, 2), padding='same', strides=2)(encoded)
print(encoded.shape)


## DECODER ##

# DeConv1
decoded = Conv2D(32, (3, 3), activation='relu', padding='same')(encoded)
decoded = Conv2D(32, (3, 3), activation='relu', padding='same')(decoded)
decoded = UpSampling2D((2, 2))(decoded)
print(decoded.shape)

# DeConv2
decoded = Conv2D(64, (3, 3), activation='relu', padding='same')(decoded)
decoded = Conv2D(64, (3, 3), activation='relu', padding='same')(decoded)
decoded = UpSampling2D((2, 2))(decoded)
print(decoded.shape)

# DeConv3
decoded = Conv2D(128, (3, 3), activation='relu', padding='same')(decoded)
decoded = Conv2D(128, (3, 3), activation='relu', padding='same')(decoded)
decoded = UpSampling2D((2, 2))(decoded)
print(decoded.shape)

# DeConv4
decoded = Conv2D(256, (3, 3), activation='relu', padding='same')(decoded)
decoded = Conv2D(256, (3, 3), activation='relu', padding='same')(decoded)
decoded = UpSampling2D((2, 2))(decoded)
print(decoded.shape)

decoded = Conv2D(3, (3, 3), activation='sigmoid', padding='same')(decoded)
print(decoded.shape)


## ENCODER and AUTOENCODER ##

autoencoder = Model(input_img, decoded)
encoder = Model(input_img, encoded)

autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy')


In [None]:
# fitting the training data to the autoencoder model
 autoencoder.fit(train_data, train_data,
                 epochs=1000,
                 batch_size=256,
                 shuffle=True,
                 validation_data=(eval_data, eval_data))

In [None]:
# saving whole model
autoencoder.save('autoencoders/autoencoder_model1.h5')

# saving encoder for dimensionality reduction
encoder.save('autoencoders/encoder_model1.h5')
 
# loading whole model
model1 = load_model('autoencoders/autoencoder_model1.h5')

In [None]:
plt.figure(figsize=(10, 6))
# history.history stores the training and validation loss of our model when fitting
plt.plot(autoencoder.history.history['loss'])
plt.plot(autoencoder.history.history['val_loss'])
plt.title('Autoencoder loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()

In [None]:
# reconstructed images from validation set
reconst_test = autoencoder.predict(eval_data)

# number of images
n = 10
# number of rows in plot
row = 2

plt.figure(figsize=(20, 4))
for i in range(n):
    # display original
    ax = plt.subplot(row, n, i + 1)
    plt.imshow(eval_data[i].reshape(32, 32, 3))
    plt.gray()
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)

    # display reconstruction
    ax = plt.subplot(row, n, i + 1 + n)
    plt.imshow(reconst_test[i].reshape(32, 32, 3))
    plt.gray()
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
    
plt.show()

### Autoencoder 2 (6 layers)

In [None]:
# input layer
input_img_2 = Input(shape=(32, 32, 3))  

## ENCODER ##
# Conv1 #
encoded_2 = Conv2D(filters = 256, kernel_size = (3, 3), activation='relu', padding='same')(input_img_2)
encoded_2 = Conv2D(filters = 256, kernel_size = (3, 3), activation='relu', padding='same')(encoded_2)
encoded_2 = MaxPooling2D(pool_size = (2, 2), padding='same', strides=2)(encoded_2)
print(encoded_2.shape)

# Conv2 #
encoded_2 = Conv2D(filters = 128, kernel_size = (3, 3), activation='relu', padding='same')(encoded_2)
encoded_2 = Conv2D(filters = 128, kernel_size = (3, 3), activation='relu', padding='same')(encoded_2)
encoded_2 = MaxPooling2D(pool_size = (2, 2), padding='same', strides=2)(encoded_2)
print(encoded_2.shape)

# Conv3 #
encoded_2 = Conv2D(filters = 64, kernel_size = (3, 3), activation='relu', padding='same')(encoded_2)
encoded_2 = Conv2D(filters = 64, kernel_size = (3, 3), activation='relu', padding='same')(encoded_2)
encoded_2 = MaxPooling2D(pool_size = (2, 2), padding='same', strides=2)(encoded_2)
print(encoded_2.shape)

## DECODER ##

# DeConv1
decoded_2 = Conv2D(64, (3, 3), activation='relu', padding='same')(encoded_2)
decoded_2 = Conv2D(64, (3, 3), activation='relu', padding='same')(decoded_2)
decoded_2 = UpSampling2D((2, 2))(decoded_2)
print(decoded_2.shape)

# DeConv2
decoded_2 = Conv2D(128, (3, 3), activation='relu', padding='same')(decoded_2)
decoded_2 = Conv2D(128, (3, 3), activation='relu', padding='same')(decoded_2)
decoded_2 = UpSampling2D((2, 2))(decoded_2)
print(decoded_2.shape)

# DeConv3
decoded_2 = Conv2D(256, (3, 3), activation='relu', padding='same')(decoded_2)
decoded_2 = Conv2D(256, (3, 3), activation='relu', padding='same')(decoded_2)
decoded_2 = UpSampling2D((2, 2))(decoded_2)
print(decoded_2.shape)

decoded_2 = Conv2D(3, (3, 3), activation='sigmoid', padding='same')(decoded_2)
print(decoded_2.shape)


## ENCODER and AUTOENCODER ##

autoencoder_2 = Model(input_img_2, decoded_2)
encoder_2 = Model(input_img_2, encoded_2)

autoencoder_2.compile(optimizer='adadelta', loss='binary_crossentropy')


In [None]:
# fitting the training data to the autoencoder model
 autoencoder_2.fit(train_data, train_data,
                 epochs=1000,
                 batch_size=256,
                 shuffle=True,
                 validation_data=(eval_data, eval_data))

In [None]:
plt.figure(figsize=(10, 6))
# history.history stores the training and validation loss of our model when fitting
plt.plot(autoencoder_2.history.history['loss'])
plt.plot(autoencoder_2.history.history['val_loss'])
plt.title('Autoencoder loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()

In [None]:
# reconstructed images from validation set
reconst_test_2 = autoencoder_2.predict(eval_data)

# number of images
n = 10
# number of rows in plot
row = 2

plt.figure(figsize=(20, 4))
for i in range(n):
    # display original
    ax = plt.subplot(row, n, i + 1)
    plt.imshow(eval_data[i].reshape(32, 32, 3))
    plt.gray()
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)

    # display reconstruction
    ax = plt.subplot(row, n, i + 1 + n)
    plt.imshow(reconst_test_2[i].reshape(32, 32, 3))
    plt.gray()
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
    
plt.show()

In [None]:
autoencoder_2.save('autoencoders/autoencoder_model2.h5')
encoder_2.save('autoencoders/encoder_model2.h5')

# Principle Component Analysis (PCA) (Sandie)

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [None]:
pca_x = np.reshape(train_data, (24000,96))

In [None]:
scaler = StandardScaler()
scaler.fit(pca_x)
scaled_x = scaler.transform(pca_x)

In [None]:
pca = PCA(0.95)

pca_transformed = pca.fit_transform(scaled_x)

In [None]:
pca_transformed.shape

In [None]:
inv = pca.inverse_transform(pca_transformed)

plt.scatter(scaled_x[:, 0], scaled_x[:, 1], alpha=0.2)
plt.scatter(inv[:, 0], inv[:, 1], alpha=0.8)
plt.axis('equal');

In [None]:
fig, axes = plt.subplots(2,5,figsize=(20,4),
 subplot_kw={'xticks':[], 'yticks':[]},
 gridspec_kw=dict(hspace=0.01, wspace=0.01))
for i, ax in enumerate(axes.flat):
    ax.imshow(pca.components_[i].reshape(12,8), cmap='gray')

In [None]:
pca.explained_variance_ratio_

In [None]:
pca2 = PCA(10)

pca_transformed2 = pca2.fit_transform(scaled_x)

In [None]:
inv2 = pca2.inverse_transform(pca_transformed2)

plt.scatter(scaled_x[:, 0], scaled_x[:, 1], alpha=0.2)
plt.scatter(inv2[:, 0], inv2[:, 1], alpha=0.8)
plt.axis('equal');

In [None]:
fig, axes = plt.subplots(2,5,figsize=(20,4),
 subplot_kw={'xticks':[], 'yticks':[]},
 gridspec_kw=dict(hspace=0.01, wspace=0.01))
for i, ax in enumerate(axes.flat):
    ax.imshow(pca2.components_[i].reshape(12,8), cmap='gray')

In [None]:
pca2.explained_variance_ratio_

In [None]:
pca = PCA().fit(scaled_x)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');