In [None]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from scipy import stats
import scipy

In [None]:
plt.rcParams['figure.figsize'] = [10., 10.]
plt.rcParams['xtick.labelsize'] = 14
plt.rcParams['ytick.labelsize'] = 14 
plt.rcParams['axes.labelsize'] = 16
plt.rcParams['axes.titlesize'] = 16
plt.rcParams['legend.fontsize'] = 14

# Deep Learning

This notebook demonstrates various deep learning architectures using the MNIST data

The code uses Tensorflow / Keras, which you may need to install

In [None]:
from sklearn.datasets import fetch_openml
from tensorflow import keras
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

In [None]:
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)
y = np.int32(y)

In [None]:
X_train = X[:60000]
X_test = X[60000:]
y_train = y[:60000]
y_test = y[60000:]

Performing so-called "one hot" encoding on the outputs

In [None]:
y_cat_test = keras.utils.to_categorical(y_test)
y_cat_train = keras.utils.to_categorical(y_train)

In [None]:
models = {}

Below are multiple architecture examples. Try out different ones and build your own

In [None]:
# name = 'simple'

# inputs = keras.Input(shape=(X_train.shape[1],))
# h = keras.layers.Dense(128, activation="relu")(inputs)
# h = keras.layers.Dense(64, activation="relu")(h)
# h = keras.layers.Dense(32, activation="relu")(h)
# outputs = keras.layers.Dense(10, activation='softmax')(h)

# models[name] = keras.Model(inputs=inputs, outputs=outputs)

# optimizer = keras.optimizers.Adam(0.0001)

# models[name].compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
# models[name].summary()

In [None]:
# name = 'dropout'

# inputs = keras.Input(shape=(X_train.shape[1],))
# h = keras.layers.Dropout(0.01)(inputs)
# h = keras.layers.Dense(128, activation="relu")(h)
# h = keras.layers.Dropout(0.01)(h)
# h = keras.layers.Dense(64, activation="relu")(h)
# h = keras.layers.Dropout(0.01)(h)
# h = keras.layers.Dense(32, activation="relu")(h)
# outputs = keras.layers.Dense(10, activation='softmax')(h)

# models[name] = keras.Model(inputs=inputs, outputs=outputs)

# optimizer = keras.optimizers.Adam(0.0001)

# models[name].compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [None]:
# name = 'batchnorm'


# inputs = keras.Input(shape=(X_train.shape[1],))
# h = keras.layers.Dense(128, activation="relu")(inputs)
# h = keras.layers.BatchNormalization()(h)
# h = keras.layers.Dense(64, activation="relu")(h)
# h = keras.layers.BatchNormalization()(h)
# h = keras.layers.Dense(32, activation="relu")(h)
# outputs = keras.layers.Dense(10, activation='softmax')(h)

# models[name] = keras.Model(inputs=inputs, outputs=outputs)

# optimizer = keras.optimizers.Adam(0.0001)

# models[name].compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [None]:
# name = 'locally connected'


# inputs = keras.Input(shape=(28, 28, 1))
# h = keras.layers.LocallyConnected2D(1,  kernel_size=(5, 5), activation="relu")(inputs)
# h = keras.layers.LocallyConnected2D(1,  kernel_size=(5, 5), activation="relu")(h)
# h = keras.layers.Flatten()(h)
# h = keras.layers.Dense(32, activation="relu")(h)
# h = keras.layers.Dense(16, activation="relu")(h)
# outputs = keras.layers.Dense(10, activation='softmax')(h)

# models[name] = keras.Model(inputs=inputs, outputs=outputs)

# optimizer = keras.optimizers.Adam(0.0001)

# models[name].compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [None]:
# name = 'cnn_simple'


# inputs = keras.Input(shape=(28, 28, 1))
# h = keras.layers.Conv2D(1,  kernel_size=(5, 5), activation="relu")(inputs)
# h = keras.layers.Conv2D(1,  kernel_size=(5, 5), activation="relu")(h)
# h = keras.layers.Flatten()(h)
# h = keras.layers.Dense(32, activation="relu")(h)
# h = keras.layers.Dense(16, activation="relu")(h)
# outputs = keras.layers.Dense(10, activation='softmax')(h)

# models[name] = keras.Model(inputs=inputs, outputs=outputs)

# optimizer = keras.optimizers.Adam(0.0001)

# models[name].compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [None]:
name = 'CNN'

inputs = keras.Input(shape=(28, 28, 1))
h = keras.layers.Conv2D(32,  kernel_size=(3, 3), activation="relu")(inputs)
h = keras.layers.MaxPool2D(pool_size=(2,2))(h)
h = keras.layers.Conv2D(64,  kernel_size=(3, 3), activation="relu")(h)
h = keras.layers.MaxPool2D(pool_size=(2,2))(h)
h = keras.layers.Conv2D(64,  kernel_size=(3, 3), activation="relu")(h)
h = keras.layers.Flatten()(h)
h = keras.layers.Dense(16, activation="relu")(h)
outputs = keras.layers.Dense(10, activation='softmax')(h)

models[name] = keras.Model(inputs=inputs, outputs=outputs)

optimizer = keras.optimizers.Adam(0.0001)

models[name].compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [None]:
models['CNN'].summary()

In [None]:
# name = 'CNN + Dropout + Batchnorm'


# inputs = keras.Input(shape=(28, 28, 1))
# h = keras.layers.Dropout(0.01)(inputs)
# h = keras.layers.Conv2D(32,  kernel_size=(3, 3), activation="relu")(h)
# h = keras.layers.BatchNormalization()(h)
# h = keras.layers.MaxPool2D(pool_size=(2,2))(h)
# h = keras.layers.Conv2D(64,  kernel_size=(3, 3), activation="relu")(h)
# h = keras.layers.BatchNormalization()(h)
# h = keras.layers.MaxPool2D(pool_size=(2,2))(h)
# h = keras.layers.Conv2D(64,  kernel_size=(3, 3), activation="relu")(h)
# h = keras.layers.BatchNormalization()(h)
# h = keras.layers.Flatten()(h)
# h = keras.layers.Dense(16, activation="relu")(h)
# outputs = keras.layers.Dense(10, activation='softmax')(h)

# models[name] = keras.Model(inputs=inputs, outputs=outputs)

# optimizer = keras.optimizers.Adam(0.0001)

# models[name].compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# models[name].summary()

We can now train the model using several epochs (1 epoch = churning through the full dataset once)

NB: depending on the model, you need to shape the inputs differently!

Training 30 Epochs (depending on the model and your computer hardware) can take a while

In [None]:
# train 1d models (dense etc)
# models[name].fit(X_train, y_cat_train, epochs=30, validation_data=(X_test, y_cat_test), batch_size=64)

# traind 2d models (CNNs etc)
models[name].fit(X_train.reshape(-1, 28, 28, 1), y_cat_train, epochs=30, validation_data=(X_test.reshape(-1, 28, 28, 1), y_cat_test), batch_size=64)

Looking at the training history can help gaining some insight and sport overfitting for example

In [None]:
for name in models.keys(): #['simple', 'CNN + Dropout + Batchnorm']: #'dropout', 'batchnorm']:
    #bl = plt.plot(models[name].history.history['accuracy'], ls='--', label='Training Accuracy %s'%name)
    #plt.plot(models[name].history.history['val_accuracy'], ls='-', c=bl[0].get_color(), label='Testing Accuracy %s'%name)
    try:
        bl = plt.plot(models[name].history.history['loss'], ls='--', label='Training Loss %s'%name)
        plt.plot(models[name].history.history['val_loss'], ls='-', c=bl[0].get_color(), label='Testing Loss %s'%name)
    except AttributeError:
        pass
plt.gca().set_xlabel('Epoch')
plt.gca().set_ylabel('Loss')
plt.legend()
plt.gca().set_yscale('log')
#plt.savefig('NN_history_cnn_best.png', bbox_inches='tight')

In [None]:
# predict 1d model
#y_pred = models[name].predict(X_test)

# predict 2d model
y_pred = models[name].predict(X_test.reshape(-1, 28, 28, 1))

The confusion matrix shows how good the assignement of digits to the rerspective classis is

In [None]:
cm = confusion_matrix(y_test, np.argmax(y_pred,axis=1))

In [None]:
plt.imshow(cm.T, cmap='YlGnBu', origin='lower')
plt.gca().set_xlabel('True label')
plt.gca().set_ylabel('Predicted label')
plt.savefig('NN_consfusion_%s.png'%name, bbox_inches='tight')

-> Try out different models and architectures and compare them!

# Auto encoder

As discussed in the lecture, a different application of NNs are auto encoders.
We first look at a linear auto encoder, which just replicates our good old PCA

In [None]:
# linear

inputs = keras.Input(shape=(X_train.shape[1],))
h = keras.layers.Dense(20, activation="linear", use_bias=False)(inputs)
outputs = keras.layers.Dense(X_train.shape[1], activation='linear', use_bias=False)(h)

ae = keras.Model(inputs=inputs, outputs=outputs)

In [None]:
optimizer = keras.optimizers.Adam(0.00001)

ae.compile(loss='mse', optimizer=optimizer, metrics=['accuracy'])

In [None]:
ae.fit(X, X, epochs=30, batch_size=32)

In [None]:
encode = keras.Model(inputs=inputs, outputs=h)

In [None]:
reduced_data = encode(X).numpy()

plt_data = plt.scatter(reduced_data[:, 0], reduced_data[:, 1], s=0.1, c=y ,cmap=plt.cm.get_cmap('Spectral', 10))
plt.colorbar()
#plt.savefig('mnist_encoded_true_labels.png', bbox_inches='tight')

And here is how our 20d recosntrcuted data looks like....pretty similar to our 20d PCA!
Exercise: compare this NN to PCA in 2d

In [None]:
X_reco = ae(X).numpy()

fig, ax = plt.subplots(5,5)
for i in range(25):
    axis = ax[i//5, i%5]
    axis.imshow(X_reco[i].reshape(28,28), cmap='Greys')

## Non-linear AE

It gets much more powerful when adding back in non-linearirties

In [None]:
inputs = keras.Input(shape=(X_train.shape[1],))
encoded = keras.layers.Dense(256, activation="relu")(inputs)
encoded = keras.layers.Dense(64, activation="relu")(encoded)
encoded = keras.layers.Dense(2, activation="relu")(encoded)
decoder1 = keras.layers.Dense(64, activation="relu")
decoded = decoder1(encoded)
decoder2 = keras.layers.Dense(256, activation="relu")
decoded = decoder2(decoded)
decoder_out = keras.layers.Dense(X_train.shape[1], activation='linear')
outputs = decoder_out(decoded)

ae = keras.Model(inputs=inputs, outputs=outputs)

In [None]:
optimizer = keras.optimizers.Adam(0.001)

ae.compile(loss='mse', optimizer=optimizer, metrics=['accuracy'])

In [None]:
ae.fit(X_train, X_train, epochs=30, validation_data=(X_test, X_test), batch_size=64)

We can split up our models intwo the encoder and the decoder part:

In [None]:
encode = keras.Model(inputs=inputs, outputs=encoded)

In [None]:
dec_inp = keras.Input(shape=2,)
decoded_i = decoder1(dec_inp)
decoded_i = decoder2(decoded_i)
outputs_i = decoder_out(decoded_i)
decode = keras.Model(inputs=dec_inp, outputs=outputs_i)

In [None]:
reduced_data = encode(X).numpy()
reduced_data

For this 2d encoder, the digits separate much more nicely than in the PCA case, and also recosntrcuted images look fantastic

In [None]:
plt_data = plt.scatter(reduced_data[:, 0], reduced_data[:, 1], s=0.1, c=y ,cmap=plt.cm.get_cmap('Spectral', 10))
plt.colorbar()
#plt.savefig('mnist_encoded_linear_true_labels.png', bbox_inches='tight')

In [None]:
X_reco = ae(X)
X_plot = X_reco.numpy()

fig, ax = plt.subplots(5,5)
for i in range(25):
    axis = ax[i//5, i%5]
    axis.imshow(X_plot[i].reshape(28,28), cmap='Greys')

## Generate digits

We can try to use the decoder as a generator, and generate artidicial digits. The issue here is that this may not work very well (see lecture) and should be done via _variational_ AEs (see according notebook).

In [None]:
inp = np.array([[100., 100.],]).astype(np.float32)
o = decode(inp).numpy()

In [None]:
# display a 2D manifold of the digits
n = 15 # figure with 15x15 digits
digit_size = 28
figure = np.zeros((digit_size * n, digit_size * n))

grid_x = np.linspace(-100., 1600, n)
grid_y = np.linspace(-100., 1200, n)

for i, yi in enumerate(grid_x):
    for j, xi in enumerate(grid_y):
        z_sample = np.array([[xi, yi]]).astype(np.float32)
        x_decoded = decode.predict(z_sample)
        digit = x_decoded[0].reshape(digit_size, digit_size)
        figure[i * digit_size: (i + 1) * digit_size,
               j * digit_size: (j + 1) * digit_size] = digit

plt.figure(figsize=(15, 15))
plt.imshow(figure, cmap='Greys')

plt.gca().get_xaxis().set_visible(False)
plt.gca().get_yaxis().set_visible(False)


#plt.savefig('AE_mnist.png', bbox_inches='tight')