# Part 1: CNNs and Transfer Learning in General

In [None]:
import numpy as np
import matplotlib.pyplot as plt 
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.applications.mobilenet import MobileNet, preprocess_input
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Flatten, Input
from tensorflow.keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D, UpSampling2D, MaxPool2D
from tensorflow.keras import backend as K
from scipy.ndimage import zoom

## Data

We'll be using the [CIFAR10](https://www.cs.toronto.edu/~kriz/cifar.html), which contains some natural images tagged in 10 different categories (e.g. cars, dogs, birds etc.)

In [None]:
num_classes = 10 # number of classes in the data
img_rows, img_cols, img_channels = 32, 32, 3 # input image dimensions

# Load and convert data
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')

# Convert class vectors to one-hot encoding
y_train = tf.keras.utils.to_categorical(y_train, num_classes)
y_test = tf.keras.utils.to_categorical(y_test, num_classes)

# Depending on the implementation, the underlying libraries might want the image 
# dimensions in different orders, check for it and reshape
if K.image_data_format() == 'channels_first':
    x_train = x_train.reshape(x_train.shape[0], img_channels, img_rows, img_cols)
    x_test = x_test.reshape(x_test.shape[0], img_channels, img_rows, img_cols)
    input_shape = (img_channels, img_rows, img_cols)
else:
    x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, img_channels)
    x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, img_channels)
    input_shape = (img_rows, img_cols, img_channels)

# To speed up things, we select 1k random samples for training and test
index = np.arange(x_train.shape[0])
np.random.seed(0)
np.random.shuffle(index)
index = index[:1000]
x_train, y_train = x_train[index], y_train[index]

index = np.arange(x_test.shape[0])
np.random.seed(0)
np.random.shuffle(index)
index = index[:1000]
x_test, y_test = x_test[index], y_test[index]

print('x_train shape:', x_train.shape)
print('y_train shape:', y_train.shape)

## Exercise 1.1: Building a CNN from scratch

Let's build a CNN from scratch using a very small subset of CIFAR10 training data (1000 data points).

In [None]:
x_scaled_train = x_train / 255.
x_scaled_test = x_test / 255.

In [None]:
model1 = Sequential([
    Conv2D(16, kernel_size=(3, 3),strides=1, padding='valid',activation='relu',input_shape=input_shape),
    Conv2D(16, kernel_size=(3, 3),strides=2, padding='valid',activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(num_classes, activation='softmax')])

model1.compile(loss='categorical_crossentropy',
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
              metrics=['accuracy'])
model1.summary()

In [None]:
model1.fit(x_scaled_train, y_train,
          batch_size=128,
          epochs=20)

Finally, we evaluate the model:

In [None]:
train_score = model1.evaluate(x_scaled_train , y_train, verbose=0)
test_score = model1.evaluate(x_scaled_test, y_test, verbose=0)

print('Train loss:', train_score[0])
print('Train accuracy:', train_score[1])
print('Test loss:', test_score[0])
print('Test accuracy:', test_score[1])

### Questions

1. Why is there such a big difference between the training and testing accuracy?
2. Why is the performance so poor?

## Exercise 1.2: Using pre-processed features

Let's load a pre-trained MobileNet network (efficient, small network, only 4M parameters but with decent performance on ImageNet) and extract features from our training and testsets.

First, plot a sample image (so we can make sure our transformations are correct):

In [None]:
plt.figure(figsize=(5, 5))
plt.imshow(x_train[0].astype(int))
plt.show()

Some pre-proessing of the image for MobileNet (scaling and shifing to be between [-1, 1])

In [None]:
x_preprocess_train = preprocess_input(x_train.copy())
x_preprocess_test = preprocess_input(x_test.copy())
x_preprocess_train.shape, x_preprocess_test.shape

Replot sample image (need to shift/scale to dispaly colours properly):

In [None]:
plt.figure(figsize=(5, 5))
plt.imshow((x_preprocess_train[0] + 1.0) * 127.5 / 255)
plt.show()

We need to resize the image because MobileNet expects one of a certain number of image sizes, e.g., 160x160x3, or 224x224x3. One way to do so is to use the `zoom` function, which adds in new pixels whose value is based on spline interpolation (this might take a few minutes).

In [None]:
print('Processing training data')
resized_images = []
for i in range(x_preprocess_train.shape[0]):
    resized_images.append(zoom(x_preprocess_train[i], (5.0, 5.0, 1.0)))
    if (i+1) % 200 == 0:
        print('* Processed %d images' % (i+1))
x_preprocess_train = np.stack(resized_images)

print('Processing testing data')
resized_images = []
for i in range(x_preprocess_test.shape[0]):
    resized_images.append(zoom(x_preprocess_test[i], (5.0, 5.0, 1.0)))
    if (i+1) % 200 == 0:
        print('* Processed %d images' % (i+1))
x_preprocess_test = np.stack(resized_images)

x_preprocess_train.shape, x_preprocess_test.shape

Plot another sample image (needs to be reshaped appropriately):

In [None]:
plt.figure(figsize=(5, 5))
plt.imshow((x_preprocess_train[0] + 1.0) * 127.5 / 255)
plt.show()

We are now ready to build a model. Our basis will be the MobileNet network, with the parameters trained on ImageNet.  Notice we add the `include_top=False` parameter because we don't want to use the included ImageNet classifier.

In [None]:
mobilenet = MobileNet(weights='imagenet', include_top=False, input_shape = (160,160,3))

In [None]:
mobilenet.summary()

MobileNet outputs a (5, 5, 1024) tensor before it's "top layer" (which we didn't download).  We compute the relevant activation of this top layer for our images (this represents a range of high-level features of our images). We then aggregate the top layer activation using a `GlobalAveragePooling2D` layer, which compresses it into a (1, 1, 1024) tensor, which is equivalent to a 1024-vector.  `GlobalAveragePooling2D` works by taking the average of each 5x5 feature map.

Let's extract high-level features of our images using MobileNet (this may take a few minutes)

In [None]:
x_features_train = mobilenet.predict(x_preprocess_train)
x_features_test = mobilenet.predict(x_preprocess_test)
x_features_train.shape, x_features_test.shape

Let's now build a simple feed-forward network which uses these features as an input, to predict the right classes:

In [None]:
model2 = Sequential([
    GlobalAveragePooling2D(input_shape=(5,5,1024)),
    Dense(64, activation='relu'),
    Dense(num_classes, activation='softmax')])
model2.compile(loss='categorical_crossentropy',
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              metrics=['accuracy'])
model2.summary()

In [None]:
model2.fit(x_features_train, y_train,
           batch_size=128,
           epochs=20)

Finally, we evaluate the model:

In [None]:
train_score = model2.evaluate(x_features_train, y_train, verbose=0)
test_score = model2.evaluate(x_features_test, y_test, verbose=0)

print('Train loss:', train_score[0])
print('Train accuracy:', train_score[1])
print('Test loss:', test_score[0])
print('Test accuracy:', test_score[1])

### Questions

1. What happens if you don't use the `GlobalAveragePooling2D` layer (note: you will need a `Flatten` layer instead)?  Hypothesize why the performance changes.
2. Why does the image need to be preprocessed via the `preprocess_input()` function?  Hypothesize what happens if we omit this step.
3. Why do we need to resize the image (via `zoom`)?

## Exercise 1.3: Using Frozen Layers in a Pre-trained Network

Instead of using pre-processed features, we add the pre-trained MobileNet network to our model (and freeze it, to avoid changing its weights).

In [None]:
x_preprocess_train = preprocess_input(x_train.copy())
x_preprocess_test = preprocess_input(x_test.copy())

In [None]:
mobilenet = MobileNet(weights='imagenet', include_top=False, input_shape = (160,160,3))
mobilenet.trainable = False

We integrate the `MobileNet` layers into our network. But before that, we need to resize the images. This time, we use "upsampling". Basically, we add additional pixels that are copies of the existing ones, i.e., we "stretch out" the pixels.

We will conduct upsampling directly in our neural network, so there is not need to do additional pre-processing.

In [None]:
model3 = Sequential([
    UpSampling2D(size=(5,5), input_shape = (32,32,3)),
    mobilenet,
    GlobalAveragePooling2D(),
    Dense(64, activation='relu'),
    Dense(num_classes, activation='softmax')])
    
model3.compile(loss='categorical_crossentropy',
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              metrics=['accuracy'])
model3.summary()

We train the model, as before. However (unless you are using a GPU), this is a lot slower. Hence, I recommend running for only a few periods.

In [None]:
model3.fit(x_preprocess_train, y_train,
           batch_size=128,
           epochs=8)

Once more, let's look at how the model performs:

In [None]:
train_score = model3.evaluate(x_preprocess_train, y_train, verbose=0)
test_score = model3.evaluate(x_preprocess_test, y_test, verbose=0)

print('Train loss:', train_score[0])
print('Train accuracy:', train_score[1])
print('Test loss:', test_score[0])
print('Test accuracy:', test_score[1])

### Questions

1. Why does the network train so much more slowly?
2. What is the difference (if any) between the model in Exercise 2 and Exercise 3?
3. How would you modify the above code if we wanted to train some of the MobileNet layers? Hint: you can iterate through the layers of a model with
```
for layer in model.layers:
```

# Part 2: Hot Dog or Not Dog

This problem's purpose is to build a neural network to classify images as hot dogs or not-hot dogs. This is the same problem as seen in the HBO TV show "Silicon Valley". We will be using the dataset put together by [a user on Kaggle](https://www.kaggle.com/dansbecker/hot-dog-not-hot-dog) which contains 498 training images and 500 test images.

A simple CNN is given below. Due to the small sample size it has a very poor test set accuracy. Your task is to build a CNN that can beat this test set accuracy by a large margin (get to at least 70% test set accuracy).

First, we need a few more packages. If you don't currently have skimage or cv2 installed, uncomment and run the lines below:

In [None]:
#pip install scikit-image
#pip install opencv-python

In [None]:
from PIL import Image
import os, sys
import cv2
import tarfile
from skimage.transform import resize
from tensorflow.keras.preprocessing.image import ImageDataGenerator

We start by loading the data:

In [None]:
path_to_data =  tf.keras.utils.get_file('hotdog.tar', 'https://www.dropbox.com/s/9zx61bhlrjx135j/hotdog.tar?dl=1')
file = tarfile.open(path_to_data)
file.extractall(os.path.abspath(os.path.join(path_to_data, os.pardir)))
file.close()

Let's take a look at two examples pictures:

In [None]:
img_size = 160

img_arr_hotdog = cv2.imread(os.path.abspath(os.path.join(path_to_data, os.pardir,'hotdog/train/hot_dog/2417.jpg')))
img_arr_hotdog = cv2.resize(img_arr_hotdog, (img_size,img_size))[:,:,::-1]

img_arr_notdog = cv2.imread(os.path.abspath(os.path.join(path_to_data, os.pardir,'hotdog/train/not_hot_dog/197.jpg')))
img_arr_notdog = cv2.resize(img_arr_notdog, (img_size,img_size))[:,:,::-1]

plt.figure(figsize=(5,5))
plt.subplot(1, 2, 1)
plt.imshow(img_arr_hotdog)
plt.title("Hot dog"); plt.grid(False)

plt.subplot(1, 2, 2)
plt.imshow(img_arr_notdog)
plt.title("Not dog"); plt.grid(False)

Instead of loading all the data in advance, we create a data pipeline using an `ImageDataGenerator`. The generator will load in the data files as needed and perform two transformations:
- Rescaling pixels to be between [0, 1]
- Resizing images to be in `img_size`x`img_size` (160x160)

During training for each batch, the images are read from disk on the fly, loaded into memory and then the transformations are applied.

In [None]:
train_data_dir = os.path.abspath(os.path.join(path_to_data, os.pardir,'hotdog/train'))
test_data_dir = os.path.abspath(os.path.join(path_to_data, os.pardir,'hotdog/test'))
batch_size = 128

train_datagen = ImageDataGenerator(rescale=1. / 255)
test_datagen = ImageDataGenerator(rescale=1. / 255)

# Data parameters (DO NOT MODIFY)
num_train_samples = 498
num_test_samples = 500

# Data generators (DO NOT MODIFY)
train_generator = train_datagen.flow_from_directory(
    train_data_dir,
    target_size=(img_size, img_size),
    batch_size=batch_size,
    class_mode='binary'
)

test_generator = test_datagen.flow_from_directory(
    test_data_dir,
    target_size=(img_size, img_size),
    batch_size=batch_size,
    class_mode='binary'
)

### Defining and running an initial model

We define a starting model, which you will need to improve upon.

In [None]:
model1 = Sequential([
    Conv2D(32, (3, 3), padding='valid', activation='relu', input_shape=(img_size,img_size,3)),
    MaxPooling2D(pool_size=(2, 2)),
    
    Conv2D(32, (3, 3), padding='valid',activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    
    Conv2D(64, (3, 3), padding='valid', activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
    
model1.compile(loss='binary_crossentropy',
              metrics=['accuracy'], 
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.001))

model1.summary()

When training the model, there is a small detail to consider: since we generate data on the fly, the training process doesn't know the total number of data points. Normally, in each epoch, we would take as many steps as needed to get through the whole dataset, given our batch_size. Hence, we have "training samples" / "batch size" as the number of steps per epoch. Here, we have to manually define that number of steps instead (and we do it in exactly this way, for consistency of the meaning of "epoch").

In [None]:
model1.fit(train_generator,
            steps_per_epoch=num_train_samples // batch_size,
            epochs=20)

As usual, we evaluate the model. Again, the use of a generator implies only a small change.

In [None]:
train_score = model1.evaluate(train_generator,steps=num_test_samples // batch_size, verbose=0)
test_score = model1.evaluate(test_generator,steps=num_test_samples // batch_size, verbose=0)

print('Train loss:', train_score[0])
print('Train accuracy:', train_score[1])
print('Test loss:', test_score[0])
print('Test accuracy:', test_score[1])

### Questions

1. Can you improve the model using Transfer Learning? You could use the MobileNet as before, or some completely different pre-trained model, such as one of the different [ResNet](https://www.tensorflow.org/api_docs/python/tf/keras/applications/resnet) implementations in TensorFlow.
2. Are there other things you can do to improve the model?

### Example answer, part 1

Using transfer learning as the training dataset is relatively small for the application (here: `MobileNet`)

Since hotdog images probably have the same low-level details as that of the `ImageNet` dataset, such transfer learning should be feasible.

In [None]:
mobilenet = MobileNet(weights='imagenet', include_top=False, input_shape = (160,160,3))
mobilenet.trainable = False

In [None]:
model2 = Sequential([
    mobilenet,
    GlobalAveragePooling2D(),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])
model2.compile(loss='binary_crossentropy',
              metrics=['accuracy'], 
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.001))

model2.summary()

In [None]:
model2.fit(train_generator,
            steps_per_epoch=num_train_samples // batch_size,
            epochs=20)

In [None]:
train_score = model2.evaluate(train_generator,steps=num_test_samples // batch_size, verbose=0)
test_score = model2.evaluate(test_generator,steps=num_test_samples // batch_size, verbose=0)

print('Train loss:', train_score[0])
print('Train accuracy:', train_score[1])
print('Test loss:', test_score[0])
print('Test accuracy:', test_score[1])

Compared to the original model, which contains 1.2 million parameters to train on 498 images, the transfer learning model contains 3.2 million parameters, but only 65.7 thousand are trainable. As a result, the performance improves significantly (even though there is quite a bit of overfitting happening, which we may want to address in a second step).

### Example answer, part 2

We can try to unfreeze some layers close to the top of pre-trained `MobileNet`, to adapt the representation more closely to our hotdog/notdog images. In particular, we only freeze the layers up to the second-to-last, and keep the last two unfrozen:

In [None]:
mobilenet.trainable = True
# Freeze layers in the base model except the last two
for layer in mobilenet.layers[:-2]:
    layer.trainable = False

The model can be defined as before. Notice how we now have around 2,000 more parameters that are trainable.

In [None]:
model3 = Sequential([
    mobilenet,
    GlobalAveragePooling2D(),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])
model3.compile(loss='binary_crossentropy',
              metrics=['accuracy'], 
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.001))

model3.summary()

In [None]:
model3.fit(train_generator,
            steps_per_epoch=num_train_samples // batch_size,
            epochs=20)

In [None]:
train_score = model3.evaluate(train_generator,steps=num_test_samples // batch_size, verbose=0)
test_score = model3.evaluate(test_generator,steps=num_test_samples // batch_size, verbose=0)

print('Train loss:', train_score[0])
print('Train accuracy:', train_score[1])
print('Test loss:', test_score[0])
print('Test accuracy:', test_score[1])

By unfreezing layers close to the top, more parameters have become trainable. This can be helpful in training, but we don't see a vast improvement in this case (if any)

# Part 3: Non-sequential models

We will now take a first peek at running non-sequential models. An example application is object detection (we need to predict classes **and** bounding boxes). But there are many other sceneraios where you want to tweak your model non-sequentially, e.g., to introduce skip connection (see the videos).

Here, we will train a model that predicts both the class of an images, as well as a random number (this doesn't have much meaning but it is really just to show you how we can use the Functional API of TensorFlow).

Let's first create the secondary y's (we will use the data from the first part):

In [None]:
train_means = np.mean(x_preprocess_train,axis=(1,2,3))
y2_train = np.random.normal(train_means,np.abs(train_means/2))

test_means = np.mean(x_preprocess_test,axis=(1,2,3))
y2_test = np.random.normal(test_means,np.abs(test_means/2))

The Functional API works very similarly to the Sequential API. But instead of having a list of layers, we just create layers and connect them arbitrarily. To do so, we just specify the previous layer that is supposed to flow into the current layer:

In [None]:
model_input = Input(shape=(32, 32, 3)) # We start with an input layer (we could have multiple inputs, too!)

x = Conv2D(32, kernel_size=(3,3), strides=(2,2), padding='same', activation="relu")(model_input) # We then create a Convolutional layer, which takes the input layer as an input
x = MaxPool2D((2,2), strides=(2,2), padding='same')(x) # Next, we create a Pooling layer that takes the convolutional layer as its input
    
x = Conv2D(64, kernel_size=(3,3), strides=(2,2), padding='same', activation="relu")(x) # As before
x = MaxPool2D((2,2), strides=(2,2), padding='same')(x) # As before
     
x = Flatten()(x)  # As before
x = Dense(100,activation="relu")(x)  # As before

model_output_1 = Dense(num_classes, activation='softmax', name = 'output_1')(x) # Now we create an output layer that predicts the class (normal / pneumonia). It uses whatever comes out of the network so far
model_output_2 = Dense(1, activation="sigmoid", name = 'output_2')(x) # We create a second output layer. Note that this does not connect to the other output layer, but directly to the last hidden layer

We have all the same layers defined as before, just with a second output layer. Note that we give a specific name to our output-layers, so we can reference them later!

We combine our layers in a model. We just have ot specify what our inputs are and what our outputs are. The remaining layers are added based on the structure above!

In [None]:
model_func = Model(inputs = [model_input], outputs=[model_output_1, model_output_2])

See for yourself:

In [None]:
model_func.summary()

Let's now compile our non-sequential model. We need to define our losses and metrics for each of our outputs! This is why we gave the output layers specific names, so we can use this here. The rest is as before:

In [None]:
model_func.compile(loss={'output_1':'binary_crossentropy',
                         'output_2':'mean_squared_error'},
                   loss_weights = [1,0.01],
                   metrics = {'output_1':'accuracy',
                             'output_2':'mean_squared_error'},
                   optimizer=tf.keras.optimizers.Adam(5e-4))

Similar for fitting the model: We have to make clear that there are two different y values that need to be predicted. We will run the model only for a few epochs, to see how it works.

In [None]:
log_func = model_func.fit(x=x_preprocess_train,y=[y_train,y2_train],epochs=20)

As always, we can evaluate the model:

In [None]:
model_func.evaluate(x_preprocess_test, [y_test,y2_test])

And we can make predictions. We just have to note that two outputs are being predicted, the labels and the average values. But we can simply use list-indices to get the right ones.

In [None]:
predictions = model_func.predict(x_preprocess_test)

In [None]:
predictions[0].shape

In [None]:
predictions[1].shape