In [None]:
import random
random.seed(112358)

import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.metrics import accuracy_score

# TensorFlow and tf.keras
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow.keras import optimizers
from tensorflow.keras import regularizers
from tensorflow.keras import callbacks

%matplotlib inline

## Overall Approach
I took a sequential approach, changing one hyperparameter at a time and seeing if changing that hyperparameter improved the model. If it did improve the model then I kept the change and tried a different adaptation. However, I took a logical order through the various hyperparameter options. I started by adjusting Adam. I realised that you must start with small numbers (coefficients) for L2 regulaization and increase them. Small numbers mean less penalisation in this instance. However, using too high values for the coefficients resulted in the penalisation reducing the ability for the model to train on the data and therefore resulted in lower scores. Therefore I used a value in between these extremes. My validation score is much higher than the test score on kaggle which may suggest that there is a lot of noise in the data and the model is fitting to that noise. Therefore, data augmentation would be useful in this instance.

## Data
The Kannada MNIST dataset was used, which is a large database of handwritten digits in the indigenous language Kannada.

This dataset consists of 60,000 28x28 grayscale images of the ten digits, along with a test set of 10,000 images.

For this homework, we will simplify the problem by only using the digits labeled 0 and 1 owing to the similarity of the two symbols, and we will use a total of 1,200 samples for training (this includes the data you will use for validation).

More details: https://arxiv.org/pdf/1908.01242.pdf

# 1. Get Data and visualize

In [None]:

kmnist_test = pd.read_csv('/Users/phili/data/kmnist_test.csv')
kmnist_train = pd.read_csv('/Users/phili/data/kmnist_train.csv')
X_train = kmnist_train.drop('output', axis=1).values
y_train = kmnist_train['output'].values

seed = 7
np.random.seed(seed)

X_train = (X_train - X_train.min(axis=0)) / (X_train.max(axis=0) - X_train.min(axis=0))

print(X_train.min(), X_train.max())

#print(X_train.shape, y_train.shape, '\n\n', X_train[56][:2], '\n\n', set(y_train))

zero_df = kmnist_train[kmnist_train['output']==0]
zero_array = zero_df.drop('output', axis=1).values
one_df = kmnist_train[kmnist_train['output']==1]
one_array = one_df.drop('output', axis=1).values

first_zero = zero_array[0].reshape(28,28)

plt.figure()
plt.imshow(first_zero, cmap=plt.get_cmap('gray'))
plt.xlabel('First example of a zero')
plt.colorbar()

first_one = one_array[0].reshape(28,28)

plt.figure()
plt.imshow(first_one, cmap=plt.get_cmap('gray'))
plt.xlabel('First example of a one')
plt.colorbar()

# 2. Fit initial overfit model

In [None]:
seed = 7
np.random.seed(seed)

X_train = X_train.reshape(-1, 28, 28,1)

tf.keras.backend.clear_session()  # For easy reset of notebook state.

# your code here
n_neurons = 100
n_input = len(X_train[0])
n_output = 1

model_overfit = tf.keras.models.Sequential(name='Model_Overfit')

model_overfit .add(tf.keras.layers.Flatten(input_shape=(28, 28)))

#First hidden layer
model_overfit.add((tf.keras.layers.Dense(n_neurons,activation = 'relu')))

#Second hidden layer
model_overfit.add((tf.keras.layers.Dense(n_neurons,activation = 'relu')))

#Third hidden layer
model_overfit.add((tf.keras.layers.Dense(n_neurons,activation = 'relu')))

# output layer, one neuron 
model_overfit.add(tf.keras.layers.Dense(n_output,  activation='sigmoid'))

model_overfit.summary()

#Do we need to reshape in order for keras to use the data properly?
#How do we choose softmax vs linear for output and activation and optimiser/ loss functiom combo?

%%time
optimizer = optimizers.Adam(lr=0.001) #Default learning rate.

model_overfit.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])

history_overfit = model_overfit.fit(X_train,y_train, epochs = 1000, batch_size = 128,verbose=1,validation_split=0.3)

#Should it be binary cross_entropy loss because we have a binary target variable? - is the target variable the response variable?

# plot accuracy and loss for the test set
fig, ax = plt.subplots(1,2, figsize=(20,6))

ax[0].plot(history_overfit.history['accuracy'])
ax[0].plot(history_overfit.history['val_accuracy'])
ax[0].set_title('Model accuracy')
ax[0].set_ylabel('accuracy')
ax[0].set_xlabel('epoch')
ax[0].legend(['train', 'val'], loc='best')

ax[1].plot(history_overfit.history['loss'])
ax[1].plot(history_overfit.history['val_loss'])
ax[1].set_title('Model loss')
ax[1].set_ylabel('loss')
ax[1].set_xlabel('epoch')
ax[1].legend(['train', 'val'], loc='best')

**INTERPRETATION:**

You can tell that the model is overfitting because there is a discrepancy between the validation accuracy and the training accuracy. This is further shown by the fact that the learning rate immediately moves to one. This suggests that the model is finding features in the training data and learing those intensively, and may not be able to generalize as successfully to unseen data.

# 3. Regularised Model

In [None]:
#TO CHANGE: epochs, L2 norm, dropout, batch size, optimizer.
#ORDER: Adam, L1, L2, Epochs, Batch Size, Dropout.Other o
# Adam: 0.2-0.4
n_neurons = 100
n_input = len(X_train[0])
n_output = 1
#kernel_weight = 0.005
#bias_weight = 0.00
#callback = callbacks.EarlyStopping(monitor='val_loss', patience=55)
        
#kernel_regularizer=myl1_reg

myl2_reg = regularizers.l2(0.08)
myl1_reg = regularizers.l1(0.004) 

model_reg = tf.keras.models.Sequential(name='Model_reg')

model_reg.add(tf.keras.layers.Flatten(input_shape=(28, 28)))

#First hidden layer
model_reg.add((tf.keras.layers.Dense(n_neurons, kernel_regularizer=myl2_reg, activation = 'relu')))

#Second hidden layer
model_reg.add((tf.keras.layers.Dense(n_neurons,kernel_regularizer=myl2_reg, activation = 'relu')))

#Third hidden layer
model_reg.add((tf.keras.layers.Dense(n_neurons,kernel_regularizer=myl2_reg, activation = 'relu')))

model_reg.add(tf.keras.layers.Dropout(0.2))

# output layer, one neuron 
model_reg.add(tf.keras.layers.Dense(n_output,  activation='sigmoid'))

model_reg.summary()

#Do we need to reshape in order for keras to use the data properly?
#How do we choose softmax vs linear for output and activation and optimiser/ loss functiom combo?

%%time
optimizer = optimizers.Adam(lr=0.00003)
#optimizer = optimizers.SGD(lr=0.01, momentum=0.9)

model_reg.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])

history_reg = model_reg.fit(X_train,y_train, epochs = 2000, verbose=0,batch_size = 32, validation_split=0.3)

#Should it be binary cross_entropy loss because we have a binary target variable? - is the target variable the response variable?

history_reg = model_reg.fit_generator(datagen_train.flow(X_train, y_train, batch_size=32),
                              steps_per_epoch=len(X_train)//32,
                              epochs=2000,
                              verbose=2)

print('Val accuracy in the model on the last epoch is', history_reg.history['val_accuracy'][-1])
print('Train accuracy in the model on the last epoch is', history_reg.history['accuracy'][-1])
print('Train loss in the model on the last epoch is', history_reg.history['loss'][-1])
print('Val loss in the model on the last epoch is', history_reg.history['val_loss'][-1])

tf.keras.backend.clear_session()  # For easy reset of notebook state.

In [None]:
def array_exists(arr):
        return hasattr(arr, 'shape')
#check if the numpy array exists

def reshape_if_exists(arr):
    if array_exists(arr):
        return arr['x'].values.reshape(-1,1), arr['y'].values.reshape(-1,1)
    else:
        return None, None
    
def reshape_and_extract_sets(train_set, test_set):
    """
    Extracts x_train, y_train, x_test and y_test and reshapes them for using with keras.
    """    
    x_train, y_train = reshape_if_exists(train_set)
    x_test, y_test   = reshape_if_exists(test_set)
    return x_train, y_train, x_test, y_test


def plot_loss(model_history, rolling = None, title = "Loss vs Epoch "):
    """
    Arguments:
        model_history : the nueral network model history to plot
        title   : if you want a title other than the default plot it.
        rolling : this will plot a rolling average of the loss (purely for visualization purposes)
    """
    plt.figure(figsize = (12,5))
    train_loss = model_history.history['loss']
    val_loss = model_history.history['val_loss']
    set_colors = {"train": sns.color_palette()[0],
                 "val": sns.color_palette()[1]}
    
    if rolling:
        alphas = [0.45, 0.35]
    else:
        alphas = [0.8, 0.6]
    
    plt.loglog( train_loss, linewidth=3, label = 'Training', alpha = alphas[0], color = set_colors["train"])
    
    plt.loglog( val_loss, linewidth=3, label = 'Validation', color = set_colors["val"], alpha=alphas[1])
    
    if rolling:
        plt.plot(pd.Series(train_loss).rolling(rolling).mean(),linewidth=4, 
                 label = 'Train loss rolling avg.', color = set_colors["train"])
        plt.plot(pd.Series(val_loss).rolling(rolling).mean(),linewidth=4, 
                 label = 'Val loss rolling avg.', color = set_colors["val"])
    plt.title(title)
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

plot_loss(history_reg, rolling = 30)

# plot accuracy and loss for the test set
fig, ax = plt.subplots(1,2, figsize=(20,6))

ax[0].plot(history_reg.history['accuracy'])
ax[0].plot(history_reg.history['val_accuracy'])
ax[0].set_title('Model accuracy')
ax[0].set_ylabel('accuracy')
ax[0].set_xlabel('epoch')
ax[0].legend(['train', 'val'], loc='best')

ax[1].plot(history_reg.history['loss'])
ax[1].plot(history_reg.history['val_loss'])
ax[1].set_title('Model loss')
ax[1].set_ylabel('loss')
ax[1].set_xlabel('epoch')
ax[1].legend(['train', 'val'], loc='best')