In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import tensorflow as tf
import seaborn as sns
from keras.models import Sequential, Model
from keras.layers import Dense, Conv2D , Dropout, Activation, Flatten, Input, MaxPooling2D
from keras.utils import to_categorical, plot_model
from sklearn.preprocessing import MinMaxScaler

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
    #for filename in filenames:
        #print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

ModuleNotFoundError: No module named 'tensorflow'

# The Goal

The goal of this notebook is to see how MLPs compare to CNNs in terms of digit recognition. 

Additionally we will see the  generalization capabilities of CNNs. For this last step, we will train the CNN in data for digits and see how they generalize after taking the same networks to be tested on the alphabet characters and clothing items from the "Fashion MNIST" dataset.

### Import and analyse standard MNIST data

Let us start by importing the standard MNIST dataset, comprised of 60 000 train samples, with balanced data, for digits from 1 to 10.

In [None]:
numbers_df=pd.read_csv('emnist/emnist-mnist-train.csv',header=None)
test_df=pd.read_csv('emnist/emnist-mnist-test.csv',header=None)
numbers_df.head()

In [None]:
numbers_df.shape

As expected, 60 000 samples, with 784 (28x28) features. The first column in the dataset is the label for the training.

How many of each digit class is there in the training ans test sets?

In [None]:
sns.countplot(data=numbers_df,x=0,palette='pastel')
sns.countplot(data=test_df,x=0)
plt.show()

There are 6000 of each class in the training set and 1000 of each in the test set.

!! Carefull !! The picture above is not a stacked countplot!

We shoul try to visualize some of these digits. Let us make a function for that purpose, since we may want to reuse it.

In [None]:
def vis(dataset,nr_samples,label_col,legend=False,cmap='gray_r',cbar=False,transpose=True,prediction=False,dict_name=None):
    samples=dataset.iloc[np.random.randint(0,dataset.shape[0],size=(1,nr_samples))[0]].reset_index(drop=True) # Picks n random samples from the dataset
    labels=samples[label_col].values
    samples.drop(label_col,axis=1,inplace=True)
    if prediction==True:
        preds=samples['Prediction'].values
        samples.drop('Prediction',axis=1,inplace=True)
    fig, ax = plt.subplots(2,nr_samples//2,sharey=True,sharex=True)
    for i in range(nr_samples):
        pixels=samples.iloc[i].values
        pixels=pixels.reshape((28,28))
        if transpose==True:
            pixels=pixels.transpose()
        sns.heatmap(pixels,cmap=cmap,cbar=cbar,ax=ax.flatten()[i])
        ax.flatten()[i].axes.get_xaxis().set_visible(False)
        ax.flatten()[i].axes.get_yaxis().set_visible(False)
    plt.tight_layout()
    plt.show()
    if legend==True:
        if dict_name!=None:
            labels=[dict_name[i] for i in labels]
        if prediction==True:
            print('The images represent items with labels {} which were predicted to be {}.'.format(labels,preds))
        else:
            print('The images represent items with labels {}.'.format(labels))

We can now see the representation of some samples from the numbers_df dataframe.

In [None]:
vis(numbers_df,10,label_col=0,legend=True)

** <font size="5">Data Processing</font>**

Having visualised the dataset, and checked some digits, we should start processing our data. The first step is to normalize the values of the geyscale. As can be seen in the scale of the previous Figure, values are ranging from 0 (for black) to 255 (for white). We'll normalize them, so they are kept in the interval $\left[0,1\right]$.

For this instance in particular, we will use a MinMaxScaler fit on the train set and then fit it to the testing set as well. This could be done instead dividing by 255, but this procedure is more general. Further down in the notebook, we will just divide by 255.

In [None]:
features_df=numbers_df.drop(0,axis=1)
labels_df=numbers_df[0]
scaler=MinMaxScaler(feature_range=(0,1))
scaler.fit(features_df)
features_df=pd.DataFrame(scaler.transform(features_df),columns=features_df.columns)
features_df['Label']=labels_df
features_df.head()


Let us try to visualize the numbers with the rescaled features.

In [None]:
vis(features_df,10,label_col='Label',legend=True)

The scaling was done correctly, as the numbers didn't get distorted.
We now use the scaler which was fitted to the training set to scale the test set as well.

In [None]:
test_features_df=test_df.drop(0,axis=1)
test_labels_df=test_df[0]
test_features_df=pd.DataFrame(scaler.transform(test_features_df),columns=test_features_df.columns)
test_features_df['Label']=test_labels_df

Having done this scaling, we are ready to use this to predict the handwriten digits!

## Multi-Layer Perceptron (MLP)

Here, we define the building of the MLP model using a function. This helps in testing different architectures in a a faster and more efficient way. It will also allow for the specification of dropout layers and control the number of classes in an easy way, by specifying function arguments.

In [None]:
def MLP(input_shape,nodes=[128,32],dropout_chance=0.4,num_classes=10,produce_output=True):
    # If a 0 is inserted into the nodes, that means a Dropout layer is to be added
    model=Sequential()
    model.add(Dense(nodes[0],input_shape=input_shape,activation='relu'))
    num_hidd_layers=len(nodes)
    for i in range(num_hidd_layers-1):
        if nodes[i+1]==0:
            model.add(Dropout(dropout_chance))
        else:
            model.add(Dense(nodes[i+1],activation='relu'))
    if produce_output==True:
        model.add(Dense(num_classes,activation='softmax'))
    return model

Now we define the model. It will be a four-layer model. The first hidden layer has 256 nodes, with 'ReLu' as activation function, the second has 128 nodes, also with 'ReLu' and the third has 32 nodes. The output layer has 10 nodes and it uses softmax as activation, since it allows for a probabilistic interpretation of the outputs.

In [None]:
# Define the model
model=MLP((784,),nodes=[256,128,32])

# Compile and fit the model
model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['accuracy'])
history = model.fit(features_df.drop('Label',axis=1),to_categorical(features_df['Label']),nb_epoch=25,validation_split=0.2,batch_size=128,verbose=1)

Let us see the learning curve for this model. 
(As this will be used often, we'll define a function for it.)

In [None]:
def learning_curve(history,titles,legend,metrics=['accuracy','loss']):
    nplots=len(metrics)
    fig , ax = plt.subplots(1,nplots,sharex=True)
    fig.set_figheight(5)
    fig.set_figwidth(15)
    for i in range(nplots):
        ax[i].plot(history.history[metrics[i]])
        ax[i].plot(history.history['val_{}'.format(metrics[i])])
        ax[i].set(xlabel='Epoch', ylabel=metrics[i])
        ax[i].legend(legend)
        ax[i].title.set_text(titles[i])
    plt.tight_layout()
    plt.show()
    

In [None]:
learning_curve(history,['Model Accuracy','Model Loss'],legend=['Train','Test'],metrics=['accuracy','loss'])

Now that the model is fitted to the training data, we'll test it in the test data.

In [None]:
test_preds = model.predict_classes(test_features_df.drop('Label',axis=1))
score, acc = model.evaluate(test_features_df.drop('Label',axis=1),to_categorical(test_features_df['Label']))
print('The loss of the test data is {:.3f} with an accuracy of {:.3f}.'.format(score, acc))

How many misslabeled samples do we have?

In [None]:
test_preds_df=pd.DataFrame()
test_preds_df['Prediction']=test_preds
test_preds_df['Label']=test_features_df['Label']
missclassified=test_preds_df[test_preds_df.Prediction!=test_preds_df.Label]
missclassified_index=missclassified.index.to_list()
print('{} images (out of 10 000) were missclassified!'.format(len(missclassified_index)))

We have some wrongly classified samples. Let's see some of them in order to try and understand why they occurred.

In [None]:
missclassified_features_df=test_features_df.iloc[missclassified_index].copy()
missclassified_features_df['Prediction']=missclassified['Prediction']
missclassified_features_df.reset_index(inplace=True,drop=True)
missclassified_features_df.head()

In [None]:
vis(missclassified_features_df,10,'Label',legend=True,prediction=True)

As we can se, some of the missclassified digits are not difficult to recognize by human standards. Let's see if a CNN can achieve better results through it's pattern recognition.

## Convolutional Neural Network (CNN)

First, just like in the MLP, we create the model. Again, let's make it a function.

In [None]:
def CNN(input_shape,num_kernels=[20,20],kernel_shapes=[(3,3),(3,3)],dense_nodes=[128],dropout_chance=0.4,num_classes=10,produce_output=True):
    # A 0 inserted either in num_kernels or in dense_nodes means a Dropout layer is to be inserted at that point
    # If it is inserted in the convolutional layers, then some value must be adde in the corresponding place in kernel_shapes
    model = Sequential()
    model.add(Conv2D(num_kernels[0],kernel_size=kernel_shapes[0],activation='relu',input_shape=input_shape))
    num_conv_layers = len(num_kernels)
    for i in range(num_conv_layers-1):
        if num_kernels[i+1]==0:
            model.add(Dropout(dropout_chance))
        else:
            model.add(Conv2D(num_kernels[i+1],kernel_size=kernel_shapes[i+1],activation='relu'))
            model.add(MaxPooling2D(pool_size=(2, 2), strides=None, padding='valid', data_format=None))
    num_dense_layers = len(dense_nodes)
    model.add(Flatten())
    for j in range(num_dense_layers):
        if dense_nodes[j]==0:
            model.add(Dropout(dropout_chance))
        else:
            model.add(Dense(dense_nodes[j],activation='relu'))
    if produce_output==True:
        model.add(Dense(num_classes,activation='softmax'))
    return model
    

We now use a model which has the same structure in the dense part as the former MLP, but as a first step it encodes the features found by using convolutions.

Now, we prepare the data for training.

In [None]:
num_images=features_df.shape[0]
X=features_df.drop('Label',axis=1).values.reshape(num_images,28,28,1)
y=features_df['Label'].values

Then we compile and fit the model with the training data.

In [None]:
# Define the model
cnn_model = CNN((28,28,1),num_kernels=[20,30],kernel_shapes=[(3,3),(4,4)],dense_nodes=[256,128,32])

# Compile and fit the model
cnn_model.compile(loss = 'categorical_crossentropy',optimizer = 'rmsprop',metrics = ['accuracy'])
cnn_history = cnn_model.fit(X,to_categorical(y),nb_epoch = 25,validation_split = 0.2,batch_size = 128,verbose = 1)

Before proceeding, one should check for the possibility of overfitting.

In [None]:
learning_curve(cnn_history,['Model Accuracy','Model Loss'],legend=['Train','Test'],metrics=['accuracy','loss'])

How many digits were missclassified with the CNN?

In [None]:
test_images=test_features_df.shape[0]
X_test=test_features_df.drop('Label',axis=1).values.reshape(test_images,28,28,1)
y_test=test_features_df['Label']

In [None]:
cnn_test_preds = cnn_model.predict_classes(X_test)
score, acc = cnn_model.evaluate(X_test,to_categorical(y_test))
print('The loss of the test data is {:.3f} with an accuracy of {:.3f}.'.format(score, acc))

In [None]:
cnn_test_preds_df=pd.DataFrame()
cnn_test_preds_df['Prediction']=cnn_test_preds
cnn_test_preds_df['Label']=test_features_df['Label']
cnn_missclassified=cnn_test_preds_df[cnn_test_preds_df.Prediction!=cnn_test_preds_df.Label]
cnn_missclassified_index=cnn_missclassified.index.to_list()
print('{} images (out of 10 000) were missclassified!'.format(len(cnn_missclassified_index)))

We have some improvements! The CNN is able to classify correctly more samples than the simple MLP. Let's check which were missed by the CNN.

In [None]:
cnn_missclassified_features_df=test_features_df.iloc[cnn_missclassified_index].copy()
cnn_missclassified_features_df['Prediction']=cnn_missclassified['Prediction']
cnn_missclassified_features_df.reset_index(inplace=True,drop=True)

In [None]:
vis(cnn_missclassified_features_df,10,'Label',legend=True,prediction=True)

By looking at some sample images, it is possible to notice that the images in which the CNN fails to classify the digit correctly are much more prone to be wrongly classified by humans too: the prediction and the label don't mach but many times the drawn digit resembles the prediction in some way. This does not happen so much with the MLP.

Additionally, we can notice some overfitting by looking at the learning curve of both models. This can be countered with the addition of Dropout layers. These were purposefully not added so one can get a sense of their effect.

Let us now add these Dropout layers. 

### MLP with Dropout

In the following model, we introduce some Dropout layers, to see if we can minimize the overfitting effect we have in the standard MLP model above.

In [None]:
# Define model
model_do=MLP((784,),nodes=[256,0,128,32])

# Compile and fit model
model_do.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['accuracy'])
history_do = model_do.fit(features_df.drop('Label',axis=1),to_categorical(features_df['Label']),nb_epoch=25,validation_split=0.2,batch_size=128,verbose=1)

Checking the learning curve.

In [None]:
learning_curve(history_do,['Model Accuracy','Model Loss'],legend=['Train','Test'],metrics=['accuracy','loss'])

Now it seems that both curves are converging, unlike what happened without Dropout layers. Furthermore, this comes at virtually no cost for the accuracy of the model and a slight decrease in loss. 

Let's see how is the performace on the test set with this model.

In [None]:
test_preds = model_do.predict_classes(test_features_df.drop('Label',axis=1))
score, acc = model_do.evaluate(test_features_df.drop('Label',axis=1),to_categorical(test_features_df['Label']))
print('The loss of the test data is {:.3f} with an accuracy of {:.3f}.'.format(score, acc))

In [None]:
test_preds_df=pd.DataFrame()
test_preds_df['Prediction']=test_preds
test_preds_df['Label']=test_features_df['Label']
missclassified=test_preds_df[test_preds_df.Prediction!=test_preds_df.Label]
missclassified_index=missclassified.index.to_list()
print('{} images (out of 10 000) were missclassified!'.format(len(missclassified_index)))

As seen in the validation scores, we get a similar result, but now with a far more consistent learning through the several epochs of training, with the validation accuracy and loss converging to that of the training ones.

### CNN with Dropout

In [None]:
# Define model
cnn_model_do = CNN((28,28,1),num_kernels=[20,30],kernel_shapes=[(3,3),(4,4)],dense_nodes=[0,256,0,128,32])

# Compile and fit model
cnn_model_do.compile(loss = 'categorical_crossentropy',optimizer = 'rmsprop',metrics = ['accuracy'])
cnn_history_do = cnn_model_do.fit(X,to_categorical(y),nb_epoch = 25,validation_split = 0.2,batch_size = 128,verbose = 1)

We should look at the learning curves now.

In [None]:
learning_curve(cnn_history_do,['Model Accuracy','Model Loss'],legend=['Train','Test'],metrics=['accuracy','loss'])

How about the model performance on the test set?

In [None]:
cnn_test_preds = cnn_model_do.predict_classes(X_test)
score, acc = cnn_model_do.evaluate(X_test,to_categorical(y_test))
print('The loss of the test data is {:.3f} with an accuracy of {:.3f}.'.format(score, acc))

In [None]:
cnn_test_preds_df=pd.DataFrame()
cnn_test_preds_df['Prediction']=cnn_test_preds
cnn_test_preds_df['Label']=test_features_df['Label']
cnn_missclassified=cnn_test_preds_df[cnn_test_preds_df.Prediction!=cnn_test_preds_df.Label]
cnn_missclassified_index=cnn_missclassified.index.to_list()
print('{} images (out of 10 000) were missclassified!'.format(len(cnn_missclassified_index)))

As in the case of the MLP, the introduction of Dropout layers was able to solve the overfitting problem, as can be seen in the learning curves for the above model. In doing this, it improved the performance of our model on the test set as well, since it is now able to generalize consistently to unseen data.

# Adaptability

We will now check how adaptable CNN are. For that, we will train these networks on the digits dataset and then take the learned weights and build a model for classifying alphabet characters with them.
For this effect, using the Sequential() model from Keras is not the ideal. We will therefore proceed to use Functional API model.

#### Preprocessing

Let's start by importing the alphabet dataset.

In [None]:
char_df = pd.read_csv('/kaggle/input/emnist/emnist-letters-train.csv',header=None)
test_char_df = pd.read_csv('/kaggle/input/emnist/emnist-letters-test.csv',header=None)

In [None]:
# Normalization and changing the label column name to 'Label'
labels=char_df[0].tolist()
char_df.drop(0,axis=1,inplace=True)
char_df=char_df/255
char_df['Label']=labels

test_labels=test_char_df[0].tolist()
test_char_df.drop(0,axis=1,inplace=True)
test_char_df=test_char_df/255
test_char_df['Label']=test_labels
test_char_df.head()

In [None]:
fig, ax = plt.subplots(2)
sns.countplot(data=char_df,x='Label',ax=ax[0])
ax[0].title.set_text('Training set')
sns.countplot(data=test_char_df,x='Label',ax=ax[1])
ax[1].title.set_text('Test set')
plt.tight_layout()
plt.show()

Since the set of labels for the training set and the test set is not the same (there are some labels missing from the test set), we will have to increase the length of the one-hot encoded vectors for the test set before applying our model to the them. This will be done at some later point in the notebook.

In [None]:
vis(char_df,10,'Label',legend=True)

## Model definition

In [None]:
# Vision part of the model
inputs=Input((28,28,1))
conv1=Conv2D(20,(3,3),activation='relu')(inputs)
conv2=Conv2D(30,(4,4),activation='relu')(conv1)
output1=Flatten()(conv2)

vision_model=Model(inputs,output1)

inputs2=vision_model(inputs)

# Specific part of the model for digits

dropout1=Dropout(0.4)(inputs2)
dense1=Dense(256,activation='relu')(dropout1)
dropout2=Dropout(0.4)(dense1)
dense2=Dense(128,activation='relu')(dropout2)
dense3=Dense(32,activation='relu')(dense2)
output2=Dense(10,activation='softmax')(dense3)


# Specific part of the model for characters

dropout1_c=Dropout(0.4)(inputs2)
dense1_c=Dense(256,activation='relu')(dropout1_c)
dropout2_c=Dropout(0.4)(dense1_c)
dense2_c=Dense(128,activation='relu')(dropout2_c)
dense3_c=Dense(32,activation='relu')(dense2_c)
output2_c=Dense(27,activation='softmax')(dense3_c)

# Specific part of the model for fashion MNIST

dropout1_f=Dropout(0.4)(inputs2)
dense1_f=Dense(256,activation='relu')(dropout1_f)
dropout2_f=Dropout(0.4)(dense1_f)
dense2_f=Dense(128,activation='relu')(dropout2_f)
dense3_f=Dense(32,activation='relu')(dense2_f)
output2_f=Dense(10,activation='softmax')(dense3_f)

### Training on MNIST Data

The model layers are now created. Let us define the model for traning on the digits dataset.

In [None]:
# Define the model (same as previous CNN with Dropout)
training_model=Model(inputs,outputs=output2)

# Compile and fit the model
training_model.compile(optimizer='rmsprop', loss='categorical_crossentropy',metrics=['accuracy'])
training_history = training_model.fit(X,to_categorical(y),nb_epoch = 25,validation_split = 0.2,batch_size = 128,verbose = 1)

In [None]:
learning_curve(training_history,['Model Accuracy','Model Loss'],legend=['Train','Test'],metrics=['accuracy','loss'])

Let's check the performance on the test set.

In [None]:
digits_test_probs = training_model.predict(X_test)
digits_test_preds = [np.argmax(np.asarray(i)) for i in digits_test_probs]
score, acc = training_model.evaluate(X_test,to_categorical(y_test))
print('The loss of the test data is {:.3f} with an accuracy of {:.3f}.'.format(score, acc))

In [None]:
digits_test_preds_df=pd.DataFrame()
digits_test_preds_df['Prediction']=digits_test_preds
digits_test_preds_df['Label']=test_features_df['Label']
digits_missclassified=digits_test_preds_df[digits_test_preds_df.Prediction!=digits_test_preds_df.Label]
digits_missclassified_index=digits_missclassified.index.to_list()
print('{} images (out of 10 000) were missclassified!'.format(len(digits_missclassified_index)))

### Letters MNIST data

Preprocessing character image data.

In [None]:
num_chars=char_df.shape[0]
X_char=char_df.drop('Label',axis=1).values.reshape(num_chars,28,28,1)
y_char=char_df['Label'].values

num_test_chars=test_char_df.shape[0]
test_X_char=test_char_df.drop('Label',axis=1).values.reshape(num_test_chars,28,28,1)
test_y_char=test_char_df['Label'].values

Let's do the model for classifying the characters, without training the convolutional part of the network.

In [None]:
conv1.trainable = False
conv2.trainable = False
output1.trainable = False

char_model=Model(inputs,output2_c)
plot_model(char_model,show_shapes=True,show_layer_names=True,expand_nested=True)
char_model.compile(optimizer='rmsprop', loss='categorical_crossentropy',metrics=['accuracy'])
char_history = char_model.fit(X_char,to_categorical(y_char),nb_epoch = 25,validation_split = 0.2,batch_size = 256,verbose = 1)

In [None]:
learning_curve(char_history,['Model Accuracy','Model Loss'],legend=['Train','Test'],metrics=['accuracy','loss'])

Let's check performance and missed samples.

In [None]:
char_test_probs = char_model.predict(test_X_char)
char_test_preds = [np.argmax(np.asarray(i)) for i in char_test_probs]
filler=[0.0,0.0,0.0,0.0,0.0,0.0,0.0] #adds 0s to the labels from 21 to 27 because there are no samples with tese labels in the test set
y_vals=np.asarray([list(i)+filler for i in to_categorical(test_y_char)])
score_char, acc_char = char_model.evaluate(test_X_char,y_vals)
print('The loss of the test data is {:.3f} with an accuracy of {:.3f}.'.format(score_char, acc_char))

In [None]:
char_test_preds_df=pd.DataFrame()
char_test_preds_df['Prediction']=char_test_preds
char_test_preds_df['Label']=test_char_df['Label']
char_missclassified=char_test_preds_df[char_test_preds_df.Prediction!=char_test_preds_df.Label]
char_missclassified_index=char_missclassified.index.to_list()
print('{} images (out of {}) were missclassified!'.format(len(char_missclassified_index),test_char_df.shape[0]))

### Fashion MNIST Data

Now we try to use the same convolution layers as trained for the digit dataset to try and make predictions about the Fashion MNIST data.

In [None]:
# Import data
fashion_df = pd.read_csv('/kaggle/input/fashionmnist/fashion-mnist_train.csv')
test_fashion_df = pd.read_csv('/kaggle/input/fashionmnist/fashion-mnist_test.csv')

In [None]:
fashion_df.head()

In [None]:
# Change format
labels=fashion_df['label'].tolist()
fashion_df.drop('label',axis=1,inplace=True)
fashion_df=fashion_df/255
fashion_df['Label']=labels

test_labels_f=test_fashion_df['label'].tolist()
test_fashion_df.drop('label',axis=1,inplace=True)
test_fashion_df=test_fashion_df/255
test_fashion_df['Label']=test_labels_f
fashion_df.head()

In [None]:
fig, ax = plt.subplots(2)
sns.countplot(data=fashion_df,x='Label',ax=ax[0])
ax[0].title.set_text('Training set')
sns.countplot(data=test_fashion_df,x='Label',ax=ax[1])
ax[1].title.set_text('Test set')
plt.tight_layout()
plt.show()

Here, both sets have the same labesl, so there is no need to complete the one-hot encoded vectors for any of them.

In [None]:
num_items=fashion_df.shape[0]
X_fashion=fashion_df.drop('Label',axis=1).values.reshape(num_items,28,28,1)
y_fashion=fashion_df['Label'].values

num_test_items=test_fashion_df.shape[0]
test_X_fashion=test_fashion_df.drop('Label',axis=1).values.reshape(num_test_items,28,28,1)
test_y_fashion=test_fashion_df['Label'].values

Let's visualize some of these images.

In [None]:
# Dictionary for converting items into names.

fashion_dict={0: 'T-shirt/top', 1:'Trouser', 2: 'Pullover', 3: 'Dress', 4: 'Coat', 5: 'Sandal', 6: 'Shirt', 7: 'Sneaker', 8: 'Bag', 9: 'Ankle boot'}

In [None]:
vis(fashion_df,10,'Label',legend=True,transpose=False,dict_name=fashion_dict)

Now we define and fit the specific part of the model.

In [None]:
conv1.trainable = False
conv2.trainable = False
output1.trainable = False

fashion_model=Model(inputs,output2_f)
plot_model(fashion_model,show_shapes=True,show_layer_names=True,expand_nested=True)
fashion_model.compile(optimizer='rmsprop', loss='categorical_crossentropy',metrics=['accuracy'])
fashion_history = fashion_model.fit(X_fashion,to_categorical(y_fashion),nb_epoch = 25,validation_split = 0.2,batch_size = 128,verbose = 1)

In [None]:
learning_curve(fashion_history,['Model Accuracy','Model Loss'],legend=['Train','Test'],metrics=['accuracy','loss'])

In [None]:
fashion_test_probs = fashion_model.predict(test_X_fashion)
fashion_test_preds = [np.argmax(np.asarray(i)) for i in fashion_test_probs]
score_fashion, acc_fashion = fashion_model.evaluate(test_X_fashion,to_categorical(test_y_fashion))
print('The loss of the test data is {:.3f} with an accuracy of {:.3f}.'.format(score_fashion, acc_fashion))

In [None]:
fashion_test_preds_df=pd.DataFrame()
fashion_test_preds_df['Prediction']=fashion_test_preds
fashion_test_preds_df['Label']=test_fashion_df['Label']
fashion_missclassified=fashion_test_preds_df[fashion_test_preds_df.Prediction!=fashion_test_preds_df.Label]
fashion_missclassified_index=fashion_missclassified.index.to_list()
print('{} images (out of {}) were missclassified!'.format(len(fashion_missclassified_index),test_fashion_df.shape[0]))

By analysing the learning curves of the letters and fashion models, one is able to notice the existence of overfitting. Additionally, it is possible to see that there is some room for improvement, even in the training set for these models. In the next section this issue will be addressed by incresing the complexity of the Dense layers of the models.

## Improving Letter and Fashion classification

Let us modify slightly the model specific to each one of the latter datasets.

In [None]:
# Specific part of the model for characters 2.0

dropout1_c2=Dropout(0.4)(inputs2)
dense1_c2=Dense(512,activation='relu')(dropout1_c2)
dropout2_c2=Dropout(0.4)(dense1_c2)
dense2_c2=Dense(256,activation='relu')(dropout2_c2)
dropout3_c2=Dropout(0.4)(dense2_c2)
dense3_c2=Dense(64,activation='relu')(dropout3_c2)
output2_c2=Dense(27,activation='softmax')(dense3_c2)

# Specific part of the model for fashion MNIST 2.0

dropout1_f2=Dropout(0.4)(inputs2)
dense1_f2=Dense(512,activation='relu')(dropout1_f2)
dropout2_f2=Dropout(0.4)(dense1_f2)
dense2_f2=Dense(256,activation='relu')(dropout2_f2)
dropout3_f2=Dropout(0.4)(dense2_f2)
dense3_f2=Dense(64,activation='relu')(dropout3_f2)
output2_f2=Dense(10,activation='softmax')(dense3_f2)

### Letter Recognition

In [None]:
Let us test the improved Letter recognition model.

In [None]:
conv1.trainable = False
conv2.trainable = False
output1.trainable = False

char_model_2=Model(inputs,output2_c2)
plot_model(char_model_2,show_shapes=True,show_layer_names=True,expand_nested=True)
char_model_2.compile(optimizer='rmsprop', loss='categorical_crossentropy',metrics=['accuracy'])
char_history_2 = char_model_2.fit(X_char,to_categorical(y_char),nb_epoch = 25,validation_split = 0.2,batch_size = 128,verbose = 1)

In [None]:
learning_curve(char_history_2,['Model Accuracy','Model Loss'],legend=['Train','Test'],metrics=['accuracy','loss'])

In [None]:
char_test_probs = char_model_2.predict(test_X_char)
char_test_preds = [np.argmax(np.asarray(i)) for i in char_test_probs]
filler=[0.0,0.0,0.0,0.0,0.0,0.0,0.0] #adds 0s to the labels from 21 to 27 because there are no samples with tese labels in the test set
y_vals=np.asarray([list(i)+filler for i in to_categorical(test_y_char)])
score_char, acc_char = char_model_2.evaluate(test_X_char,y_vals)
print('The loss of the test data is {:.3f} with an accuracy of {:.3f}.'.format(score_char, acc_char))

In [None]:
char_test_preds_df=pd.DataFrame()
char_test_preds_df['Prediction']=char_test_preds
char_test_preds_df['Label']=test_char_df['Label']
char_missclassified=char_test_preds_df[char_test_preds_df.Prediction!=char_test_preds_df.Label]
char_missclassified_index=char_missclassified.index.to_list()
print('{} images (out of {}) were missclassified!'.format(len(char_missclassified_index),test_char_df.shape[0]))

### Fashion Recognition

In [None]:
conv1.trainable = False
conv2.trainable = False
output1.trainable = False

fashion_model_2=Model(inputs,output2_f2)
plot_model(fashion_model_2,show_shapes=True,show_layer_names=True,expand_nested=True)
fashion_model_2.compile(optimizer='rmsprop', loss='categorical_crossentropy',metrics=['accuracy'])
fashion_history_2 = fashion_model_2.fit(X_fashion,to_categorical(y_fashion),nb_epoch = 25,validation_split = 0.2,batch_size = 32,verbose = 1)

In [None]:
learning_curve(fashion_history_2,['Model Accuracy','Model Loss'],legend=['Train','Test'],metrics=['accuracy','loss'])