## LOAD DATA and IMPORTING LIBRARIES

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras

In [5]:
train_df = pd.read_csv("/kaggle/input/fashionmnist/fashion-mnist_train.csv")
test_df = pd.read_csv("/kaggle/input/fashionmnist/fashion-mnist_test.csv")

In [6]:
train_df.describe()

In [7]:
x_train = train_df.iloc[:, 1:785]
y_train = train_df.iloc[:, [0]]

x_test = test_df.iloc[:, 1:785]
y_test = test_df.iloc[:, [0]]

In [8]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

## MODEL 1


In [9]:
model = keras.Sequential([
    keras.layers.Dense(units=32, activation='relu', input_shape=[784]),
    keras.layers.Dense(units=16, activation='relu'),
    keras.layers.Dense(units=10, activation='softmax'),
])
model.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])

history = model.fit(x_train, y_train, 
                    epochs=180, batch_size = 5000, 
                    validation_split = 0.2
                    )

history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot()
history_df.loc[:, ['accuracy', 'val_accuracy']].plot()

model.evaluate(x_test, y_test, batch_size=2000)

## MODEL2
- single layer 1024 cells
- epochs = 50
- activation = relu 
- accuracy not converging indicating we might need more number of epochs
- _slight overfitting, possible that deeper networks will model better_
- _accuracy on training data = 90.19%, accuracy on testing data = 85.77%

In [10]:
model2 = keras.Sequential([
    keras.layers.Dense(units=1024, activation='relu', input_shape=[784]),
    keras.layers.Dense(units=10, activation='softmax'),
])
model2.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])

history = model2.fit(x_train, y_train, 
                    epochs= 50, batch_size = 5000, 
                    validation_split = 0.2
                    )

history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot()
history_df.loc[:, ['accuracy', 'val_accuracy']].plot()

model2.evaluate(x_test, y_test, batch_size=2000)

## MODEL 3
- attempting at creating a deeper network with more number of nodes per hidden layer as compared to model 1
- activation = relu 
- _train accuracy = 89.24%, test accuracy = 84.18% -> overfitting_
- deeper net might be more useful

In [11]:
model3 = keras.Sequential([
    #hidden layers
    keras.layers.Dense(units= 128, activation='relu', input_shape=[784]),
    keras.layers.Dense(units= 64, activation='relu'),
    #output layer
    keras.layers.Dense(units=10, activation='softmax'),
])
model3.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])

history = model3.fit(x_train, y_train, 
                    epochs=100, batch_size = 5000, 
                    validation_split = 0.2
                    )

history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot()
history_df.loc[:, ['accuracy', 'val_accuracy']].plot()

model3.evaluate(x_test, y_test, batch_size=2000)

## MODEL 4
- 3 hidden layers - 128, 64, 32 units
- accuracy drops
- activation = relu 
- _train accuracy = 92.32%, test accuracy = 85.37%_

In [12]:
model4 = keras.Sequential([
    #hidden layers
    keras.layers.Dense(units= 128, activation='relu', input_shape=[784]),
    keras.layers.Dense(units= 64, activation='relu'),
    keras.layers.Dense(units= 32, activation='relu'),
    #output layer
    keras.layers.Dense(units=10, activation='softmax'),
])
model4.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])

history = model4.fit(x_train, y_train, 
                    epochs=180, batch_size = 5000, 
                    validation_split = 0.2
                    )

history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot()
history_df.loc[:, ['accuracy', 'val_accuracy']].plot()

model4.evaluate(x_test, y_test, batch_size=2000)

## MODEL 5
- 2 layers = 256, 128
- activation = relu 
- _Overfitting, train accuracy : 94.32%, test accuracy : 84.88%

In [13]:
model5 = keras.Sequential([
    #hidden layers
    keras.layers.Dense(units= 256, activation='relu', input_shape=[784]),
    keras.layers.Dense(units= 128, activation='relu'),
    #output layer
    keras.layers.Dense(units=10, activation='softmax'),
])
model5.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])

history = model5.fit(x_train, y_train, 
                    epochs=100, batch_size = 5000, 
                    validation_split = 0.2
                    )

history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot()
history_df.loc[:, ['accuracy', 'val_accuracy']].plot()

model5.evaluate(x_test, y_test, batch_size=2000)

## MODEL 6
- making use of sigmoid activations
- _train accuracy = 91.21%, test accuracy = 87.60% (higher accuracy as compared to ReLU activation functions)

In [14]:
model6 = keras.Sequential([
    #hidden layers
    keras.layers.Dense(units= 128, activation='relu', input_shape=[784]),
    keras.layers.Dense(units= 64, activation='sigmoid'),
    keras.layers.Dense(units= 32, activation='sigmoid'),
    #output layer
    keras.layers.Dense(units=10, activation='softmax'),
])
model6.compile(optimizer='adam',
                loss= 'sparse_categorical_crossentropy',
                metrics=['accuracy'])

history = model6.fit(x_train, y_train, 
                    epochs=100, batch_size = 5000, 
                    validation_split = 0.2
                    )

history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot()
history_df.loc[:, ['accuracy', 'val_accuracy']].plot()

model6.evaluate(x_test, y_test, batch_size=2000)

## MODEL 7
- previous combo seemed to work so we try to increase number of nodes (in an attempt to increase train set accuracy) and to counter the overfitting we further add some regularization
- _train accuracy = 93.44%, test accuracy = 88.94%_

In [15]:
model7 = keras.Sequential([
    #hidden layers
    keras.layers.Dense(units= 128, activation='relu', input_shape=[784]),
    keras.layers.Dense(units= 64, activation='sigmoid', kernel_regularizer=keras.regularizers.l2(0.001)),
    keras.layers.Dense(units= 64, activation='sigmoid', kernel_regularizer= keras.regularizers.L1L2(l1=0.001, l2=0.001)),
    #output layer
    keras.layers.Dense(units=10, activation='softmax'),
])
model7.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])

history = model7.fit(x_train, y_train, 
                    epochs=100, batch_size = 5000, 
                    validation_split = 0.2
                    )

history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot()
history_df.loc[:, ['accuracy', 'val_accuracy']].plot()

model7.evaluate(x_test, y_test, batch_size=2000)

## MODEL 8
- A deeper model with 512, 64, 32 nodes in the hidden layers and added regularization while maintaining a sigmoid activation function
- Also, validation and test accuracy difference is quite big signifying overfitting and thus Dropout should be used.
- _train accuracy = 97.17%, test accuracy = 90.02%_

In [16]:
model8 = keras.Sequential([
    #hidden layers
    keras.layers.Dense(units= 512, activation='relu', input_shape=[784]),
    keras.layers.Dense(units= 64, activation='sigmoid', kernel_regularizer=keras.regularizers.l2(0.001)),
    keras.layers.Dense(units= 32, activation='sigmoid', kernel_regularizer= keras.regularizers.L1L2(l1=0.001, l2=0.001)),
    #output layer
    keras.layers.Dense(units=10, activation='softmax'),
])
model8.compile(optimizer='adam', 
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])

history = model8.fit(x_train, y_train, 
                    epochs=100, batch_size = 5000, 
                    validation_split = 0.2
                    )

history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot()
history_df.loc[:, ['accuracy', 'val_accuracy']].plot()

model8.evaluate(x_test, y_test, batch_size=2000)

## MODEL 9
- KL divergence loss
- poor performance
- very small decrease in training and cross validation loss and almost no increase in accuracy
- thus we continue with categorical cross entropy loss function
- _train accuracy = 10.18%, test accuracy = 11.69%_

In [17]:
model9 = keras.Sequential([
    #hidden layers
    keras.layers.Dense(units= 512, activation='relu', input_shape=[784]),
    keras.layers.Dense(units= 64, activation='sigmoid', kernel_regularizer=keras.regularizers.l2(0.001)),
    keras.layers.Dense(units= 32, activation='sigmoid', kernel_regularizer= keras.regularizers.L1L2(l1=0.001, l2=0.001)),
    #output layer
    keras.layers.Dense(units=10, activation='softmax'),
])
model9.compile(optimizer='adam', 
                loss= keras.losses.KLDivergence(),
                metrics=['accuracy'])

history = model9.fit(x_train, y_train, 
                    epochs= 50, batch_size = 5000, 
                    validation_split = 0.2
                    )

history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot()
history_df.loc[:, ['accuracy', 'val_accuracy']].plot()

model9.evaluate(x_test, y_test, batch_size=2000)

## MODEL 10
- checking if small neural nets with less number of activation nodes but more layers provide simlilar performance
- 4 hidden layers with 32 nodes each and a ReLU activation function
- _train accuracy = 85.92%, test accuracy = 84.02%_

In [18]:
model10 = keras.Sequential([
    #hidden layers
    keras.layers.Dense(units= 32, activation='relu', input_shape=[784]),
    keras.layers.Dense(units= 32, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)),
    keras.layers.Dense(units= 32, activation='relu', kernel_regularizer= keras.regularizers.l2(0.001)),
    keras.layers.Dense(units= 32, activation='relu', kernel_regularizer= keras.regularizers.l2(0.001)),
    
    #output layer
    keras.layers.Dense(units=10, activation='softmax'),
])
model10.compile(optimizer='adam', 
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])

history = model10.fit(x_train, y_train, 
                    epochs=100, batch_size = 5000, 
                    validation_split = 0.2
                    )

history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot()
history_df.loc[:, ['accuracy', 'val_accuracy']].plot()

model10.evaluate(x_test, y_test, batch_size=2000)

## MODEL 11
- we have been getting similar performance in models 5-8. Hence we may attempt to use dropout regularization after the first 2 hidden layers in an attampt to reduce overfitting
- Dropout randomly sets some of the outgoing edges from neurons to zero so as to avoid overfitting
- _Success!_
- _Dropout 0.2 after 1st 2 hidden layers_
- _train accuracy = 94.3%, cross_val accuracy = 93.19%, test accuracy = 89.74%_

In [19]:
model11 = keras.Sequential([
    #hidden layers
    keras.layers.Dense(units= 512, activation='relu', input_shape=[784]),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(units= 64, activation='sigmoid', kernel_regularizer=keras.regularizers.l2(0.001)),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(units= 32, activation='sigmoid', kernel_regularizer= keras.regularizers.L1L2(l1=0.001, l2=0.001)),
    #output layer
    keras.layers.Dense(units=10, activation='softmax'),
])
model11.compile(optimizer='adam', 
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])

history = model11.fit(x_train, y_train, 
                    epochs=100, batch_size = 5000, 
                    validation_split = 0.2
                    )

history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot()
history_df.loc[:, ['accuracy', 'val_accuracy']].plot()

model11.evaluate(x_test, y_test, batch_size=2000)

## MODEL 12
- A deeper neural net with 512,256,256,128,64 nodes in the hidden layers with dropout on the first 4 layers
- Activation- ReLU
- We see that due to a deeper neural network, the weights are taking a longer time to converge, hence increasing the number of epochs may help
- _train accuracy = 94.3%, cross_val accuracy = 89.94%, test accuracy = 88.59%_

In [20]:
model12 = keras.Sequential([
    #hidden layers
    keras.layers.Dense(units= 512, activation='relu', input_shape=[784]),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(units= 256, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(units= 256, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(units= 128, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(units= 64, activation='relu', kernel_regularizer= keras.regularizers.L1L2(l1=0.001, l2=0.001)),
    #output layer
    keras.layers.Dense(units=10, activation='softmax'),
])
model12.compile(optimizer='adam', 
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])

history = model12.fit(x_train, y_train, 
                    epochs=100, batch_size = 5000, 
                    validation_split = 0.2
                    )

history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot()
history_df.loc[:, ['accuracy', 'val_accuracy']].plot()

model12.evaluate(x_test, y_test, batch_size=2000)

## MODEL 13
- Similar architecture as Model 12, number of epochs increased to 400 
- Activation- ReLU
- _train accuracy = 97.75%,test accuracy = 89.61%, validation accuracy- 89.52%
- Validation Set accuracy hardly improves after the 100th epoch while training set accuracy keeps on increasing
- thus we observe that as we keep on increasing the complexity of the neural network, the difference between train and validation accuracy keeps increasing indicating overfitting.


In [21]:
model13 = keras.Sequential([
    #hidden layers
    keras.layers.Dense(units= 512, activation='relu', input_shape=[784]),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(units= 256, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(units= 256, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(units= 128, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(units= 64, activation='relu', kernel_regularizer= keras.regularizers.L1L2(l1=0.001, l2=0.001)),
    #output layer
    keras.layers.Dense(units=10, activation='softmax'),
])
model13.compile(optimizer='adam', 
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])

history = model13.fit(x_train, y_train, 
                    epochs=400, batch_size = 5000, 
                    validation_split = 0.2
                    )

history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot()
history_df.loc[:, ['accuracy', 'val_accuracy']].plot()

model13.evaluate(x_test, y_test, batch_size=2000)

## Model 14

- 3 hidden layers with 512,256,64 nodes
- Tanh activation function used with droupout in first 2 layers
- Train Accuracy- 84.07% Test Accuracy- 85.25%
- This is lesser than sigmoid and relu activation functions.

In [22]:
model14 = keras.Sequential([
    #hidden layers
    keras.layers.Dense(units= 512, activation='tanh', input_shape=[784]),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(units= 256, activation='tanh', kernel_regularizer=keras.regularizers.l2(0.001)),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(units= 64, activation='tanh', kernel_regularizer= keras.regularizers.L1L2(l1=0.001, l2=0.001)),
    #output layer
    keras.layers.Dense(units=10, activation='softmax'),
])
model14.compile(optimizer='adam', 
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])

history = model14.fit(x_train, y_train, 
                    epochs=100, batch_size = 5000, 
                    validation_split = 0.2
                    )

history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot()
history_df.loc[:, ['accuracy', 'val_accuracy']].plot()

model14.evaluate(x_test, y_test, batch_size=2000)

## Model 15

- Trying a neural network with 5 hidden layers with 64 nodes at each of the hidden layer.
- Training accuracy- 81.87% Test accuracy -83.04%
- Model is not able to converge even after 400 epochs due to so many hidden layers.

In [23]:
model15 = keras.Sequential([
    #hidden layers
    keras.layers.Dense(units= 64, activation='sigmoid', input_shape=[784]),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(units= 64, activation='sigmoid'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(units= 64, activation='sigmoid'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(units= 64, activation='sigmoid'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(units= 64, activation='sigmoid'),
    keras.layers.Dropout(0.2),
    #output layer
    keras.layers.Dense(units=10, activation='softmax'),
])
model15.compile(optimizer='adam', 
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])

history = model15.fit(x_train, y_train, 
                    epochs=400, batch_size = 5000, 
                    validation_split = 0.2
                    )

history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot()
history_df.loc[:, ['accuracy', 'val_accuracy']].plot()

model15.evaluate(x_test, y_test, batch_size=2000)

## Model 16
- tanh activation function and KL divergence loss function
- 128,64,32 nodes in the hidden layers
- Training accuracy 10.13% Test accuracy 5.57%
- Extremely low accuracy with a lot of spikes in the graph.

In [24]:
model16 = keras.Sequential([
    #hidden layers
    keras.layers.Dense(units= 128, activation='tanh', input_shape=[784]),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(units= 64, activation='tanh'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(units= 32, activation='tanh'),
    keras.layers.Dropout(0.2),
    #output layer
    keras.layers.Dense(units=10, activation='softmax'),
])
model16.compile(optimizer='adam', 
                loss=keras.losses.KLDivergence(),
                metrics=['accuracy'])

history = model16.fit(x_train, y_train, 
                    epochs=100, batch_size = 5000,   
                    validation_split = 0.2
                    )

history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot()
history_df.loc[:, ['accuracy', 'val_accuracy']].plot()

model16.evaluate(x_test, y_test, batch_size=2000)