# Speech MNIST
We are operating on melspectrogram features. Due to the similarity with images, we can use a CNN to classify the spoken digits.

For the data, we are going to use the following repo.

In [None]:
!git clone https://github.com/jayrodge/AudioMNIST-using-PyTorch.git

## Importing libraries
First we are going to import the necessary libraries

In [None]:
import os
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.losses import SparseCategoricalCrossentropy
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
os.chdir('AudioMNIST-using-PyTorch/')

## Data Preprocessing
We are going to load the data and preprocess it. Refer to the comment for each function for more details.

In [None]:
def load_and_preprocess_image(path):
    '''
    We are going to load png images and resize them to 224x224.
    After that, we normalize the images to be in the range [-0.5, 0.5].
    '''
    image = tf.io.read_file(path)
    image = tf.image.decode_png(image, channels=3)
    image = tf.image.resize(image, [224, 224])
    image = (image / 255.0) - 0.5
    return image

def get_filenames_and_labels(root_dir):
    '''
    Returns a list of filenames and a list of labels. The filenames and labels are matched.
    - Subfolders under `AudioMNIST-using-PyTorch/MNIST/` corresponds to the labels.
        - Eg. `AudioMNIST-using-PyTorch/MNIST/00/` contains all the files with label 0.
    - Labels should be integers from 0 to 9.
    '''
    # TODO: Fill me
    pass

def split_data(filenames, labels, test_size=0.2, valid_size=0.2):
    '''
    We are going to split pairs of filenames and labels into train, test and valid sets.
        - For example, if `test_size=0.2` and `valid_size=0.2`, then 60% of the data will be used for training,
        and 20% each for testing and validation.
    - Please refer to the documentation of `train_test_split` in `sklearn.model_selection` for more information.
    - https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
    '''
    # Split into train and temp (test + valid)
    filenames_train, filenames_temp, labels_train, labels_temp = train_test_split(
        filenames, labels, test_size=(test_size + valid_size), stratify=labels, random_state=42)
    
    # TODO: Split temp into test and valid
    
    return filenames_train, labels_train, filenames_test, labels_test, filenames_valid, labels_valid

def create_dataset(filenames, labels, batch_size=32):
    '''
    This function creates a `tf.data.Dataset` from a list of filenames and a list of labels.
    The function should work out of the box. You don't need to modify it.
    '''
    dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
    dataset = dataset.map(lambda x, y: (load_and_preprocess_image(x), tf.cast(y, tf.int32)),
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.shuffle(buffer_size=len(filenames))
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return dataset

## Intantiation of Datasets
Now we are actually going to call the preprocessing functions and instantiate the datasets.

In [None]:
root_dir = 'MNIST'
batch_size = 128

# Get filenames and labels
filenames, labels = get_filenames_and_labels(root_dir)

# TODO: Split data (One line of code)
# You should use the function `split_data` that you have implemented.

# TODO: Create datasets (Three lines of code)
# You should use the function `create_dataset` that was provided.

# Logging the dataset information along with the number of samples
print("Train Dataset:", train_dataset, "Number of Samples:", len(filenames_train))
print("Test Dataset:", test_dataset, "Number of Samples:", len(filenames_test))
print("Validation Dataset:", valid_dataset, "Number of Samples:", len(filenames_valid))

## Visualization
We can visualize the logmelspectrogram to see how it looks like.
This block also makes sure that data preprocessing was done correctly.

In [None]:
%matplotlib inline
classes = [str(i) for i in range(10)]  # If your classes are labeled 0 through 9
# Function to un-normalize and display an image
def imshow(img):
    img = img * 0.5 + 0.5  # unnormalize
    plt.imshow(img)  # No need to transpose

# Obtain one batch of training images
for images, labels in train_dataset.take(1):
    images = images.numpy()  # Convert images to numpy for display
    labels = labels.numpy()  # Convert labels to numpy for display

# Plot the images in the batch, along with the corresponding labels
fig = plt.figure(figsize=(25, 4))
# Display 20 images
for idx in np.arange(20):
    ax = fig.add_subplot(2, 10, idx+1, xticks=[], yticks=[])
    imshow(images[idx])
    ax.set_title(classes[labels[idx]])

plt.show()

## The Model
We are going to use a simple CNN model to classify the spoken digits.
- Your are welcome to define your own model architecture.
- The baseline approach is to translate the following model from PyTorch to TensorFlow.
```python
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # convolutional layer
        self.conv1 = nn.Conv2d(3, 16, 5)
        # max pooling layer
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(16, 32, 5)
        self.dropout = nn.Dropout(0.2)
        self.fc1 = nn.Linear(32*53*53, 256)
        self.fc2 = nn.Linear(256, 84)
        self.fc3 = nn.Linear(84, 10)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, x):
        # add sequence of convolutional and max pooling layers
        x = self.pool(F.relu(self.conv1(x)))
        x = self.dropout(x)
        x = self.pool(F.relu(self.conv2(x)))
        x = self.dropout(x)
        x = x.view(-1, 32 * 53 * 53)
        x = F.relu(self.fc1(x))
        x = self.dropout(F.relu(self.fc2(x)))
        x = self.softmax(self.fc3(x))
        return x
```

In [None]:
class Net(models.Model):
    def __init__(self):
        super(Net, self).__init__()
        # TODO: fill me


    def call(self, x):
        # TODO: fill me
        return x

# Create the model instance
model = Net()

# Model summary to check the architecture
model.build((None, 224, 224, 3))  # `None` can accommodate a variable batch size
model.summary()

## Loss Function and Optimizer
One final step before we can simply call `model.fit`

In [None]:
# Loss function
loss_function = SparseCategoricalCrossentropy()

# Optimizer
optimizer = SGD(learning_rate=0.001, momentum=0.9)

# Compile the model
model.compile(optimizer=optimizer,
              loss=loss_function,
              metrics=['accuracy'])

## Train!
Ideally the model should produce an Acc. of 96% or more.

In [None]:
# Training parameters
n_epochs = 2  # You may increase this number, but 2 epochs works well enough

# Callback for saving the best model in the TensorFlow SavedModel format
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint('model_MNIST', save_best_only=True, save_format="tf")

# Fit the model
history = model.fit(train_dataset,
                    validation_data=valid_dataset,
                    epochs=n_epochs,
                    callbacks=[checkpoint_cb])

 ## Testing the CNN Model
 Evaluate the model on the test set

In [None]:
test_loss, test_accuracy = model.evaluate(test_dataset)

print(f'Test Loss: {test_loss:.6f}, Test Accuracy: {test_accuracy:.6f}')

## Bonus Point:
- Try to report the acc of our model on different digits.
- Try to report a confusion matrix.