This is a starter notebook to get you familiar with how to browse the data and train a simple neural network using Tensorflow/Keras.

In [1]:
import numpy as np
import os
import pandas as pd 

from PIL import Image

import keras
import keras.layers
from keras.models import Sequential 
from keras.layers import Dense, Dropout, Flatten 
from keras.layers import Conv2D, MaxPooling2D, InputLayer
import keras.utils.all_utils as kr_utils
import keras.regularizers
from keras.preprocessing.image import ImageDataGenerator
import tensorflow as tf
import tensorflow_addons as tfa

from tensorflow.keras.layers import BatchNormalization
from sklearn.model_selection import train_test_split

print(tf.__version__)

In [2]:
train_images_folder = "/kaggle/input/csci-5622-spring-22/train/train/"
test_images_folder = "/kaggle/input/csci-5622-spring-22/test/test"
train_csv = "/kaggle/input/csci-5622-spring-22/train.csv"
submission_csv = "/kaggle/input/csci-5622-spring-22/sample_submission.csv"
patch_size = 192
num_classes = 53

First, let's create a generator that will read the images and provide batches of samples and their corresponding labels.

We'll be building it using keras' Sequence

In [3]:
class RockGenerator(kr_utils.Sequence):
    def __init__(self, df, # contains the images names and their labels
                 path_to_images,
                 batch_size=32,
                 shuffle=True, # to shuffle the data at the end of each epoch
                ):
        
        self.df = df # dataframe with two columns "image" and "label"
        self.images_path = path_to_images
        self.batch_size = batch_size
        self.shuffle = shuffle
        
        self.mean = 125.3
        self.std = 63.5
        if shuffle:
            self.indexes = np.random.permutation(self.df.shape[0])
        else:
            self.indexes = np.arange(self.df.shape[0])
        self.on_epoch_end()

    def on_epoch_end(self): # called at the end of each epoch
        if self.shuffle:
            np.random.shuffle(self.indexes)

    def __len__(self):
        # return number of batches in dataset / steps per epoch
        return int(np.ceil(self.df.shape[0] / self.batch_size))

    def __getitem__(self, index):
        # get batch at position index
        indexes = self.df.index[self.indexes[index*self.batch_size:min((index+1)*self.batch_size, self.df.shape[0])] ] 
        images = np.zeros((len(indexes), patch_size, patch_size,3))
        labels = np.zeros((len(indexes), num_classes))
        for i, ind in enumerate(indexes):
            image = np.asarray(Image.open(os.path.join(self.images_path , "{}.png".format(self.df.image[ind]))))
            image = (image - self.mean) / self.std # this is global mean and std, you can use mean/std per channel
            images[i] = image
            labels[i] = kr_utils.to_categorical(self.df.label[ind], num_classes=num_classes) # gives the one-hot-encoding
        return images, labels

In [4]:
df_train, df_val = train_test_split(pd.read_csv(train_csv), test_size = 0.1, random_state = 5622)
df_test = pd.read_csv(submission_csv)

In [5]:
print(df_train.shape, df_val.shape)

In [6]:
train_generator = RockGenerator(df_train, train_images_folder)
val_generator = RockGenerator(df_val, train_images_folder, shuffle=False)
test_generator = RockGenerator(df_test, test_images_folder, shuffle=False)

In [7]:
# let's examine a batch
batch_x, batch_y = train_generator[0]
print(batch_x.shape, batch_y.shape)
print(np.mean(batch_x), np.std(batch_x)) # not exactly 0 and 1, but close enough

Let's now define a simple neural network. We'll start with a stack of 2d convolutions followed by a few feed-forward network.

In [8]:
# model = keras.models.Sequential([keras.layers.InputLayer(input_shape=(patch_size,patch_size,3)), # Input layer, no need to mention the batch_size
#                                  keras.layers.Conv2D(filters=32, kernel_size=(4,4), strides=2,activation="relu"),
#                                  #keras.layers.add(BatchNormalization()),
#                                  keras.layers.MaxPooling2D(2,2),
#                                  keras.layers.Conv2D(filters=32, kernel_size=(4,4), strides=2,activation="relu"),
#                                  #keras.layers.add(BatchNormalization()),
#                                  keras.layers.MaxPooling2D(2,2),
#                                  keras.layers.Conv2D(filters=64, kernel_size=(3,3), strides=2,activation="relu"),
#                                  keras.layers.add(BatchNormalization()),
#                                  keras.layers.MaxPooling2D(2,2),
#                                  keras.layers.Flatten(),
#                                  keras.layers.Dense(128, activation="relu"),
#                                  keras.layers.Dense(128, activation="relu"),
#                                  keras.layers.Dense(53, activation="softmax")
#                                  ])
model = keras.models.Sequential()
model.add(InputLayer(input_shape=(patch_size,patch_size,3)))
model.add(Conv2D(filters=32, kernel_size=(4,4), strides=2,activation="relu")) 
model.add(MaxPooling2D(2,2))
model.add(BatchNormalization())
model.add(Conv2D(filters=32, kernel_size=(4,4), strides=2,activation="relu")) 
model.add(MaxPooling2D(2,2))
model.add(BatchNormalization())
model.add(Conv2D(filters=64, kernel_size=(3,3), strides=2,activation="relu")) 
model.add(MaxPooling2D(2,2))
model.add(BatchNormalization())
model.add(Flatten())
#model.add(Dense(units=128,activation = 'relu'))
#model.add(Dropout(0.25))
model.add(Dense(units = 128, activation = 'relu'))
model.add(Dropout(0.25))
model.add(Dense(units = 128, activation = 'relu'))
model.add(Dense(units = 53, activation = 'softmax'))
model.summary() # prints the output of each layer and number of trainable weights for each

All we have to do at this step is to choose the proper loss function, optimizer and metric.

In [9]:
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True), # this means that teh network returns the log probabilities and not probas
              optimizer=keras.optimizers.adam_v2.Adam(learning_rate=4e-4), # The optimizer that smooths the gradient
              metrics=["accuracy", 
                       tfa.metrics.F1Score(num_classes=num_classes,average="macro", name="macroF1")]) # We want to track accuracy and MacroF1

We would also like to store the model from the epoch with the best validation metric. We can easily achieve so using "Callbacks"

In [10]:
checkpoint_callbk = tf.keras.callbacks.ModelCheckpoint(
    "best_tiny_model", # name of file to save the best model to
    monitor="val_macroF1", # prefix val to specify that we want the model with best macroF1 on the validation data
    verbose=1, # prints out when the model achieve a better epoch
    mode="max", # the monitored metric should be maximized
    save_freq="epoch", # clear
    save_best_only=True, # of course, if not, every time a new best is achieved will be savedf differently
    save_weights_only=True # this means that we don't have to save the architecture, if you change the architecture, you'll loose the old weights
)


We're all set now to run our tiny neural network. Training usign CPU is going to be slow the more convolutions we use.
Feed-forward layers are much faster on CPu compared to convolutions.

In [None]:
model.fit(train_generator,callbacks=[checkpoint_callbk], epochs=100, validation_data=val_generator)

Once the training is done, we can load the weights from the epoch with the best validation metric. If you run it for more epochs, you'll notice that it overfits quickly and reaches around 0.91 F1 on the training vs ~ 0.25 F1 on the validation.

The partitions are fairly similar, so the estimates from cross validation can be reliable.

In [None]:
model.load_weights("best_tiny_model")

In [None]:
y_hat = model.predict(test_generator) # logits of the 53 classes
y_hat = np.argmax(y_hat, axis=1) # take the classe with the hgiher logit
test_generator.df.label = y_hat
test_generator.df.to_csv("start_here_submission.csv", index=False) # we don't want to add the column of indices