# Multimodal Classification of sound and pictures
ATSI Project : Machine Learning
Group : Pierre-Antoine COMBY & Maxime ZAGAR
  
This project aims to create an learning algorithm dedicated to recognize an environment from pictures and sounds. In our case study, we have to differentiate those data in 9 different classes. 
  
The provided data have the following shape :  
- Pictures are at the format 256x256
- Each sound is described by 104 mffc coefficients

**Useful links :**  
Competition address : https://www.kaggle.com/c/iogs-atsi-multimodal/leaderboard  
Keras : https://www.kaggle.com/sinkie/keras-data-augmentation-with-multiple-inputs


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

import tensorflow as tf
tf.test.gpu_device_name()

Mounted at /content/gdrive


'/device:GPU:0'


## Dataset pre processing

The data (sound+picture) describe these 9 different locations :
- Forest
- An urban place (city)
- Beach
- Classroom
- River
- Jungle
- Restaurant
- Football match
- Grocery-Store

In [4]:
DATA_DIR = 'gdrive/My Drive/Colab_Notebooks/multimodal_classification/data'
import matplotlib.pyplot as plt
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os 
import skimage
from skimage import io

#!pip install keras-rectified-adam

from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.image import ImageDataGenerator

#from keras_radam import RAdam

from PIL import Image


CLASS_NAME=[ 'FOREST', 'CITY', 'BEACH', 'CLASSROOM', 'RIVER', 'JUNGLE', 'RESTAURANT', 'GROCERY-STORE', 'FOOTBALL-MATCH']
train_df = pd.read_csv(os.path.join(DATA_DIR,'data_train.csv'), delimiter=',', nrows=None)


print(train_df.head())
dataTrain, dataValid = train_test_split(train_df,train_size=.80,random_state=42)

test_df = pd.read_csv(os.path.join(DATA_DIR,'data_test_novt.csv'), delimiter=',', nrows = None)

print(f"Number of labelled samples: {len(train_df)}")
print(f"Number of labelled validation samples: {len(dataValid)}")
print(f"Number of labelled train samples: {len(dataTrain)}")
print(f"Number of unlabelled samples {len(test_df)}")
print(f"Number of classes: {len(CLASS_NAME)}")

# Test dataset, nolabel is provided. 


                IMAGE     mfcc_1     mfcc_2  ...   mfcc_103   mfcc_104  CLASS
0  trainimg_00000.png  11.112999   4.439105  ... -11.933302 -14.578534      3
1  trainimg_00001.png  13.567897  -1.470553  ...  14.197092  -9.513413      0
2  trainimg_00002.png  12.585137   1.143402  ...   9.582895   0.731367      3
3  trainimg_00003.png  17.783320   4.445305  ...  -5.714941  -6.054448      4
4  trainimg_00004.png  12.348299 -11.143099  ...  21.121838   5.865456      5

[5 rows x 106 columns]
Number of labelled samples: 13802
Number of labelled validation samples: 2761
Number of labelled train samples: 11041
Number of unlabelled samples 3450
Number of classes: 9


## Audio classification

To see what's possible with only  the audio data, let's try a classical MLP. With only  a few thousand of parameters, the classification is already pretty good. The training is also very fast. 

As we have a 

In [5]:
dataTrainArray = np.array(dataTrain)
dataValidArray = np.array(dataValid)
audioTrain,yTrain = dataTrainArray[:,1:-1].astype('float32'), dataTrainArray[:,-1].astype('int')
audioValid,yValid = dataValidArray[:,1:-1].astype('float32'), dataValidArray[:,-1].astype('int')

def create_mlp():
    audio_inputs = keras.Input(shape=audioTrain.shape[1])
    audio1 = layers.Dense(64,activation="relu")(audio_inputs)
    audio2 = layers.Dense(32,activation="relu")(audio1)
    audio3 = layers.Dense(16,activation="relu")(audio2)
    audio_outputs = layers.Dense(9,activation="softmax")(audio3)
    MLP = keras.Model(inputs=audio_inputs, outputs=audio_outputs)
    return MLP, audio_inputs, audio_outputs

MLP, audio_inputs, audio = create_mlp()
MLP.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 104)]             0         
_________________________________________________________________
dense (Dense)                (None, 64)                6720      
_________________________________________________________________
dense_1 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_2 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_3 (Dense)              (None, 9)                 153       
Total params: 9,481
Trainable params: 9,481
Non-trainable params: 0
_________________________________________________________________


In [8]:
MLP.compile(
     loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.RMSprop(),
    metrics=["accuracy"],
)
history = MLP.fit(audioTrain,yTrain,batch_size=16,epochs=50,validation_split=0.2,verbose=0)

test_scores = MLP.evaluate(audioValid, yValid)
print("Loss",test_scores[0])
print("accuracy:", test_scores[1])

Loss 2.211484670639038
accuracy: 0.8949655890464783


The training is fast, and the accuracy is already relevant. 

## Image Classification

In the same way, one can classify using only the image information , which is a classic ML problem. The following CNN, inspired by AlexNet and VGG famous network, provides a relative good accuracy, but is however very slow to train (due to a increase number of parameters).

In [None]:
def small_cnn(input_shape):
    params = {'activation':'relu','kernel_initializer':'he_uniform', 'padding':'same'}
    X_input = keras.Input(input_shape)
    X = layers.AveragePooling2D((4,4))(X_input) # downscale to 64x64

    X = layers.Conv2D(32,(3,3), **params, input_shape=(64,64,3))(X)
    X = layers.BatchNormalization()(X)   
    X = layers.Conv2D(32,(3,3), **params)(X)
    X = layers.BatchNormalization()(X)   
    X = layers.MaxPooling2D((2,2))(X)
    X = layers.Dropout(0.2)(X)

    X = layers.Conv2D(64,(3,3), **params, input_shape=(64,64,3))(X)
    X = layers.BatchNormalization()(X)   
    X = layers.Conv2D(64,(3,3), **params)(X)
    X = layers.BatchNormalization()(X)   
    X = layers.MaxPooling2D((2,2))(X)
    X = layers.Dropout(0.3)(X)

    X = layers.Conv2D(128,(3,3), **params, input_shape=(64,64,3))(X)
    X = layers.BatchNormalization()(X)   
    X = layers.Conv2D(128,(3,3), **params)(X)
    X = layers.BatchNormalization()(X)   
    X = layers.MaxPooling2D((2,2))(X)
    X = layers.Dropout(0.4)(X)
    
    X = layers.Flatten()(X)
    X = layers.Dense(128, activation = 'relu',kernel_initializer='he_uniform')(X)
    X = layers.BatchNormalization()(X)   
    X = layers.Dropout(0.5)(X)
    X = layers.Dense(9,activation='softmax')(X)
    model = keras.Model(inputs = X_input, outputs = X, name='smallCNN')
    return model,X_input,X

## Joint Classification
To improve the Classification, we have to use the complete information at our disposal, here the image joint with the audio data. 


### Custom DataGeneration

The dataset of image is relatively small, and a closer look shows that it is in fact built from a few different videos. 

To provides the training data together (image file, sound, label) to the model, it is required to create a DataGenerator, this generator is also used to create some augmentations on images (small rotations, translations, zooms and flip), with the aim of making the image classification more robust. 
No transformation is applied to the sound data, as no trivial augmentation is possible with mfcc coefficients.


The same is done for the test data, only providing sound and image, with no extra transformations. 

In [None]:
class CustomDataGenerator(keras.utils.Sequence):
  def __init__(self,dataframe,batch_size=32,shuffle=True,augment=False):
    self.batch_size=batch_size
    self.df = dataframe
    self.indices = self.df.index.tolist()
    self.shuffle = shuffle
    if augment:
     self.augmentor = ImageDataGenerator(
        rescale=1./255,
        rotation_range=10,
        width_shift_range=0.1,
        height_shift_range=0.1,
        zoom_range=0.1,
        horizontal_flip=True,
        fill_mode='nearest')
    else:
      self.augmentor = ImageDataGenerator(rescale=1./255)
    self.on_epoch_end()

  def on_epoch_end(self):
    self.index = np.arange(len(self.indices))
    if self.shuffle:
      np.random.seed(42)
      np.random.shuffle(self.index)
  
  def __len__(self):
    return len(self.indices)//self.batch_size
  
  def __getitem__(self,index):
    index = self.index[index*self.batch_size:(index+1)*self.batch_size]
    batch_indices = [self.indices[k] for k in index]
    X,y = self.__data_generation(batch_indices)
    return X,y

  def __data_generation(self,batch_indices):
    Ximage = np.empty((self.batch_size,256,256,3),dtype='float32')
    Xaudio = np.empty((self.batch_size, 104),dtype='float32')
    y = np.empty(self.batch_size,dtype=int)

    for i, idx in enumerate(batch_indices):
      Ximage[i,:] = keras.preprocessing.image.load_img(os.path.join(DATA_DIR,self.df.at[idx,'IMAGE']))
      Xaudio[i,:] = np.array(self.df.loc[idx])[1:-1]
      y[i] = np.array(self.df.at[idx,'CLASS'])
    X_gen = self.augmentor.flow([Ximage,Xaudio], y)
    return next(X_gen)

class CustomTestDataGenerator(keras.utils.Sequence):
  def __init__(self,dataframe,batch_size=1):
    self.batch_size=batch_size
    self.df = dataframe
    self.indices = self.df.index.tolist()
    self.index = np.arange(len(self.indices))
    self.augmentor = ImageDataGenerator(rescale=1./255)
  def __len__(self):
    return len(self.indices)//self.batch_size
  
  def __getitem__(self,index):
    index = self.index[index*self.batch_size:(index+1)*self.batch_size]
    batch_indices = [self.indices[k] for k in index]
    X = self.__data_generation(batch_indices)
    return X

  def __data_generation(self,batch_indices):
    Ximage = np.empty((self.batch_size,256,256,3),dtype='float32')
    Xaudio = np.empty((self.batch_size, 104),dtype='float32')
    for i, idx in enumerate(batch_indices):
      Ximage[i,:] = keras.preprocessing.image.load_img(os.path.join(DATA_DIR,self.df.at[idx,'IMAGE']))
      Xaudio[i,:] = np.array(self.df.loc[idx])[1:]
    X_gen = self.augmentor.flow([Ximage,Xaudio])
    return next(X_gen)
  


## Split Network

A first approach to train is to use to parallel network (possibly trained separatly) and combine their decision output.

In [None]:
Xgen = CustomDataGenerator(dataTrain,augment=True)
XValid=  CustomDataGenerator(dataValid,augment=False)

def multimodal_network():
  MLP, audio_input, audio_output = create_mlp()
  #CNN, image_input, image_output = AlexNet((256,256,3))
  CNN, image_input, image_output = small_cnn((256,256,3))

  combined = layers.concatenate([audio_output, image_output])
  z = layers.Dense(15,activation='relu')(combined)
  output = layers.Dense(9,activation='softmax')(z)

  return keras.Model(inputs=[image_input, audio_input],outputs=output)

MLM = multimodal_network()
MLM.summary()
keras.utils.plot_model(MLM, show_shapes=True, show_layer_names=True, to_file='model.png')

from IPython.display import Image
Image(retina=True, filename='model.png')


In [None]:
MLM.compile(
        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
  optimizer= keras.optimizers.Adam(),
  metrics=["accuracy"],
)
history = MLM.fit(Xgen,epochs=50,validation_data=XValid,workers=2,verbose=0)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.legend('accuracy','validation_accuracy')

In [None]:
scores = MLM.evaluate(XValid)
print(scores)
MLM.save(os.path.join(DATA_DIR,'multi_mod_class'))

[0.045679979026317596, 0.9876453280448914]
INFO:tensorflow:Assets written to: gdrive/My Drive/Colab_Notebooks/multimodal_classification/data/multi_mod_class/assets


## A better Approach

The previous network was the simplest merge of two independant network. In particular, the last classifying layers of each indepedant network were used(9+9 -> 15 -> 9) this caused a lot of compression, with potentially a loss of information, which could be usefull for the classification. 

In the following NN, the last independant layers are not classifying anymore. Furthermore, Drop-out layers with increasing rates have been added to make the network more robust to the data presented. 

In [None]:
def direct_mix():
    image_input_shape=(256,256,3)
    audio_input_shape=104

    params = {'activation':'relu','kernel_initializer':'he_uniform', 'padding':'same'}
    X_image_input = keras.Input(image_input_shape)
    X = layers.AveragePooling2D((4,4))(X_image_input) # downscale to 64x64

    X = layers.Conv2D(32,(3,3), **params, input_shape=(64,64,3))(X)
    X = layers.BatchNormalization()(X)   
    X = layers.Conv2D(32,(3,3), **params)(X)
    X = layers.BatchNormalization()(X)   
    X = layers.MaxPooling2D((2,2))(X)
    X = layers.Dropout(0.2)(X)

    X = layers.Conv2D(64,(3,3), **params, input_shape=(64,64,3))(X)
    X = layers.BatchNormalization()(X)   
    X = layers.Conv2D(64,(3,3), **params)(X)
    X = layers.BatchNormalization()(X)   
    X = layers.MaxPooling2D((2,2))(X)
    X = layers.Dropout(0.3)(X)

    X = layers.Conv2D(128,(3,3), **params, input_shape=(64,64,3))(X)
    X = layers.BatchNormalization()(X)   
    X = layers.Conv2D(128,(3,3), **params)(X)
    X = layers.BatchNormalization()(X)   
    X = layers.MaxPooling2D((2,2))(X)
    X = layers.Dropout(0.4)(X)
    
    X = layers.Flatten()(X)
    X = layers.Dense(128, activation = 'relu',kernel_initializer='he_uniform')(X)
    X = layers.BatchNormalization()(X)  
    X = layers.Dropout(0.5)(X)
    # audio net
    X_audio_input = keras.Input(audio_input_shape)
    X1 = layers.Dense(256,activation = 'relu',kernel_initializer='he_uniform')(X_audio_input)
    X1 = layers.Dense(128,activation = 'relu',kernel_initializer='he_uniform')(X1)

    X = layers.concatenate([X, X1]) # 256 final parameters
    output = layers.Dense(9,activation="softmax")(X)

    return keras.Model(inputs=[X_image_input, X_audio_input],outputs=output) 


### Training

The training has been realised only on Google Colab GPU (Nivdia K100), and the lack of computing power has limited the search for optimal meta parameters (number of Epoch, data augmentation variations, etc...). 

The use of data augmentation has not shown any particular benefit with this dataset, and has thus been disable. We assume that this mostly due to the nature of the validation and test data, both extracted from the same handfull of (relatively stabled) videos. In a more heteregeneous dataset (or with more class ) it may be reused. 

In [None]:

XgenFull = CustomDataGenerator(train_df,augment=False)
Xgen = CustomDataGenerator(dataTrain,augment=False)
XValid=  CustomDataGenerator(dataValid,augment=False)
XTest =  CustomTestDataGenerator(test_df)

directMix = direct_mix()
directMix.compile(
        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
  optimizer= keras.optimizers.Adam(),
  metrics=["accuracy"],
)

checkpoint_filepath = 'gdrive/My Drive/Colab_Notebooks/best_weight/'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)
# save the best model
history = directMix.fit(Xgen,epochs=50,validation_data=XValid,workers=2,verbose=1,callbacks=[model_checkpoint_callback])
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.legend('accuracy','validation_accuracy')
directMix.load_weights(checkpoint_filepath)

print(directMix.evaluate(XValid))

## Use of Full data training

In [None]:

XgenFull = CustomDataGenerator(train_df,augment=False)
XTest =  CustomTestDataGenerator(test_df)

directMix = direct_mix()
directMix.compile(
        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
  optimizer= keras.optimizers.Adam(),
  metrics=["accuracy"],
)

checkpoint_filepath = 'gdrive/My Drive/Colab_Notebooks/best_weight_full/'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='accuracy',
    mode='max',
    save_best_only=True)
# save the best model
history = directMix.fit(XgenFull,epochs=50,workers=2,verbose=1,callbacks=[model_checkpoint_callback])
plt.plot(history.history['accuracy'])
plt.legend('accuracy')

## Prediction on Test data and Submission

In [None]:
XTest =  CustomTestDataGenerator(test_df)
directMix.load_weights(checkpoint_filepath)

testPred = directMix.predict(XTest)


In [None]:
y_classes = testPred.argmax(axis=-1)
with open(os.path.join(DATA_DIR,'test.npy'), 'wb') as f:
  np.save(f,y_classes)

import csv
header = ['id','CLASS']
with open(os.path.join(DATA_DIR,'out.csv'), 'w') as fh:
    writer = csv.writer(fh, delimiter=',')
    writer.writerow(h for h in header)
    writer.writerows(enumerate(y_classes))