In [18]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.utils import plot_model
from utilities import f1_m, recall_m, precision_m

from tensorflow.keras.applications.vgg19 import VGG19
from tensorflow.keras.applications.vgg19 import preprocess_input
from tensorflow.keras.models import Model
from tensorflow import keras
import numpy as np
import os 
import glob
import cv2

In [6]:
# Hyperparameters
IMG_SIZE = 224
EPOCHS = 10
BATCH_SIZE = 32


MAX_SEQ_LENGTH = 128
FRAME_GAP = 11
NUM_FEATURES = 1024


In [4]:
def get_cnn_model():
    classes = 4

    # Create a VGG19 model, and removing the last layer that is classifying 1000 images. 
    # # This will be replaced with images classes we have. 
    base_model = VGG19(weights='imagenet', include_top=False, input_shape=(IMG_SIZE,IMG_SIZE,3))
    # freeze all layers in the the base model
    base_model.trainable = False

    # Model = Model(inputs=base_model.input, outputs=base_model.get_layer('flatten').output)

    x = layers.Flatten()(base_model.output) #Output obtained on vgg16 is now flattened. 
    outputs = layers.Dense(classes, activation="sigmoid")(x)

    #Creating model object 
    model = keras.Model(inputs=base_model.input, outputs=outputs)

    optimizer = keras.optimizers.Adam(learning_rate=1e-4)
    # compile the model
    model.compile(
        optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy',f1_m,precision_m, recall_m]
    )
    model.summary()
    return model

In [7]:
# Embedding Layer
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=output_dim
        )
        self.sequence_length = sequence_length
        self.output_dim = output_dim

    def call(self, inputs):
        # The inputs are of shape: `(batch_size, frames, num_features)`
        length = tf.shape(inputs)[1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_positions = self.position_embeddings(positions)
        return inputs + embedded_positions

    def compute_mask(self, inputs, mask=None):
        mask = tf.reduce_any(tf.cast(inputs, "bool"), axis=-1)
        return mask
    
    def get_config(self):
        config = super().get_config()
        config.update({
            "sequence_length": self.sequence_length,
            "output_dim": self.output_dim
        })
        return config


# Subclassed layer
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim, dropout=0.3
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation=tf.nn.gelu), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()

    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]

        attention_output = self.attention(inputs, inputs, attention_mask=mask)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "dense_dim": self.dense_dim,
        })
        return config

In [8]:
def get_transformer_model():
    sequence_length = MAX_SEQ_LENGTH
    embed_dim = NUM_FEATURES
    dense_dim = 4
    num_heads = 1
    classes = 4

    inputs = keras.Input(shape=(None, None), name="input")
    x = PositionalEmbedding(
        sequence_length, embed_dim, name="frame_position_embedding"
    )(inputs)
    x = TransformerEncoder(embed_dim, dense_dim, num_heads, name="transformer_layer")(x)

    x = layers.Dense(units=embed_dim, activation='gelu')(x)
    x = layers.LayerNormalization()(x)


    x = layers.GlobalMaxPooling1D()(x)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(classes, activation="sigmoid")(x)
    model = keras.Model(inputs, outputs)


    optimizer = keras.optimizers.Adam(learning_rate=1e-4)
    # compile the model
    model.compile(
        optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy',f1_m,precision_m, recall_m]
    )
    
    model.summary()
    return model

In [10]:

filepath = os.getcwd() + "/temp/audio_classifier"
cnn = get_cnn_model()
cnn.load_weights(filepath)

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 224, 224, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0   

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x145652db400>

In [11]:

filepath = os.getcwd() + "/tmp_3_4/video_classifier"
transformer = get_transformer_model()
transformer.load_weights(filepath)

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, None, None)]      0         
_________________________________________________________________
frame_position_embedding (Po (None, None, 1024)        131072    
_________________________________________________________________
transformer_layer (Transform (None, None, 1024)        4211716   
_________________________________________________________________
dense_8 (Dense)              (None, None, 1024)        1049600   
_________________________________________________________________
layer_normalization_5 (Layer (None, None, 1024)        2048      
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 1024)              0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024)              0   

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x145653a4a90>

In [13]:
def get_late_fusion_model(model_1,model_2):
    classes = 4
    x1 = model_1.output
    x2 = model_2.output
    
    # LATE FUSION
    x = layers.concatenate([x1, x2])
    x = keras.Sequential()(x)
    # x = Dense(x.shape[1], activation='relu')(x) #12
    # x = Dropout(DROPOUT_PROB)(x)
    # x = Dense(ceil(x.shape[1]/2), activation='relu')(x) #8
    # x = Dropout(DROPOUT_PROB)(x)
    # predictions = Dense(classes, activation='softmax')(x)

    # x = layers.GlobalMaxPooling1D()(x)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(classes, activation="sigmoid")(x)

    model = keras.Model(inputs=[model_1.input, model_2.input], outputs=outputs) # Inputs go into two different layers

    optimizer = keras.optimizers.Adam(learning_rate=1e-4)
    # compile the model
    model.compile(
        optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy',f1_m,precision_m, recall_m]
    )
    
    model.summary()
    return model


## Train multimodal video classification model

In [15]:
train_image_data, train_labels = np.load("extracted_data/train_data.npy"), np.load("extracted_data/train_labels.npy")
val_image_data, val_labels = np.load("extracted_data/val_data.npy"), np.load("extracted_data/val_labels.npy")
test_image_data, test_labels = np.load("extracted_data/test_data.npy"), np.load("extracted_data/test_labels.npy")

In [17]:
train_spectrograms = glob.glob('extracted_train_spectrogram/*')
val_spectrograms = glob.glob('extracted_val_spectrogram/*')
test_spectrograms = glob.glob('extracted_test_spectrogram/*')

In [26]:
train_audio_data = []
val_audio_data = []
test_audio_data = []

for f in train_spectrograms:
    img = cv2.imread(f)
    img = cv2.resize(img, (IMG_SIZE,IMG_SIZE))
    train_audio_data.append(img)
    
train_audio_data = np.array(train_audio_data)

for f in val_spectrograms:
    img = cv2.imread(f)
    img = cv2.resize(img, (IMG_SIZE,IMG_SIZE))
    val_audio_data.append(img)
    
val_audio_data = np.array(val_audio_data)

for f in test_spectrograms:
    img = cv2.imread(f)
    img = cv2.resize(img, (IMG_SIZE,IMG_SIZE))
    test_audio_data.append(img)
    
test_audio_data = np.array(test_audio_data)

In [30]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [33]:
def run_experiment():
    log_dir = "logs/fit/fusion_temp" 
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

    filepath = os.getcwd() + "/fusion_temp/classifier"
    checkpoint = keras.callbacks.ModelCheckpoint(
        filepath, save_weights_only=True, save_best_only=True, verbose=1
    )

    with tf.device('/device:CPU:0'):
        model = get_late_fusion_model(transformer,cnn)
        history = model.fit(
            [train_image_data, train_audio_data],
            train_labels,
            validation_data=([val_image_data, val_audio_data],val_labels),
            epochs=EPOCHS,
            callbacks=[checkpoint, tensorboard_callback],
        )

    model.load_weights(filepath)
    # _, accuracy = model.evaluate(test_data, test_labels)
    # evaluate the model
    loss, accuracy, f1_score, precision, recall = model.evaluate([test_image_data, test_audio_data], test_labels, verbose=0)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")
    print(f"F1 score: {round(f1_score, 2)}")
    print(f"Precision: {round(precision, 2)}")
    print(f"Recall: {round(recall, 2)}")

    return model

In [34]:
trained_model = run_experiment()

Model: "model_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 224, 224, 3) 0                                            
__________________________________________________________________________________________________
block1_conv1 (Conv2D)           (None, 224, 224, 64) 1792        input_2[0][0]                    
__________________________________________________________________________________________________
block1_conv2 (Conv2D)           (None, 224, 224, 64) 36928       block1_conv1[0][0]               
__________________________________________________________________________________________________
block1_pool (MaxPooling2D)      (None, 112, 112, 64) 0           block1_conv2[0][0]               
____________________________________________________________________________________________