In [11]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.utils import plot_model
from utilities import f1_m, recall_m, precision_m

from tensorflow.keras.applications.vgg19 import VGG19
from tensorflow.keras.applications.vgg19 import preprocess_input
from tensorflow.keras.models import Model
from tensorflow import keras

from sklearn.metrics import accuracy_score, classification_report

import numpy as np
import os 
import glob

## Early Fusion

In [2]:
# Hyperparameters
IMG_SIZE = 224
EPOCHS = 30
BATCH_SIZE = 32


MAX_SEQ_LENGTH = 128
FRAME_GAP = 11
NUM_FEATURES = 1024


In [3]:
def get_cnn_model():

    #Create CNN model
    model = keras.Sequential()
    model.add(layers.Conv2D(32, (3, 3), padding='same', activation='relu', input_shape=(IMG_SIZE,IMG_SIZE,3)))
    model.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D(pool_size=(2, 2)))
    model.add(layers.Dropout(0.25))
    model.add(layers.Conv2D(64, (3, 3), padding='same', activation='relu'))
    model.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D(pool_size=(2, 2)))
    model.add(layers.Dropout(0.5))
    model.add(layers.Conv2D(128, (3, 3), padding='same', activation='relu'))
    model.add(layers.Conv2D(128, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D(pool_size=(2, 2)))
    model.add(layers.Dropout(0.5))
    model.add(layers.Flatten())
    model.add(layers.Dense(512, activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(4, activation='sigmoid'))


    # # compile the model
    # optimizer = keras.optimizers.SGD(learning_rate=0.0000001, decay=1e-6, momentum=0.9, nesterov=True)
    # model.compile(
    #     optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy',f1_m,precision_m, recall_m]
    # )

    model.summary()
    return model

In [5]:
# Embedding Layer
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=output_dim
        )
        self.sequence_length = sequence_length
        self.output_dim = output_dim

    def call(self, inputs):
        # The inputs are of shape: `(batch_size, frames, num_features)`
        length = tf.shape(inputs)[1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_positions = self.position_embeddings(positions)
        return inputs + embedded_positions

    def compute_mask(self, inputs, mask=None):
        mask = tf.reduce_any(tf.cast(inputs, "bool"), axis=-1)
        return mask
    
    def get_config(self):
        config = super().get_config()
        config.update({
            "sequence_length": self.sequence_length,
            "output_dim": self.output_dim
        })
        return config


# Subclassed layer
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim, dropout=0.3
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation=tf.nn.gelu), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()

    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]

        attention_output = self.attention(inputs, inputs, attention_mask=mask)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "dense_dim": self.dense_dim,
        })
        return config

In [5]:
def get_transformer_model():
    sequence_length = MAX_SEQ_LENGTH
    embed_dim = NUM_FEATURES
    dense_dim = 4
    num_heads = 1
    classes = 4

    inputs = keras.Input(shape=(None, None), name="input")
    x = PositionalEmbedding(
        sequence_length, embed_dim, name="frame_position_embedding"
    )(inputs)
    x = TransformerEncoder(embed_dim, dense_dim, num_heads, name="transformer_layer")(x)

    x = layers.Dense(units=embed_dim, activation='gelu')(x)
    x = layers.LayerNormalization()(x)


    x = layers.GlobalMaxPooling1D()(x)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(classes, activation="sigmoid")(x)
    model = keras.Model(inputs, outputs)


    # optimizer = keras.optimizers.Adam(learning_rate=1e-4)
    # # compile the model
    # model.compile(
    #     optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy',f1_m,precision_m, recall_m]
    # )
    
    model.summary()
    return model

In [17]:
def get_early_fusion_model():
    sequence_length = MAX_SEQ_LENGTH
    embed_dim = NUM_FEATURES
    dense_dim = 4
    num_heads = 1
    classes = 4

    # Create Transformer-based model
    inputs_rgb = keras.Input(shape=(None, None), name="input_image")
    x1 = PositionalEmbedding(
        sequence_length, embed_dim, name="frame_position_embedding"
    )(inputs_rgb)
    x1 = TransformerEncoder(embed_dim, dense_dim, num_heads, name="transformer_layer")(x1)
    x1 = layers.Dense(units=embed_dim, activation='gelu')(x1)
    x1 = layers.LayerNormalization()(x1)
    x1 = layers.GlobalMaxPooling1D()(x1)
    x1 = layers.Dropout(0.5)(x1)


    #Create CNN model
    inputs_spec = keras.Input(shape=(IMG_SIZE,IMG_SIZE,3), name="input_spectrogram")
    # x2 = keras.Sequential()(inputs_spec)
    x2 = layers.Conv2D(32, (3, 3), padding='same', activation='relu', input_shape=(IMG_SIZE,IMG_SIZE,3))(inputs_spec)
    x2 = layers.Conv2D(64, (3, 3), activation='relu')(x2)
    x2 = layers.MaxPooling2D(pool_size=(2, 2))(x2)
    x2 = layers.Dropout(0.25)(x2)
    x2 = layers.Conv2D(64, (3, 3), padding='same', activation='relu')(x2)
    x2 = layers.Conv2D(64, (3, 3), activation='relu')(x2)
    x2 = layers.MaxPooling2D(pool_size=(2, 2))(x2)
    x2 = layers.Dropout(0.5)(x2)
    x2 = layers.Conv2D(128, (3, 3), padding='same', activation='relu')(x2)
    x2 = layers.Conv2D(128, (3, 3), activation='relu')(x2)
    x2 = layers.MaxPooling2D(pool_size=(2, 2))(x2)
    x2 = layers.Dropout(0.5)(x2)
    x2 = layers.Flatten()(x2)
    x2 = layers.Dense(512, activation='relu')(x2)
    x2 = layers.Dropout(0.5)(x2)

    # LATE FUSION
    x = layers.concatenate([x1, x2])
    x = keras.Sequential()(x)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(classes, activation="sigmoid")(x)

    
    model = keras.Model(inputs=[inputs_rgb, inputs_spec], outputs=outputs) # Inputs go into two different layers

    optimizer = keras.optimizers.Adam(learning_rate=1e-4)
    # compile the model
    model.compile(
        optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy',f1_m,precision_m, recall_m]
    )
    
    model.summary()
    return model

In [18]:
# def get_early_fusion_model(model_1,model_2):
#     x1 = model_1.output
#     x2 = model_2.output
#     classes = 4

#     # LATE FUSION
#     x = layers.concatenate([x1, x2])
#     x = keras.Sequential()(x)
#     x = layers.Dropout(0.5)(x)
#     outputs = layers.Dense(classes, activation="sigmoid")(x)

#     model = keras.Model(inputs=[model_1.input, model_2.input], outputs=outputs) # Inputs go into two different layers

#     optimizer = keras.optimizers.Adam(learning_rate=1e-4)
#     # compile the model
#     model.compile(
#         optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy',f1_m,precision_m, recall_m]
#     )
    
#     model.summary()
#     return model


### Train multimodal video classification model

In [6]:
train_image_data, train_labels = np.load("extracted_data/train_data.npy"), np.load("extracted_data/train_labels.npy")
val_image_data, val_labels = np.load("extracted_data/val_data.npy"), np.load("extracted_data/val_labels.npy")
test_image_data, test_labels = np.load("extracted_data/test_data.npy"), np.load("extracted_data/test_labels.npy")

In [7]:
train_spectrograms = glob.glob('extracted_train_spectrogram/*')
val_spectrograms = glob.glob('extracted_val_spectrogram/*')
test_spectrograms = glob.glob('extracted_test_spectrogram/*')

In [12]:
from keras_preprocessing import image
train_audio_data = []
val_audio_data = []
test_audio_data = []

for f in train_spectrograms:
    img = image.load_img(f, target_size= (IMG_SIZE,IMG_SIZE))
    img = image.img_to_array(img)
    train_audio_data.append(img)
    
train_audio_data = np.array(train_audio_data)

for f in val_spectrograms:
    img = image.load_img(f, target_size= (IMG_SIZE,IMG_SIZE))
    img = image.img_to_array(img)
    val_audio_data.append(img)
    
val_audio_data = np.array(val_audio_data)

for f in test_spectrograms:
    img = image.load_img(f, target_size= (IMG_SIZE,IMG_SIZE))
    img = image.img_to_array(img)
    test_audio_data.append(img)
    
test_audio_data = np.array(test_audio_data)

In [13]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [14]:
transformer = get_transformer_model()
cnn = get_cnn_model()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, None, None)]      0         
_________________________________________________________________
frame_position_embedding (Po (None, None, 1024)        131072    
_________________________________________________________________
transformer_layer (Transform (None, None, 1024)        4211716   
_________________________________________________________________
dense_2 (Dense)              (None, None, 1024)        1049600   
_________________________________________________________________
layer_normalization_2 (Layer (None, None, 1024)        2048      
_________________________________________________________________
global_max_pooling1d (Global (None, 1024)              0         
_________________________________________________________________
dropout (Dropout)            (None, 1024)              0     

In [23]:
def run_experiment():
    log_dir = "logs/fit/fusion_temp" 
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

    filepath = os.getcwd() + "/early_fusion_temp/classifier"
    checkpoint = keras.callbacks.ModelCheckpoint(
        filepath, save_weights_only=True, monitor='val_f1_m',
        mode='max',
        save_best_only=True,
        verbose = True
    )

    with tf.device('/device:CPU:0'):
        # model = get_early_fusion_model(transformer,cnn)
        model = get_early_fusion_model()
        history = model.fit(
            [train_image_data, train_audio_data],
            train_labels,
            validation_data=([val_image_data, val_audio_data],val_labels),
            epochs=EPOCHS,
            callbacks=[checkpoint, tensorboard_callback],
        )

    model.load_weights(filepath)
    # _, accuracy = model.evaluate(test_data, test_labels)
    # evaluate the model
    loss, accuracy, f1_score, precision, recall = model.evaluate([test_image_data, test_audio_data], test_labels, verbose=0)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")
    print(f"F1 score: {round(f1_score, 2)}")
    print(f"Precision: {round(precision, 2)}")
    print(f"Recall: {round(recall, 2)}")

    return model

In [24]:
trained_model = run_experiment()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_spectrogram (InputLayer)  [(None, 224, 224, 3) 0                                            
__________________________________________________________________________________________________
conv2d_6 (Conv2D)               (None, 224, 224, 32) 896         input_spectrogram[0][0]          
__________________________________________________________________________________________________
conv2d_7 (Conv2D)               (None, 222, 222, 64) 18496       conv2d_6[0][0]                   
__________________________________________________________________________________________________
max_pooling2d_3 (MaxPooling2D)  (None, 111, 111, 64) 0           conv2d_7[0][0]                   
____________________________________________________________________________________________

## Late Fusion

In [21]:
def get_cnn_model():

    #Create CNN model
    model = keras.Sequential()
    model.add(layers.Conv2D(32, (3, 3), padding='same', activation='relu', input_shape=(IMG_SIZE,IMG_SIZE,3)))
    model.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D(pool_size=(2, 2)))
    model.add(layers.Dropout(0.25))
    model.add(layers.Conv2D(64, (3, 3), padding='same', activation='relu'))
    model.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D(pool_size=(2, 2)))
    model.add(layers.Dropout(0.5))
    model.add(layers.Conv2D(128, (3, 3), padding='same', activation='relu'))
    model.add(layers.Conv2D(128, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D(pool_size=(2, 2)))
    model.add(layers.Dropout(0.5))
    model.add(layers.Flatten())
    model.add(layers.Dense(512, activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(4, activation='sigmoid'))


    # compile the model
    optimizer = keras.optimizers.SGD(learning_rate=0.0000001, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(
        optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy',f1_m,precision_m, recall_m]
    )

    model.summary()
    return model

def get_transformer_model():
    sequence_length = MAX_SEQ_LENGTH
    embed_dim = NUM_FEATURES
    dense_dim = 4
    num_heads = 1
    classes = 4

    inputs = keras.Input(shape=(None, None), name="input")
    x = PositionalEmbedding(
        sequence_length, embed_dim, name="frame_position_embedding"
    )(inputs)
    x = TransformerEncoder(embed_dim, dense_dim, num_heads, name="transformer_layer")(x)

    x = layers.Dense(units=embed_dim, activation='gelu')(x)
    x = layers.LayerNormalization()(x)


    x = layers.GlobalMaxPooling1D()(x)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(classes, activation="sigmoid")(x)
    model = keras.Model(inputs, outputs)


    optimizer = keras.optimizers.Adam(learning_rate=1e-4)
    # compile the model
    model.compile(
        optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy',f1_m,precision_m, recall_m]
    )
    
    model.summary()
    return model

In [22]:
# filepath = os.getcwd() + "/tmp_3_4/video_classifier"
filepath = os.getcwd() + "/video_chkpt/video_classifier"
transformer = get_transformer_model()
transformer.load_weights(filepath)

# evaluate the transformer model
loss, accuracy, f1_score, precision, recall = transformer.evaluate(test_image_data, test_labels, verbose=0)
print(f"Test accuracy: {round(accuracy * 100, 2)}%")
print(f"F1 score: {round(f1_score, 2)}")
print(f"Precision: {round(precision, 2)}")
print(f"Recall: {round(recall, 2)}")

# filepath = os.getcwd() + "/temp/audio_classifier"
filepath = os.getcwd() + "/audio_chkpt/audio_classifier"
cnn = get_cnn_model()
cnn.load_weights(filepath)

# evaluate the cnn model
loss, accuracy, f1_score, precision, recall = cnn.evaluate(test_audio_data, test_labels, verbose=0)
print(f"Test accuracy: {round(accuracy * 100, 2)}%")
print(f"F1 score: {round(f1_score, 2)}")
print(f"Precision: {round(precision, 2)}")
print(f"Recall: {round(recall, 2)}")

Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, None, None)]      0         
_________________________________________________________________
frame_position_embedding (Po (None, None, 1024)        131072    
_________________________________________________________________
transformer_layer (Transform (None, None, 1024)        4211716   
_________________________________________________________________
dense_29 (Dense)             (None, None, 1024)        1049600   
_________________________________________________________________
layer_normalization_16 (Laye (None, None, 1024)        2048      
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 1024)              0         
_________________________________________________________________
dropout_21 (Dropout)         (None, 1024)              0   

In [83]:
def get_late_fusion():
    ## Extract the probabilities from each classifier for the late fusion
    res1 = transformer.predict(test_image_data)
    # print(res1)
    res2 = cnn.predict(test_audio_data)
    # print(res2)
    all_res = np.array([res1,res2])
    all_res = all_res.sum(0)
    return all_res

In [99]:


## Computing final prediction with late fusion without training
# results1 = all_res.sum(0).argmax(1)
# results2 = all_res.prod(0).argmax(1)
# results3 = np.median(all_res, 0).argmax(1)
# results4 = np.max(all_res, 0).argmax(1)

# results = all_res.sum(0)


def predictLabelForGivenThreshold(results, threshold):
    # y_pred=[]
    # for sample in results:
    #     y_pred.append([1 if i>=threshold else 0 for i in sample ] )
    # return np.array(y_pred)


    predictions = []
    for key,values in enumerate(list(results)):
        temp = []
        for v in values:
            v = (v >= threshold).astype(int)
            temp.append(v)
        predictions.append(temp) 
    predictions = np.array(predictions)

    return predictions

In [120]:
label_names = ['Mature', 'Slapstick', 'Gory', 'Sarcasm']
results = get_late_fusion()
y_pred  = predictLabelForGivenThreshold(results,0.7)
print(classification_report(test_labels, y_pred,target_names=label_names))

              precision    recall  f1-score   support

      Mature       0.24      1.00      0.38        24
   Slapstick       0.42      0.28      0.33        18
        Gory       0.30      0.50      0.37         6
     Sarcasm       0.54      0.86      0.67        43

   micro avg       0.36      0.76      0.49        91
   macro avg       0.37      0.66      0.44        91
weighted avg       0.42      0.76      0.51        91
 samples avg       0.36      0.55      0.42        91



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [121]:
from sklearn import metrics
print("F1 of each label: {}".format(metrics.f1_score(test_labels, y_pred, average=None)))

F1 of each label: [0.38095238 0.33333333 0.375      0.66666667]


In [122]:
def Accuracy(y_true, y_pred):
    temp = 0
    for i in range(y_true.shape[0]):
        temp += sum(np.logical_and(y_true[i], y_pred[i])) / sum(np.logical_or(y_true[i], y_pred[i]))
    return temp / y_true.shape[0]

Accuracy(test_labels, y_pred)

0.34112149532710273

In [102]:
#Exact match ratio
MR = np.all(y_pred == test_labels, axis=1).mean()
print(MR)
print(accuracy_score(test_labels,y_pred))

0.12149532710280374
0.12149532710280374


In [108]:
img_res = transformer.predict(test_image_data)
img_y_pred = predictLabelForGivenThreshold(img_res,0.6)
print(classification_report(test_labels, img_y_pred,target_names=label_names))
Accuracy(test_labels, img_y_pred)
# print(img_y_pred)

              precision    recall  f1-score   support

      Mature       0.48      0.54      0.51        24
   Slapstick       1.00      0.11      0.20        18
        Gory       0.27      0.50      0.35         6
     Sarcasm       0.72      0.72      0.72        43

   micro avg       0.59      0.54      0.56        91
   macro avg       0.62      0.47      0.45        91
weighted avg       0.68      0.54      0.54        91
 samples avg       0.41      0.39      0.40        91



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  temp += sum(np.logical_and(y_true[i], y_pred[i])) / sum(np.logical_or(y_true[i], y_pred[i]))


nan

In [119]:
audio_res = cnn.predict(test_audio_data)
audio_y_pred = predictLabelForGivenThreshold(audio_res,0.1)
print(classification_report(test_labels, audio_y_pred,target_names=label_names))
Accuracy(test_labels, audio_y_pred)
# print(res1)

              precision    recall  f1-score   support

      Mature       0.22      1.00      0.37        24
   Slapstick       0.18      1.00      0.31        18
        Gory       0.00      0.00      0.00         6
     Sarcasm       0.40      1.00      0.57        43

   micro avg       0.27      0.93      0.42        91
   macro avg       0.20      0.75      0.31        91
weighted avg       0.28      0.93      0.43        91
 samples avg       0.27      0.68      0.38        91



  _warn_prf(average, modifier, msg_start, len(result))


0.26791277258566965

## Inference

In [None]:
img = cv2.imread('extracted_test_spectrogram/al-TxOuSqc8.02.jpg')
img = cv2.resize(img, (IMG_SIZE,IMG_SIZE))
img = np.array(img)
# img = test_data[104]
img = img.reshape((1,IMG_SIZE,IMG_SIZE,3))
# print(img.shape)

y_pred = cnn.predict(img)[0]

# # round probabilities to class labels
# y_pred = y_pred.round()


print(y_pred)