<a href="https://www.kaggle.com/code/vincemarcs/mvsa-fusion-models-ml?scriptVersionId=101037728" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
SEED = 61

import os
import re
import gc
import h5py
import torch
import numpy as np
import pandas as pd
import tensorflow as tf
import random as python_random
import tensorflow_addons as tfa
import matplotlib.pyplot as plt

from tqdm import tqdm
from nltk import tokenize
from IPython.display import display_html
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from transformers import BertTokenizer, BertForMaskedLM, BertModel
from tensorflow.python.keras.layers import Layer, InputSpec, Lambda

from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import RepeatedKFold, KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay, classification_report

from keras import backend as K
from keras import initializers,regularizers,constraints
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Reshape, Input, Embedding, Flatten, Dense, Dropout, BatchNormalization, Activation #, merge
from keras.layers import TimeDistributed, LSTM, GRU, Bidirectional, Convolution1D, MaxPooling1D, MaxPooling2D
from keras.layers.core import RepeatVector #, Reshape
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from keras.models import Sequential, Model, load_model

def reset_seeds():
    np.random.seed(SEED) 
    python_random.seed(SEED)
    tf.random.set_seed(SEED)
    os.environ["PYTHONHASHSEED"] = str(SEED)

# from tensorflow.keras import Model
# from attention import Attention_input1, Attention_input2
# from keras.optimizers import SGD, RMSprop, Adagrad

In [2]:
def read_hdf5(path):
    read_file = h5py.File(path, 'r')
    
    feature_names = list(read_file.keys())
    loaded_data = []
    
    for name in feature_names:
        dataset = read_file[name][:]
        if dataset.dtype == np.dtype('object'):
            dataset = np.array([x.decode('UTF-8') for x in dataset])            
        loaded_data.append((name, dataset))

    return loaded_data

def loadz(path):
    data = np.load(path)['arr_0']
    return data

In [3]:
def load_labels(path):
    data = read_hdf5(path)

    for x in data:
        if x[0] == 'multimodal-labels':
            labels = x[1]
        if x[0] == 'text-labels':
            text_labels = x[1]
        if x[0] == 'image-labels':
            image_labels = x[1]
        
    return labels, text_labels, image_labels

def merge_mvsa(mvsa_single, mvsa_multiple):
    mvsa = np.concatenate((mvsa_single, mvsa_multiple), axis=0)
    return mvsa

def load_mvsa_feature(feature_name, merge=False):
    folder_path = os.path.join('../input/mvsa-features/', feature_name)
    single_file = 'mvsa-single-{}.npz'.format(feature_name)
    multiple_file = 'mvsa-multiple-{}.npz'.format(feature_name)
    mvsa_single = loadz(os.path.join(folder_path, single_file))
    mvsa_multiple = loadz(os.path.join(folder_path, multiple_file))
    
    if merge == True:
        return merge_mvsa(mvsa_single, mvsa_multiple)
    
    return mvsa_single, mvsa_multiple

In [4]:
# e.g. validation_split=0.1 -----> 8:1:1 ratio of train, val, test
def split_data(data, validation_split):
    num_val = int(validation_split * data.shape[0])
    data_train = data[:-(num_val*2)]
    data_val = data[-(num_val*2):-(num_val)]
    data_test = data[-num_val:]
    return data_train, data_val, data_test

In [5]:
def weighted_average(weights, probs):
    ''' Calculate the weighted average probability distribution from all input probs and its weights 
    weights: weights list (or array)
    probs: probability distributions array list
    '''
    output_probs = []
    weighted_probs = [probs[i] * weights[i] for i in range(len(weights))]
    for i in range(len(probs[0])):
        sum_prob = np.zeros(len(probs[0][0]))
        for j in range(len(weights)):
            sum_prob = np.sum((sum_prob, weighted_probs[j][i]), axis=0)
        output_probs.append(sum_prob)
    return np.asarray(output_probs, dtype='float32')

def get_average_weights(*scores, inverse=False):
    ''' Get the corresponding weight of each input score 
    inverse: (bool) get inverse weights value in case of the smaller score value, the bigger weight value (such as model loss)
    '''
    
    weights = []
    for score in scores:
        weights.append(score/np.sum(scores))
    
    if inverse == True:
        inverse_weights = []
        inverse = [1/weight for weight in weights]
        for inv in inverse:
            inverse_weights.append(inv/np.sum(inverse))
        weights = inverse_weights

    return weights

In [6]:
NUM_CLASSES = 3
f1_macro = tfa.metrics.F1Score(num_classes=NUM_CLASSES, average='macro', name='f1_macro')
f1_weighted = tfa.metrics.F1Score(num_classes=NUM_CLASSES, average='weighted', name='f1_weighted')
    
def create_model_text(input_shape, lstm=True):
    text_input = Input(shape=input_shape)
    if lstm == True:
        text_reshape = Reshape((1, -1)) (text_input)
        text_lstm = LSTM(NUM_LSTM) (text_reshape)
        outputs = Dense(NUM_CLASSES, activation='softmax') (text_lstm)
    else:
        outputs = Dense(NUM_CLASSES, activation='softmax') (text_input)
    model = Model(text_input, outputs)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', f1_macro, f1_weighted])
    return model

def create_model_image(input_shape, lstm=True):
    image_input = Input(shape=input_shape)
    if lstm == True:
        image_reshape = Reshape((1, -1)) (image_input)
        image_lstm = LSTM(NUM_LSTM_IMG) (image_reshape)
        outputs = Dense(NUM_CLASSES, activation='softmax') (image_lstm)
    else:
        outputs = Dense(NUM_CLASSES, activation='softmax') (image_input)
    model = Model(image_input, outputs)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', f1_macro, f1_weighted])
    return model

def create_model_IF(text_shape, image_shape, lstm=True):
    image_input = Input(shape=image_shape)
    text_input = Input(shape=text_shape)
    if lstm == True:
        text_reshape = Reshape((1, -1)) (text_input)
        text_lstm = LSTM(NUM_LSTM) (text_reshape)
        image_reshape = Reshape((1, -1)) (image_input)
        image_lstm = LSTM(NUM_LSTM) (image_reshape)
        text_image_concat = tf.keras.layers.Concatenate(axis=1)([text_lstm, image_lstm])
    else:
        text_image_concat = tf.keras.layers.Concatenate(axis=1)([text_input, image_input])
    concat_self_attention = tf.keras.layers.Attention() ([text_image_concat, text_image_concat])
#     concat_self_attention = Dropout(DROPOUT_RATE) (concat_self_attention)
    outputs = Dense(NUM_CLASSES, activation='softmax') (concat_self_attention)
    model = Model([text_input, image_input], outputs)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', f1_macro, f1_weighted])
    return model

2022-07-17 09:35:05.973509: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-17 09:35:05.974574: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-17 09:35:05.975245: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-17 09:35:05.976056: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

In [7]:
def preprocess_inputs(X1, X2, y):
    y = le.fit_transform(y)
    y = to_categorical(np.asarray(y))
    
    random_idx = np.random.permutation(len(y))
    X1, X2, y = X1[random_idx], X2[random_idx], y[random_idx]
    
    X1_train, X1_val, X1_test = split_data(X1, VALIDATION_SPLIT)
    X2_train, X2_val, X2_test = split_data(X2, VALIDATION_SPLIT)
    y_train, y_val, y_test = split_data(y, VALIDATION_SPLIT)
    
    oversample = BorderlineSMOTE(sampling_strategy='minority', random_state=SEED, kind='borderline-2')
#     oversample = SMOTE(sampling_strategy='minority', random_state=SEED)
    X1_train, _ = oversample.fit_resample(X1_train, y_train)
    X2_train, y_train = oversample.fit_resample(X2_train, y_train)

    return {'texts': [X1_train, X1_val, X1_test], 'images': [X2_train, X2_val, X2_test], 'labels':[y_train, y_val, y_test]}

In [8]:
def run_and_evaluate_IF(name, data, verbose=0, lstm=True):
    '''  Train and test Intermediate Fusion model Multimodal Labels 
    X1: text input
    X2: image input
    y: multimodal labels
    '''
    
    X1_train = data['texts'][0]
    X1_val = data['texts'][1]
    X1_test = data['texts'][2]
    
    X2_train = data['images'][0]
    X2_val = data['images'][1]
    X2_test = data['images'][2]
    
    y_train = data['labels'][0]
    y_val = data['labels'][1]
    y_test = data['labels'][2]

    
#     y = le.fit_transform(y)
#     y = to_categorical(np.asarray(y))
    
#     random_idx = np.random.permutation(len(y))
#     X1, X2, y = X1[random_idx], X2[random_idx], y[random_idx]
    
#     X1_train, X1_val, X1_test = split_data(X1, VALIDATION_SPLIT)
#     X2_train, X2_val, X2_test = split_data(X2, VALIDATION_SPLIT)
#     y_train, y_val, y_test = split_data(y, VALIDATION_SPLIT)
    
#     oversample = BorderlineSMOTE(sampling_strategy='minority', random_state=SEED, kind='borderline-2')
# #     oversample = SMOTE(sampling_strategy='minority', random_state=SEED)
#     X1_train, _ = oversample.fit_resample(X1_train, y_train)
#     X2_train, y_train = oversample.fit_resample(X2_train, y_train)

    
    if lstm == False:
        model = create_model_IF(X1_train.shape[1:], X2_train.shape[1:], lstm=False)
    else:
        model = create_model_IF(X1_train.shape[1:], X2_train.shape[1:])
        
    early_stopping = EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=EARLY_STOPPING)
    checkpoint = ModelCheckpoint('./model_checkpoint/{}.h5'.format(name), save_best_only=True, verbose=verbose)
    
    if 'multiple' in name:
        batch_size = 256
    else:
        batch_size = BATCH_SIZE # 128
    
    history = model.fit([X1_train, X2_train], y_train, validation_data=([X1_val, X2_val], y_val), 
                        epochs=EPOCHS, batch_size=batch_size, verbose=verbose,
                        callbacks=[checkpoint, early_stopping])
    if verbose == 1:
        best_epoch = np.argmin(history.history['val_loss'])
        print('\nCheckpoint loaded at epoch:', best_epoch)
        
    return history, evaluate_model_IF(model, X1_test, X2_test, y_test, checkpoint=name, verbose=verbose)

def evaluate_model_IF(model, X_texts, X_images, y_test, checkpoint=None, verbose=1):
    if checkpoint is not None:
        model = load_model('./model_checkpoint/{}.h5'.format(checkpoint))
    
    loss, acc, f1_macro, f1_weighted = model.evaluate([X_texts, X_images], y_test, verbose=verbose)

    if verbose == 1:
        print('Loss:', loss)
        print('Accuracy:', acc)
        print('Macro F1-score:', f1_macro)
        print('Weighted F1-score:', f1_weighted)

        y_pred = model.predict([X_texts, X_images])
        matrix = confusion_matrix(le.inverse_transform(y_test.argmax(axis=1)), le.inverse_transform(y_pred.argmax(axis=1)), 
                                  labels=list(le.classes_))
        cm_disp = ConfusionMatrixDisplay(confusion_matrix=matrix,
                                  display_labels=list(le.classes_))
        cm_disp.plot()
        plt.show()
        
    return loss, acc, f1_macro, f1_weighted

In [9]:
def run_and_evaluate_LF(name, data, verbose=0, lstm=True):
    ''' Train and test Late Fusion model only with Multimodal Labels 
    X1: text input
    X2: image input
    y: multimodal labels
    y1: text labels
    y2: image labels
    verbose: 0 or 1 to print tracking on progress
    '''
#     y = le.fit_transform(y)
#     y = to_categorical(np.asarray(y))

#     random_idx = np.random.permutation(len(y))
#     X1, X2, y = X1[random_idx], X2[random_idx], y[random_idx]
    
#     X1_train, X1_val, X1_test = split_data(X1, VALIDATION_SPLIT)
#     X2_train, X2_val, X2_test = split_data(X2, VALIDATION_SPLIT)
#     y_train, y_val, y_test = split_data(y, VALIDATION_SPLIT)
    
#     oversample = BorderlineSMOTE(sampling_strategy='minority', random_state=SEED, kind='borderline-2')
# #     oversample = SMOTE(sampling_strategy='minority', random_state=SEED)
#     X1_train, _ = oversample.fit_resample(X1_train, y_train)
#     X2_train, y_train = oversample.fit_resample(X2_train, y_train)

    X1_train = data['texts'][0]
    X1_val = data['texts'][1]
    X1_test = data['texts'][2]
    
    X2_train = data['images'][0]
    X2_val = data['images'][1]
    X2_test = data['images'][2]
    
    y_train = data['labels'][0]
    y_val = data['labels'][1]
    y_test = data['labels'][2]

    if lstm == False:
        model_text = create_model_text(X1_train.shape[1:], lstm=False)
        model_image = create_model_image(X2_train.shape[1:], lstm=False)
    else:
        model_text = create_model_text(X1_train.shape[1:])
        model_image = create_model_image(X2_train.shape[1:])

    early_stopping1 = EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=EARLY_STOPPING)
    early_stopping2 = EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=EARLY_STOPPING)
    checkpoint_text = ModelCheckpoint('./model_checkpoint/{}-text.h5'.format(name), save_best_only=True, verbose=verbose)
    checkpoint_image = ModelCheckpoint('./model_checkpoint/{}-image.h5'.format(name), save_best_only=True, verbose=verbose)
    
    if 'multiple' in name:
        batch_size = 256
    else:
        batch_size = BATCH_SIZE # 128
    
    history_text = model_text.fit(X1_train, y_train, validation_data=(X1_val, y_val), 
                                  epochs=EPOCHS, batch_size=batch_size, verbose=verbose,
                                  callbacks=[checkpoint_text, early_stopping1])
    
    history_image = model_image.fit(X2_train, y_train, validation_data=(X2_val, y_val), 
                                    epochs=EPOCHS, batch_size=batch_size, verbose=verbose,
                                    callbacks=[checkpoint_image, early_stopping2])
    
    best_val_acc_text = np.max(history_text.history['val_accuracy'])
    best_val_acc_image = np.max(history_image.history['val_accuracy'])

    weights = get_average_weights(best_val_acc_text, best_val_acc_image)

    model_text = load_model('./model_checkpoint/{}-text.h5'.format(name))
    model_image = load_model('./model_checkpoint/{}-image.h5'.format(name))

    y_pred_text = model_text.predict(X1_test)
    y_pred_image = model_image.predict(X2_test)

    y_pred = weighted_average(weights, np.asarray([y_pred_text, y_pred_image], dtype='float32'))

    best_epoch_text = np.argmin(history_text.history['val_loss'])
    best_epoch_image = np.argmin(history_image.history['val_loss'])

    if verbose == 1:
        print('Checkpoint of text model loaded at epoch:', best_epoch_text)
        print('Checkpoint of image model loaded at epoch:', best_epoch_image)

    return evaluate_model_LF(y_test, y_pred, verbose=verbose)

In [10]:
def run_and_evaluate_HF(name, data, verbose=0, lstm=True):
    ''' Train Hybrid Fusion model and test only with Multimodal Labels
    X1: text input
    X2: image input
    y: labels
    verbose: 0 or 1 to print tracking on progress
    '''
#     y = le.fit_transform(y)
#     y = to_categorical(np.asarray(y))
    
#     random_idx = np.random.permutation(len(y))
#     X1, X2, y = X1[random_idx], X2[random_idx], y[random_idx]
    
#     X1_train, X1_val, X1_test = split_data(X1, VALIDATION_SPLIT)
#     X2_train, X2_val, X2_test = split_data(X2, VALIDATION_SPLIT)
#     y_train, y_val, y_test = split_data(y, VALIDATION_SPLIT)
    
#     oversample = BorderlineSMOTE(sampling_strategy='minority', random_state=SEED, kind='borderline-2')
# #     oversample = SMOTE(sampling_strategy='minority', random_state=SEED)
#     X1_train, _ = oversample.fit_resample(X1_train, y_train)
#     X2_train, y_train = oversample.fit_resample(X2_train, y_train)

    X1_train = data['texts'][0]
    X1_val = data['texts'][1]
    X1_test = data['texts'][2]
    
    X2_train = data['images'][0]
    X2_val = data['images'][1]
    X2_test = data['images'][2]
    
    y_train = data['labels'][0]
    y_val = data['labels'][1]
    y_test = data['labels'][2]
    
    if lstm == False:
        model_text = create_model_text(X1_train.shape[1:], lstm=False)
        model_image = create_model_image(X2_train.shape[1:], lstm=False)
        model_IF = create_model_IF(X1_train.shape[1:], X2_train.shape[1:], lstm=False)
    else:
        model_text = create_model_text(X1_train.shape[1:])
        model_image = create_model_image(X2_train.shape[1:])
        model_IF = create_model_IF(X1_train.shape[1:], X2_train.shape[1:])

    early_stopping1 = EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=EARLY_STOPPING)
    early_stopping2 = EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=EARLY_STOPPING)
    early_stopping3 = EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=EARLY_STOPPING)

    checkpoint_text = ModelCheckpoint('./model_checkpoint/{}-text.h5'.format(name), save_best_only=True, verbose=verbose)
    checkpoint_image = ModelCheckpoint('./model_checkpoint/{}-image.h5'.format(name), save_best_only=True, verbose=verbose)
    checkpoint_IF = ModelCheckpoint('./model_checkpoint/{}-IF.h5'.format(name), save_best_only=True, verbose=verbose)

    if 'multiple' in name:
        batch_size = 256
    else:
        batch_size = BATCH_SIZE # 128
    
    history_text = model_text.fit(X1_train, y_train, validation_data=(X1_val, y_val), 
                        epochs=EPOCHS, batch_size=batch_size, verbose=verbose,
                        callbacks=[checkpoint_text, early_stopping1])
    
    history_image = model_image.fit(X2_train, y_train, validation_data=(X2_val, y_val), 
                        epochs=EPOCHS, batch_size=batch_size, verbose=verbose,
                        callbacks=[checkpoint_image, early_stopping2])
    
    history_IF = model_IF.fit([X1_train, X2_train], y_train, validation_data=([X1_val, X2_val], y_val), 
                        epochs=EPOCHS, batch_size=batch_size, verbose=verbose,
                        callbacks=[checkpoint_IF, early_stopping3])
    
    best_val_acc_text = np.max(history_text.history['val_accuracy'])
    best_val_acc_image = np.max(history_image.history['val_accuracy'])
    best_val_acc_IF = np.max(history_IF.history['val_accuracy'])
    
    weights = get_average_weights(best_val_acc_text, best_val_acc_image, best_val_acc_IF)

    model_text = load_model('./model_checkpoint/{}-text.h5'.format(name))
    model_image = load_model('./model_checkpoint/{}-image.h5'.format(name))
    model_IF = load_model('./model_checkpoint/{}-IF.h5'.format(name))

    y_pred_text = model_text.predict(X1_test)
    y_pred_image = model_image.predict(X2_test)
    y_pred_IF = model_IF.predict([X1_test, X2_test])
    
    y_pred = weighted_average(weights, np.asarray([y_pred_text, y_pred_image, y_pred_IF], dtype='float32'))
    
    best_epoch_text = np.argmin(history_text.history['val_loss'])
    best_epoch_image = np.argmin(history_image.history['val_loss'])
    best_epoch_IF = np.argmin(history_IF.history['val_loss'])
    
    if verbose == 1:
        print('Checkpoint of text model loaded at epoch:', best_epoch_text)
        print('Checkpoint of image model loaded at epoch:', best_epoch_image)
        print('Checkpoint of IF model loaded at epoch:', best_epoch_IF)

    return evaluate_model_LF(y_test, y_pred, verbose=verbose)

In [11]:
def evaluate_model_LF(y_true, y_pred, verbose=0):
    
    y_pred = le.inverse_transform(y_pred.argmax(axis=1))
    y_true = le.inverse_transform(y_true.argmax(axis=1))

    acc = accuracy_score(y_true, y_pred)
    f1_macro = f1_score(y_true, y_pred, average='macro')
    f1_weighted = f1_score(y_true, y_pred, average='weighted')

    if verbose == 1:
        print(classification_report(y_true, y_pred))
        matrix = confusion_matrix(y_true, y_pred,
                                  labels=list(le.classes_))
        cm_disp = ConfusionMatrixDisplay(confusion_matrix=matrix,
                                  display_labels=list(le.classes_))
        cm_disp.plot()
        plt.show()

    return acc, f1_macro, f1_weighted

In [12]:
def plot_metrics(history):
    fig = plt.figure(figsize=(20, 5))

    fig.add_subplot(1, 4, 1)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('LOSS')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'val'], loc='best')

    fig.add_subplot(1, 4, 2)
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('ACCURACY')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'val'], loc='best')

    fig.add_subplot(1, 4, 3)
    plt.plot(history.history['f1_macro'])
    plt.plot(history.history['val_f1_macro'])
    plt.title('Macro F1-SCORE')
    plt.ylabel('f1-macro')
    plt.xlabel('epoch')
    plt.legend(['train', 'val'], loc='best')
    
    fig.add_subplot(1, 4, 4)
    plt.plot(history.history['f1_weighted'])
    plt.plot(history.history['val_f1_weighted'])
    plt.title('Weighted F1-SCORE')
    plt.ylabel('f1-weighted')
    plt.xlabel('epoch')
    plt.legend(['train', 'val'], loc='best')

    plt.show()

In [13]:
def style_dataframe(dataframe):
    return dataframe.style.highlight_max(subset=['Accuracy', 'F1-macro', 'F1-weighted'], props='color:lawngreen', axis=0)\
                          .highlight_min(subset=['Accuracy', 'F1-macro', 'F1-weighted'], props='color:tomato', axis=0)

def highlight_neg(cell):
    if type(cell) != str and cell < 0 :
        return 'color: tomato'
    else:
        return 'color: lawngreen'

def display_dataframes(dfs, names=[], index=False):
    def to_df(x):
        if isinstance(x, pd.Series):
            return pd.DataFrame(x)
        else:
            return x
    html_str = ''
    if names:
        html_str += ('<tr>' + 
                     ''.join(f'<td style="text-align:center">{name}</td>' for name in names) + 
                     '</tr>')
    html_str += ('<tr>' + 
                 ''.join(f'<td style="vertical-align:top"> {to_df(df).to_html()}</td>' 
                         for df in dfs) + 
                 '</tr>')
    html_str = f'<table>{html_str}</table>'
    html_str = html_str.replace('table','table style="display:inline"')
    display_html(html_str, raw=True)

# Load data

In [14]:
# ## Choose best
# # Load text feature
# mvsa_single_bert, mvsa_multiple_bert = load_mvsa_feature('bert-base')
# mvsa_single_pos_bow, mvsa_multiple_pos_bow = load_mvsa_feature('pos-bow')
# mvsa_single_pos_tfidf, mvsa_multiple_pos_tfidf = load_mvsa_feature('pos-tfidf')
# mvsa_single_ner_bow, mvsa_multiple_ner_bow = load_mvsa_feature('ner-bow')
# mvsa_single_ner_tfidf, mvsa_multiple_ner_tfidf = load_mvsa_feature('ner-tfidf')

# ## Load image feature
# mvsa_single_vgg16, mvsa_multiple_vgg16 = load_mvsa_feature('vgg16')
# mvsa_single_vgg19, mvsa_multiple_vgg19 = load_mvsa_feature('vgg19')
# mvsa_single_resnet50, mvsa_multiple_resnet50 = load_mvsa_feature('resnet50')
# mvsa_single_resnet101, mvsa_multiple_resnet101 = load_mvsa_feature('resnet101')
# mvsa_single_resnet152, mvsa_multiple_resnet152 = load_mvsa_feature('resnet152')
# mvsa_single_densenet121, mvsa_multiple_densenet121 = load_mvsa_feature('densenet121')
# mvsa_single_densenet169, mvsa_multiple_densenet169 = load_mvsa_feature('densenet169')
# mvsa_single_densenet201, mvsa_multiple_densenet201 = load_mvsa_feature('densenet201')

In [15]:
# mvsa_single_bert_pos = np.concatenate((mvsa_single_bert, mvsa_single_pos_tfidf), axis=1)
# mvsa_single_bert_ner = np.concatenate((mvsa_single_bert, mvsa_single_ner_tfidf), axis=1)
# mvsa_single_bert_pos_ner = np.concatenate((mvsa_single_bert, mvsa_single_pos_tfidf, mvsa_single_ner_tfidf), axis=1)

# mvsa_multiple_bert_pos = np.concatenate((mvsa_multiple_bert, mvsa_multiple_pos_tfidf), axis=1)
# mvsa_multiple_bert_ner = np.concatenate((mvsa_multiple_bert, mvsa_multiple_ner_tfidf), axis=1)
# mvsa_multiple_bert_pos_ner = np.concatenate((mvsa_multiple_bert, mvsa_multiple_pos_tfidf, mvsa_multiple_ner_tfidf), axis=1)

In [16]:
mvsa_single_multimodal_labels, mvsa_single_text_labels, mvsa_single_image_labels = load_labels('../input/mvsa-features/labels/mvsa-single-labels.hdf5')
mvsa_multiple_multimodal_labels, mvsa_multiple_text_labels, mvsa_multiple_image_labels = load_labels('../input/mvsa-features/labels/mvsa-multiple-labels.hdf5')

le = LabelEncoder()
le.fit(mvsa_multiple_multimodal_labels)
NUM_CLASSES = len(le.classes_) # = 3

In [17]:
# prepare all features data

feature_names = ['resnet101-bert', 'resnet101-bert-lstm', 'densenet201-bert-lstm', 'densenet201-bert-pos-lstm', 'densenet201-bert-ner-lstm', 'densenet201-bert-pos-ner-lstm']
mvsa_single_features = []
mvsa_multiple_features = []

for name in feature_names:
    textual = name.split('-')[1]
    visual = name.split('-')[0]
    
    if textual == 'bert':
        textual = 'bert-base'
        
    textual_features = load_mvsa_feature(textual)
    visual_features = load_mvsa_feature(visual)

    if 'pos' in name and 'ner' not in name:
        pos_features = load_mvsa_feature('pos-tfidf')
        mvsa_single_features.append([np.concatenate((textual_features[0], pos_features[0]), axis=1), visual_features[0]])
        mvsa_multiple_features.append([np.concatenate((textual_features[1], pos_features[1]), axis=1), visual_features[1]])

    elif 'pos' not in name and 'ner' in name:
        ner_features = load_mvsa_feature('ner-tfidf')
        mvsa_single_features.append([np.concatenate((textual_features[0], ner_features[0]), axis=1), visual_features[0]])
        mvsa_multiple_features.append([np.concatenate((textual_features[1], ner_features[1]), axis=1), visual_features[1]])

    elif 'pos' in name and 'ner' in name:
        pos_features = load_mvsa_feature('pos-tfidf')
        ner_features = load_mvsa_feature('ner-tfidf')
        mvsa_single_features.append([np.concatenate((textual_features[0], pos_features[0], ner_features[0]), axis=1), visual_features[0]])
        mvsa_multiple_features.append([np.concatenate((textual_features[1], pos_features[1], ner_features[1]), axis=1), visual_features[1]])
    
    else:
        mvsa_single_features.append([textual_features[0], visual_features[0]])
        mvsa_multiple_features.append([textual_features[1], visual_features[1]])

    
# mvsa_single_features = [[mvsa_single_bert, mvsa_single_resnet101],
#                         [mvsa_single_bert, mvsa_single_resnet101],
#                         [mvsa_single_bert, mvsa_single_densenet201],
#                         [mvsa_single_bert_pos, mvsa_single_densenet201],
#                         [mvsa_single_bert_ner, mvsa_single_densenet201],                        
#                         [mvsa_single_bert_pos_ner, mvsa_single_densenet201]]

# mvsa_multiple_features = [[mvsa_multiple_bert, mvsa_multiple_resnet101],
#                           [mvsa_multiple_bert, mvsa_multiple_resnet101],
#                           [mvsa_multiple_bert, mvsa_multiple_densenet201],
#                           [mvsa_multiple_bert_pos, mvsa_multiple_densenet201],
#                           [mvsa_multiple_bert_ner, mvsa_multiple_densenet201],
#                           [mvsa_multiple_bert_pos_ner, mvsa_multiple_densenet201]]

In [18]:
reset_seeds()
EPOCHS = 100
BATCH_SIZE = 128
VALIDATION_SPLIT = 0.1
EARLY_STOPPING = 10
NUM_LSTM = 64
NUM_LSTM_IMG = 128
DROPOUT_RATE = 0.1

In [19]:
mvsa_single_features_split = []
mvsa_multiple_features_split = []

for x in mvsa_single_features:
    splits = preprocess_inputs(x[0], x[1], mvsa_single_multimodal_labels)
    mvsa_single_features_split.append(splits)
for x in mvsa_multiple_features:
    splits = preprocess_inputs(x[0], x[1], mvsa_multiple_multimodal_labels)
    mvsa_multiple_features_split.append(splits)

# Intermediate Fusion

In [20]:
print('MVSA-Single with Intermediate Fusion\n')
scores = []
for i in range(len(feature_names)):
    print('MVSA-Single:', feature_names[i])
    if 'lstm' in feature_names[i]:
        _, score = run_and_evaluate_IF('single-IF-' + feature_names[i], mvsa_single_features_split[i],#[0], mvsa_single_features[i][1], 
#                                      mvsa_single_multimodal_labels,
                                     verbose=0)
    else:
        _, score = run_and_evaluate_IF('single-IF-' + feature_names[i], mvsa_single_features_split[i],#[0], mvsa_single_features[i][1],
# #                                      mvsa_single_multimodal_labels,
                                     verbose=0, lstm=False)
    scores.append(score)
    print()
df_single_scores_IF = pd.DataFrame(scores, columns=['Loss', 'Accuracy', 'F1-macro', 'F1-weighted'], index=feature_names)


print('MVSA-Multiple with Intermediate Fusion\n')
scores = []
for i in range(len(feature_names)):
    print('MVSA-Multiple:', feature_names[i])
    if 'lstm' in feature_names[i]:
        _, score = run_and_evaluate_IF('multiple-IF-' + feature_names[i], mvsa_multiple_features_split[i],#[0], mvsa_multiple_features[i][1], 
#                                      mvsa_multiple_multimodal_labels,
                                     verbose=0)
    else:
        _, score = run_and_evaluate_IF('multiple-IF-' + feature_names[i], mvsa_multiple_features_split[i],#[0], mvsa_multiple_features[i][1],
#                                      mvsa_multiple_multimodal_labels,
                                     verbose=0, lstm=False)
    scores.append(score)
    print()
df_multiple_scores_IF = pd.DataFrame(scores, columns=['Loss', 'Accuracy', 'F1-macro', 'F1-weighted'], index=feature_names)

MVSA-Single with Intermediate Fusion

MVSA-Single: resnet101-bert


2022-07-17 09:35:37.787520: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)



MVSA-Single: resnet101-bert-lstm


2022-07-17 09:35:47.275914: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005



MVSA-Single: densenet201-bert-lstm

MVSA-Single: densenet201-bert-pos-lstm

MVSA-Single: densenet201-bert-ner-lstm

MVSA-Single: densenet201-bert-pos-ner-lstm

MVSA-Multiple with Intermediate Fusion

MVSA-Multiple: resnet101-bert

MVSA-Multiple: resnet101-bert-lstm

MVSA-Multiple: densenet201-bert-lstm

MVSA-Multiple: densenet201-bert-pos-lstm

MVSA-Multiple: densenet201-bert-ner-lstm

MVSA-Multiple: densenet201-bert-pos-ner-lstm



# Late Fusion

In [21]:
print('MVSA-Single with Late Fusion\n')
scores = []
for i in range(len(feature_names)):
    print('MVSA-Single:', feature_names[i])
    if 'lstm' in feature_names[i]:
        score = run_and_evaluate_LF('single-LF-' + feature_names[i], mvsa_single_features_split[i],#[0], mvsa_single_features[i][1], 
#                                      mvsa_single_multimodal_labels,
                                     verbose=0)
    else:
        score = run_and_evaluate_LF('single-LF-' + feature_names[i], mvsa_single_features_split[i],#[0], mvsa_single_features[i][1],
#                                      mvsa_single_multimodal_labels,
                                     verbose=0, lstm=False)
    scores.append(score)
    print()
df_single_scores_LF = pd.DataFrame(scores, columns=['Accuracy', 'F1-macro', 'F1-weighted'], index=feature_names)


print('MVSA-Multiple with Late Fusion\n')
scores = []
for i in range(len(feature_names)):
    print('MVSA-Multiple:', feature_names[i])
    if 'lstm' in feature_names[i]:
        score = run_and_evaluate_LF('multiple-LF-' + feature_names[i], mvsa_multiple_features_split[i],#[0], mvsa_multiple_features[i][1], 
#                                      mvsa_multiple_multimodal_labels,
                                     verbose=0)
    else:
        score = run_and_evaluate_LF('multiple-LF-' + feature_names[i], mvsa_multiple_features_split[i],#[0], mvsa_multiple_features[i][1],
#                                      mvsa_multiple_multimodal_labels,
                                     verbose=0, lstm=False)
    scores.append(score)
    print()
df_multiple_scores_LF = pd.DataFrame(scores, columns=['Accuracy', 'F1-macro', 'F1-weighted'], index=feature_names)

MVSA-Single with Late Fusion

MVSA-Single: resnet101-bert

MVSA-Single: resnet101-bert-lstm

MVSA-Single: densenet201-bert-lstm

MVSA-Single: densenet201-bert-pos-lstm

MVSA-Single: densenet201-bert-ner-lstm

MVSA-Single: densenet201-bert-pos-ner-lstm

MVSA-Multiple with Late Fusion

MVSA-Multiple: resnet101-bert

MVSA-Multiple: resnet101-bert-lstm

MVSA-Multiple: densenet201-bert-lstm

MVSA-Multiple: densenet201-bert-pos-lstm

MVSA-Multiple: densenet201-bert-ner-lstm

MVSA-Multiple: densenet201-bert-pos-ner-lstm



# Hybrid Fusion

In [22]:
print('MVSA-Single with Hybrid Fusion\n')
scores = []
for i in range(len(feature_names)):
    print('MVSA-Single:', feature_names[i])
    if 'lstm' in feature_names[i]:
        score = run_and_evaluate_HF('single-HF-' + feature_names[i], mvsa_single_features_split[i],#[0], mvsa_single_features[i][1], 
#                                      mvsa_single_multimodal_labels,
                                     verbose=0)
    else:
        score = run_and_evaluate_HF('single-HF-' + feature_names[i], mvsa_single_features_split[i],#[0], mvsa_single_features[i][1],
#                                      mvsa_single_multimodal_labels,
                                     verbose=0, lstm=False)
    scores.append(score)
    print()
df_single_scores_HF = pd.DataFrame(scores, columns=['Accuracy', 'F1-macro', 'F1-weighted'], index=feature_names)


print('MVSA-Multiple with Hybrid Fusion\n')
scores = []
for i in range(len(feature_names)):
    print('MVSA-Multiple:', feature_names[i])
    if 'lstm' in feature_names[i]:
        score = run_and_evaluate_HF('multiple-HF-' + feature_names[i], mvsa_multiple_features_split[i],#[0], mvsa_multiple_features[i][1], 
#                                      mvsa_multiple_multimodal_labels,
                                     verbose=0)
    else:
        score = run_and_evaluate_HF('multiple-HF-' + feature_names[i], mvsa_multiple_features_split[i],#[0], mvsa_multiple_features[i][1],
#                                      mvsa_multiple_multimodal_labels,
                                     verbose=0, lstm=False)
    scores.append(score)
    print()
df_multiple_scores_HF = pd.DataFrame(scores, columns=['Accuracy', 'F1-macro', 'F1-weighted'], index=feature_names)

MVSA-Single with Hybrid Fusion

MVSA-Single: resnet101-bert

MVSA-Single: resnet101-bert-lstm

MVSA-Single: densenet201-bert-lstm

MVSA-Single: densenet201-bert-pos-lstm

MVSA-Single: densenet201-bert-ner-lstm

MVSA-Single: densenet201-bert-pos-ner-lstm

MVSA-Multiple with Hybrid Fusion

MVSA-Multiple: resnet101-bert

MVSA-Multiple: resnet101-bert-lstm

MVSA-Multiple: densenet201-bert-lstm

MVSA-Multiple: densenet201-bert-pos-lstm

MVSA-Multiple: densenet201-bert-ner-lstm

MVSA-Multiple: densenet201-bert-pos-ner-lstm



# Display results

In [23]:
df_single_scores_IF = df_single_scores_IF.drop(columns=['Loss'])
df_multiple_scores_IF = df_multiple_scores_IF.drop(columns=['Loss'])

In [24]:
print('Intermediate Fusion')
display_dataframes((style_dataframe(df_single_scores_IF), style_dataframe(df_multiple_scores_IF)), 
                   names=['MVSA-Single', 'MVSA-Multiple'])

Intermediate Fusion


Unnamed: 0_level_0,Accuracy,F1-macro,F1-weighted
Unnamed: 0_level_1,Accuracy,F1-macro,F1-weighted
resnet101-bert,0.578714,0.279562,0.46222
resnet101-bert-lstm,0.623060,0.25592,0.47836
densenet201-bert-lstm,0.707317,0.613189,0.70757
densenet201-bert-pos-lstm,0.678492,0.595506,0.674061
densenet201-bert-ner-lstm,0.651885,0.549898,0.645954
densenet201-bert-pos-ner-lstm,0.671840,0.595302,0.670598
resnet101-bert,0.634548,0.298537,0.529801
resnet101-bert-lstm,0.679201,0.489964,0.640661
densenet201-bert-lstm,0.680964,0.471986,0.636375
densenet201-bert-pos-lstm,0.698002,0.469049,0.643169

Unnamed: 0,Accuracy,F1-macro,F1-weighted
resnet101-bert,0.578714,0.279562,0.46222
resnet101-bert-lstm,0.62306,0.25592,0.47836
densenet201-bert-lstm,0.707317,0.613189,0.70757
densenet201-bert-pos-lstm,0.678492,0.595506,0.674061
densenet201-bert-ner-lstm,0.651885,0.549898,0.645954
densenet201-bert-pos-ner-lstm,0.67184,0.595302,0.670598

Unnamed: 0,Accuracy,F1-macro,F1-weighted
resnet101-bert,0.634548,0.298537,0.529801
resnet101-bert-lstm,0.679201,0.489964,0.640661
densenet201-bert-lstm,0.680964,0.471986,0.636375
densenet201-bert-pos-lstm,0.698002,0.469049,0.643169
densenet201-bert-ner-lstm,0.692127,0.517109,0.661135
densenet201-bert-pos-ner-lstm,0.683901,0.506193,0.631262


In [25]:
print('Late Fusion with Multimodal Labels')
display_dataframes((style_dataframe(df_single_scores_LF), style_dataframe(df_multiple_scores_LF)), 
                   names=['MVSA-Single', 'MVSA-Multiple'])

Late Fusion with Multimodal Labels


Unnamed: 0_level_0,Accuracy,F1-macro,F1-weighted
Unnamed: 0_level_1,Accuracy,F1-macro,F1-weighted
resnet101-bert,0.631929,0.510017,0.616717
resnet101-bert-lstm,0.662971,0.500791,0.623552
densenet201-bert-lstm,0.733925,0.655188,0.728949
densenet201-bert-pos-lstm,0.662971,0.560756,0.642657
densenet201-bert-ner-lstm,0.645233,0.530882,0.631838
densenet201-bert-pos-ner-lstm,0.689579,0.595487,0.67327
resnet101-bert,0.631610,0.424405,0.577364
resnet101-bert-lstm,0.673325,0.452449,0.613624
densenet201-bert-lstm,0.684489,0.446921,0.625657
densenet201-bert-pos-lstm,0.682726,0.450127,0.625696

Unnamed: 0,Accuracy,F1-macro,F1-weighted
resnet101-bert,0.631929,0.510017,0.616717
resnet101-bert-lstm,0.662971,0.500791,0.623552
densenet201-bert-lstm,0.733925,0.655188,0.728949
densenet201-bert-pos-lstm,0.662971,0.560756,0.642657
densenet201-bert-ner-lstm,0.645233,0.530882,0.631838
densenet201-bert-pos-ner-lstm,0.689579,0.595487,0.67327

Unnamed: 0,Accuracy,F1-macro,F1-weighted
resnet101-bert,0.63161,0.424405,0.577364
resnet101-bert-lstm,0.673325,0.452449,0.613624
densenet201-bert-lstm,0.684489,0.446921,0.625657
densenet201-bert-pos-lstm,0.682726,0.450127,0.625696
densenet201-bert-ner-lstm,0.697415,0.486694,0.651322
densenet201-bert-pos-ner-lstm,0.676263,0.4631,0.614444


In [26]:
print('Hybrid Fusion with Multimodal Labels')
display_dataframes((style_dataframe(df_single_scores_HF), style_dataframe(df_multiple_scores_HF)), 
                   names=['MVSA-Single', 'MVSA-Multiple'])

Hybrid Fusion with Multimodal Labels


Unnamed: 0_level_0,Accuracy,F1-macro,F1-weighted
Unnamed: 0_level_1,Accuracy,F1-macro,F1-weighted
resnet101-bert,0.631929,0.461134,0.568088
resnet101-bert-lstm,0.682927,0.54037,0.660534
densenet201-bert-lstm,0.720621,0.638432,0.716096
densenet201-bert-pos-lstm,0.682927,0.599115,0.676018
densenet201-bert-ner-lstm,0.651885,0.536162,0.645467
densenet201-bert-pos-ner-lstm,0.705100,0.620957,0.686851
resnet101-bert,0.649824,0.426304,0.57934
resnet101-bert-lstm,0.686839,0.471838,0.629208
densenet201-bert-lstm,0.683314,0.441569,0.619165
densenet201-bert-pos-lstm,0.695652,0.463463,0.637544

Unnamed: 0,Accuracy,F1-macro,F1-weighted
resnet101-bert,0.631929,0.461134,0.568088
resnet101-bert-lstm,0.682927,0.54037,0.660534
densenet201-bert-lstm,0.720621,0.638432,0.716096
densenet201-bert-pos-lstm,0.682927,0.599115,0.676018
densenet201-bert-ner-lstm,0.651885,0.536162,0.645467
densenet201-bert-pos-ner-lstm,0.7051,0.620957,0.686851

Unnamed: 0,Accuracy,F1-macro,F1-weighted
resnet101-bert,0.649824,0.426304,0.57934
resnet101-bert-lstm,0.686839,0.471838,0.629208
densenet201-bert-lstm,0.683314,0.441569,0.619165
densenet201-bert-pos-lstm,0.695652,0.463463,0.637544
densenet201-bert-ner-lstm,0.696827,0.495576,0.654102
densenet201-bert-pos-ner-lstm,0.680376,0.486308,0.627641


In [27]:
# get max values of late and intermediate fusion
df_single_higher_fusion = pd.DataFrame(np.where(df_single_scores_LF.gt(df_single_scores_IF.values), df_single_scores_LF.values, df_single_scores_IF.values),
                 columns=['Accuracy','F1-macro','F1-weighted'], index=feature_names)

df_multiple_higher_fusion = pd.DataFrame(np.where(df_single_scores_LF.gt(df_single_scores_IF.values), df_multiple_scores_LF.values, df_multiple_scores_IF.values),
                 columns=['Accuracy','F1-macro','F1-weighted'], index=feature_names)

df_single_subtract = df_single_scores_HF.subtract(df_single_higher_fusion)
df_multiple_subtract = df_multiple_scores_HF.subtract(df_multiple_higher_fusion)

In [28]:
print('Compare Hybrid Fusion scores up')
display_dataframes((df_single_subtract.style.applymap(highlight_neg), df_multiple_subtract.style.applymap(highlight_neg)), 
                   names=['MVSA-Single', 'MVSA-Multiple'])

Compare Hybrid Fusion scores up


Unnamed: 0_level_0,Accuracy,F1-macro,F1-weighted
Unnamed: 0_level_1,Accuracy,F1-macro,F1-weighted
resnet101-bert,0.000000,-0.048883,-0.048629
resnet101-bert-lstm,0.019956,0.03958,0.036982
densenet201-bert-lstm,-0.013304,-0.016756,-0.012853
densenet201-bert-pos-lstm,0.004435,0.00361,0.001956
densenet201-bert-ner-lstm,0.000000,-0.013737,-0.000487
densenet201-bert-pos-ner-lstm,0.015521,0.02547,0.013581
resnet101-bert,0.018214,0.001899,0.001976
resnet101-bert-lstm,0.013514,0.019389,0.015584
densenet201-bert-lstm,-0.001175,-0.005352,-0.006492
densenet201-bert-pos-lstm,-0.002350,-0.005586,-0.005625

Unnamed: 0,Accuracy,F1-macro,F1-weighted
resnet101-bert,0.0,-0.048883,-0.048629
resnet101-bert-lstm,0.019956,0.03958,0.036982
densenet201-bert-lstm,-0.013304,-0.016756,-0.012853
densenet201-bert-pos-lstm,0.004435,0.00361,0.001956
densenet201-bert-ner-lstm,0.0,-0.013737,-0.000487
densenet201-bert-pos-ner-lstm,0.015521,0.02547,0.013581

Unnamed: 0,Accuracy,F1-macro,F1-weighted
resnet101-bert,0.018214,0.001899,0.001976
resnet101-bert-lstm,0.013514,0.019389,0.015584
densenet201-bert-lstm,-0.001175,-0.005352,-0.006492
densenet201-bert-pos-lstm,-0.00235,-0.005586,-0.005625
densenet201-bert-ner-lstm,0.0047,-0.021532,-0.007033
densenet201-bert-pos-ner-lstm,0.004113,0.023209,0.013197
