In [3]:
import pandas as pd
import pickle
import numpy as np

# Load dataset

In [49]:
dataset = 'youtube' #@param ['youtube', 'mmmo', 'moud', 'pom'] {type:"string"}

In [51]:
X1_train = np.load(f'../data/{dataset}/X1_train.npy')
X1_val = np.load(f'../data/{dataset}/X1_val.npy')
X1_test = np.load(f'../data/{dataset}/X1_test.npy')
X2_train = np.load(f'../data/{dataset}/X2_train.npy')
X2_val = np.load(f'../data/{dataset}/X2_val.npy')
X2_test = np.load(f'../data/{dataset}/X2_test.npy')
y_train = np.load(f'../data/{dataset}/y_train.npy')
y_val = np.load(f'../data/{dataset}/y_val.npy')
y_test = np.load(f'../data/{dataset}/y_test.npy')

# Classificação apenas com features de imagem

In [None]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier


def modal_image_classifier(df_data_train,df_data_test):
  X = np.array( df_data_train['img_features'].to_list() )
  y = np.array( df_data_train['intent'].to_list() )

  clf_img = make_pipeline(StandardScaler(),MLPClassifier(random_state=1, max_iter=300))
  clf_img.fit(X, y)

  X_test = np.array( df_data_test['img_features'].to_list() )
  y_test = np.array( df_data_test['intent'].to_list() )
  return clf_img.score(X_test,y_test)


# Classificação apenas com features textuais

In [None]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

def modal_text_classifier(df_data_train,df_data_test):
  X = np.array( df_data_train['text_features'].to_list() )
  y = np.array( df_data_train['intent'].to_list() )
  clf_text = make_pipeline(StandardScaler(),MLPClassifier(random_state=1, max_iter=300))
  clf_text.fit(X, y)

  X_test = np.array( df_data_test['text_features'].to_list() )
  y_test = np.array( df_data_test['intent'].to_list() )
  clf_text.score(X_test,y_test)

# Multimodal Fusion Model

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import Dot,Lambda,Input, Activation, Dense, Concatenate, Dropout, GlobalAveragePooling1D
from tensorflow.keras import Model
from tensorflow.keras.regularizers import l1
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
 
def autoencoder_att_labels(num_classes, size_1, size_2):
    input_img = Input(shape=(size_1,))
    input_txt = Input(shape=(size_2,))
    fusion_dim = 512
 
    im_emb = Activation('tanh')(input_img)
    im_emb = Dense(fusion_dim, activation='tanh')(im_emb)
 
    txt_emb = Activation('tanh')(input_txt)
    txt_emb = Dense(fusion_dim, activation='tanh')(txt_emb)
 
    ''' Attention Modality '''
    #[input_1, input_2] = [visual_embd, average_seq]
    input_1 = Lambda(lambda x: tf.keras.backend.l2_normalize(x, axis=-1))(im_emb) # (bs, ndim)
    input_2 = Lambda(lambda x: tf.keras.backend.l2_normalize(x, axis=-1))(txt_emb) # (bs, ndim)
    

    output_size=1
    # Step 1. Get scalar weights
    scalar_input_1 = Dense(output_size)(input_1)  # (batch_size, output_size)
    scalar_input_2 = Dense(output_size)(input_2)  # (batch_size, output_size)
    scalar_input_1_exp = Lambda(lambda x: tf.keras.backend.expand_dims(x))(scalar_input_1)  # (batch_size, output_size, 1)
    scalar_input_2_exp = Lambda(lambda x: tf.keras.backend.expand_dims(x))(scalar_input_2)  # (batch_size, output_size, 1)
    scalars = concatenate([scalar_input_1_exp, scalar_input_2_exp], name='concat')  # (batch_size, output_size, 2)
    
    # # Step 2. Normalize weights - softmax
    alphas = Activation('softmax')(scalars)  # (batch_size, output_size, 2)
    
    # Step 3. Weighted average
    input_1_exp = Lambda(lambda x: tf.keras.backend.expand_dims(x))(input_1)  # (batch_size, nb_feats, 1)
    input_2_exp = Lambda(lambda x: tf.keras.backend.expand_dims(x))(input_2)  # (batch_size, nb_feats, 1)
    features = concatenate([input_1_exp, input_2_exp], name='concat_feats')  # (batch_size, nb_feats, 2)
 
    latent = Dot(axes=[-1, -1])([alphas, features])  # (batch_size, output_size, nb_feats)
    latent = tf.reduce_mean(latent, axis=1)
    
    encoder = Model([input_txt, input_img], [latent,alphas], name='encoder')
    
    clf_in = Input(shape=(fusion_dim,))
    clf_probs = Dropout(0.4)(clf_in)
    clf_probs = Dense(num_classes, activation='softmax')(clf_probs) # (batch_size, nb_labels)
    clf = Model(clf_in, clf_probs, name='clf')
 
    decoder_in = Input(shape=(fusion_dim,))
    im_rebuild = Dense(fusion_dim, activation='tanh')(decoder_in)
    im_rebuild = Dense(size_1, name='img_reb')(im_rebuild)
 
    txt_rebuild = Dense(fusion_dim, activation='tanh')(decoder_in)
    txt_rebuild = Dense(size_2, name='txt_reb')(txt_rebuild)
 
    decoder = Model(decoder_in, outputs=[txt_rebuild,im_rebuild], name='decoder')
 
    autoencoder = Model([input_txt, input_img], [decoder(encoder([input_txt, input_img])[0]), clf(encoder([input_txt, input_img])[0]) ] )

    autoencoder.compile(optimizer=tf.keras.optimizers.Adamax(learning_rate=5e-4),
                    loss=['mse','mse','categorical_crossentropy'],
                    metrics=['accuracy'],
                    loss_weights=[2.0,2.0,0.1]) # max_losses(512,512,~15) decoder_loss: 0.0021 - decoder_1_loss: 0.8902 - clf_loss: 1.8469

    return clf, encoder, decoder, autoencoder

In [None]:
from tensorflow.keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
from tensorflow.keras.layers import Dot,Activation,Dense, Input, concatenate, multiply, average, subtract, add, Dropout, Lambda, Flatten
from tensorflow.keras.models import Model
import tensorflow as tf

def clf_model(num_classes, size=256):
    inp = Input(shape=(size))
    x = Dense(size//2, activation='relu')(inp)
    x = Dropout(0.5)(x)
    x = Dense(size//4, activation='relu')(x)
    x = Dropout(0.1)(x)
    output = Dense(num_classes,activation='softmax')(x)

    model = Model(inputs=inp, outputs=output)

    model.compile(optimizer='adam', loss='categorical_crossentropy',metrics=['accuracy', f1_m])
    return model

def multimodal_text_image(X_1,X_2,num_classes,operator='concatenate',verbose=0):

  # fusion_dim = X_1.shape[1]+X_2.shape[1]
  fusion_dim = X_1.shape[1]

  inp1 = Input(shape=(X_1.shape[1]))
  inp2 = Input(shape=(X_2.shape[1]))

  l1 = Dense(fusion_dim, activation='relu')(inp1)
  l2 = Dense(fusion_dim, activation='relu')(inp2)
  # l1 = inp1
  # l2 = inp2

  # fusion layer
  print('------->',operator)
  if(operator=='concatenate'):
    w = concatenate([l1,l2])
  if(operator=='multiply'):
    w = multiply([l1,l2])
  if(operator=='average'):
    w = average([l1,l2])
  if(operator=='subtract'):
    w = subtract([l1,l2])
  if(operator=='add'):
    w = add([l1,l2])
  if(operator=='att'):
    visual_embd = Lambda(lambda x: tf.keras.backend.l2_normalize(x, axis=-1))(l1) # (bs, ndim)
    average_seq = Lambda(lambda x: tf.keras.backend.l2_normalize(x, axis=-1))(l2) # (bs, ndim)
    scalar_visual = Dense(1)(visual_embd) # (bs, 1)
    scalar_text = Dense(1)(average_seq) # (bs, 1)
    scalars = concatenate([scalar_visual, scalar_text], name='concat')  # (bs, 2)

    # # Step 2. Normalize weights - softmax
    alphas = Activation('softmax')(scalars) # (bs, 2)

    # Step 3. Weighted average
    visual_embd_2 = Lambda( lambda x: tf.keras.backend.expand_dims(x) ) (visual_embd) # (bs, ndim, 1)
    average_seq_2 = Lambda( lambda x: tf.keras.backend.expand_dims(x) )(average_seq) # (bs, ndim, 1)
    features = concatenate([visual_embd_2, average_seq_2], name='concat_feats') # (bs, ndim, 2)
    w = Dot(axes=[-1, -1])([alphas, features]) # (bs, ndim)
  if(operator=='att_labels'):
    #[input_1, input_2] = [visual_embd, average_seq]
    input_1 = Lambda(lambda x: tf.keras.backend.l2_normalize(x, axis=-1))(l1) # (bs, ndim)
    input_2 = Lambda(lambda x: tf.keras.backend.l2_normalize(x, axis=-1))(l2) # (bs, ndim)
    output_size=num_classes
    # Step 1. Get scalar weights
    scalar_input_1 = Dense(output_size)(input_1)  # (batch_size, nb_labels)
    scalar_input_2 = Dense(output_size)(input_2)  # (batch_size, nb_labels)
    scalar_input_1_exp = Lambda(lambda x: tf.keras.backend.expand_dims(x))(scalar_input_1)  # (batch_size, nb_labels, 1)
    scalar_input_2_exp = Lambda(lambda x: tf.keras.backend.expand_dims(x))(scalar_input_2)  # (batch_size, nb_labels, 1)
    scalars = concatenate([scalar_input_1_exp, scalar_input_2_exp], name='concat')  # (batch_size, nb_labels, 2)
    
    # # Step 2. Normalize weights - softmax
    alphas = Activation('softmax')(scalars)  # (batch_size, nb_labels, 2)
    
    # Step 3. Weighted average
    input_1_exp = Lambda(lambda x: tf.keras.backend.expand_dims(x))(input_1)  # (batch_size, nb_feats, 1)
    input_2_exp = Lambda(lambda x: tf.keras.backend.expand_dims(x))(input_2)  # (batch_size, nb_feats, 1)
    features = concatenate([input_1_exp, input_2_exp], name='concat_feats')  # (batch_size, nb_feats, 2)
    w = Dot(axes=[-1, -1])([alphas, features])  # (batch_size, nb_labels, nb_feats)

  w = Dropout(0.5)(w)
  # fusion_layer = Dense(fusion_dim, activation='relu')(w)
  fusion_layer = w

  if (operator == 'att_labels'): # nm: new
    output = Dense(1)(fusion_layer)  # (batch_size, nb_labels, 1)  
    output = Lambda(lambda x: tf.keras.backend.squeeze(x, axis=-1))(output)  # (batch_size, nb_labels)
    output = Activation('softmax')(output)  # (batch_size, nb_labels)    
  else:
    output = Dense(num_classes,activation='softmax')(fusion_layer)

  model = Model(inputs=[inp1, inp2], outputs=output)
  
  model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', f1_m])

  model.summary()

  return model, fusion_layer


### Treinando e Avaliando o Multimodal Fusion Model

Plot loss weights/accuracy

In [None]:
import keras 
from tqdm.notebook import tqdm
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

dataset_fold_path = 'documentIntent_emnlp19/splits/train_split_'
merging_layers = ['autoencoder','image_emb', 'txt_emb', 'add', 'att_labels','att','concatenate','subtract','average','multiply']
results = []
seed = 0

nb_folds = 5 # 5 | 1 (test)
nb_runs = 1 # 11 | 2(test)

for fold in tqdm(range(0,1)):

  num_classes = y_train.shape[1]

  
  for merge in tqdm(merging_layers):
    #X_train, X_test = None, NoneW
    for run in range(0,nb_runs):
        tf.random.set_seed(run)
        #for lw in lws:
        early_stopping = EarlyStopping(monitor='val_accuracy', patience=50,restore_best_weights=True, mode='max')
        model,probs = None,None
        if merge == 'autoencoder':
          early_stopping = EarlyStopping(monitor='val_clf_accuracy', patience=100,restore_best_weights=True, mode='max')
          clf, enc, dec, autoencoder = autoencoder_att_labels(num_classes, X2_train.shape[1], X1_train.shape[1])
          autoencoder.fit([X1_train, X2_train], [[X1_train, X2_train], y_train], validation_data=([X1_val, X2_val], [[X1_val, X2_val], y_val]), epochs=1000, batch_size=64, verbose=0, callbacks=[early_stopping])

          encoded, _ = enc([X1_test, X2_test])
          probs = clf.predict(encoded)
        elif merge == 'txt_emb':
          model = clf_model(num_classes, size=X1_train.shape[1])
          model.fit(X1_train, y_train, epochs=1000,batch_size=16, verbose=0, validation_data=(X1_val, y_val), callbacks=[early_stopping])
          probs = model.predict(X1_test)
        elif merge == 'image_emb':
          model = clf_model(num_classes, size=X2_train.shape[1])
          model.fit(X2_train, y_train, epochs=1000,batch_size=16, verbose=0, validation_data=(X2_val, y_val), callbacks=[early_stopping])
          probs = model.predict(X2_test)
        else:
          model, fusion_layer = multimodal_text_image(X1_train,X2_train,num_classes,operator=merge)
          model.fit([X1_train,X2_train], y_train, validation_data=([X1_val, X2_val], y_val),
                          epochs=1000,
                          batch_size=16,
                          shuffle=True,verbose=0, callbacks=[early_stopping])
          probs = model.predict([X1_test,X2_test])

        y_true = np.argmax(y_test,axis=1)
        y_pred = np.argmax(probs,axis=1)



        f1_macro = f1_score(y_true, y_pred, average='macro')
        f1_micro = f1_score(y_true, y_pred, average='micro')
        acc = accuracy_score(y_true, y_pred)
        auc = roc_auc_score(y_test, probs, average='macro', multi_class='ovr')

        print(fold,merge, acc,f1_micro,f1_macro,auc)
        results.append((fold,merge, acc, f1_micro,f1_macro,auc,10.))

In [None]:
y_pred

In [None]:
df_results = pd.DataFrame(results)
df_results.columns = ['fold','merging','acc','f1-micro','f1-macro','auc', 'clf_weight']
df_results
df = df_results.groupby(['merging','fold'], as_index=False).agg(
                      {'acc':['mean','std']})
df 

In [None]:
df_results.sort_values(by='acc', ascending=False)[df_results.fold==0].head(50)

In [None]:
df_results.sort_values(by='acc', ascending=False)[df_results.fold==1].head(50)

In [None]:
df_results.sort_values(by='acc', ascending=False)[df_results.fold==2].head(50)

In [None]:
df_results.sort_values(by='acc', ascending=False)[df_results.fold==3].head(50)

In [None]:
df_results.sort_values(by='acc', ascending=False)[df_results.fold==4].head(50)

In [None]:
sorted(df_results.to_numpy().tolist(), reverse=True, key=lambda a: a[3])

In [None]:
df_results.to_excel(f'{dataset}_fusion_models_seeded.xls')