In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [21]:
import gc, numpy as np, pickle
import tensorflow as tf
from keras.models import Model
from keras import backend as K
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Input, Bidirectional, GRU, Masking, Dense, Dropout, Lambda, Activation, dot, multiply, concatenate
#from keras.layers import TimeDistributed
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [53]:
#train_text [num of video, max num of utterance = max_utt_len, dim of representation]
#train_len: num of utterance in every video
(train_text, train_label, test_text, test_label, max_utt_len, train_len, test_len) = pickle.load(open('/content/mytext.pickle', 'rb'))
#(train_audio, _, test_audio, _, _, _, _) = pickle.load(open('./input/audio.pickle', 'rb'))

#original one
#(train_video, _, test_video, _, _, _, _) = pickle.load(open('/content/video.pickle', 'rb'))
train_video = pickle.load(open('/content/video_train.pickle','rb'))
test_video = pickle.load(open('/content/video_test.pickle','rb'))

In [54]:
train_text = np.concatenate((train_text, test_text[:10]), axis = 0)
train_video = np.concatenate((train_video, test_video[:10]), axis = 0)
test_text = test_text[10:]
test_video = test_video[10:]
train_label = np.concatenate((train_label, test_label[:10]), axis = 0)
test_label = test_label[10:]
train_len = train_len + test_len[:10]
test_len = test_len[10:]

In [4]:
#mask zero value to speed up

def create_mask(train_data, test_data, train_length, test_length):
    '''
    # Arguments
        train, test data (any one modality (text, audio or video)), utterance lengths in train, test videos
    # Returns
        mask for train and test data
    '''

    train_mask = np.zeros((train_data.shape[0], train_data.shape[1]), dtype='float')
    for i in range(len(train_length)):
        train_mask[i, :train_length[i]] = 1.0

    test_mask = np.zeros((test_data.shape[0], test_data.shape[1]), dtype='float')
    for i in range(len(test_length)):
        test_mask[i, :test_length[i]] = 1.0
    
    return train_mask, test_mask

In [55]:
train_mask, test_mask = create_mask(train_text, test_text, train_len, test_len)

In [6]:
def bi_modal_attention(x, y): #x=V, y=T
  m1 = dot([x, y], axes=(2, 2)) # m1 = x dot y^T, axis=0:batch size, axis=1: max_len, axis=2:dim of representation
  m2 = dot([y, x], axes=(2, 2)) # m2 = y dot x^T

  n1 = Activation('softmax')(m1) # n1 = softmax(m1)
  n2 = Activation('softmax')(m2)

  o1 = dot([n1, y], axes=(2, 1)) #o1 = n1 dot y
  o2 = dot([n2, x], axes=(2, 1))

  a1 =  multiply([o1, x]) # a1 = o1 element-wise multiply x
  a2 =  multiply([o2, y])

  return concatenate([a1, a2])

In [19]:
def MMMUBA():
  #Initialise the keras tensor
  text = Input(shape=(train_text.shape[1], train_text.shape[2]))
  video = Input(shape=(train_video.shape[1], train_video.shape[2]))

  #mask layer
  masked_text = Masking(mask_value=0)(text)
  masked_video = Masking(mask_value=0)(video)


  #RNN
  drop_rnn = 0.5
  gru_units = 300
            
  rnn_text = Bidirectional(GRU(gru_units, return_sequences=True, dropout=0.5, recurrent_dropout=0.5), merge_mode='concat')(masked_text)
  rnn_video = Bidirectional(GRU(gru_units, return_sequences=True, dropout=0.5, recurrent_dropout=0.5), merge_mode='concat')(masked_video)        
            
  rnn_text = Dropout(drop_rnn)(rnn_text)
  rnn_video = Dropout(drop_rnn)(rnn_video)

  #no longer need time-distributed layer
  drop_dense = 0.5
  dense_units = 100

  dense_text = Dropout(drop_dense)(Dense(dense_units, activation='tanh')(rnn_text))
  dense_video = Dropout(drop_dense)(Dense(dense_units, activation='tanh')(rnn_video))
  
  #dense_text: [None, max num of utterance = max_utt_len, dim of representation]

  #Attention layer:
  vt_att = bi_modal_attention(dense_video, dense_text)

  #concat v and t:
  merged = concatenate([vt_att, dense_video, dense_text])


  #output:
  output = Dense(2, activation='softmax')(merged)
  #print(output[0][0])
  #model:
  model = Model([text, video], output)  
  return model

In [8]:
MMMUBA()

<keras.engine.functional.Functional at 0x7f85a4543700>

In [9]:
def calc_test_result(result, test_label, test_mask):
    '''
    # Arguments
        predicted test labels, gold test labels and test mask
    # Returns
        accuracy of the predicted labels
    '''
    true_label=[]
    predicted_label=[]

    for i in range(result.shape[0]):
        for j in range(result.shape[1]):
            if test_mask[i,j]==1:
                true_label.append(np.argmax(test_label[i,j] )) #np.argmax returns [0.9, 0.1] as 0 and [0.1, 0.9] as 1
                                                              # it is more efficient to calculate the accuracy
                predicted_label.append(np.argmax(result[i,j] ))
  
    return accuracy_score(true_label, predicted_label)


In [10]:
# Reason for this function: We use softmax to category and we need to compute the accuracy 
# thus we need to make the [1] and [0] as [1,0] and [0,1]
def create_one_hot_labels(train_label, test_label):
    '''
    # Arguments
        train and test labels (2D matrices)
    # Returns
        one hot encoded train and test labels (3D matrices)
    '''

    maxlen = int(max(train_label.max(), test_label.max()))
    
    train = np.zeros((train_label.shape[0], train_label.shape[1], maxlen+1))
    test = np.zeros((test_label.shape[0], test_label.shape[1], maxlen+1))
    
    for i in range(train_label.shape[0]):
        for j in range(train_label.shape[1]):
            train[i,j,train_label[i,j]] = 1

    for i in range(test_label.shape[0]):
        for j in range(test_label.shape[1]):
            test[i,j,test_label[i,j]] = 1

    return train, test

In [56]:
train_label, test_label = create_one_hot_labels(train_label.astype('int'), test_label.astype('int'))

In [57]:
train_label.shape

(71, 63, 2)

In [58]:
train_mask.shape

(71, 63)

In [83]:
def train():

  accuracy = []

  model = MMMUBA()
  #lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
   # [5,23],
    #[1e-2,1e-3,1e-4])
  opt = tf.keras.optimizers.Adamax(learning_rate=1e-3, weight_decay=1e-5)
  model.compile(optimizer= opt, loss='categorical_crossentropy', weighted_metrics=[], sample_weight_mode='temporal', metrics=['accuracy'])

  path = '/content/model.hdf5'
  #callbacks
  early_stop = EarlyStopping(monitor='val_loss', patience=10, verbose=0)
  check = ModelCheckpoint(path, monitor='val_accuracy', save_best_only=True, mode='max', verbose=0)

  history = model.fit([train_text, train_video], train_label,
                            epochs=50,
                            batch_size=8,
                            sample_weight=train_mask, #weight is 0 for none value
                            shuffle=True, 
                            callbacks=[early_stop, check],
                            # directly use test as validation set
                            validation_data=([test_text, test_video], test_label, test_mask),
                            verbose=1)
  
  model.load_weights(path)
  test_predictions = model.predict([test_text, test_video])
  print(test_predictions[0][0])
  test_accuracy = calc_test_result(test_predictions, test_label, test_mask)
  accuracy.append(test_accuracy)
  return accuracy

In [None]:
aaa = train()

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
[0.9984988  0.00150112]


In [None]:
aaa
#paper: 81.51

[0.7938829787234043]

In [None]:
#time: 21:41

0

In [84]:
bbb = train()

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
[0.36904162 0.6309584 ]


In [85]:
bbb

[0.6538461538461539]

In [86]:
sum(test_len)

468

In [87]:
sum(train_len)

1716