<a href="https://colab.research.google.com/github/manashpratim/Bosch-Summer-Internship/blob/master/A%2BV%2BT_V4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!wget --no-check-certificate \
      "http://immortal.multicomp.cs.cmu.edu/raw_datasets/CMU_MOSI.zip"\
      -O "/content/mosi.zip"

In [None]:
#Unzip the MOSI data
!unzip -q '/content/mosi.zip'

In [None]:
!pip install adversarial-robustness-toolbox

In [None]:
import tensorflow as tf
tf.compat.v1.disable_eager_execution()

In [None]:
#Function to extract file names
def get_file_names(mypath,savefile):      
  from os import listdir
  from os.path import isfile, join
  onlyfiles = [f[:f.find('.')] for f in listdir(mypath) if isfile(join(mypath, f))]
  with open(savefile, 'w') as f:
    for item in onlyfiles:
        f.write(item)
        f.write('\n')
  return onlyfiles

In [None]:
# Get the names of the files. I use the names of the Transcript files as all the other data formats are adjusted based on this format
mypath = '/content/Raw/Transcript/Segmented'
files = get_file_names(mypath,'textfile.txt')

In [None]:
# Load the labels and saved audio features
import pickle
with open('/content/drive/My Drive/mosi_data/audio_features_joined.pickle', 'rb') as handle:
    dic = pickle.load(handle)
with open('/content/drive/My Drive/mosi_data/video_features_dense.pickle', 'rb') as handle:
    dic2 = pickle.load(handle)
with open('/content/drive/My Drive/mosi_data/text_data_joined.pickle', 'rb') as handle:
    dic3 = pickle.load(handle)
with open('/content/drive/My Drive/mosi_data/labels_joined.pickle', 'rb') as handle:
    labels = pickle.load(handle)

In [None]:
# data normalization
def preprocessing(arr,flag=False):
    mean =  np.mean(arr,axis=0)
    std = np.std(arr,axis=0)
    if flag:
      arr = (arr-mean)/std
    else:
      arr = (arr-mean)
    return arr

In [None]:
#Text
import numpy as np
review = []
for key in files:
  review+=dic3[key]
review = np.array(review)

In [None]:
# This block pads the audio features so that each segments have same length. I use zero padding
audio_data = []
for key in files:
  for l in dic[key]:
        l = preprocessing(l,flag=True)             
        audio_data.append(l)

import tensorflow as tf
audio_data = tf.keras.preprocessing.sequence.pad_sequences(audio_data, maxlen=1639, dtype='float32', padding='post', truncating='post',value=0.0)
audio_data = audio_data[:,:,7:]

In [None]:
# This block pads the video features so that each segments have same length. I use zero padding
video_data = []
for key in files:
  for l in dic2[key]:            
      video_data.append(l)

import tensorflow as tf
video_data = tf.keras.preprocessing.sequence.pad_sequences(video_data, maxlen=142, dtype='float32', padding='post', truncating='post',value=0.0)
video_data = video_data[:,:,:500]

In [None]:
# Join all the labels into a numpy array
import numpy as np
y = []
for key in files:
    y+=labels[key]
y = np.array(y)

#For regression, do not execute any of the lines below

y[y>0]=1        #Convert labels to binary
y[y<0]=0

y=y.astype(int)   

#ref = {-3:0,-2:1,-1:2,0:3,1:4,2:5,3:6}         #Uncomment the following three lines for 7 class classification
#for i,num in enumerate(y):
#  y[i] = ref[num]

In [None]:
# Function to generate train-test split. Arguments are text data,labels,audio features data and split_size (0.8 mean 80:20 train-test split)
def split_data(audio,video,text,labels,split_size=0.8):
  import numpy as np  
  train_length =int(len(labels)*split_size)
  test_length =int(len(labels)-train_length)
  idx = np.random.permutation(labels.shape[0])
  audio = audio[idx]
  labels = labels[idx]
  video = video[idx]
  text = text[idx]
  audio_train = audio[:train_length]
  audio_val = audio[train_length:]
  video_train = video[:train_length]
  video_val = video[train_length:]
  text_train = text[:train_length]
  text_val = text[train_length:]
  labels_train = labels[:train_length]
  labels_val = labels[train_length:]
  
  return audio_train,audio_val,video_train,video_val,text_train,text_val,labels_train,labels_val

In [None]:
audio_train,audio_val,video_train,video_val,train_reviews,val_reviews,labels_train,labels_val = split_data(audio_data,video_data,review,y,split_size=0.8)

In [None]:
import tensorflow as tf
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_length = 581
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
vocab_len=5000

tokenizer = Tokenizer(num_words=vocab_len+1,oov_token=oov_tok)
tokenizer.fit_on_texts(review)

word_index = tokenizer.word_index
vocab_size=len(word_index)
print('Size of Vocabulary: ',vocab_size)

train_sequences = tokenizer.texts_to_sequences(train_reviews)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

val_sequences = tokenizer.texts_to_sequences(val_reviews)
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

labels_train=np.expand_dims(labels_train, axis=1)
labels_val=np.expand_dims(labels_val, axis=1)

In [None]:
!unzip -q '/content/drive/My Drive/mosi_data/globe6B.zip'

In [None]:
embeddings_index = {}
with open('/content/glove.6B.300d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [None]:

embedding_dim = 300
embeddings_matrix = np.zeros((vocab_size+1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector

In [None]:
# Data Statistics
print('Dimension of Training  Audio Data: ',audio_train.shape)
print('Dimension of Validation Audio Data: ',audio_val.shape)
print('Dimension of Training  Video Data: ',video_train.shape)
print('Dimension of Validation Video Data: ',video_val.shape)
print('Dimension of Training  Text Data: ',train_padded.shape)
print('Dimension of Validation Text Data: ',val_padded.shape)
print('Dimension of Training Labels: ',labels_train.shape)
print('Dimension of Validation Labels: ',labels_val.shape)

## **Late Fusion (Weighted Sum of Logits)**

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

class MultiHeadSelfAttention(layers.Layer):
    def __init__(self, embed_dim, num_heads=8):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        if embed_dim % num_heads != 0:
            raise ValueError(
                f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
            )
        self.projection_dim = embed_dim // num_heads
        self.query_dense = layers.Dense(embed_dim)
        self.key_dense = layers.Dense(embed_dim)
        self.value_dense = layers.Dense(embed_dim)
        self.combine_heads = layers.Dense(embed_dim)

    def attention(self, query, key, value):
        score = tf.matmul(query, key, transpose_b=True)
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)
        weights = tf.nn.softmax(scaled_score, axis=-1)
        output = tf.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        # x.shape = [batch_size, seq_len, embedding_dim]
        batch_size = tf.shape(inputs)[0]
        query = self.query_dense(inputs)  # (batch_size, seq_len, embed_dim)
        key = self.key_dense(inputs)  # (batch_size, seq_len, embed_dim)
        value = self.value_dense(inputs)  # (batch_size, seq_len, embed_dim)
        query = self.separate_heads(
            query, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        key = self.separate_heads(
            key, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        value = self.separate_heads(
            value, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        attention, weights = self.attention(query, key, value)
        attention = tf.transpose(
            attention, perm=[0, 2, 1, 3]
        )  # (batch_size, seq_len, num_heads, projection_dim)
        concat_attention = tf.reshape(
            attention, (batch_size, -1, self.embed_dim)
        )  # (batch_size, seq_len, embed_dim)
        output = self.combine_heads(
            concat_attention
        )  # (batch_size, seq_len, embed_dim)
        return output

In [None]:
# Audio model
class Transformer1(layers.Layer):
    def __init__(self, maxlen, embed_dim,num_heads):
        super(Transformer1, self).__init__()
        
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = keras.Sequential(
            [layers.Dense(64, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(0.3)
        self.dropout2 = layers.Dropout(0.5)
        self.dropout3 = layers.Dropout(0.5)
        self.pool = tf.keras.layers.GlobalAveragePooling1D()
        self.dense = tf.keras.layers.Dense(64, activation="relu")
        self.out = tf.keras.layers.Dense(2)

    def call(self, inputs):

        attn_output = self.att(inputs)
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        out = self.dropout2(ffn_output)
        
        out =  self.layernorm2(out1 + ffn_output)
        out =  self.pool(out)
        out = self.dense(out)
        return self.out(out)

In [None]:
#Video Model
class Transformer2(layers.Layer):
    def __init__(self, maxlen, embed_dim,num_heads):
        super(Transformer2, self).__init__()
        
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)

        self.lstm1 = tf.keras.layers.Bidirectional(tf.compat.v1.keras.layers.CuDNNLSTM(128,return_sequences=True))
        self.lstm2 = tf.keras.layers.Bidirectional(tf.compat.v1.keras.layers.CuDNNLSTM(128,return_sequences=True))
        self.dropout1 = tf.keras.layers.Dropout(0.2)
        self.dropout2 = tf.keras.layers.Dropout(0.2)
        self.dropout3 = tf.keras.layers.Dropout(0.2)
        self.dropout4 = tf.keras.layers.Dropout(0.4)
        self.dropout5 = tf.keras.layers.Dropout(0.5)

        self.pool     =  tf.keras.layers.GlobalAveragePooling1D()

        self.dense1 = tf.keras.layers.Dense(128, activation="relu")
        self.dense2 = tf.keras.layers.Dense(64, activation="relu")
        self.out = tf.keras.layers.Dense(2)
    
    def call(self, inputs):
        
        attn_output = self.att(inputs)
        x = inputs + attn_output
        x = self.dropout1(x)
        x = self.lstm1(x)
        x = self.lstm2(x)
        x = self.dropout2(x)
        x = self.pool(x)
        x = self.dropout3(x)
        x = self.dense1(x)
        x = self.dropout4(x)
        x = self.dense2(x)
   
        return self.out(x)

In [None]:
#Text Model
class Transformer3(layers.Layer):
    def __init__(self, maxlen, embed_dim, vocab_size,embeddings_matrix,num_heads):
        super(Transformer3, self).__init__()
        
        self.embed = tf.keras.layers.Embedding(vocab_size+1, embed_dim,  input_length=maxlen, weights = [embeddings_matrix], trainable = False)
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)

        self.lstm1 = tf.keras.layers.Bidirectional(tf.compat.v1.keras.layers.CuDNNLSTM(128,return_sequences=True))
        self.lstm2 = tf.keras.layers.Bidirectional(tf.compat.v1.keras.layers.CuDNNLSTM(128,return_sequences=True))
        self.dropout1 = tf.keras.layers.Dropout(0.2)
        self.dropout2 = tf.keras.layers.Dropout(0.2)
        self.dropout3 = tf.keras.layers.Dropout(0.2)
        self.dropout4 = tf.keras.layers.Dropout(0.4)
        self.dropout5 = tf.keras.layers.Dropout(0.5)
        self.dropout6 = tf.keras.layers.Dropout(0.4)
        self.pool     =  tf.keras.layers.GlobalAveragePooling1D()
        self.dense1 = tf.keras.layers.Dense(128, activation="relu")
        self.dense2 = tf.keras.layers.Dense(64, activation="relu")
        self.out = tf.keras.layers.Dense(2)
    
    def call(self, inputs):
        
        inputs = self.embed(inputs) 
        inputs = self.dropout6(inputs)
        attn_output = self.att(inputs)
        x = inputs + attn_output
        x = self.dropout1(x)
        x = self.lstm1(x)
        x = self.lstm2(x)
        x = self.dropout2(x)
        x = self.pool(x)
        x = self.dropout3(x)
        x = self.dense1(x)
        x = self.dropout3(x)
        x = self.dense2(x)
        
        return self.out(x)

In [None]:
#Customized layer for weighted sum
class WeightedSum(tf.keras.layers.Layer):
    def __init__(self, a,b,c, **kwargs):
        self.a = a
        self.b = b
        self.c = c
        super(WeightedSum, self).__init__(**kwargs)
    def call(self, model_outputs):
        return self.a * model_outputs[0] + self.b * model_outputs[1] + self.c * model_outputs[2]
    def compute_output_shape(self, input_shape):
        return input_shape[0]

In [None]:

num_heads = 5  # Number of attention heads

inputs1 = layers.Input(shape=(1639,50))
transformer_block = Transformer1(1639, 50,num_heads)
outputs1 = transformer_block(inputs1)
model1 = keras.Model(inputs=inputs1, outputs=outputs1)

inputs2 = layers.Input(shape=(142,500))
transformer_block1 = Transformer2(142, 500,num_heads)
outputs2 = transformer_block1(inputs2)
model2 = keras.Model(inputs=inputs2, outputs=outputs2)

inputs3 = layers.Input(shape=(581,))
transformer_block3 = Transformer3(581, 300, 3108,embeddings_matrix,num_heads)
outputs3 = transformer_block3(inputs3)
model3 = keras.Model(inputs=inputs3, outputs=outputs3)

fusion = WeightedSum(0.2,0.3,0.5)([model1.output, model2.output,model3.output]) 
out = tf.keras.layers.Activation('softmax')(fusion)   
model = keras.Model([model1.input,model2.input,model3.input],out)

In [None]:
reduce =tf. keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy', factor=0.1, patience=10, mode='auto')
early = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', min_delta=1e-4, patience=12, mode='max',restore_best_weights=True)

#Uncomment one of the next three lines at a time
#model.compile(loss='binary_crossentropy',optimizer='rmsprop',metrics=['accuracy'])                #Binary classification
model.compile(loss='sparse_categorical_crossentropy',optimizer='rmsprop',metrics=['accuracy'])            #7 class classifiaction
#model.compile(loss="mean_absolute_error",optimizer='adam')             #Regression
history=model.fit([audio_train,video_train,train_padded],labels_train, batch_size=32, epochs=50,
                  #validation_data=(audio_val,labels_val),
                  validation_split=0.1,
                  shuffle = True,
                  callbacks=[reduce,early])


#model_save_filename = "audio_model.h5"

#earlystopping_cb = keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
#mdlcheckpoint_cb = keras.callbacks.ModelCheckpoint(
#    model_save_filename, monitor="val_accuracy", save_best_only=True
#)

In [None]:
results = model.evaluate([audio_val,video_val,val_padded],labels_val)
print('Test Set Performance: ',results)