<a href="https://colab.research.google.com/github/manashpratim/Bosch-Summer-Internship/blob/master/A%2BV%2BT_V1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Data is available at https://drive.google.com/drive/folders/1NFYIaXjL8V5kvZo3g9JEafLQ3scslWic?usp=sharing

## **Loading Data**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!unzip -q '/content/drive/My Drive/mosi_data/mosi.zip'

In [None]:
#Function to get the file names. Inputs are path and name of the file to be saved
def get_file_names(mypath,savefile):
  from os import listdir
  from os.path import isfile, join
  onlyfiles = [f[:f.find('.')] for f in listdir(mypath) if isfile(join(mypath, f))]
  with open(savefile, 'w') as f:
    for item in onlyfiles:
        f.write(item)
        f.write('\n')
  return onlyfiles

In [None]:
mypath = '/content/Raw/Transcript/Segmented'
files = get_file_names(mypath,'textfile.txt')

In [None]:
# Load the data
import pickle
with open('/content/drive/My Drive/mosi_data/labels_joined.pickle', 'rb') as handle:
    label= pickle.load(handle)

with open('/content/drive/My Drive/mosi_data/text_data_joined.pickle', 'rb') as handle:
    dic = pickle.load(handle)

with open('/content/drive/My Drive/mosi_data/audio_features_joined.pickle', 'rb') as handle:
    dic2 = pickle.load(handle)

with open('/content/drive/My Drive/mosi_data/video_features.pickle', 'rb') as handle:
    dic3 = pickle.load(handle)

## **Preprocessing Data**

In [None]:
# Join all the segments of the text data into a numpy array
import numpy as np
review = []
for key in files:
  review+=dic[key]
review = np.array(review)

In [None]:
# Join all the labels into a numpy array
import numpy as np
y = []
for key in files:
    y+=label[key]
y = np.array(y)

y[y>0]=1        #Convert labels to binary
y[y<0]=0

y=y.astype(int)   # Execute this line for classification. Comment it for regression
#ref = {-3:0,-2:1,-1:2,0:3,1:4,2:5,3:6}         #Uncomment the following three lines for 7 class classification
#for i,num in enumerate(y):
#  y[i] = ref[num]

In [None]:
# This block pads the audio features so that each segments have same length. I use zero padding
audio_data = []
maximum = float('-inf')
max_pad_len = 1639                            #max length of a sequence. For audio_features_joined (MFCC), use this
#max_pad_len = 858                            #For audio_pretrained_features_joined (VGGish), use this. Uncomment the above

for key in files:
  for l in dic2[key]:

    if len(l)>0:
      #maximum = max(maximum,l.shape[0])
      pad_width = max_pad_len - l.shape[0]
      mfcc = np.pad(l.T, pad_width=((0, 0), (0, pad_width)), mode='constant')
      audio_data.append(mfcc)
    else:                                     # This else statement is for the VGGish features data. They have 28 bad frames. It does not affect the MFCC data
      f = np.random.rand(128,858)               
      audio_data.append(f)

audio_data = np.array(audio_data)
audio_data= audio_data.reshape(audio_data.shape[0], 57, 1639)             # For audio_features_joined
#audio_data= audio_data.reshape(audio_data.shape[0], 128, 858)            # For audio_pretrained_features_joined
#audio_data = audio_data.swapaxes(1,-1)

In [None]:
# This block pads the video features so that each segments have same length. I use zero padding
video_data = []
#maximum = float('-inf')
max_pad_len = 77                            #max length of a sequence
                          

for key in files:
  for l in dic3[key]:
    l = np.array(l)
    if len(l)>0:
      #maximum = max(maximum,l.shape[0])
      pad_width = max_pad_len - l.shape[0]
      mfcc = np.pad(l.T, pad_width=((0, 0), (0, pad_width)), mode='constant')
      video_data.append(mfcc)
    else:                                     # This else statement is for the VGGish features data. They have 28 bad frames. It does not affect the MFCC data
      f = np.random.rand(512,77)               
      video_data.append(f)

video_data = np.array(video_data)
video_data= video_data.reshape(video_data.shape[0], 512, 77,1) 
#video_data = video_data.swapaxes(1,-1)            

In [None]:
# Function to generate train-test split. Arguments are text data,labels,audio features data and split_size (0.8 mean 80:20 train-test split)
def split_data(text,audio,video,labels,split_size=0.8):
  import numpy as np  
  train_length =int(len(labels)*split_size)
  test_length =int(len(labels)-train_length)
  idx = np.random.permutation(labels.shape[0])
  text = text[idx]
  audio = audio[idx]
  video = video[idx]
  labels = labels[idx]
  text_train = text[:train_length]
  text_val = text[train_length:]
  labels_train = labels[:train_length]
  labels_val = labels[train_length:]
  audio_train = audio[:train_length]
  audio_val = audio[train_length:]
  video_train = video[:train_length]
  video_val = video[train_length:]
  
  return text_train,text_val,audio_train,audio_val,video_train,video_val,labels_train,labels_val

In [None]:
# Get train-test split
train_reviews, val_reviews, train_audio, val_audio, train_video,val_video,train_labels, val_labels = split_data(review,audio_data,video_data,y,0.8)

In [None]:
# Preprocess the text data. Similar to the audio data, segments of the text data are paddded to have same length
import tensorflow as tf
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_length = 581
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
vocab_len=5000

tokenizer = Tokenizer(num_words=vocab_len+1,oov_token=oov_tok)
tokenizer.fit_on_texts(review)

word_index = tokenizer.word_index
vocab_size=len(word_index)
print('Size of Vocabulary: ',vocab_size)

train_sequences = tokenizer.texts_to_sequences(train_reviews)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

val_sequences = tokenizer.texts_to_sequences(val_reviews)
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

train_labels=np.expand_dims(train_labels, axis=1)
val_labels=np.expand_dims(val_labels, axis=1)

In [None]:
# Unzip the downloaded embeddings
!unzip -q '/content/drive/My Drive/mosi_data/globe6B.zip'

In [None]:
# Load the embeddings. There are 4 dimensions to choose from. I used 300 dimensional embeddings. 
embeddings_index = {}
with open('/content/glove.6B.300d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [None]:
# Map the embeddings with the words of the text data
embedding_dim = 300
embeddings_matrix = np.zeros((vocab_size+1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector

In [None]:
# Function to compute F1 score. I use it as a metrics for Binary Classification.
from keras.callbacks import Callback,ModelCheckpoint
from keras.models import Sequential,load_model
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
import keras.backend as K
def f1_score(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

## **Baseline Late Fusion using Concatenation**

In [None]:
# text+audio+video Conatenation fusion model
# Provide the suitable units inside the Dense layer.
# For Binary classification, use 1 and 'sigmoid' as activation
# For 7 class classification, use 7 and 'softmax' as activation
# For Regression, use 1 and remove activation

from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

# text model
inp1 = Input(max_length)   
layer = tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=max_length, weights = [embeddings_matrix], trainable = False)(inp1)            
layer = tf.keras.layers.Dropout(0.4)(layer)
layer = tf.keras.layers.Bidirectional(tf.compat.v1.keras.layers.CuDNNLSTM(128,return_sequences=True))(layer)
layer = tf.keras.layers.Bidirectional(tf.compat.v1.keras.layers.CuDNNLSTM(128))(layer)
layer = tf.keras.layers.Dropout(0.2)(layer)
layer = tf.keras.layers.Dense(128, activation='relu')(layer)
layer = tf.keras.layers.Dropout(0.4)(layer)
layer = tf.keras.layers.Dense(64, activation='relu')(layer)
layer = tf.keras.layers.Dropout(0.5)(layer)
#out1 = tf.keras.layers.Dense(1, activation='sigmoid')(layer)                 
model1= Model(inp1,layer)                                 

# audio model
inp2 = Input((57,1639))           
layer2 = tf.keras.layers.Conv1D(64,3,activation='relu',input_shape=(57,1639))(inp2)
layer2 = tf.keras.layers.Conv1D(128,3,activation='relu')(layer2)
layer2 = tf.keras.layers.MaxPool1D(2)(layer2)
layer2 = tf.keras.layers.Conv1D(128,3,activation='relu',padding='same')(layer2)
layer2 = tf.keras.layers.Conv1D(256,3,activation='relu')(layer2)
layer2 = tf.keras.layers.MaxPool1D(2)(layer2)
layer2 = tf.keras.layers.Dropout(0.3)(layer2)
layer2 = tf.keras.layers.Bidirectional(tf.compat.v1.keras.layers.CuDNNLSTM(128,return_sequences=True))(layer2)
layer2 = tf.keras.layers.Bidirectional(tf.compat.v1.keras.layers.CuDNNLSTM(128))(layer2)
layer2 = tf.keras.layers.Dropout(0.2)(layer2)
layer2 = tf.keras.layers.Dense(256,activation='relu')(layer2)
layer2 = tf.keras.layers.Dropout(0.2)(layer2)
layer2 = tf.keras.layers.Dense(64,activation='relu')(layer2)
#layer2 = tf.keras.layers.Dropout(0.5)(layer2)
model2 = Model(inp2,layer2)

# video model
inp3 = Input((512,77,1))           
layer3 = tf.keras.layers.Conv2D(16,3,activation='relu',use_bias=False,input_shape=(512,77,1))(inp3)
layer3 = tf.keras.layers.MaxPool2D(2)(layer3)
layer3 = tf.keras.layers.Conv2D(32,3,use_bias=False,activation='relu')(layer3)
layer3 = tf.keras.layers.MaxPool2D(2)(layer3)
layer3 = tf.keras.layers.Conv2D(64,3,use_bias=False,activation='relu')(layer3)
layer3 = tf.keras.layers.MaxPool2D(2)(layer3)
layer3 = tf.keras.layers.Conv2D(128,3,use_bias=False,activation='relu')(layer3)
layer3 = tf.keras.layers.MaxPool2D(2)(layer3)
layer3 = tf.keras.layers.Dropout(0.3)(layer3)
layer3 = tf.keras.layers.Flatten()(layer3)
layer3 = tf.keras.layers.Dense(256,activation='relu')(layer3)
layer3 = tf.keras.layers.Dropout(0.2)(layer3)
layer3 = tf.keras.layers.Dense(64,activation='relu')(layer3)
#layer2 = tf.keras.layers.Dropout(0.5)(layer2)
model3 = Model(inp3,layer3)

# Fusion of the two models. I concatenate the three models and pass it through a projection layer. All three models output 64 dimensional vectors
# So the dense layer has 192 units
fusion = tf.keras.layers.Concatenate(axis=1)([model1.output, model2.output,model3.output])   
fusion = tf.keras.layers.Dense(192,activation='relu')(fusion)

out = tf.keras.layers.Dense(1, activation='sigmoid')(fusion)        #For binary classification.  
#out = tf.keras.layers.Dense(1)(fusion)                             #For regression. comment the above line
#out = tf.keras.layers.Dense(7, activation='softmax')(fusion)       #For 7 class classification 
model = Model([model1.input,model2.input,model3.input],out)         #the fused model

In [None]:
reduce =tf. keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy', factor=0.1, patience=10, mode='auto')  #to reduce learning rate by factor of 0.1 if model performance degrades for 10 (patience) epochs.  
#early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=10, mode='auto')  #early stopping if performance of model degrades for 10 epochs

#Uncomment one of the next three lines at a time
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy',f1_score])             #Binary classification
#model.compile(loss='sparse_categorical_crossentropy',optimizer='rmsprop',metrics=['accuracy'])            #7 class classifiaction
#model.compile(loss="mean_absolute_error",optimizer='adam',metrics=["mean_absolute_error"])             #Regression

# I am training for 50 epochs with a batch size of 256. Set verbose to 2 for no training details and 0 for more training details.
num_epochs = 50
history=model.fit([train_padded,train_audio,train_video], 
                    train_labels, 
                    epochs=num_epochs, 
                    batch_size=64, 
                    shuffle = True,
                    validation_data=([val_padded,val_audio,val_video],val_labels),
                    callbacks=[reduce],
                    verbose=1)

## **Baseline A+T using Weighted Sum of Logits**

In [None]:
#Customized layer for weighted sum
class WeightedSum(tf.keras.layers.Layer):
    def __init__(self, a, **kwargs):
        self.a = a
        super(WeightedSum, self).__init__(**kwargs)
    def call(self, model_outputs):
        return self.a * model_outputs[0] + (1 - self.a) * model_outputs[1]
    def compute_output_shape(self, input_shape):
        return input_shape[0]

In [None]:
# text+audio weighted sum of logits fusion model

from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

#text model
inp1 = Input(max_length)   
layer = tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=max_length, weights = [embeddings_matrix], trainable = False)(inp1)            
layer = tf.keras.layers.Dropout(0.4)(layer)
layer = tf.keras.layers.Bidirectional(tf.compat.v1.keras.layers.CuDNNLSTM(128,return_sequences=True))(layer)
layer = tf.keras.layers.Bidirectional(tf.compat.v1.keras.layers.CuDNNLSTM(128))(layer)
layer = tf.keras.layers.Dropout(0.2)(layer)
layer = tf.keras.layers.Dense(128, activation='relu')(layer)
layer = tf.keras.layers.Dropout(0.4)(layer)
layer = tf.keras.layers.Dense(64, activation='relu')(layer)
layer = tf.keras.layers.Dropout(0.5)(layer)
out1 = tf.keras.layers.Dense(1)(layer)                  #Use 7 for 7 class classification. Do not change for binary classification and regression
model1= Model(inp1,out1)

#audio model
inp2 = Input((128,858))
layer2 = tf.keras.layers.Conv1D(64,3,activation='relu',input_shape=(128,858))(inp2)
layer2 = tf.keras.layers.Conv1D(128,3,activation='relu')(layer2)
layer2 = tf.keras.layers.MaxPool1D(2)(layer2)
layer2 = tf.keras.layers.Conv1D(128,3,activation='relu',padding='same')(layer2)
layer2 = tf.keras.layers.Conv1D(256,3,activation='relu')(layer2)
layer2 = tf.keras.layers.Dropout(0.3)(layer2)
layer2 = tf.keras.layers.MaxPool1D(2)(layer2)
layer2 = tf.keras.layers.Bidirectional(tf.compat.v1.keras.layers.CuDNNLSTM(128,return_sequences=True))(layer2)
layer2 = tf.keras.layers.Bidirectional(tf.compat.v1.keras.layers.CuDNNLSTM(128))(layer2)
layer2 = tf.keras.layers.Dropout(0.2)(layer2)
layer2 = tf.keras.layers.Dense(256,activation='relu')(layer2)
layer2 = tf.keras.layers.Dropout(0.2)(layer2)
layer2 = tf.keras.layers.Dense(64,activation='relu')(layer2)
layer2 = tf.keras.layers.Dropout(0.3)(layer2)
out2 = tf.keras.layers.Dense(1)(layer2)                         #Use 7 for 7 class classification. Do not change for binary classification and regression
model2 = Model(inp2,out2)

# 0.5 in the next line mean 50% weightage to both modalities
fusion = WeightedSum(0.5)([model1.output, model2.output])

#For regression, comment both the lines
out = tf.keras.layers.Activation('sigmoid')(fusion)             
#out = tf.keras.layers.Activation('softmax')(fusion)            #uncomment for 7 class classification
     

model3 = Model([model1.input,model2.input],out)             # for regression, change 'out' to 'fusion'