<a href="https://colab.research.google.com/github/manashpratim/Bosch-Summer-Internship/blob/master/VideoV2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Data is available at https://drive.google.com/drive/folders/1NFYIaXjL8V5kvZo3g9JEafLQ3scslWic?usp=sharing

## **Loading Data**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!unzip -q '/content/drive/My Drive/mosi_data/mosi.zip'

In [None]:
def get_file_names(mypath,savefile):
  from os import listdir
  from os.path import isfile, join
  onlyfiles = [f[:f.find('.')] for f in listdir(mypath) if isfile(join(mypath, f))]
  with open(savefile, 'w') as f:
    for item in onlyfiles:
        f.write(item)
        f.write('\n')
  return onlyfiles

In [None]:
mypath = '/content/Raw/Transcript/Segmented'
files = get_file_names(mypath,'textfile.txt')

In [None]:
import pickle
with open('/content/drive/My Drive/mosi_data/video_frames_joined.pickle', 'rb') as handle:
    nd = pickle.load(handle)
with open('/content/drive/My Drive/mosi_data/video_features.pickle', 'rb') as handle:
    dic3 = pickle.load(handle)
with open('/content/drive/My Drive/mosi_data/video_features_dense.pickle', 'rb') as handle:
    dic2 = pickle.load(handle)
with open('/content/drive/My Drive/mosi_data/labels_joined.pickle', 'rb') as handle:
    labels = pickle.load(handle)

## **Processing Data**

In [None]:
# This block pads the video features so that each segments have same length. I use zero padding. This is the data for model 1.
# The videos were sampled at 0.5s. The features were extracted using MTCNN and VGG Face 2. The dimension of frames passed to the models were 100x100x3
video_data = []
#maximum = float('-inf')
max_pad_len = 77                            #max length of a sequence
                          

for key in files:
  for l in dic3[key]:
    l = np.array(l)
    if len(l)>0:
      #maximum = max(maximum,l.shape[0])
      pad_width = max_pad_len - l.shape[0]
      mfcc = np.pad(l.T, pad_width=((0, 0), (0, pad_width)), mode='constant')
      video_data.append(mfcc)
    else:                                     # This else statement is for the VGGish features data. They have 28 bad frames. It does not affect the MFCC data
      f = np.random.rand(512,77)               
      video_data.append(f)

video_data = np.array(video_data)
video_data= video_data.reshape(video_data.shape[0], 512, 77,1)          

In [None]:
# This block pads the video frames so that each segments have same length. I use zero padding. This is the data for Model 2.
video_data = []
maximum = float('-inf')

for key in files:
  for l in nd[key]:
        video_data.append(l)
import tensorflow as tf
video_data=tf.keras.preprocessing.sequence.pad_sequences(video_data, maxlen=40, dtype='float32', padding='post', truncating='post',value=0.0)

In [None]:
#This block pads the video features so that each segments have same length. I use zero padding. This is the data for model 3.
#The videos were sampled at 0.03s. The features were extracted using VGG Face 2. The dimension of frames passed to the models were 224x224x3 
# This block pads the audio features so that each segments have same length. I use zero padding
video_data = []

for key in files:
  for l in dic2[key]:            
      video_data.append(l)

import tensorflow as tf
video_data = tf.keras.preprocessing.sequence.pad_sequences(video_data, maxlen=142, dtype='float32', padding='post', truncating='post',value=0.0)
video_data = video_data[:,:,:500]

In [None]:
# Join all the labels into a numpy array
import numpy as np
y = []
for key in files:
    y+=labels[key]
y = np.array(y)

#For regression, do not execute any of the lines below

y[y>0]=1        #Convert labels to binary
y[y<0]=0

y=y.astype(int)   

#ref = {-3:0,-2:1,-1:2,0:3,1:4,2:5,3:6}         #Uncomment the following three lines for 7 class classification
#for i,num in enumerate(y):
#  y[i] = ref[num]

In [None]:
def split_data(data,labels,split_size=0.8):
  import numpy as np  
  train_length =int(len(labels)*split_size)
  test_length =int(len(labels)-train_length)
  idx = np.random.permutation(labels.shape[0])
  data = data[idx]
  labels = labels[idx]
  data_train = data[:train_length]
  data_val = data[train_length:]
  labels_train = labels[:train_length]
  labels_val = labels[train_length:]
  
  return data_train,data_val,labels_train,labels_val

In [None]:
video_train,video_val,labels_train,labels_val = split_data(video_data,y,split_size=0.8)
del video_data

## **Model 1 (Baseline)**

In [None]:
import tensorflow as tf
#from keras import regularizers

model=tf.keras.models.Sequential([
    
    tf.keras.layers.Conv2D(16,(3,3),activation='relu',input_shape=(512,77,1)),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Conv2D(32,(3,3),activation='relu'),
    tf.keras.layers.Conv2D(64,(3,3),activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Conv2D(128,(3,3),activation='relu',padding='same'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Flatten(),

    tf.keras.layers.Dense(256,activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(64,activation='relu'),
    tf.keras.layers.Dropout(0.5),
    #tf.keras.layers.Dense(1)
    #tf.keras.layers.Dense(1,activation='sigmoid')
    tf.keras.layers.Dense(7,activation='softmax')
])

In [None]:
reduce =tf. keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, mode='auto')  #to reduce learning rate by factor of 0.1 if model performance degrades for 10 (patience) epochs.  
#early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=10, mode='auto')  #early stopping if performance of model degrades for 10 epochs

#Uncomment one of the next three lines at a time
#model2.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy',f1_score])                #Binary classification
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])            #7 class classifiaction
#model2.compile(loss="mean_absolute_error",optimizer='adam',metrics=["mean_absolute_error"])             #Regression

# I am training for 50 epochs with a batch size of 256. Set verbose to 2 for no training details and 0 for more training details.
num_epochs = 50
history=model.fit(video_train, 
                    labels_train, 
                    epochs=num_epochs, 
                    batch_size=256, 
                    validation_data=(video_val,labels_val),
                    callbacks=[reduce],
                    shuffle = True,
                    verbose=1)

## **Model 2 (RAW)**

In [None]:
# video model
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model


inp = Input(shape=(40, 64, 64, 3))
x   = tf.keras.layers.Conv3D(filters=32, kernel_size=(3, 3, 3), strides=(1,2,2),activation="relu", padding="same")(inp)
x   = tf.keras.layers.Conv3D(filters=64, kernel_size=(3, 3, 3), strides=(1,2,2),activation="relu", padding="same")(x)
x   = tf.keras.layers.BatchNormalization()(x)
x   = tf.keras.layers.Dropout(0.4)(x)
x   = tf.keras.layers.Reshape((40, 16*16*64))(x)


x = tf.keras.layers.Bidirectional(tf.compat.v1.keras.layers.CuDNNLSTM(128,return_sequences=True))(x)
x = tf.keras.layers.Bidirectional(tf.compat.v1.keras.layers.CuDNNLSTM(128,return_sequences=True))(x)
x   = tf.keras.layers.BatchNormalization()(x)
x   = tf.keras.layers.Dropout(0.4)(x)
x  = tf.keras.layers.GlobalAveragePooling1D()(x)
x   = tf.keras.layers.BatchNormalization()(x)
x   = tf.keras.layers.Dropout(0.4)(x)
x   = tf.keras.layers.Dense(64, activation='relu')(x)
x   = tf.keras.layers.Dropout(0.5)(x)
out   = tf.keras.layers.Dense(1, activation='sigmoid')(x)
#out   = tf.keras.layers.Dense(7, activation='softmax')(x)
#out   = tf.keras.layers.Dense(1)(x)

model = Model(inp,out)

In [None]:
reduce =tf. keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, mode='auto')
#early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=10, mode='auto')

#Uncomment one of the next three lines at a time               
model.compile(loss='binary_crossentropy',optimizer='rmsprop',metrics=['accuracy'])         #Binary classification
#model.compile(loss='sparse_categorical_crossentropy',optimizer='rmsprop',metrics=['accuracy'])            #7 class classifiaction
#model.compile(loss="mean_absolute_error",optimizer='rmsprop')             #Regression
history=model.fit(video_train,labels_train, batch_size=32, epochs=80, shuffle=True,validation_data=(video_val,labels_val),callbacks=[reduce],use_multiprocessing=True)

## **Model 3 (Transformer)**

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

class MultiHeadSelfAttention(layers.Layer):
    def __init__(self, embed_dim, num_heads=8):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        if embed_dim % num_heads != 0:
            raise ValueError(
                f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
            )
        self.projection_dim = embed_dim // num_heads
        self.query_dense = layers.Dense(embed_dim)
        self.key_dense = layers.Dense(embed_dim)
        self.value_dense = layers.Dense(embed_dim)
        self.combine_heads = layers.Dense(embed_dim)

    def attention(self, query, key, value):
        score = tf.matmul(query, key, transpose_b=True)
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)
        weights = tf.nn.softmax(scaled_score, axis=-1)
        output = tf.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        # x.shape = [batch_size, seq_len, embedding_dim]
        batch_size = tf.shape(inputs)[0]
        query = self.query_dense(inputs)  # (batch_size, seq_len, embed_dim)
        key = self.key_dense(inputs)  # (batch_size, seq_len, embed_dim)
        value = self.value_dense(inputs)  # (batch_size, seq_len, embed_dim)
        query = self.separate_heads(
            query, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        key = self.separate_heads(
            key, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        value = self.separate_heads(
            value, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        attention, weights = self.attention(query, key, value)
        attention = tf.transpose(
            attention, perm=[0, 2, 1, 3]
        )  # (batch_size, seq_len, num_heads, projection_dim)
        concat_attention = tf.reshape(
            attention, (batch_size, -1, self.embed_dim)
        )  # (batch_size, seq_len, embed_dim)
        output = self.combine_heads(
            concat_attention
        )  # (batch_size, seq_len, embed_dim)
        return output


In [None]:
class Transformer(layers.Layer):
    def __init__(self, maxlen, embed_dim,num_heads):
        super(Transformer, self).__init__()
        
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)

        self.lstm1 = tf.keras.layers.Bidirectional(tf.compat.v1.keras.layers.CuDNNLSTM(128,return_sequences=True))
        self.lstm2 = tf.keras.layers.Bidirectional(tf.compat.v1.keras.layers.CuDNNLSTM(128,return_sequences=True))
        self.dropout1 = tf.keras.layers.Dropout(0.2)
        self.dropout2 = tf.keras.layers.Dropout(0.2)
        self.dropout3 = tf.keras.layers.Dropout(0.2)
        self.dropout4 = tf.keras.layers.Dropout(0.4)
        self.dropout5 = tf.keras.layers.Dropout(0.5)

        self.pool     =  tf.keras.layers.GlobalAveragePooling1D()
        self.bn1   = tf.keras.layers.BatchNormalization()
        self.bn2   = tf.keras.layers.BatchNormalization()
        self.bn3   = tf.keras.layers.BatchNormalization()
        self.bn4   = tf.keras.layers.BatchNormalization()
        self.bn5   = tf.keras.layers.BatchNormalization()
        self.dense1 = tf.keras.layers.Dense(128, activation="relu")
        self.dense2 = tf.keras.layers.Dense(64, activation="relu")
        self.out = tf.keras.layers.Dense(1, activation="sigmoid")
        #self.out = tf.keras.layers.Dense(7, activation="softmax")
        #self.out = tf.keras.layers.Dense(1)
    
    def call(self, inputs):
        
        attn_output = self.att(inputs)
        x = inputs + attn_output
        x = self.dropout1(x)
        x = self.lstm1(x)
        x = self.lstm2(x)
        x = self.dropout2(x)
        x = self.pool(x)
        x = self.dropout3(x)
        x = self.dense1(x)
        x = self.dropout4(x)
        x = self.dense2(x)
        x = self.dropout5(x)
        out = self.out(x)
        
        return out

In [None]:
maxlen = 142
embed_dim = 500
num_heads = 10
inputs = layers.Input(shape=(maxlen,embed_dim))
transformer_block = Transformer(maxlen, embed_dim,num_heads)
outputs = transformer_block(inputs)
model = keras.Model(inputs=inputs, outputs=outputs)

In [None]:
reduce =tf. keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, mode='auto')
#early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=10, mode='auto')

#Uncomment one of the next three lines at a time               
model.compile(loss='binary_crossentropy',optimizer='rmsprop',metrics=['accuracy'])         #Binary classification
#model.compile(loss='sparse_categorical_crossentropy',optimizer='rmsprop',metrics=['accuracy'])            #7 class classifiaction
#model.compile(loss="mean_absolute_error",optimizer='rmsprop')             #Regression
history=model.fit(video_train,labels_train, batch_size=32, epochs=80, shuffle=True,validation_data=(video_val,labels_val),callbacks=[reduce],use_multiprocessing=True)