<a href="https://colab.research.google.com/github/manashpratim/Bosch-Summer-Internship/blob/master/AudioV2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Data is available at https://drive.google.com/drive/folders/1NFYIaXjL8V5kvZo3g9JEafLQ3scslWic?usp=sharing

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np

In [None]:
# Unzip the MOSI data
!unzip -q '/content/drive/My Drive/mosi_data/mosi.zip'

# **Audio Features Extraction**

The extracted features are available at the above link

In [None]:
#Function to extract file names
def get_file_names(mypath,savefile):      
  from os import listdir
  from os.path import isfile, join
  onlyfiles = [f[:f.find('.')] for f in listdir(mypath) if isfile(join(mypath, f))]
  with open(savefile, 'w') as f:
    for item in onlyfiles:
        f.write(item)
        f.write('\n')
  return onlyfiles

In [None]:
mypath = '/content/Raw/Audio/WAV_16000/Segmented' 
savefile = 'audiofiles.txt'

In [None]:
audiofiles = get_file_names(mypath,savefile)          #Get the names of the audio files

In [None]:
#Function to extract audio features. Arguments are path of the audio data and the names of the audiofiles 
def get_audio_features(mypath,audiofiles):
    import librosa
    import numpy as np
    mydic = {}
    for file in audiofiles:
      audiofile = mypath + '/' + file + '.wav'
      x, sr = librosa.load(audiofile, sr  = 16000, res_type='kaiser_fast')        #Sampling frequency 16KHz
      chroma_stft = librosa.feature.chroma_stft(y=x, sr=sr).T
      rmse = librosa.feature.rmse(y=x).T
      spec_cent = librosa.feature.spectral_centroid(y=x, sr=sr).T
      spec_bw = librosa.feature.spectral_bandwidth(y=x, sr=sr).T
      rolloff = librosa.feature.spectral_rolloff(y=x, sr=sr).T
      zcr = librosa.feature.zero_crossing_rate(x).T
      mfcc = librosa.feature.mfcc(y=x, sr=sr,n_mfcc=40).T                         #40 MFCC features
      mydic[file] = np.hstack((chroma_stft,rmse,spec_cent,spec_bw,rolloff,zcr,mfcc))
    return mydic

In [None]:
audio = get_audio_features(mypath,audiofiles)                 #Get audio features

In [None]:
# This block changes the format of the audio features data to match the other data modalities

dic = {}
for file in audiofiles:
  new = file[:-file[::-1].find('_')-1]
  dic[file] = new

newdic = {}
newdic1 = {}
for key in audio:
  newkey = dic[key]
  if newkey not in newdic:
    newdic[newkey] = {}
  if newkey not in newdic1:
    newdic1[newkey] = {}
  newdic1[newkey][key] = audio[key]
  k = key[-key[::-1].find('_'):]
  newdic[newkey][int(k)] = audio[key]

nd = {}
for key in newdic:
    nd[key] = []
    for k in sorted(newdic[key].keys()):
      nd[key].append(newdic[key][k])

In [None]:
#Save the audio features data in pickle format
import pickle
with open('/content/drive/My Drive/mosi_data/audio_features_joined.pickle', 'wb') as handle:
    pickle.dump(nd, handle, protocol=pickle.HIGHEST_PROTOCOL)

# **Preprocessing the features and labels**

In [None]:
# Load the labels and saved audio features
import pickle
with open('/content/drive/My Drive/mosi_data/audio_features_joined.pickle', 'rb') as handle:
    dic2 = pickle.load(handle)

with open('/content/drive/My Drive/mosi_data/labels_joined.pickle', 'rb') as handle:
    labels = pickle.load(handle)

In [None]:
# Get the names of the files. I use the names of the Transcript files as all the other data formats are adjusted based on this format
mypath = '/content/Raw/Transcript/Segmented'
files = get_file_names(mypath,'textfile.txt')

In [None]:
def preprocessing(arr,flag=False):
    mean =  np.mean(arr,axis=0)
    std = np.std(arr,axis=0)
    if flag:
      arr = (arr-mean)/std
    else:
      arr = (arr-mean)
    return arr

In [None]:
# This block pads the audio features so that each segments have same length. I use zero padding
audio_data = []
maximum = float('-inf')
max_pad_len = 1639                     #max length of a sequence. For audio_features_joined (MFCC), use this
#max_pad_len = 858                            #For audio_pretrained_features_joined (VGGish), use this. Uncomment the above

for key in files:
  for l in dic2[key]:

      if len(l)>0:
        #maximum = max(maximum,l.shape[0])
        pad_width = max_pad_len - l.shape[0]
        l = preprocessing(l,flag=True)
        mfcc = np.pad(l.T[7:], pad_width=((0, 0), (0, pad_width)), mode='constant')
        audio_data.append(mfcc)
      else:                                     # This else statement is for the VGGish features data. They have 28 bad frames. It does not affect the MFCC data
        f = np.random.rand(128,858)               
        audio_data.append(f)

audio_data = np.array(audio_data)         
audio_data= audio_data.reshape(audio_data.shape[0],50,max_pad_len)    # For audio_features_joined
#audio_data= audio_data.reshape(audio_data.shape[0], 128, 858)            # For audio_pretrained_features_joined

In [None]:
# Join all the labels into a numpy array
import numpy as np
y = []
for key in files:
    y+=labels[key]
y = np.array(y)

#For regression, do not execute any of the lines below

y[y>0]=1        #Convert labels to binary
y[y<0]=0

y=y.astype(int)   

#ref = {-3:0,-2:1,-1:2,0:3,1:4,2:5,3:6}         #Uncomment the following three lines for 7 class classification
#for i,num in enumerate(y):
#  y[i] = ref[num]

In [None]:
# Function to generate train-test split. Arguments are text data,labels,audio features data and split_size (0.8 mean 80:20 train-test split)
def split_data(audio,labels,split_size=0.8):
  import numpy as np  
  train_length =int(len(labels)*split_size)
  test_length =int(len(labels)-train_length)
  idx = np.random.permutation(labels.shape[0])
  audio = audio[idx]
  labels = labels[idx]
  audio_train = audio[:train_length]
  audio_val = audio[train_length:]
  labels_train = labels[:train_length]
  labels_val = labels[train_length:]
  
  return audio_train,audio_val,labels_train,labels_val

In [None]:
audio_train,audio_val,labels_train,labels_val = split_data(audio_data,y,split_size=0.8)

In [None]:
# Data Statistics
print('Dimension of Training  Text Data: ',audio_train.shape)
print('Dimension of Validation Text Data: ',audio_val.shape)
print('Dimension of Training Labels: ',labels_train.shape)
print('Dimension of Validation Labels: ',labels_val.shape)

# **Training**

## **Baseline**

In [None]:
# audio model
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

inp = Input((50,1639))           # Dimensions for MFCC data. For VGGish, change it to (128,858)
layer = tf.keras.layers.Conv1D(16,3,activation='relu',input_shape=(50,1639))(inp)
layer = tf.keras.layers.Conv1D(32,3,activation='relu')(layer)
layer = tf.keras.layers.MaxPool1D(2)(layer)
#layer = tf.keras.layers.Conv1D(64,3,activation='relu',padding='same')(layer)
#layer = tf.keras.layers.Conv1D(128,3,activation='relu')(layer)
#layer = tf.keras.layers.MaxPool1D(2)(layer)
layer = tf.keras.layers.Dropout(0.5)(layer)
layer = tf.keras.layers.Bidirectional(tf.compat.v1.keras.layers.CuDNNLSTM(64,return_sequences=True))(layer)
layer = tf.keras.layers.Bidirectional(tf.compat.v1.keras.layers.CuDNNLSTM(32))(layer)
layer = tf.keras.layers.Dropout(0.5)(layer)
layer = tf.keras.layers.Dense(64,activation='relu')(layer)
layer = tf.keras.layers.Dropout(0.5)(layer)
layer = tf.keras.layers.Dense(32,activation='relu')(layer)
layer = tf.keras.layers.Dropout(0.5)(layer)
#out = tf.keras.layers.Dense(1,activation='sigmoid')(layer)
out = tf.keras.layers.Dense(7,activation='softmax')(layer)
#out = tf.keras.layers.Dense(1)(layer)
model = Model(inp,out)


In [None]:
reduce =tf. keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, mode='auto')
#early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=10, mode='auto')

#Uncomment one of the next three lines at a time
#model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])                #Binary classification
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])            #7 class classifiaction
#model.compile(loss="mean_absolute_error",optimizer='adam',metrics=["mean_absolute_error"])             #Regression
history=model.fit(audio_train,labels_train, batch_size=256, epochs=50, validation_data=(audio_val,labels_val),callbacks=[reduce])

## **Transformer**

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

class MultiHeadSelfAttention(layers.Layer):
    def __init__(self, embed_dim, num_heads=8):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        if embed_dim % num_heads != 0:
            raise ValueError(
                f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
            )
        self.projection_dim = embed_dim // num_heads
        self.query_dense = layers.Dense(embed_dim)
        self.key_dense = layers.Dense(embed_dim)
        self.value_dense = layers.Dense(embed_dim)
        self.combine_heads = layers.Dense(embed_dim)

    def attention(self, query, key, value):
        score = tf.matmul(query, key, transpose_b=True)
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)
        weights = tf.nn.softmax(scaled_score, axis=-1)
        output = tf.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        # x.shape = [batch_size, seq_len, embedding_dim]
        batch_size = tf.shape(inputs)[0]
        query = self.query_dense(inputs)  # (batch_size, seq_len, embed_dim)
        key = self.key_dense(inputs)  # (batch_size, seq_len, embed_dim)
        value = self.value_dense(inputs)  # (batch_size, seq_len, embed_dim)
        query = self.separate_heads(
            query, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        key = self.separate_heads(
            key, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        value = self.separate_heads(
            value, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        attention, weights = self.attention(query, key, value)
        attention = tf.transpose(
            attention, perm=[0, 2, 1, 3]
        )  # (batch_size, seq_len, num_heads, projection_dim)
        concat_attention = tf.reshape(
            attention, (batch_size, -1, self.embed_dim)
        )  # (batch_size, seq_len, embed_dim)
        output = self.combine_heads(
            concat_attention
        )  # (batch_size, seq_len, embed_dim)
        return output



In [None]:
class Transformer(layers.Layer):
    def __init__(self, maxlen, embed_dim,num_heads):
        super(Transformer, self).__init__()
        
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.pur1 = tf.keras.layers.Permute((2, 1))
        self.ffn = keras.Sequential(
            [layers.Dense(64, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(0.2)
        self.dropout2 = layers.Dropout(0.5)
        self.dropout3 = layers.Dropout(0.5)
        self.pool = tf.keras.layers.GlobalAveragePooling1D()
        self.dense = tf.keras.layers.Dense(64, activation="relu")
        self.out = tf.keras.layers.Dense(1, activation="sigmoid")
        #self.out = tf.keras.layers.Dense(7, activation="softmax")
        #self.out = tf.keras.layers.Dense(1)
    
    def call(self, inputs):
        inputs = self.pur1(inputs)
        attn_output = self.att(inputs)
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        out = self.dropout2(ffn_output)
        
        out =  self.layernorm2(out1 + ffn_output)
        out = self.pool(out)
        out = self.dropout3(self.dense(out))
        
        return self.out(out)



In [None]:
maxlen = 1639
embed_dim = 50  # Embedding size for each token
num_heads = 10  # Number of attention heads

inputs = layers.Input(shape=(50,maxlen))
transformer_block = Transformer(maxlen, embed_dim,num_heads)
outputs = transformer_block(inputs)
model = keras.Model(inputs=inputs, outputs=outputs)


In [None]:
reduce =tf. keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, mode='auto')
#early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=10, mode='auto')

#Uncomment one of the next three lines at a time
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])                #Binary classification
#model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])            #7 class classifiaction
#model.compile(loss="mean_absolute_error",optimizer='adam')             #Regression
history=model.fit(audio_train,labels_train, batch_size=12, epochs=50, validation_data=(audio_val,labels_val),callbacks=[reduce])


#model_save_filename = "audio_model.h5"

#earlystopping_cb = keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
#mdlcheckpoint_cb = keras.callbacks.ModelCheckpoint(
#    model_save_filename, monitor="val_accuracy", save_best_only=True
#)
