<a href="https://colab.research.google.com/github/manashpratim/Bosch-Summer-Internship/blob/master/Text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Data is available at https://drive.google.com/drive/folders/1NFYIaXjL8V5kvZo3g9JEafLQ3scslWic?usp=sharing

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#Unzip the dataset
!unzip -q '/content/drive/My Drive/mosi_data/mosi.zip'

In [None]:
#Function to get the file names. Inputs are path and name of the file to be saved
def get_file_names(mypath,savefile):
  from os import listdir
  from os.path import isfile, join
  onlyfiles = [f[:f.find('.')] for f in listdir(mypath) if isfile(join(mypath, f))]
  with open(savefile, 'w') as f:
    for item in onlyfiles:
        f.write(item)
        f.write('\n')
  return onlyfiles

In [None]:
#specify the path and get the file
mypath = '/content/Raw/Transcript/Segmented'
files = get_file_names(mypath,'textfile.txt')

# **Extracting Text Data from Transcripts**

The extracted data is available at the link provided above

In [None]:
# Function to process the transcripts and save them as pickle file. 
# Arguments are location of the transcripts, name of the files and the name of the file to be saved.
def get_text_data_joined(mypath,files,savefile):
  dic = {}
  for file in files:
      filename = mypath+'/' + file + '.annotprocessed'
      text_file = open(filename, "r")
      lines = text_file.read().split('\n')
      lines = lines[:-1]
      dic[file] = []
      for line in lines:
        ind1 = line.find('_')+1
        line = line[ind1:]
        ind2 = line.find('_')+1
        line = line[ind2:].strip()
        dic[file].append(line.lower())
  import pickle
  with open(savefile, 'wb') as handle:
         pickle.dump(dic, handle, protocol=pickle.HIGHEST_PROTOCOL)
  return dic

In [None]:
# Execute the above function
savefile = '/content/drive/My Drive/mosi_data/text_data_joined.pickle'
dic = get_text_data_joined(mypath,files,savefile)

# **Data Preprocessing**

In [None]:
# Load the processed transcripts and the labels
import pickle
with open('/content/drive/My Drive/mosi_data/labels_joined.pickle', 'rb') as handle:
    label= pickle.load(handle)

with open('/content/drive/My Drive/mosi_data/text_data_joined.pickle', 'rb') as handle:
    dic = pickle.load(handle)

In [None]:
# Join all the segments of the text data into a numpy array
import numpy as np
review = []
for key in files:
  review+=dic[key]
review = np.array(review)

In [None]:
# Join all the labels into a numpy array
import numpy as np
y = []
for key in files:
    y+=label[key]
y = np.array(y)

#y[y>0]=1        #Convert labels to binary
#y[y<0]=0

y=y.astype(int)   # Execute this line for classification. Comment it for regression
ref = {-3:0,-2:1,-1:2,0:3,1:4,2:5,3:6}         #Uncomment the following three lines for 7 class classification
for i,num in enumerate(y):
  y[i] = ref[num]

In [None]:
# Function to generate train-test split. Arguments are text data,labels,audio features data and split_size (0.8 mean 80:20 train-test split)
def split_data(text,labels,split_size=0.8):
  import numpy as np  
  train_length =int(len(labels)*split_size)
  test_length =int(len(labels)-train_length)
  idx = np.random.permutation(labels.shape[0])
  text = text[idx]
  labels = labels[idx]
  text_train = text[:train_length]
  text_val = text[train_length:]
  labels_train = labels[:train_length]
  labels_val = labels[train_length:]
  
  return text_train,text_val,labels_train,labels_val

In [None]:
# Get train-test split
train_reviews,  val_reviews, train_labels, val_labels = split_data(review,y,0.8)

In [None]:
# Get the maximum sequence length of text data
maximum = float('-inf')
for s in review:
    maximum = max(maximum,len(s))
print(maximum)

In [None]:
# Preprocess the text data. Similar to the audio data, segments of the text data are paddded to have same length
import tensorflow as tf
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_length = 581
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
vocab_len=5000

tokenizer = Tokenizer(num_words=vocab_len+1,oov_token=oov_tok)
tokenizer.fit_on_texts(review)

word_index = tokenizer.word_index
vocab_size=len(word_index)
print('Size of Vocabulary: ',vocab_size)

train_sequences = tokenizer.texts_to_sequences(train_reviews)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

val_sequences = tokenizer.texts_to_sequences(val_reviews)
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

train_labels=np.expand_dims(train_labels, axis=1)
val_labels=np.expand_dims(val_labels, axis=1)

In [None]:
# Data Statistics
print('Dimension of Training  Text Data: ',train_padded.shape)
print('Dimension of Validation Text Data: ',val_padded.shape)
print('Dimension of Training Labels: ',train_labels.shape)
print('Dimension of Validation Labels: ',val_labels.shape)

In [None]:
# Download Glove Wiki Embeddings
!wget --no-check-certificate \
      "http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip"\
      -O "/content/drive/My Drive/mosi_data/globe6B.zip"

In [None]:
# Unzip the downloaded embeddings
!unzip -q '/content/drive/My Drive/mosi_data/globe6B.zip'

In [None]:
# Load the embeddings. There are 4 dimensions to choose from. I used 300 dimensional embeddings. 
embeddings_index = {}
with open('/content/glove.6B.300d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [None]:
# Map the embeddings with the words of the text data
embedding_dim = 300
embeddings_matrix = np.zeros((vocab_size+1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector

#**Training** 

In [None]:
# Function to compute F1 score. I use it as a metrics for Binary Classification.
from keras.callbacks import Callback,ModelCheckpoint
from keras.models import Sequential,load_model
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
import keras.backend as K
def f1_score(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

## **Baseline**

In [None]:
# Provide the suitable units inside the Dense layer.
# For Binary classification, use 1 and 'sigmoid' as activation
# For 7 class classification, use 7 and 'softmax' as activation
# For Regression, use 1 and remove activation

from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

# text only model
inp = Input(max_length)   
layer = tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=max_length, weights = [embeddings_matrix], trainable = False)(inp)            
layer = tf.keras.layers.Dropout(0.4)(layer)
layer = tf.keras.layers.Bidirectional(tf.compat.v1.keras.layers.LSTM(128,return_sequences=True))(layer)
layer = tf.keras.layers.Bidirectional(tf.compat.v1.keras.layers.LSTM(128))(layer)
layer = tf.keras.layers.Dropout(0.2)(layer)
layer = tf.keras.layers.Dense(128, activation='relu')(layer)
layer = tf.keras.layers.Dropout(0.4)(layer)
layer = tf.keras.layers.Dense(64, activation='relu')(layer)
layer = tf.keras.layers.Dropout(0.5)(layer)
out = tf.keras.layers.Dense(1, activation='sigmoid')(layer) 
#out = tf.keras.layers.Dense(7, activation='softmax')(layer) 
#out = tf.keras.layers.Dense(1)(layer)                 
model= Model(inp,out)                                 



In [None]:
reduce =tf. keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, mode='auto')  #to reduce learning rate by factor of 0.1 if model performance degrades for 10 (patience) epochs.  
#early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=10, mode='auto')  #early stopping if performance of model degrades for 10 epochs

#Uncomment one of the next three lines at a time
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy',f1_score])                #Binary classification
#model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])            #7 class classifiaction
#model.compile(loss="mean_absolute_error",optimizer='adam',metrics=["mean_absolute_error"])             #Regression

# I am training for 50 epochs with a batch size of 256. Set verbose to 2 for no training details and 0 for more training details.
num_epochs = 50
history=model.fit(train_padded, 
                    train_labels, 
                    epochs=num_epochs, 
                    batch_size=256, 
                    validation_data=(val_padded,val_labels),
                    callbacks=[reduce],
                    verbose=1)

## **Transformer**

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

class MultiHeadSelfAttention(layers.Layer):
    def __init__(self, embed_dim, num_heads=8):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        if embed_dim % num_heads != 0:
            raise ValueError(
                f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
            )
        self.projection_dim = embed_dim // num_heads
        self.query_dense = layers.Dense(embed_dim)
        self.key_dense = layers.Dense(embed_dim)
        self.value_dense = layers.Dense(embed_dim)
        self.combine_heads = layers.Dense(embed_dim)

    def attention(self, query, key, value):
        score = tf.matmul(query, key, transpose_b=True)
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)
        weights = tf.nn.softmax(scaled_score, axis=-1)
        output = tf.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        # x.shape = [batch_size, seq_len, embedding_dim]
        batch_size = tf.shape(inputs)[0]
        query = self.query_dense(inputs)  # (batch_size, seq_len, embed_dim)
        key = self.key_dense(inputs)  # (batch_size, seq_len, embed_dim)
        value = self.value_dense(inputs)  # (batch_size, seq_len, embed_dim)
        query = self.separate_heads(
            query, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        key = self.separate_heads(
            key, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        value = self.separate_heads(
            value, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        attention, weights = self.attention(query, key, value)
        attention = tf.transpose(
            attention, perm=[0, 2, 1, 3]
        )  # (batch_size, seq_len, num_heads, projection_dim)
        concat_attention = tf.reshape(
            attention, (batch_size, -1, self.embed_dim)
        )  # (batch_size, seq_len, embed_dim)
        output = self.combine_heads(
            concat_attention
        )  # (batch_size, seq_len, embed_dim)
        return output



In [None]:

class Transformer(layers.Layer):
    def __init__(self, maxlen, embed_dim, vocab_size,embeddings_matrix,num_heads):
        super(Transformer, self).__init__()
        
        self.embed = tf.keras.layers.Embedding(vocab_size+1, embed_dim,  input_length=maxlen, weights = [embeddings_matrix], trainable = False)
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)

        self.lstm1 = tf.keras.layers.Bidirectional(tf.compat.v1.keras.layers.CuDNNLSTM(128,return_sequences=True))
        self.lstm2 = tf.keras.layers.Bidirectional(tf.compat.v1.keras.layers.CuDNNLSTM(128,return_sequences=True))
        self.dropout1 = tf.keras.layers.Dropout(0.2)
        self.dropout2 = tf.keras.layers.Dropout(0.2)
        self.dropout3 = tf.keras.layers.Dropout(0.2)
        self.dropout4 = tf.keras.layers.Dropout(0.4)
        self.dropout5 = tf.keras.layers.Dropout(0.5)
        self.dropout6 = tf.keras.layers.Dropout(0.4)
        self.pool     =  tf.keras.layers.GlobalAveragePooling1D()
        self.dense1 = tf.keras.layers.Dense(128, activation="relu")
        self.dense2 = tf.keras.layers.Dense(64, activation="relu")
        #self.out = tf.keras.layers.Dense(1, activation="sigmoid")
        self.out = tf.keras.layers.Dense(7, activation="softmax")
        #self.out = tf.keras.layers.Dense(1)
    
    def call(self, inputs):
        
        inputs = self.embed(inputs) 
        inputs = self.dropout6(inputs)
        attn_output = self.att(inputs)
        x = inputs + attn_output
        x = self.dropout1(x)
        x = self.lstm1(x)
        x = self.lstm2(x)
        x = self.dropout2(x)
        x = self.pool(x)
        x = self.dropout3(x)
        x = self.dense1(x)
        x = self.dropout3(x)
        x = self.dense2(x)
        x = self.dropout4(x)
        out = self.out(x)
        
        return out



In [None]:
vocab_size = 3108
maxlen = 581
embed_dim = 300  # Embedding size for each token
num_heads = 10  # Number of attention heads

inputs = layers.Input(shape=(maxlen,))
transformer_block = Transformer(maxlen, embed_dim, vocab_size,embeddings_matrix,num_heads)
outputs = transformer_block(inputs)
model = keras.Model(inputs=inputs, outputs=outputs)



In [None]:
reduce =tf. keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, mode='auto')
#model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
#model.compile(optimizer="adam", loss="mean_absolute_error")
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
history = model.fit(
    train_padded, train_labels, batch_size=256, epochs=30, validation_data=(val_padded, val_labels),callbacks=[reduce]
)
