In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cd /content/drive/MyDrive/'Colab_Notebooks'/mmser

In [None]:
!pip install transformers
!pip install transformers[torch]
!pip install tensorboardX
!pip install pytorch_lightning

In [None]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
from transformers import BertTokenizer
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils import data
from tensorboardX import SummaryWriter
from torchvggish import vggish, vggish_input
import sys
import random
import csv
from transformers import BertForSequenceClassification
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

from datetime import datetime

currentSecond= datetime.now().second
currentMinute = datetime.now().minute
currentHour = datetime.now().hour

currentDay = datetime.now().day
currentMonth = datetime.now().month
currentYear = datetime.now().year

In [None]:
'''
dirname        : path that need to be searched
ret                : files in the dirname (recursive)
list_avoid_dir : dirname need to be skipped
usage           : 
    list_files = []
    file_search(dirname, list_files):   
'''
def file_search(dirname, ret, list_avoid_dir=[]):
    
    filenames = os.listdir(dirname)
    
    for filename in filenames:
        full_filename = os.path.join(dirname, filename)

        if os.path.isdir(full_filename) :
            if full_filename.split('/')[-1] in list_avoid_dir:
                continue
            else:
                file_search(full_filename, ret, list_avoid_dir)
            
        else:
            ret.append(full_filename)          

            

'''
filename : filename (inc. path) that will be inspected
'''
def find_encoding(filename):
    rawdata = open(filename, 'rb').read()
    result = chardet.detect(rawdata)
    charenc = result['encoding']    
    return charenc
            
'''
dir_name : dir_name (inc. path) that will be created ( full-path name )
'''
def create_folder(dir_name):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)

In [None]:
def extract_trans(list_in_file, out_file):
    
    lines = []
    
    for in_file in list_in_file:
        cnt = 0
        
        with open(in_file, 'r') as f1:
            lines = f1.readlines()

        with open(out_file, 'a') as f2:

            csv_writer = csv.writer(f2)
            lines = sorted(lines)                  # sort based on first element
            
            for line in lines:

                name = line.split(':')[0].split(' ')[0].strip()
                
                # unwanted case 
                if name[:3] != 'Ses':             # noise transcription such as reply  M: sorry
                    continue
                elif name[-3:-1] == 'XX':        # we don't have matching pair in label
                    continue
                trans = line.split(':')[1].strip()
                
                cnt += 1
                csv_writer.writerow([name, trans])

In [None]:
# [schema] ID, transcriptions [csv]

list_files = []

for x in range(5):
    sess_name = 'Session' + str(x+1)

    path = 'data/IEMOCAP_full_release/' + sess_name + '/dialog/transcriptions/'
    file_search(path, list_files)
    list_files = sorted(list_files)

    print (sess_name + ", #sum files: " + str(len(list_files)))

extract_trans(list_files, 'processed_trans.csv')

In [None]:
# read contents of csv file
file = pd.read_csv("processed_trans.csv")
  
# adding header
headerList = ['sessionID', 'text']
  
# converting data frame to csv
file.to_csv("processed_trans_head.csv", header=headerList, index=False)

In [None]:
list_category = [
                'ang',
                'hap',
                'sad',
                'neu',
                'fru',
                'exc',
                'fea',
                'sur',
                'dis',
                'oth',
                'xxx'
                ]

category = {}
for c_type in list_category:
    if category.__contains__(c_type):
        pass
    else:
        category[c_type] = len(category)

In [None]:
def find_category(lines):
    is_target = True
    
    id = ''
    c_label = ''
    list_ret = []
    
    for line in lines:
        
        if is_target == True:
            
            try:
                id          = line.split('\t')[1].strip()  #  extract ID
                c_label  = line.split('\t')[2].strip()  #  extract category
                if not category.__contains__(c_label):
                    print("ERROR nokey ", c_label)
                    sys.exit()
                
                list_ret.append( [id, c_label] )
                is_target = False

            except:
                print("ERROR ", line)
                sys.exit()
        
        else:
            if line == '\n':
                is_target = True
        
    return list_ret

In [None]:
def extract_labels(list_in_file, out_file) :
    id = ''
    lines = []
    list_ret = []
    
    for in_file in list_in_file:
        
        with open(in_file, 'r') as f1:
            lines = f1.readlines()
            lines = lines[2:]                           # remove head
            list_ret = find_category(lines)
            
        list_ret = sorted(list_ret)                   # sort based on first element
    
        with open(out_file, 'a') as f2:
            csv_writer = csv.writer(f2)
            csv_writer.writerows(list_ret)

In [None]:
# [schema] ID, label [csv]

list_files = []
list_avoid_dir = ['Attribute', 'Categorical', 'Self-evaluation']

for x in range(5):
    sess_name = 'Session' + str(x+1)

    path = 'data/IEMOCAP_full_release/' + sess_name + '/dialog/EmoEvaluation/'
    file_search(path, list_files, list_avoid_dir)
    list_files = sorted(list_files)

    print(sess_name + ", #sum files: " + str(len(list_files)))

extract_labels(list_files, "processed_labels.csv")

In [None]:
# read contents of csv file
file = pd.read_csv("processed_labels.csv")
  
# adding header
headerList = ['sessionID', 'label']
  
# converting data frame to csv
file.to_csv("processed_labels_head.csv", header=headerList, index=False)

In [None]:
dfl = pd.read_csv('processed_labels_head.csv')
dfl.loc[dfl["label"] == "ang", "label"] = 0
dfl.loc[dfl["label"] == "hap", "label"] = 1
dfl.loc[dfl["label"] == "exc", "label"] = 1
dfl.loc[dfl["label"] == "sad", "label"] = 2
dfl.loc[dfl["label"] == "neu", "label"] = 3
dfl.loc[dfl["label"] == "fru", "label"] = -1
dfl.loc[dfl["label"] == "fea", "label"] = -1
dfl.loc[dfl["label"] == "sur", "label"] = -1
dfl.loc[dfl["label"] == "dis", "label"] = -1
dfl.loc[dfl["label"] == "oth", "label"] = -1
dfl.loc[dfl["label"] == "xxx", "label"] = -1
dfl.head(10)

In [None]:
dfl.to_csv("processed_digital_labels_head.csv", index=False)

In [None]:
# reading two csv files
data1 = pd.read_csv('processed_trans_head.csv')
data2 = pd.read_csv('processed_digital_labels_head.csv')
  
# using merge function by setting how='inner'
translabels = pd.merge(data1, data2, 
                   on='sessionID', 
                   how='inner')

translabels.to_csv("processed_trans_labels_head.csv", index=False)

In [None]:
list_files = []
for x in range(5):
    sess_name = 'Session' + str(x+1)
    path = 'data/IEMOCAP_full_release/'+ sess_name + '/sentences/wav/'
    file_search(path, list_files)
    list_files = sorted(list_files)
    print (sess_name + ", #sum files: " + str(len(list_files)))

In [None]:
df=pd.read_csv('processed_trans_labels_head.csv')
df.head()

In [None]:
no_rows=len(list_files)
# cnt = 0
index=0
sprectrogram_shape=[]
docs = []
bookmark=0
extraLabel=0
for everyFile in list_files:
  if(everyFile.split('/')[-1].endswith('.wav')):
    filename=everyFile.split('/')[-1].strip('.wav')
    lable=df.loc[df['sessionID']==filename]['label'].values[0]
    text=df.loc[df['sessionID']==filename]['text'].values[0]
    # print('label',lable)
    if(lable!=-1):
      input_batch = vggish_input.wavfile_to_examples(everyFile)
      # print(input_batch.size())
      if (len(input_batch.size()) < 4) or (input_batch.size(dim=0) <= 1):
        # print("Wrong", input_batch.size())
        continue
      elif (len(input_batch.size()) == 4) and (input_batch.size(dim=0) > 1):
        # print("Correct", input_batch.size())
        docs.append({
           'fileName':everyFile.split('/')[-1].strip('.wav'),
           'text':text,
           'sprectrome':input_batch,
           'label':lable
                })
        index+=1
        # print('index',index)
        # cnt+=1
        # if cnt > 100:
          # break
    else:
      extraLabel=extraLabel+1
      # print('extraLabel',extraLabel)

In [None]:
random.shuffle(docs)
random.shuffle(docs)
random.shuffle(docs)
total_length=len(docs)
train_length=int(.8*total_length)
train_list=docs[0:train_length]
test_list=docs[train_length:]
print('no of items for train ',len(train_list))
print('no of items for test ',len(test_list))
# no of items for train  4424
# no of items for test  1107

In [None]:
# Write data
train_file = open("train_data.pkl", "wb")

pickle.dump(train_list, train_file)

train_file.close()

test_file = open("test_data.pkl", "wb")

pickle.dump(test_list, test_file)

test_file.close()

In [None]:
# Read data
train_file = open("train_data.pkl", "rb")

train_list = pickle.load(train_file)

print(train_list[0])

test_file = open("test_data.pkl", "rb")

test_list = pickle.load(test_file)

print(test_list[0])

In [None]:
outputs_text= []
def hook_text(module, input, output):
    outputs_text.clear()
    outputs_text.append(output)
    return None

In [None]:
# # Create Text-only model
# class Text_Only(nn.Module):
#     def __init__(self, num_classes=4):
#         super(MMSER, self).__init__()
#         self.num_classes=num_classes
#         self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#         self.text_model= BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes, return_dict=True)        
        
#         self.text_model.bert.pooler.register_forward_hook(hook_text)
        
#         for param in self.text_model.parameters():
#           param.requires_grad = False

#         self.dropout = nn.Dropout(.5)
#         self.linear1 = nn.Linear(768, 64)
#         self.linear2 = nn.Linear(64, num_classes)

#         self.softmax = nn.Softmax(dim=1)
        
#     def forward(self,text,audio):
#         self.text_model(text)
#         text_embed=outputs_text[0]
#         text_embed = self.linear1(text_embed)
#         # print("T", text_embed.shape)
#         x=self.dropout(concat_embded)
#         x=self.linear2(x)
#         return x

In [None]:
# # Create Audio-only model
# class Audio_Only(nn.Module):
#     def __init__(self, num_classes=4):
#         super(MMSER, self).__init__()
#         self.num_classes=num_classes
#         self.audio_model= vggish()
        
#         self.linear1 = nn.Linear(128, 64)
#         self.linear2 = nn.Linear(64, num_classes)

#         self.softmax = nn.Softmax(dim=1)
        
#     def forward(self,text,audio):
#         audio_embed = self.audio_model(audio)
#         # print("A1", audio_embed.shape)
#         if (len(audio_embed.size()) == 1):
#             audio_embed = torch.unsqueeze(audio_embed, dim=0)
#         elif (audio_embed.shape[0] == 1) and (audio_embed.shape[1] == 128):
#             audio_embed = audio_embed
#         elif (audio_embed.shape[0] > 1) and (audio_embed.shape[1] == 128):
#             audio_embed=torch.sum(audio_embed, dim=0)
#             audio_embed = torch.unsqueeze(audio_embed, dim=0)
#         # print("A2", audio_embed.shape)
#         x=self.linear1(audio_embed)
#         x=self.linear2(x)
#         return x

In [None]:
# Create Multi-modal model
class MMSER(nn.Module):
    def __init__(self, num_classes=4):
        super(MMSER, self).__init__()
        self.num_classes=num_classes
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.text_model= BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes, return_dict=True)
        self.audio_model= vggish()
        
        self.text_model.bert.pooler.register_forward_hook(hook_text)
        
        for param in self.text_model.parameters():
          param.requires_grad = False

        self.dropout = nn.Dropout(.5)
        self.linear1 = nn.Linear(768, 128)
        self.linear2 = nn.Linear(256, 64)
        self.linear3 = nn.Linear(64, num_classes)

        self.softmax = nn.Softmax(dim=1)
        
    def forward(self,text,audio):
        self.text_model(text)
        audio_embed = self.audio_model(audio)
        # print("A1", audio_embed.shape)
        text_embed=outputs_text[0]
        text_embed = self.linear1(text_embed)
        # print("T", text_embed.shape)
        if (len(audio_embed.size()) == 1):
            audio_embed = torch.unsqueeze(audio_embed, dim=0)
        elif (audio_embed.shape[0] == 1) and (audio_embed.shape[1] == 128):
            audio_embed = audio_embed
        elif (audio_embed.shape[0] > 1) and (audio_embed.shape[1] == 128):
            audio_embed=torch.sum(audio_embed, dim=0)
            audio_embed = torch.unsqueeze(audio_embed, dim=0)
        # print("A2", audio_embed.shape)
        concat_embded=torch.cat((text_embed,audio_embed),1)
        # print("Concatenated", concat_embded.shape)
        x=self.dropout(concat_embded)
        x=self.linear2(x)
        x=self.linear3(x)
        return x

In [None]:
# model = Text_Only(num_classes=4) # Only text embedding
# model = Audio_Only(num_classes=4) # Only audio embedding
model=MMSER(num_classes=4) # Multi-modal with both text and audio embeddings
model.to(device)

In [None]:
# # Check the designed model
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# label1=train_list[100]['label']
# # label1 = int(label1)
# text=train_list[100]['text']
# input_ids = torch.tensor(tokenizer.encode(text, add_special_tokens=True)).unsqueeze(0)  # Batch size 1
# input_ids=input_ids.to(device)
# label1=torch.tensor([label1])
# label1=label1.to(device)
# sprectrome=train_list[100]['sprectrome']
# sprectrome=sprectrome.to(device)
# model.to(device)
# model.eval()
# with torch.no_grad():
#     output = model(input_ids,sprectrome)
#     print(output)

In [None]:
import torch.optim as optim
optimizer = optim.Adam(params=model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)
writer = SummaryWriter(log_dir='logs/')

In [None]:
# # Train Text-only model
# start_epoch = 0
# total_steps = 1
# NUM_EPOCHS=101
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model.train()
# model.to(device)
# for epoch in range(start_epoch, NUM_EPOCHS):
#   print("*"*80)
#   print("Epochs:", epoch)
#   print("*"*80)
#   lr_scheduler.step()
#   random.shuffle(train_list)
#   for every_trainlist in train_list:
#     label1=every_trainlist['label']
#     label1 = int(label1)
#     text=every_trainlist['text']
#     label1=torch.tensor([label1])
#     optimizer.zero_grad()
#     input_ids = torch.tensor(tokenizer.encode(text, add_special_tokens=True)).unsqueeze(0) 
#     label1=label1.to(device)
#     input_ids=input_ids.to(device)
#     output = model(input_ids)
#     loss = criterion(output, label1)
#     print('loss',loss.item())
#     loss.backward()
#     optimizer.step()
#     _, preds = torch.max(output, 1)
#     accuracy = torch.sum(preds == label1)
#     print('accuracy.item()',accuracy.item())
#     if total_steps % 10 == 0:
#       with torch.no_grad():
#         _, preds = torch.max(output, 1)
#         accuracy = torch.sum(preds == label1)
#         writer.add_scalar('loss', loss.item(), total_steps)
#         writer.add_scalar('accuracy', accuracy.item(), total_steps)                     
#     total_steps+=1
#   if epoch % 5 == 0:
#     model_version = "bert_fc768-64_text_{}-{}-{}-{}.pt".format(epoch, currentMonth, currentDay, currentYear)
#     torch.save(model, os.path.join("models", model_version))

In [None]:
# # Train Audio-only model
# start_epoch = 0
# total_steps = 1
# NUM_EPOCHS=101
# model.train()
# model.to(device)
# for epoch in range(start_epoch, NUM_EPOCHS):
#   print("*"*80)
#   print("Epochs:", epoch)
#   print("*"*80)
#   lr_scheduler.step()
#   random.shuffle(train_list)
#   for every_trainlist in train_list:
#     label1=every_trainlist['label']
#     label1 = int(label1)
#     label1=torch.tensor([label1])
#     sprectrome=every_trainlist['sprectrome']
#     if(sprectrome.shape[2]>65):
#       optimizer.zero_grad()
#       sprectrome = sprectrome.to(device)
#       label1=label1.to(device)
#       output = model(sprectrome)
#       loss = criterion(output, label1)
#       print('loss',loss.item())
#       loss.backward()
#       optimizer.step()
#       _, preds = torch.max(output, 1)
#       accuracy = torch.sum(preds == label1)
#       print('accuracy.item()',accuracy.item())
#       if total_steps % 10 == 0:
#         with torch.no_grad():
#           _, preds = torch.max(output, 1)
#           accuracy = torch.sum(preds == label1)
#           writer.add_scalar('loss', loss.item(), total_steps)
#           writer.add_scalar('accuracy', accuracy.item(), total_steps)                     
#       total_steps+=1
#   if epoch % 5 == 0:
#     model_version = "vggish_fc128-64_audio_{}-{}-{}-{}.pt".format(epoch, currentMonth, currentDay, currentYear)
#     torch.save(model, os.path.join("models", model_version))

In [None]:
# Train Multi-modal model
start_epoch = 0
total_steps = 1
NUM_EPOCHS=101
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model.train()
model.to(device)
for epoch in range(start_epoch, NUM_EPOCHS):
  print("*"*80)
  print("Epochs:", epoch)
  print("*"*80)
  lr_scheduler.step()
  random.shuffle(train_list)
  for every_trainlist in train_list:
    label1=every_trainlist['label']
    label1 = int(label1)
    text=every_trainlist['text']
    label1=torch.tensor([label1])
    sprectrome=every_trainlist['sprectrome']
    if(sprectrome.shape[2]>65):
      optimizer.zero_grad()
      input_ids = torch.tensor(tokenizer.encode(text, add_special_tokens=True)).unsqueeze(0) 
      sprectrome = sprectrome.to(device)
      label1=label1.to(device)
      input_ids=input_ids.to(device)
      output = model(input_ids,sprectrome)
      loss = criterion(output, label1)
      print('loss',loss.item())
      loss.backward()
      optimizer.step()
      _, preds = torch.max(output, 1)
      accuracy = torch.sum(preds == label1)
      print('accuracy.item()',accuracy.item())
      if total_steps % 10 == 0:
        with torch.no_grad():
          _, preds = torch.max(output, 1)
          accuracy = torch.sum(preds == label1)
          writer.add_scalar('loss', loss.item(), total_steps)
          writer.add_scalar('accuracy', accuracy.item(), total_steps)                     
      total_steps+=1
  if epoch % 5 == 0:
    model_version = "vggish_bert_f4fc364_{}-{}-{}-{}.pt".format(epoch, currentMonth, currentDay, currentYear)
    torch.save(model, os.path.join("models", model_version))

In [None]:
# Load trained model
# ## Load Text-only model
# model_version = "bert_fc768-64_text_{}-{}-{}-{}.pt".format(epoch, currentMonth, currentDay, currentYear)
# model=torch.load(os.path.join("models", model_version))
# ## Load Audio-only model
# model_version = "vggish_fc128-64_audio_{}-{}-{}-{}.pt".format(epoch, currentMonth, currentDay, currentYear)
# model=torch.load(os.path.join("models", model_version))
## Load Multi-modal model
model_version = "vggish_bert_f4fc364_{}-{}-{}-{}.pt".format(epoch, currentMonth, currentDay, currentYear)
model=torch.load(os.path.join("models", model_version))

In [None]:
# # Text-only prediction
# y_actu=[]
# y_pred=[]
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model.to(device)
# model.eval()
# for every_test_list in test_list:
#     label1=every_test_list['label']
#     label1=torch.tensor([label1])
#     label1 = label1.to(device)
#     text=every_test_list['text']
#     input_ids = torch.tensor(tokenizer.encode(text, add_special_tokens=True)).unsqueeze(0)
#     input_ids = input_ids.to(device)
#     with torch.no_grad():
#       output = model(input_ids)
#       _, preds = torch.max(output, 1)
#       y_actu.append(label1.cpu().numpy()[0])
#       y_pred.append(preds.cpu().numpy()[0])

In [None]:
# # Audio-only prediction
# y_actu=[]
# y_pred=[]
# model.to(device)
# model.eval()
# for every_test_list in test_list:
#     label1=every_test_list['label']
#     label1=torch.tensor([label1])
#     label1 = label1.to(device)
#     sprectrome=every_test_list['sprectrome']
#     sprectrome = sprectrome.to(device)
#     with torch.no_grad():
#       if(sprectrome.shape[2]>65):
#         output = model(sprectrome)
#         _, preds = torch.max(output, 1)
#         y_actu.append(label1.cpu().numpy()[0])
#         y_pred.append(preds.cpu().numpy()[0])

In [None]:
# Multi-modal prediction
y_actu=[]
y_pred=[]
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model.to(device)
model.eval()
for every_test_list in test_list:
    label1=every_test_list['label']
    label1=torch.tensor([label1])
    label1 = label1.to(device)
    sprectrome=every_test_list['sprectrome']
    sprectrome = sprectrome.to(device)
    text=every_test_list['text']
    input_ids = torch.tensor(tokenizer.encode(text, add_special_tokens=True)).unsqueeze(0)
    input_ids = input_ids.to(device)
    with torch.no_grad():
      if(sprectrome.shape[2]>65):
        output = model(input_ids,sprectrome)
        _, preds = torch.max(output, 1)
        y_actu.append(label1.cpu().numpy()[0])
        y_pred.append(preds.cpu().numpy()[0])

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
cm = confusion_matrix(y_actu, y_pred)
print(cm)

In [None]:
cmn = (cm.astype('float') / cm.sum(axis=1)[:, np.newaxis])*100

ax = plt.subplots(figsize=(8, 5.5))[1]
sns.heatmap(cmn, cmap='flare', annot=True, square=True, linecolor='black', linewidths=0.75, ax = ax, fmt = '.2f', annot_kws={'size': 16})
ax.set_xlabel('Predicted', fontsize=18, fontweight='bold')
ax.xaxis.set_label_position('bottom')
ax.xaxis.set_ticklabels(["Anger", "Happiness", "Sadness", "Neutral"], fontsize=16)
ax.set_ylabel('Ground Truth', fontsize=18, fontweight='bold')
ax.yaxis.set_ticklabels(["Anger", "Happiness", "Sadness", "Neutral"], fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import balanced_accuracy_score, accuracy_score

In [None]:
wa = balanced_accuracy_score(y_actu, y_pred)
print(wa)

In [None]:
ua = accuracy_score(y_actu, y_pred)
print(ua)

In [None]:
print("Done!!!")