In [98]:
import numpy as np
import librosa
import pickle
import pandas as pd
import tqdm

In [99]:
import torch
from torch import nn
from torch.utils.data import DataLoader,Dataset
from torch.utils.tensorboard import SummaryWriter

# Dataset

In [3]:
final_data_train = {}

with open("./final_data_train_concat.pickle","rb") as fp:
    final_data_train = pickle.load(fp)
    del fp

In [4]:
final_data_train

{'Ses04M_script03_2_F052': {'segments': [array([-0.03213338, -0.02077687, -0.01333584, ..., -0.00077339,
          -0.00042809,  0.        ], dtype=float32)],
  'text': ['Swine'],
  'label': 1},
 'Ses05M_script02_2_F018': {'segments': [array([-0.0003475 , -0.00065664, -0.00104443, ..., -0.00012552,
           0.00022793,  0.00048957], dtype=float32)],
  'text': ['No'],
  'label': 2},
 'Ses04F_impro03_F010': {'segments': [array([0.00287569, 0.00692671, 0.00875427, ..., 0.02788722, 0.02835071,
          0.03040545], dtype=float32),
   array([-0.00261466,  0.00051322,  0.00024712, ...,  0.08525971,
           0.06879666,  0.04203715], dtype=float32),
   array([0.08482464, 0.09976883, 0.114141  , ..., 0.03879187, 0.03429671,
          0.02896774], dtype=float32),
   array([-0.02770461, -0.04016051, -0.03893214, ..., -0.00411754,
          -0.00448682, -0.00343156], dtype=float32),
   array([-0.11022681, -0.12808537, -0.13872465, ...,  0.07116222,
           0.04363715, -0.02430514], dtype=

In [70]:
max_size = 0
min_size = float("inf")
for value in final_data_train.values():
    for segment in value["segments"]:
        max_size = max(segment.shape[0],max_size)
        min_size = min(segment.shape[0],min_size)

In [71]:
max_size,min_size

(113501, 6)

In [5]:
final_data_train["Ses05M_script02_2_F018"]["segments"][0].shape

(27200,)

In [6]:
final_data_test = {}

with open("./final_data_test_concat.pickle","rb") as fp:
    final_data_test = pickle.load(fp)
    del fp

In [67]:
max_size = 0
for value in final_data_test.values():
    for segment in value["segments"]:
        max_size = max(segment.shape[0],max_size)

In [68]:
max_size

74320

In [7]:
df_train = pd.read_csv("./data/train.csv")
train_Ids = df_train["ID"].tolist()
train_sentences = df_train["Text"].tolist()
train_labels = df_train["Emotion"].tolist()

In [127]:
for id in train_Ids:
    if isinstance(final_data_train[id]["label"],str):
        print(final_data_train[id]["label"])

In [292]:
df_test = pd.read_csv("./data/test.csv")
test_ids = df_test["ID"].to_list()
test_sentences = df_test["Text"].to_list()
test_labels = df_train["Emotion"].tolist()

In [291]:
final_data_test["Ses02F_impro04_M000"]["label"]

'Ses04M_script03_2_F052'

## Glove word embeddings

In [9]:
import gensim.downloader as api

In [10]:
wv = api.load('word2vec-google-news-300')

In [161]:
train_embeddings = []
train_audio_features = []
new_shape = (int(113501/2),)

for id in tqdm.tqdm(train_Ids):
    audio_data = final_data_train[id]["segments"]
    sentence = final_data_train[id]["text"]
    features = []

    embeddings = [wv[token] if token in wv else [0.0]*300 for token in sentence]
    for audio in audio_data:
        pad_width = new_shape[0] - audio.shape[0]
        if pad_width > 0:
            y = np.pad(audio, (0, pad_width), mode='constant', constant_values=0)
        elif pad_width<0:
            y = audio[:new_shape[0]]
        features.append(librosa.feature.mfcc(y=y, sr=16000, n_mfcc=128))
    
    train_embeddings.append(embeddings)
    train_audio_features.append(features)
    




100%|██████████| 3592/3592 [03:08<00:00, 19.10it/s]


In [162]:
test_embeddings = []
test_audio_features = []
new_shape = (int(113501/2),)

for id in tqdm.tqdm(test_ids):
    audio_data = final_data_test[id]["segments"]
    sentence = final_data_test[id]["text"]
    features = []

    embeddings = [wv[token] if token in wv else [0.0]*300 for token in sentence]
    for audio in audio_data:
        pad_width = new_shape[0] - audio.shape[0]
        if pad_width > 0:
            y = np.pad(audio, (0, pad_width), mode='constant', constant_values=0)
        elif pad_width<0:
            y = audio[:new_shape[0]]
        features.append(librosa.feature.mfcc(y=y, sr=16000, n_mfcc=128))
    
    test_embeddings.append(embeddings)
    test_audio_features.append(features)
    




100%|██████████| 898/898 [00:53<00:00, 16.64it/s]


In [164]:
train_data_concatenated = {"embeddings":train_embeddings,"features":train_audio_features}
test_data_concatenated = {"embeddings":test_embeddings,"features":test_audio_features}

In [165]:
with open("train_data_concatenated.pickle","wb") as fp:
    pickle.dump(train_data_concatenated,fp)
    del fp

In [166]:
with open("test_data_concatenated.pickle","wb") as fp:
    pickle.dump(test_data_concatenated,fp)
    del fp

In [273]:
for label in train_labels:
    if isinstance(label,str):
        print(label)

In [293]:
class ConcatenatedDataset(Dataset):
    def __init__(self,audio_data:list,embeddings:list,ids:list,audio_dict:dict,labels:list) -> None:
        self.audio_data = audio_data
        self.ids = ids
        self.embeddings = embeddings
        self.audio_dict = audio_dict
        self.labels = labels
    
    def __getitem__(self, index):
        id_index = self.ids[index]
        embeddings = self.embeddings[index]
        embeddings = torch.tensor(embeddings,dtype=torch.float)
        features = self.audio_data[index]
        features = torch.tensor(features,dtype=torch.float)
        features = features.view(features.shape[0],-1)
        label = self.labels[index]
        

        return torch.cat([embeddings,features],dim=1),torch.tensor(label,dtype=torch.long)
    

    def __len__(self):
        return len(self.ids)        
        

In [294]:
train_dataset = ConcatenatedDataset(train_audio_features,train_embeddings,train_Ids,final_data_train,train_labels)

torch.Tensor

In [296]:
len(train_dataset)

3592

In [297]:
test_dataset = ConcatenatedDataset(test_audio_features,test_embeddings,test_ids,final_data_test,test_labels)

In [316]:
type(test_dataset[0][1])

torch.Tensor

# Model

In [417]:
def get_device():
    if torch.cuda.is_available():
        dev = 'cuda:0'
    elif torch.backends.mps.is_available():
        dev = 'mps:0'
    else:
        dev = 'cpu'
    device = torch.device(dev)
    return device

In [418]:
class LSTM(nn.Module):
    def __init__(self,input_size, hidden_size, output_size) -> None:
        super().__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        device = get_device()
        h0 = torch.zeros(1, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(1, x.size(0), self.hidden_size).to(device)

     
        out, _ = self.lstm(x, (h0, c0))

     
        out = self.fc(out[:, -1, :])

        return out


In [419]:
model_lstm = LSTM(14508,20,4)

In [420]:
def collate_function(batch):
    (xx, yy) = zip(*batch)
    xx_pad = nn.utils.rnn.pad_sequence(xx, batch_first=True, padding_value=0)
    yy_pad = torch.tensor(yy, dtype=torch.long)

    return xx_pad,yy_pad

In [421]:
train_dataloader = DataLoader(train_dataset,batch_size=32,shuffle=True,collate_fn=collate_function)
test_dataloader = DataLoader(test_dataset,batch_size=32,shuffle=True,collate_fn=collate_function)

In [404]:
test_dataset[1][0]

tensor([[ 0.0791, -0.0050,  0.1118,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.1050, -0.0330,  0.1240,  ...,  0.0000,  0.0000,  0.0000],
        [-0.1973,  0.2500,  0.0522,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [ 0.1030, -0.1523,  0.0259,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.1387, -0.0918,  0.0349,  ...,  0.0000,  0.0000,  0.0000],
        [-0.1270,  0.0220,  0.2871,  ...,  0.0000,  0.0000,  0.0000]])

In [405]:
for x,y in test_dataloader:
    print(x)
    print(y)
    break

tensor([[[ 0.0679,  0.1953,  0.1206,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.1318,  0.0835, -0.0659,  ...,  0.0000,  0.0000,  0.0000],
         [-0.1387,  0.1260,  0.3262,  ...,  0.0000,  0.0000,  0.0000],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[ 0.0571,  0.1621,  0.1177,  ...,  0.0000,  0.0000,  0.0000],
         [-0.1045,  0.0030, -0.1011,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[-0.0986,  0.0547,  0.1836,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0864,  0.0101,  0.0579,  ...,  0

In [416]:
def get_device():
    if torch.cuda.is_available():
        dev = 'cuda:0'
    elif torch.backends.mps.is_available():
        dev = 'mps:0'
    else:
        dev = 'cpu'
    device = torch.device(dev)
    return device

In [425]:
def calculate_accuracy(model,loader):
    device = get_device()

    model_device = model.to(device)
    with torch.no_grad():
        n_samples = 0
        n_correct  = 0
        for x,y in loader:

            x_device = x.to(device)
            y_device = y.to(device)

            ypred = model_device(x_device)
            _, ypred_labels = torch.max(ypred,1)
            n_correct += (ypred_labels == y_device).sum().item()
            n_samples+= x.shape[0]


        accuracy = (100*n_correct)/n_samples
        return accuracy


def train_loop(model,train_dataloader,test_dataloader,epochs,loss_fn,optimizer,scheduler,model_name:str,comment:str):
    device = get_device()
    model_device = model.to(device)
    max_accuracy = calculate_accuracy(model,test_dataloader)

    # log_dir = "runs/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

    # writer = SummaryWriter(log_dir=log_dir,comment=comment)

    for epoch in range(epochs):
        t = tqdm.tqdm(train_dataloader)
        
        total_loss = 0
        index = 0
        for x,y in t:
            x_device = x.to(device)
            y_device = y.to(device)
            predictions = model_device(x_device)
            loss = loss_fn(predictions,y_device)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
            total_loss+=loss.item()
            
            t.set_description(f"Epoch: {epoch+1}/{epochs} Loss: {total_loss}")

            index+=1

            if index == len(train_dataloader)-1:
                accuracy = calculate_accuracy(model,test_dataloader)
                # writer.add_scalar(f"{model_name}/Loss",total_loss,epoch)
                # writer.add_scalar(f"{model_name}/Accuracy",accuracy,epoch)
                if accuracy > max_accuracy:
                    torch.save(model.state_dict(), f"{model_name}_{accuracy}.pt")
                    max_accuracy = accuracy
                t.set_postfix({"accuracy":accuracy})
        if scheduler:
            scheduler.step()


            


In [426]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_lstm.parameters(),1e-5)

In [427]:
train_loop(model_lstm,train_dataloader,test_dataloader,10,loss_fn,optimizer,None,"concatenated_input_model","Testing")

Epoch: 1/10 Loss: 153.0138509273529: 100%|██████████| 113/113 [01:00<00:00,  1.88it/s, accuracy=35.7] 
Epoch: 2/10 Loss: 153.04687309265137: 100%|██████████| 113/113 [00:58<00:00,  1.93it/s, accuracy=35.5]
Epoch: 3/10 Loss: 152.80183935165405: 100%|██████████| 113/113 [00:58<00:00,  1.93it/s, accuracy=36]
Epoch: 4/10 Loss: 152.46134078502655: 100%|██████████| 113/113 [00:58<00:00,  1.95it/s, accuracy=34.7]
Epoch: 5/10 Loss: 152.41316485404968: 100%|██████████| 113/113 [00:58<00:00,  1.94it/s, accuracy=36.1]
Epoch: 6/10 Loss: 152.37616884708405: 100%|██████████| 113/113 [00:58<00:00,  1.93it/s, accuracy=35.6]
Epoch: 7/10 Loss: 152.19992625713348: 100%|██████████| 113/113 [00:59<00:00,  1.89it/s, accuracy=35.5]
Epoch: 8/10 Loss: 152.22015273571014: 100%|██████████| 113/113 [00:58<00:00,  1.93it/s, accuracy=35.9]
Epoch: 9/10 Loss: 152.1271027326584: 100%|██████████| 113/113 [00:58<00:00,  1.92it/s, accuracy=36.2] 
Epoch: 10/10 Loss: 152.00944185256958: 100%|██████████| 113/113 [00:58<00:0