In [1]:
import subprocess
import os
import glob
import numpy as np
import pandas as pd
from time import process_time as ptime
import time
from datetime import timedelta
import librosa

In [2]:
from tqdm import tqdm

# PyTorch libraries and modules
import torch
from torch.autograd import Variable
from torch.nn import Linear, ReLU, CrossEntropyLoss
from torch.nn import Sequential, Conv2d, MaxPool2d, Module
from torch.nn import Softmax, BatchNorm2d, Dropout, Flatten
from torch.optim import Adam

In [3]:
# ffmpeg binary
ffmpeg = "/usr/bin/ffmpeg"

# text directory
text_dir = "../data/raw/text"
labels_dir = "../data/raw/labels"

# labels file
labels_file = "../data/interim/labels/labels.csv"

# audio directories
audio_dir = "../data/raw/audio"
audio_out_dir = "../data/interim/audio"
audio_features_dir = "../data/processed/audio"

In [4]:
def load_all_ratings(labels_dir, split_id_clip = False):
    
    label_files = glob.glob(f"{labels_dir}/*.csv")

    df_list = []

    for filename in label_files:
        df = pd.read_csv(filename, index_col=None, header=0)
        df_list.append(df)


    df_labels = pd.concat(df_list, axis=0, ignore_index=True)

    label_cols = ['Input.VIDEO_ID', 'Input.CLIP',
              'Answer.anger', 'Answer.disgust',
              'Answer.fear', 'Answer.happiness',
              'Answer.sadness', 'Answer.surprise',
              'Answer.sentiment']

    label_new_cols = ['id', 'clip',
                      'anger', 'disgust',
                      'fear', 'happiness',
                      'sadness', 'surprise',
                      'sentiment']
    df_labels = df_labels[label_cols]
    df_labels.columns = label_new_cols

    # drop row all nan
    isna_idx = \
        df_labels.index[df_labels[df_labels.columns[2:]].isna().all(axis=1)]
    df_labels.drop(index=isna_idx, inplace=True)
    # replace remaining nan's with 0
    df_labels = df_labels.replace({np.nan : 0})
    # convert ratings to int
    df_labels[label_new_cols[2:]] = df_labels[label_new_cols[2:]].astype('Int64')
    # set emotions to 0 or 1
    df_labels[label_new_cols[2:-1]] = \
        df_labels[label_new_cols[2:-1]].applymap(lambda x : 1 if x > 0 else 0)

    # if sentiment > 0 convert to positive = 1, elif < 0 convert to negative = 1
    # if none of emotion or sentiment == 1, set none to 1

    df_labels['positive'] = \
        df_labels['sentiment'].map(lambda x : 1 if x > 0 else 0)
    df_labels['negative'] = \
        df_labels['sentiment'].map(lambda x : 1 if x < 0 else 0)

    # drop sentiment column (now in positive/negative)
    df_labels.drop(columns='sentiment', inplace=True)
    
    df_labels['none'] = 0
    none_idx = \
        df_labels[df_labels[df_labels.columns[2:]].sum(axis=1) == 0].index

    df_labels.loc[none_idx,'none'] = 1

    label_new_cols = ['id', 'clip',
                      'anger', 'disgust',
                      'fear', 'happiness',
                      'sadness', 'surprise',
                      'sentiment']
    # df_labels = \
        # df_labels.
    
    # remove '/' from id's
    df_labels['id'] = df_labels['id'].map(lambda x : str(x).split("/")[-1])
    if not split_id_clip:
        df_labels['id'] = df_labels['id'] + '_' + df_labels['clip'].astype(str)
        df_labels.drop(columns = 'clip', inplace = True)
        
        label_new_cols = ['id',
                          'none', 'positive', 'negative',
                          'anger', 'disgust',
                          'fear', 'happiness',
                          'sadness', 'surprise'
                         ]
    else:
        label_new_cols = ['id', 'clip',
                          'none', 'positive', 'negative',
                          'anger', 'disgust',
                          'fear', 'happiness',
                          'sadness', 'surprise'
                         ]
        
    df_labels = df_labels[label_new_cols]
    return df_labels

In [6]:
# def get_all_rated_clips_ids(labels_dir):
#    ratings = load_all_ratings(labels_dir)

In [5]:
def aggregate_ratings(ratings):
    '''
        aggregate labels to 1 if 2+ ratings of 3 aggree
    '''
    grp_labels = ratings.groupby('id').sum()
    display(grp_labels.head(5))
    grp_labels = grp_labels.applymap(lambda x : 1 if x > 1 else 0)
    display(grp_labels.head(5))
    # drop rows where all == 0
    idx = grp_labels[grp_labels.sum(axis =1) == 0].index
    grp_labels.drop(index = idx, inplace=True)
    print(f"{len(idx)} rows dropped")
    print(f"{grp_labels.shape[0]} grouped labels")
    return grp_labels

In [6]:
def gen_mfcc_spectro(audio_file, len_secs = 2,
                     fmax=8000, hop_len = 1024, end_pad_secs=0.1):
    full_samples, srate = \
        librosa.load(audio_file, sr=None)
    # standardize
    # full_samples = (full_samples - full_samples.mean()) / full_samples.std()
    
    num_samples = srate * len_secs
    end_pad = int(srate * end_pad_secs)
    
    if len(full_samples) < num_samples:
        zero_pad = np.zeros(num_samples - len(full_samples))
        samples = np.concatenate((zero_pad, full_samples), axis=0)
    elif len(full_samples) > num_samples + end_pad:
        samples = full_samples[len(full_samples) - num_samples - end_pad:-end_pad]
    else:
        # print("else")
        samples = full_samples[-num_samples:]
    # print(len(samples))
    S = librosa.feature.melspectrogram(y=samples,
                                   sr=srate, n_mels=64, #128,
                                   fmax=fmax, hop_length=hop_len)
    mfcc_spectro = librosa.feature.mfcc(S=librosa.power_to_db(S), n_mfcc=20)#,
                                    #hop_length=1024)#, htk=True)
    #mfcc_spectro = sklearn.preprocessing.scale(mfcc_spectro, axis=1)
    # plt.figure(figsize=(10,8))
    # librosa.display.specshow(mfcc_spectro[:, :], sr=test_sr / (hop_len/512),
    #                         x_axis='time')
    #plt.show()
    # print(mfcc_spectro.shape)
    return mfcc_spectro

In [7]:
def extract_mfcc_spectros_from_dir(audio_dir, file_names=None,
                                    len_secs=3, show_progress=True):
    not_files = []
    audio_features = {}
    stime = time.time()# ptime()
    cnt = 0
    if file_names is None:
        file_names = glob.glob(f"{audio_dir}/*.wav")
        # print(file_names[:5])
        # file_names = (os.listdir(audio_dir))
    else:
        file_names = [audio_dir + '/' + f + '.wav' for f in file_names]
    num_files = len(file_names) # len(os.listdir(audio_dir))
    # for i, f in enumerate(os.listdir(audio_dir)):
    mfcc_spectros = []
    clip_ids = []
    for i, f in enumerate(file_names):
        # print(f)
        clip_id = f.rsplit('.', maxsplit = 1)[0].rsplit('/', maxsplit = 1)[-1]
        # try:
        clip_mfcc_spectro = \
                    gen_mfcc_spectro(f, len_secs = len_secs,
                                     fmax=8000, hop_len = 1024, #512,
                                     end_pad_secs=0.1)
        clip_ids.append(clip_id)
        mfcc_spectros.append(clip_mfcc_spectro)
        #except:
        #    print(f"error : {clip_id}")
        
        
        if show_progress:
            if i % 10 == 0 and i != 0:
                print('.', end = '')
                if i % 500 == 0:
                    print(f" {i} de {num_files} fichiers")

    mfcc_spectros = np.array(mfcc_spectros)
    etime = time.time() # ptime()
    proc_time = timedelta(seconds = round(etime - stime))
    print(f"\n{mfcc_spectros.shape[0]} fichiers extraits: {proc_time} (h:mm:ss)")
    return mfcc_spectros, np.array(clip_ids)

# CNN model

In [29]:
class Net(Module):   
    def __init__(self):
        super(Net, self).__init__()

        self.cnn_layers = Sequential(
            # Defining a 2D convolution layer
            Conv2d(1, 4, kernel_size=3, stride=1, padding=1),
            BatchNorm2d(4),
            ReLU(inplace=True),
            MaxPool2d(kernel_size=2, stride=2),
            # Defining another 2D convolution layer
            Conv2d(4, 4, kernel_size=3, stride=1, padding=1),
            BatchNorm2d(4),
            ReLU(inplace=True),
            MaxPool2d(kernel_size=2, stride=2),
            Flatten()
        )

        n_channels = self.cnn_layers(torch.empty(1, 1, 20, 63)).size(-1)
        self.linear_layers = Sequential(
            Linear(n_channels, 300),
            Linear(300, 3)
            # Linear(4 * 7 * 7, 10)
        )

    # Defining the forward pass    
    def forward(self, x):
        x = self.cnn_layers(x)
        x = x.view(x.size(0), -1)
        x = self.linear_layers(x)
        return x

In [32]:
# defining the model
model = Net()
# defining the optimizer
optimizer = Adam(model.parameters(), lr=0.07)
# defining the loss function
criterion = CrossEntropyLoss()
# checking if GPU is available
#if torch.cuda.is_available():
#    model = model.cuda()
#    criterion = criterion.cuda()
    
print(model)

Net(
  (cnn_layers): Sequential(
    (0): Conv2d(1, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(4, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (5): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU(inplace=True)
    (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (8): Flatten(start_dim=1, end_dim=-1)
  )
  (linear_layers): Sequential(
    (0): Linear(in_features=300, out_features=300, bias=True)
    (1): Linear(in_features=300, out_features=3, bias=True)
  )
)


In [33]:
del model

In [13]:
def train(epoch, Xtrain, ytrain, Xtest, ytest):
    model.train()
    tr_loss = 0
    # getting the training set
    x_train, y_train = Variable(Xtrain), Variable(ytrain)
    # x_train, y_train = Xtrain, ytrain
    # getting the validation set
    x_val, y_val = Variable(Xtest), Variable(ytest)
    # x_val, y_val = Xtest, ytest
    # converting the data into GPU format
    #if torch.cuda.is_available():
    #    x_train = x_train.cuda()
    #    y_train = y_train.cuda()
    #    x_val = x_val.cuda()
    #    y_val = y_val.cuda()

    # clearing the Gradients of the model parameters
    optimizer.zero_grad()
    
    # prediction for training and validation set
    output_train = model(x_train.float())
    output_val = model(x_val.float())

    # computing the training and validation loss
    loss_train = criterion(output_train, y_train)
    loss_val = criterion(output_val, y_val)
    train_losses.append(loss_train)
    val_losses.append(loss_val)

    # computing the updated weights of all the model parameters
    loss_train.backward()
    optimizer.step()
    tr_loss = loss_train.item()
    if epoch%2 == 0:
        # printing the validation loss
        print('Epoch : ',epoch+1, '\t', 'loss :', loss_val)

In [31]:
# defining the number of epochs
n_epochs = 15
# empty list to store training losses
train_losses = []
# empty list to store validation losses
val_losses = []

stime = time.time()
# training the model
for epoch in range(n_epochs):
    train(epoch, Xtrain, ytrain, Xtest, ytest)
print(f"Train time: {timedelta(seconds = round(time.time() - stime))}")

Epoch :  1 	 loss : tensor(1.1178, grad_fn=<NllLossBackward0>)
Epoch :  3 	 loss : tensor(124.2669, grad_fn=<NllLossBackward0>)
Epoch :  5 	 loss : tensor(104.0745, grad_fn=<NllLossBackward0>)
Epoch :  7 	 loss : tensor(35.5201, grad_fn=<NllLossBackward0>)
Epoch :  9 	 loss : tensor(25.4375, grad_fn=<NllLossBackward0>)
Epoch :  11 	 loss : tensor(10.8557, grad_fn=<NllLossBackward0>)
Epoch :  13 	 loss : tensor(8.5423, grad_fn=<NllLossBackward0>)
Epoch :  15 	 loss : tensor(4.9201, grad_fn=<NllLossBackward0>)
Train time: 0:00:36


In [None]:
from sklearn.metrics import accuracy_score
with torch.no_grad():
    output = model(Variable(Xtrain).float())
    
softmax = torch.exp(output).cpu()
prob = list(softmax.numpy())
predictions = np.argmax(prob, axis=1)

# accuracy on training set
accuracy_score(Variable(ytrain), predictions)

In [38]:
Xtrain.shape

torch.Size([14450, 1, 20, 63])

# Load sentiment labels

In [14]:
all_ratings = load_all_ratings(labels_dir)
print(all_ratings.shape)
print(all_ratings.columns.tolist())

(70764, 10)
['id', 'none', 'positive', 'negative', 'anger', 'disgust', 'fear', 'happiness', 'sadness', 'surprise']


In [15]:
mask_positive = all_ratings['positive'] == 1
mask_negative = all_ratings['negative'] == 1
mask_none = all_ratings['none'] == 1
only_sentiment_ratings = \
    all_ratings[(mask_positive) | (mask_negative) | (mask_none)]\
        [['id','none','positive','negative']].copy()
only_sentiment_ratings.shape

(60743, 4)

In [16]:
sentiment_labels = aggregate_ratings(only_sentiment_ratings)

Unnamed: 0_level_0,none,positive,negative
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
--qXJuDtHPw_5,1,2,0
-3g5yACwYnA_10,0,2,0
-3g5yACwYnA_13,1,2,0
-3g5yACwYnA_2,1,0,0
-3g5yACwYnA_3,2,0,0


Unnamed: 0_level_0,none,positive,negative
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
--qXJuDtHPw_5,0,1,0
-3g5yACwYnA_10,0,1,0
-3g5yACwYnA_13,0,1,0
-3g5yACwYnA_2,0,0,0
-3g5yACwYnA_3,1,0,0


5062 rows dropped
18319 grouped labels


In [17]:
audio_clip_names = glob.glob(f"{audio_out_dir}/*.wav")
audio_clip_names = \
    [cn.rsplit('.', maxsplit = 1)[0].rsplit('/', maxsplit = 1)[-1] for \
         cn in audio_clip_names]
audio_clip_names[:5]

['MPRqaQqrd9Y_7', 'UlTJmndbGHM_4', 'hjBQmIWiWgw_2', '9zWeMrfr-l0_0', '31197_2']

In [18]:
print(len(audio_clip_names))

23259


In [19]:
labeled_clips_no_audio = []
for idx in sentiment_labels.index:
    if idx not in audio_clip_names:
        labeled_clips_no_audio.append(idx)

In [20]:
print(len(labeled_clips_no_audio))

256


In [21]:
sentiment_labels.drop(index=labeled_clips_no_audio, inplace=True)
print(sentiment_labels.shape)

(18063, 3)


In [22]:
mfcc_4sec_spec_sent, _ = \
    extract_mfcc_spectros_from_dir(audio_out_dir,
                                   file_names=sentiment_labels.index.tolist(),
                                    len_secs=4, show_progress=True)

.................................................. 500 de 18063 fichiers
.................................................. 1000 de 18063 fichiers
.................................................. 1500 de 18063 fichiers
.................................................. 2000 de 18063 fichiers
.................................................. 2500 de 18063 fichiers
.................................................. 3000 de 18063 fichiers
.................................................. 3500 de 18063 fichiers
.................................................. 4000 de 18063 fichiers
.................................................. 4500 de 18063 fichiers
.................................................. 5000 de 18063 fichiers
.................................................. 5500 de 18063 fichiers
.................................................. 6000 de 18063 fichiers
.................................................. 6500 de 18063 fichiers
.......................................

In [24]:
labels = sentiment_labels.copy()

In [25]:
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = \
    train_test_split(mfcc_4sec_spec_sent, labels, test_size=0.2,
                     stratify=labels.values.argmax(axis=1))
print(Xtrain.shape)
print(Xtest.shape)
print(ytrain.shape)
print(ytest.shape)

(14450, 20, 63)
(3613, 20, 63)
(14450, 3)
(3613, 3)


# Reshape for pytorch

In [26]:
Xtrain = \
    Xtrain.reshape(Xtrain.shape[0], 1, Xtrain.shape[1], Xtrain.shape[2]).astype(float)
Xtrain = torch.from_numpy(Xtrain)

Xtest = \
    Xtest.reshape(Xtest.shape[0], 1, Xtest.shape[1], Xtest.shape[2]).astype(float)
Xtest = torch.from_numpy(Xtest)

# converting the target into torch format
ytrain = ytrain.values.argmax(axis=1).astype(int)
ytrain = torch.from_numpy(ytrain)

ytest = ytest.values.argmax(axis=1).astype(int)
ytest = torch.from_numpy(ytest)


# shape of training data
print(Xtrain.shape)
print(Xtest.shape)
print(ytrain.shape)
print(ytest.shape)

torch.Size([14450, 1, 20, 63])
torch.Size([3613, 1, 20, 63])
torch.Size([14450])
torch.Size([3613])
