In [1]:
# sklearn
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import torchvision
import torch
from torch.utils.data import DataLoader, TensorDataset
from catalyst.dl import SupervisedRunner
from catalyst.dl.utils import set_global_seed, prepare_cudnn
from catalyst.dl.callbacks import AccuracyCallback, AUCCallback, PrecisionRecallF1ScoreCallback, VerboseLogger

# Other  
from tqdm import tqdm, tqdm_pandas
import scipy
from scipy.stats import skew
import librosa
import librosa.display
import json
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import specgram
import pandas as pd
import seaborn as sns
import glob 
import os
import sys
import IPython.display as ipd  # To play sound in the notebook
import warnings
# ignore warnings 
if not sys.warnoptions:
    warnings.simplefilter("ignore")


[1mAn import was requested from a module that has moved location.
Import requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.[0m


[1mAn import was requested from a module that has moved location.
Import of 'jit' requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.[0m



In [2]:
ref = pd.read_csv("../data/processed/Data_path.csv")
ref.head()

Unnamed: 0,labels,source,path
0,male_disgust,SAVEE,../data/raw/surrey-audiovisual-expressed-emoti...
1,male_disgust,SAVEE,../data/raw/surrey-audiovisual-expressed-emoti...
2,male_sad,SAVEE,../data/raw/surrey-audiovisual-expressed-emoti...
3,male_neutral,SAVEE,../data/raw/surrey-audiovisual-expressed-emoti...
4,male_fear,SAVEE,../data/raw/surrey-audiovisual-expressed-emoti...


In [3]:
'''
1. Data Augmentation method   
'''
def speedNpitch(data):
    """
    Speed and Pitch Tuning.
    """
    # you can change low and high here
    length_change = np.random.uniform(low=0.8, high = 1)
    speed_fac = 1.2  / length_change # try changing 1.0 to 2.0 ... =D
    tmp = np.interp(np.arange(0,len(data),speed_fac),np.arange(0,len(data)),data)
    minlen = min(data.shape[0], tmp.shape[0])
    data *= 0
    data[0:minlen] = tmp[0:minlen]
    return data

'''
2. Extracting the MFCC feature as an image (Matrix format).  
'''
def prepare_data(df, n, aug, mfcc):
    X = np.empty(shape=(df.shape[0], n, 216, 1))
    input_length = sampling_rate * audio_duration
    
    cnt = 0
    for fname in tqdm(df.path):
        file_path = fname
        data, _ = librosa.load(file_path, sr=sampling_rate
                               ,res_type="kaiser_fast"
                               ,duration=2.5
                               ,offset=0.5
                              )

        # Random offset / Padding
        if len(data) > input_length:
            max_offset = len(data) - input_length
            offset = np.random.randint(max_offset)
            data = data[offset:(input_length+offset)]
        else:
            if input_length > len(data):
                max_offset = input_length - len(data)
                offset = np.random.randint(max_offset)
            else:
                offset = 0
            data = np.pad(data, (offset, int(input_length) - len(data) - offset), "constant")

        # Augmentation? 
        if aug == 1:
            data = speedNpitch(data)
        
        # which feature?
        if mfcc == 1:
            # MFCC extraction 
            MFCC = librosa.feature.mfcc(data, sr=sampling_rate, n_mfcc=n_mfcc)
            MFCC = np.expand_dims(MFCC, axis=-1)
            X[cnt,] = MFCC
            
        else:
            # Log-melspectogram
            melspec = librosa.feature.melspectrogram(data, n_mels = n_melspec)   
            logspec = librosa.amplitude_to_db(melspec)
            logspec = np.expand_dims(logspec, axis=-1)
            X[cnt,] = logspec
            
        cnt += 1
    
    return X


In [4]:
# sampling_rate=44100
# audio_duration=2.5
# n_mfcc = 30
# mfcc = prepare_data(ref, n = n_mfcc, aug = 0, mfcc = 1)

In [5]:
mfcc = np.load('../data/processed/MFCC_PREPARE.npy')

In [6]:
# Split between train and test 
X_train, X_test, y_train, y_test = train_test_split(mfcc
                                                    , ref.labels
                                                    , test_size=0.25
                                                    , shuffle=True
                                                    , random_state=42
                                                   )


In [7]:
#np.save('../data/processed/MFCC_PREPARE.npy', mfcc)

In [8]:
# one hot encode the target 
lb = LabelEncoder()
# y_train = np.eye(len(ref.labels.unique()), dtype='uint8')[lb.fit_transform(y_train)] # to_categorical
# y_test = np.eye(len(ref.labels.unique()), dtype='uint8')[lb.fit_transform(y_test)] # to_categorical

y_train = lb.fit_transform(y_train) # to_categorical
y_test = lb.fit_transform(y_test) # to_categorical

# Normalization as per the standard NN process
mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0)

X_train = (X_train - mean)/std
X_test = (X_test - mean)/std

In [9]:
model = torchvision.models.resnet50(pretrained=False, progress=True, num_classes=14)

In [10]:
BATCH_SIZE = 16

In [11]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# create feature and targets tensor for train set. As you remember we need variable to accumulate gradients. Therefore first we create tensor, then we will create variable
featuresTrain = torch.from_numpy(X_train)
targetsTrain = torch.from_numpy(y_train).type(torch.LongTensor) # data type is long

# create feature and targets tensor for test set.
featuresTest = torch.from_numpy(X_test)
targetsTest = torch.from_numpy(y_test).type(torch.LongTensor) # data type is long

trainSet = TensorDataset(featuresTrain,targetsTrain)
validSet = TensorDataset(featuresTest,targetsTest)

# data loader
train_loader = DataLoader(trainSet, batch_size = BATCH_SIZE, shuffle = True)
valid_loader = DataLoader(validSet, batch_size = BATCH_SIZE, shuffle = False)

In [12]:
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
criterion = torch.nn.CrossEntropyLoss()

In [13]:
# Init catalyst components
runner = SupervisedRunner(device=device)
loaders = {'train': train_loader, 'valid': valid_loader}
logdir = '../data/logs/0'
callbacks = [
    AccuracyCallback(num_classes = 2),
    # AUCCallback(num_classes = 2, class_names = {0: 'Mirni',1: 'Mafia'}),
    # VerboseLogger(),
    # PrecisionRecallF1ScoreCallback(num_classes = 2),
]

In [14]:
runner.train(
        model=model.double(),
        criterion=criterion,
        optimizer=optimizer,
        loaders=loaders,
        callbacks=callbacks,
        logdir=logdir,
        num_epochs=20,
        verbose=2,        
    )  

1/20 * Epoch (train): 100% 571/571 [12:06<00:00,  1.27s/it, accuracy01=0.000e+00, loss=2.683]
1/20 * Epoch (valid): 100% 191/191 [00:41<00:00,  4.60it/s, accuracy01=0.000e+00, loss=1.963]
[2020-05-28 01:29:36,549] 
1/20 * Epoch 1 (_base): lr=0.0003 | momentum=0.9000
1/20 * Epoch 1 (train): accuracy01=0.2701 | loss=2.1673
1/20 * Epoch 1 (valid): accuracy01=0.3312 | loss=2.0148
2/20 * Epoch (train): 100% 571/571 [12:16<00:00,  1.29s/it, accuracy01=0.000e+00, loss=3.782]
2/20 * Epoch (valid): 100% 191/191 [00:43<00:00,  4.44it/s, accuracy01=0.000e+00, loss=2.316]
[2020-05-28 01:42:40,349] 
2/20 * Epoch 2 (_base): lr=0.0003 | momentum=0.9000
2/20 * Epoch 2 (train): accuracy01=0.4098 | loss=1.6950
2/20 * Epoch 2 (valid): accuracy01=0.4526 | loss=1.5966
3/20 * Epoch (train): 100% 571/571 [12:00<00:00,  1.26s/it, accuracy01=0.000e+00, loss=4.302]
3/20 * Epoch (valid): 100% 191/191 [00:43<00:00,  4.44it/s, accuracy01=1.000, loss=0.244]
[2020-05-28 01:55:26,435] 
3/20 * Epoch 3 (_base): lr=0.00