# AGENDA
- [ ] Try out other ppretrained models other than effinet
- [ ] Experiment with using less dropout on larger models
- [ ] data augmentation... and loading training images into google drive (after augmenttation). Augment audio(stretch, loudness, noise) and images(vertical, horizontal bars
- [ ] Possibly look into MFCCs again  
- [x] model ensembling -- done by Mandy


In [1]:
pip install tensorflow_addons

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow_addons
  Downloading tensorflow_addons-0.17.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 8.8 MB/s 
Installing collected packages: tensorflow-addons
Successfully installed tensorflow-addons-0.17.0


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import shutil as sh
import random as rng
import glob
# import itertools

import librosa as lb
from librosa.display import specshow
import IPython.display as ipd
import torch
from datasets import load_dataset
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor

import sklearn as sk
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.utils import to_categorical

%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
def map_to_array(example):
    speech, _ = lb.load(example["file"], sr=16000, mono=True)
    example["speech"] = speech
    return example

# load a demo dataset and read audio files
dataset = load_dataset("anton-l/superb_demo", "er", split="session1")
dataset = dataset.map(map_to_array)

model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-er")
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-er")

# compute attention masks and normalize the waveform if needed
inputs = feature_extractor(dataset[:4]["speech"], sampling_rate=16000, padding=True, return_tensors="pt")

logits = model(**inputs).logits
predicted_ids = torch.argmax(logits, dim=-1)
labels = [model.config.id2label[_id] for _id in predicted_ids.tolist()]


In [None]:
#  --  Defining Variables  --  #

max_ms = 4000

batchs = 64
epochs = 20

ind_to_label = {
    0 : 'angry',
    1 : 'fear',
    2 : 'happy',
    3 : 'neutral',
    4 : 'sad'
}

label_to_ind = { 
    lab: ind for ind, lab in ind_to_label.items()
    }

In [None]:
'''
Data insights

'''

class aud_stats:
    @staticmethod
    def average_sr():
        pass


In [None]:
'''

PREPROCESSING UTILS


'''


class aud_util:
    
    @staticmethod
    def loadaud(audio_file_path, sr=None, mono=False):                                 # load audio file, *mono argument (bool) can auto convert to mono, while default sr is converted to 22050*
        return lb.load(audio_file_path, sr=sr, mono=mono)                              # returns (data, sr)       

    @staticmethod
    def pad_trunc(aud, sr, target_ms):                                                 # padding places shorter audio randomly within the time frame of the padded length
        maxlen = (target_ms//1000)*sr
        
        if len(aud) == maxlen:
            return aud, sr

        elif len(aud) > maxlen:
            return aud[:maxlen], sr

        elif len(aud) < maxlen:
            pad = maxlen - len(aud)
            pad = np.zeros((pad))
            return np.concatenate((aud, pad), 0), sr



class aud_img:
    @staticmethod
    def melspec(data, sr):                                                             # returns 3 channels, deplicated from 1
        spec = lb.feature.melspectrogram(data, sr=sr, power=1)                         # power = 1/2 changes amplitude_to_db or power_to_db
        spec = lb.amplitude_to_db(spec, ref=np.min)
        spec = np.expand_dims(spec, axis=2)
        return np.stack((spec,)*3, axis=2).squeeze()

    @staticmethod
    def mfcc(data, sr):                                                                # returns 3 channels, deplicated from 1
        mfcc_ = lb.feature.mfcc(data, sr)
        #mfcc_ = sk.preprocessing.scale(mfcc_, axis=1)
        mfcc_ = np.expand_dims(mfcc_, axis=2)
        return np.stack((mfcc_,)*3, axis=2).squeeze()

    @staticmethod
    def display_audio_img(spec, sr , mfcc=False):
        fig, ax = plt.subplots()
        
        if mfcc:
            specshow(spec, sr=sr, x_axis='time')
        else:
            img = specshow(spec, x_axis='time', y_axis='mel', sr=sr, fmax=8000, ax=ax)
            fig.colorbar(img, ax=ax, format='%+2.0f dB')


# class rav_prep:
#     '''
#     01 = neutral, 
#     02 = calm,  -
#     03 = happy, 
#     04 = sad, 
#     05 = angry, 
#     06 = fearful, 
#     07 = disgust,  -
#     08 = surprised -
#     '''
#     @staticmethod
#     def correct_data_type(path):
#         if (path.split('/')[-1].split('-')[0] == '03') and (path.split('/')[-1].split('-')[1] == '01') and (path.split('/')[-1].split('-')[2] in ['01', '03', '04', '05', '06']):
#           return True
#         else:
#           return False
    
#     @staticmethod
#     def filter(path):
#       counter = 0
#       for i in glob.glob(path):
#         if rav_prep.correct_data_type(i):
#           continue
#         elif rav_prep.correct_data_type(i) != True:
#           sh.move(i, '/content/drive/MyDrive/NLP/RAVDESS_altogether/modified_data/A_removed_files')
#           counter += 1
#           continue
#       print(f'removed {counter} files')

    # @staticmethod
    # def move_ravdess_colab(path):                                               # colab google drive paths
    #   for i in glob.glob(path):
    #     if i.split('/')[-1].split('-')[2] == '05':
    #       sh.copy(i, '/content/drive/MyDrive/NLP/RAVDESS_altogether/modified_data/angry')
        
    #     elif i.split('/')[-1].split('-')[2] == '06':
    #       sh.copy(i, '/content/drive/MyDrive/NLP/RAVDESS_altogether/modified_data/fear')

    #     elif i.split('/')[-1].split('-')[2] == '03':
    #       sh.copy(i, '/content/drive/MyDrive/NLP/RAVDESS_altogether/modified_data/happy')

    #     elif i.split('/')[-1].split('-')[2] == '01':
    #       sh.copy(i, '/content/drive/MyDrive/NLP/RAVDESS_altogether/modified_data/neutral')

    #     elif i.split('/')[-1].split('-')[2] == '04':
    #       sh.copy(i, '/content/drive/MyDrive/NLP/RAVDESS_altogether/modified_data/sad')


class ds_create:
    
    @staticmethod    
    def label_from_bpath(bpath):                                                       # probably will not be used
        return bpath.decode('utf-8').split('\\')[-2]

    @staticmethod
    def slices_for_onelabel(path, label):                                              
        paths = glob.glob(path + label + '/*wav')

        labels = [label_to_ind[label]]*len(paths)

        return paths , labels

    @staticmethod
    def dfpremel(path):
        data, sr = aud_util.loadaud(path, sr=16000, mono=True)
        data, sr = aud_util.pad_trunc(data, sr, max_ms)                                
        mel = aud_img.melspec(data, sr)
        return mel
    
    @staticmethod
    def dfpremfcc(path):
        data, sr = aud_util.loadaud(path, sr=16000, mono=True)
        data, sr = aud_util.pad_trunc(data, sr, max_ms)                                
        mel = aud_img.mfcc(data, sr)
        return mel



In [None]:
# '''
# organising ravdess data 
# Done once only, by the time you see this cell, it probably was already run, so you can ignore it 
# as all the revdess files have already been organised into the sub-emotion folder in the google drive, in
# /content/drive/MyDrive/NLP/RAVDESS_altogether/modified_data

# '''

# rav_prep.filter('/content/drive/MyDrive/NLP/RAVDESS_altogether/modified_data/Altogether/*.wav')
# rav_prep.move_ravdess_colab('/content/drive/MyDrive/NLP/RAVDESS_altogether/modified_data/Altogether/*.wav')


removed 0 files


In [None]:
'''
BUILDING DATASET PIPELINE (both original data and ravdess)

'_o' means original data, excluding any extra data

 - colab, (btw doing this on a windows machine will break completely because of their stupid backward slash)
'''

angry_o, _0 =   ds_create.slices_for_onelabel(r'Data/NLP Training Dataset/', 'angry')
fear_o, _1 =    ds_create.slices_for_onelabel(r'Data/NLP Training Dataset/', 'fear')
happy_o, _2 =   ds_create.slices_for_onelabel(r'Data/NLP Training Dataset/', 'happy')
neutral_o, _3 = ds_create.slices_for_onelabel(r'Data/NLP Training Dataset/', 'neutral')
sad_o, _4 =     ds_create.slices_for_onelabel(r'Data/NLP Training Dataset/', 'sad')

angry_r, r_0 =   ds_create.slices_for_onelabel(r'Data/RAVDESS Dataset Sorted/', 'angry')
fear_r, r_1 =    ds_create.slices_for_onelabel(r'Data/RAVDESS Dataset Sorted/', 'fear')
happy_r, r_2 =   ds_create.slices_for_onelabel(r'Data/RAVDESS Dataset Sorted/', 'happy')
neutral_r, r_3 = ds_create.slices_for_onelabel(r'Data/RAVDESS Dataset Sorted/', 'neutral')
sad_r, r_4 =     ds_create.slices_for_onelabel(r'Data/RAVDESS Dataset Sorted/', 'sad')


slices = angry_o + fear_o + happy_o + neutral_o + sad_o + angry_r + fear_r + happy_r + neutral_r + sad_r
labels = _0 + _1 + _2 + _3 + _4 + r_0 + r_1 + r_2 + r_3 + r_4



In [None]:
'''
Dataframe style

using tf.stack later lol
'''


df = pd.DataFrame()

df['relative_audio_paths'] = slices
df['int_labels'] = labels
df['1hot_labels'] = list(to_categorical(labels))

df['imgs_3c'] = list(map(ds_create.dfpremel, slices))


df = sk.utils.shuffle(df)
df.reset_index(inplace=True, drop=True)

In [None]:
df.head()

Unnamed: 0,relative_audio_paths,int_labels,1hot_labels,imgs_3c
0,/content/drive/MyDrive/NLP/NLP Training Datase...,4,"[0.0, 0.0, 0.0, 0.0, 1.0]","[[[38.78178342333567, 38.78178342333567, 38.78..."
1,/content/drive/MyDrive/NLP/NLP Training Datase...,3,"[0.0, 0.0, 0.0, 1.0, 0.0]","[[[42.12918, 42.12918, 42.12918], [42.12918, 4..."
2,/content/drive/MyDrive/NLP/NLP Training Datase...,2,"[0.0, 0.0, 1.0, 0.0, 0.0]","[[[72.87353194359729, 72.87353194359729, 72.87..."
3,/content/drive/MyDrive/NLP/NLP Training Datase...,4,"[0.0, 0.0, 0.0, 0.0, 1.0]","[[[57.963776, 57.963776, 57.963776], [57.84632..."
4,/content/drive/MyDrive/NLP/NLP Training Datase...,0,"[1.0, 0.0, 0.0, 0.0, 0.0]","[[[49.65379750655458, 49.65379750655458, 49.65..."


In [None]:
df.iloc[0,3].shape

(128, 126, 3)

#original baseline model

In [None]:
input_shape = df.iloc[0,3].shape
input_shape

(128, 126, 3)

In [None]:
'''
m model
'''
xin = Input(input_shape)

prenet = tf.keras.applications.efficientnet_v2.EfficientNetV2M(weights='imagenet', include_top=False)#, input_shape=input_shape)
x = prenet(xin)

x = Flatten()(x)
x = Dense(256, activation='swish')(x)
x = Dropout(0.3)(x)
x = Dense(128, activation='swish')(x)


xout = Dense(5, activation='softmax')(x)

ownm = Model(xin, xout)
ownm.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.2), metrics=['acc', tfa.metrics.F1Score(num_classes=5, average='weighted', threshold=0.5)])
ownm.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 128, 126, 3)]     0         
                                                                 
 efficientnetv2-m (Functiona  (None, None, None, 1280)  53150388 
 l)                                                              
                                                                 
 flatten (Flatten)           (None, 20480)             0         
                                                                 
 dense (Dense)               (None, 256)               5243136   
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 128)               32896     
                                                             

In [None]:
'''
s model
'''
xin = Input(input_shape)

prenet = tf.keras.applications.efficientnet_v2.EfficientNetV2S(weights='imagenet', include_top=False)#, input_shape=input_shape)
x = prenet(xin)

x = Flatten()(x)
x = Dense(256, activation='swish')(x)
x = Dense(128, activation='swish')(x)


xout = Dense(5, activation='softmax')(x)

own = Model(xin, xout)
own.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.2), metrics=['acc', tfa.metrics.F1Score(num_classes=5, average='weighted', threshold=0.5)])
own.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 128, 126, 3)]     0         
                                                                 
 efficientnetv2-s (Functiona  (None, None, None, 1280)  20331360 
 l)                                                              
                                                                 
 flatten (Flatten)           (None, 20480)             0         
                                                                 
 dense (Dense)               (None, 256)               5243136   
                                                                 
 dense_1 (Dense)             (None, 128)               32896     
                                                                 
 dense_2 (Dense)             (None, 5)                 645       
                                                             

In [None]:
callbacks = [
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=3, factor=0.1, verbose=1),
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
    tf.keras.callbacks.ModelCheckpoint(r'/content/drive/MyDrive/NLP/Zhihaos stuff/effinet_v2m', monitor='val_loss', verbose=0, save_best_only=True)
]

In [None]:
history = ownm.fit(
    x=tf.stack(df['imgs_3c']),
    y=tf.stack(df['1hot_labels']),
    batch_size=batchs,
    epochs=epochs,
    callbacks=callbacks,
    validation_split=0.2,
)
ownm.save(r'/content/drive/MyDrive/NLP/Saved Models/modelM_1')

In [None]:
'''
Ensemble Learning modelm_5 left to be trained
'''

for count in range(5,6): # already ran 1-3 and saved models
    
    xin = Input(input_shape)

    prenet = tf.keras.applications.efficientnet_v2.EfficientNetV2S(weights='imagenet', include_top=False)#, input_shape=input_shape)
    x = prenet(xin)

    x = Flatten()(x)
    x = Dense(256, activation='swish')(x)
    x = Dense(128, activation='swish')(x)


    xout = Dense(5, activation='softmax')(x)

    own = Model(xin, xout)
    own.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.2), metrics=['acc', tfa.metrics.F1Score(num_classes=5, average='weighted', threshold=0.5)])
    
    callbacks_en = [
    tf.keras.callbacks.ReduceLROnPlateau(monitor='loss', patience=3, factor=0.1, verbose=1),
    # tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5, restore_best_weights=True),
    tf.keras.callbacks.ModelCheckpoint(r'/content/drive/MyDrive/NLP/Saved Models/Model Checkpoints', monitor='loss', verbose=0, save_best_only=True)
    ]


    history = own.fit(
        x=tf.stack(df['imgs_3c']),
        y=tf.stack(df['1hot_labels']),
        batch_size=batchs,
        epochs=epochs, 
        #validation_split=0.2,
        callbacks=callbacks_en,
    )   
    own.save(r'/content/drive/MyDrive/NLP/Saved Models/model2_' + str(count))

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/efficientnet_v2/efficientnetv2-s_notop.h5
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
INFO:tensorflow:Assets written to: /content/drive/MyDrive/NLP/Saved Models/model2_5/assets


In [None]:
'''
Ensemble Learning effinet v2s
'''
model_1 = load_model(r'/content/drive/MyDrive/NLP/Saved Models/model_1') # val_f1_score is 0.8869
model_2 = load_model(r'/content/drive/MyDrive/NLP/Saved Models/model_2') # 0.8894
model_3 = load_model(r'/content/drive/MyDrive/NLP/Saved Models/model_3') # 0.8956
model_4 = load_model(r'/content/drive/MyDrive/NLP/Saved Models/model_4') # 0.8736
model_5 = load_model(r'/content/drive/MyDrive/NLP/Saved Models/model_5') # 0.8845

In [None]:
'''
Ensemble Learning effinet v2s no val_split
'''
modelm_1 = load_model(r'/content/drive/MyDrive/NLP/Saved Models/model2_1')
modelm_2 = load_model(r'/content/drive/MyDrive/NLP/Saved Models/model2_2')
modelm_3 = load_model(r'/content/drive/MyDrive/NLP/Saved Models/model2_3') 
modelm_4 = load_model(r'/content/drive/MyDrive/NLP/Saved Models/model2_4')
modelm_5 = load_model(r'/content/drive/MyDrive/NLP/Saved Models/model2_5')

# Generating the test preds


In [None]:
tst = ds_create.dfpremel(r'/content/drive/MyDrive/NLP/NLP Training Dataset/ASR Training Dataset/fear/00530e07e3.wav')
tst = ds_create.dup_channel(tst)
tst = np.expand_dims(tst, axis=0)                                             # EXPAND DIMS OF FIRST DIMENSION ARGHHHHHH
pred = own.predict(tst)
pred = np.argmax(pred)
pred

In [None]:
own.load_weights(r'/content/drive/MyDrive/NLP/Zhihaos stuff/effinet_v2s_nodrop')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f1eebd177d0>

In [None]:
own.evaluate(
    x=tf.stack(df['imgs_3c']),
    y=tf.stack(df['1hot_labels'])
    )



[0.6958815455436707, 0.9793767333030701, 0.9794116616249084]

In [None]:
'''
Generating the qualifying csv file

'''


class test_gen:
    @staticmethod
    def path_gen(path):                                              #for zhihao's local pc
        paths = os.listdir(path)
        paths = list(map(lambda x : 'DATA_NLP_TIL\\.qualifying_test\\'+x , paths))

        return paths 

    @staticmethod
    def path_gen_colab(path):                                              #for colab, zhihaos
        paths = os.listdir(path)
        paths = list(map(lambda x : '/content/drive/MyDrive/NLP/NLP Interim Dataset/NLP/'+x , paths))

        return paths 

    @staticmethod
    def path_to_mel(path):
        ccc = ds_create.dfpremel(path)
        return ccc
    
    @staticmethod
    def path_to_mfcc(path):
        ccc = ds_create.dfpremfcc(path)
        return ccc

    @staticmethod
    def int_to_label(int):
        return ind_to_label[int]
        

In [None]:
q_df = pd.DataFrame()
paths = sorted(glob.glob(r'/content/drive/MyDrive/NLP/NLP Interim Dataset/NLP/*.wav'))
q_data = list(map(test_gen.path_to_mel, paths))

q_data = tf.stack(q_data)

In [None]:
# preds = own.predict(q_data)
# preds = np.argmax(preds, axis=1)
'''
Ensemble Learning
'''
pred_1 = modelm_1.predict(q_data)
pred_2 = modelm_2.predict(q_data)
pred_3 = modelm_3.predict(q_data)
pred_4 = modelm_4.predict(q_data)
pred_5 = modelm_5.predict(q_data)

pred_comb = pred_1 * 0.2 + pred_2 * 0.2 + pred_3 * 0.2 + pred_4 * 0.2 + pred_5 * 0.2
pred_comb = np.argmax(pred_comb, axis=1)

In [None]:
preds.shape

(600,)

In [None]:
q_df['paths'] = sorted(os.listdir(r'/content/drive/MyDrive/NLP/NLP Interim Dataset/NLP/'))
q_df['labels'] = list(map(
    test_gen.int_to_label,
    list(pred_comb)
))

In [None]:
q_df.head()

Unnamed: 0,paths,labels
0,00ae09ba94.wav,happy
1,00f2a00f1f.wav,angry
2,012822b908.wav,fear
3,0144091c26.wav,sad
4,0145cb0279.wav,sad


In [None]:
q_df.to_csv(r'/content/drive/MyDrive/NLP/Zhihao nlp preds/qualifiers_ensemble3.csv', header=False, index=False)