# Download Dataset

In [1]:
! pip install -q kaggle

In [2]:
! mkdir ~/.kaggle

In [3]:
! cp kaggle.json ~/.kaggle/

In [4]:
! chmod 600 ~/.kaggle/kaggle.json

In [5]:
import os
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns


import librosa 
import librosa.display

from IPython.display import Audio
plt.style.use('seaborn-white')

In [6]:
! kaggle datasets download -d ejlok1/toronto-emotional-speech-set-tess

Downloading toronto-emotional-speech-set-tess.zip to /content
 96% 409M/428M [00:13<00:00, 20.4MB/s]
100% 428M/428M [00:14<00:00, 31.9MB/s]


In [7]:
!unzip toronto-emotional-speech-set-tess.zip -d toronto-emotional-speech-set-tess

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: toronto-emotional-speech-set-tess/TESS Toronto emotional speech set data/OAF_angry/OAF_back_angry.wav  
  inflating: toronto-emotional-speech-set-tess/TESS Toronto emotional speech set data/OAF_angry/OAF_bar_angry.wav  
  inflating: toronto-emotional-speech-set-tess/TESS Toronto emotional speech set data/OAF_angry/OAF_base_angry.wav  
  inflating: toronto-emotional-speech-set-tess/TESS Toronto emotional speech set data/OAF_angry/OAF_bath_angry.wav  
  inflating: toronto-emotional-speech-set-tess/TESS Toronto emotional speech set data/OAF_angry/OAF_bean_angry.wav  
  inflating: toronto-emotional-speech-set-tess/TESS Toronto emotional speech set data/OAF_angry/OAF_beg_angry.wav  
  inflating: toronto-emotional-speech-set-tess/TESS Toronto emotional speech set data/OAF_angry/OAF_bite_angry.wav  
  inflating: toronto-emotional-speech-set-tess/TESS Toronto emotional speech set data/OAF_angry/OAF_boat_angry.wav  
 

# Preprocessing

In [8]:
TESS = "../content/toronto-emotional-speech-set-tess/tess toronto emotional speech set data/TESS Toronto emotional speech set data/" #check

In [9]:
path = []
emotion = []
dir_list = os.listdir(TESS)

for i in dir_list:
    fname = os.listdir(TESS + i)   
    for f in fname:
        if i == 'OAF_angry' or i == 'YAF_angry':
            emotion.append('angry')
        elif i == 'OAF_disgust' or i == 'YAF_disgust':
            emotion.append('disgust')
        elif i == 'OAF_Fear' or i == 'YAF_fear':
            emotion.append('fear')
        elif i == 'OAF_happy' or i == 'YAF_happy':
            emotion.append('happy')
        elif i == 'OAF_neutral' or i == 'YAF_neutral':
            emotion.append('neutral')                                
        elif i == 'OAF_Pleasant_surprise' or i == 'YAF_pleasant_surprised':
            emotion.append('surprise')               
        elif i == 'OAF_Sad' or i == 'YAF_sad':
            emotion.append('sad')
        else:
            emotion.append('Unknown')
        path.append(TESS + i + "/" + f)

TESS_df = pd.DataFrame(emotion, columns = ['labels'])
TESS_df = pd.concat([TESS_df,pd.DataFrame(path, columns = ['path'])],axis=1)

In [10]:
labels = []
path = []


for i in range(len(TESS_df)):
  path.append(TESS_df['path'][i])
  labels.append(TESS_df['labels'][i])

d = {'labels': labels, 'path': path}
DF = pd.DataFrame(data=d)

In [11]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

all_labels = DF['labels'].values

encoder = OneHotEncoder()

encoded_labels = encoder.fit_transform(np.array(all_labels).reshape(-1,1)).toarray()

onehot = []
for item in encoded_labels:
  tmp = ''
  for i in range(len(item)):
    tmp = tmp + str(int(item[i]))
  onehot.append(tmp)

DF['onehot'] = onehot

In [12]:
DF = DF.sample(frac=1)
DF = DF.reset_index(drop=True)

In [13]:
maxx = 48000

In [14]:
def get_data(path):
  data1, sample_rate = librosa.load(path, duration=3, offset=0.5, res_type='kaiser_fast',sr=16000)

  #padding
  data = []
  for d in data1:
    data.append(float(d))
  for i in range(maxx- len(data1)):
    data.append(0)

  result = np.array(data)
  return result

In [15]:
notfound = []
Y = []
datas = []

i = 0
for path, emotion in zip(DF.path, DF.onehot):
  try:
    data = get_data(path)
    datas.append(data)
    Y.append(emotion)

  except:
    notfound.append([path,i])
  i = i+1   

X_np = np.array(datas)
print(f'Check shapes:\nFemale features: {X_np.shape}, labels: {len(Y)}')

Check shapes:
Female features: (2800, 48000), labels: 2800


In [16]:
len(notfound)

0

In [17]:
onehot = []
for l in Y:
  tmp=[]
  for c in l:
    tmp.append(int(c))
  onehot.append(tmp)
onehot_np = np.array(onehot)

In [18]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X_np, onehot_np, random_state=0, test_size=0.20, shuffle=True)

In [19]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((2240, 48000), (2240, 7), (560, 48000), (560, 7))

# Feature Extraction and Augmentation

In [20]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.3-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 4.2 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 51.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 49.2 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.21.3


In [21]:
from transformers import AutoTokenizer, AutoFeatureExtractor
import torch
from transformers import Wav2Vec2FeatureExtractor

# import feature extractor, tokenizer
#tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-base-960h")

#tested tekrari mide too har satr #feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
#tested sefr mide feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ks")
#tested sefr mide feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("anton-l/wav2vec2-base-superb-sd")
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("superb/hubert-large-superb-er")

Downloading preprocessor_config.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

In [22]:
def extract_features(data,sample_rate = 16000):

  encodings = feature_extractor(data, sampling_rate=16000, padding=True, return_tensors="pt")
  result = np.array(encodings['input_values'][0])
  return result

In [23]:
def noise(data):
    noise_amp = 0.04*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data

def stretch(data, rate=0.70):
    return librosa.effects.time_stretch(data, rate)

def shift(data):
    shift_range = int(np.random.uniform(low=-5, high = 5)*1000)
    return np.roll(data, shift_range)

def pitch(data, sampling_rate = 16000, pitch_factor=0.8):
    return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)

def higher_speed(data, speed_factor = 1.25):
    return librosa.effects.time_stretch(data, speed_factor)

def lower_speed(data, speed_factor = 0.75):
    return librosa.effects.time_stretch(data, speed_factor)

In [24]:
empty = np.zeros(48000)
Augmented_train_data = []
Augmented_train_label = []

for i in range(len(x_train)):
  res1 = extract_features(x_train[i])
  #result = np.array(res1)
  Augmented_train_data.append(res1)
  Augmented_train_label.append(y_train[i])
  
  # noise_data = noise(x_train[i])
  # if noise_data.all() != empty.all():
  #   res2 = extract_features(noise_data)
  #   #result = np.array(res2)
  #   Augmented_train_data.append(res2)
  #   Augmented_train_label.append(y_train[i])

  shift_data = shift(x_train[i])
  if shift_data.all() != empty.all():
    res2 = extract_features(shift_data)
    #result = np.array(res2)
    Augmented_train_data.append(res2)
    Augmented_train_label.append(y_train[i])

  # pitch_data = pitch(x_train[i])
  # if pitch_data.all() != empty.all():
  #   res2 = extract_features(pitch_data)
  #   #result = np.array(res2)
  #   Augmented_train_data.append(res2)
  #   Augmented_train_label.append(y_train[i])

In [25]:
np.array(Augmented_train_data).shape,len(Augmented_train_label)

((2240, 48000), 2240)

In [26]:
test_data = []
test_label = []

for i in range(len(x_test)):
  res1 = extract_features(x_test[i])
  result = np.array(res1)
  test_data.append(result)
  test_label.append(y_test[i])

In [27]:
np.array(test_data).shape,len(test_label)

((560, 48000), 560)

In [28]:
df = {'data': Augmented_train_data, 'label': Augmented_train_label}
train_aug = pd.DataFrame(data=df)

In [29]:
train_aug = train_aug.sample(frac=1)
train_aug = train_aug.reset_index(drop=True)

In [30]:
train_features = np.asarray(train_aug['data'])
train_labels__ = np.asarray(train_aug['label'])
train_labels_ = []
for l in train_labels__:
  train_labels_.append(np.asarray(l).astype(np.float32))
train_labels = np.asarray(train_labels_).astype(np.float32)

test_features = np.asarray(test_data).astype(np.float32)
test_labels = np.asarray(test_label).astype(np.float32)

In [31]:
for t in train_features:
  if len(t) != 48000:
    print(len(t))

In [32]:
train_features.shape,train_labels.shape

((2240,), (2240, 7))

In [33]:
train_features_list = []
for t in train_features:
  train_features_list.append(np.asarray(t).astype(np.float32))

In [34]:
train_features_np = np.asarray(train_features_list).astype(np.float32)

In [35]:
train_features_np.shape

(2240, 48000)

# Classification

In [36]:
import tensorflow as tf
from tensorflow import keras

from keras.callbacks import ReduceLROnPlateau
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, BatchNormalization, AveragePooling1D
from keras.utils.np_utils import to_categorical
from keras.callbacks import ModelCheckpoint

In [37]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [38]:
# Create a MirroredStrategy.
strategy = tf.distribute.MirroredStrategy()
print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

Number of devices: 1


In [39]:
with strategy.scope():
    
    def build_model(in_shape):
        
        model=Sequential()
        model.add(Conv1D(256, kernel_size=6, strides=1, padding='same', activation='relu', input_shape=(in_shape, 1)))
        model.add(AveragePooling1D(pool_size=4, strides = 2, padding = 'same'))
        model.add(Dropout(0.2))

        model.add(Conv1D(128, kernel_size=6, strides=1, padding='same', activation='relu', input_shape=(in_shape, 1)))
        model.add(AveragePooling1D(pool_size=4, strides = 2, padding = 'same'))
        model.add(Dropout(0.2))

        model.add(Conv1D(128, kernel_size=6, strides=1, padding='same', activation='relu'))
        model.add(AveragePooling1D(pool_size=4, strides = 2, padding = 'same'))
        model.add(Dropout(0.2))

        model.add(Conv1D(64, kernel_size=6, strides=1, padding='same', activation='relu'))
        model.add(AveragePooling1D(pool_size=4, strides = 2, padding = 'same'))
        model.add(Dropout(0.2))

        model.add(Conv1D(32, kernel_size=6, strides=1, padding='same', activation='relu'))
        model.add(MaxPooling1D(pool_size=4, strides = 2, padding = 'same'))
        
        model.add(Flatten())
        model.add(Dense(units=32, activation='relu'))
        model.add(Dropout(0.3))

        model.add(Dense(units=7, activation='softmax'))
        model.compile(optimizer = 'adam' , loss = 'categorical_crossentropy' , metrics = ['accuracy'])
          
        
        return model

In [40]:
def model_build_summary(mod_dim):
    model = build_model(mod_dim)
    model.summary()
    
    return model

In [41]:
rlrp = ReduceLROnPlateau(monitor='loss', factor=0.4, verbose=0, patience=4, min_lr=0.000001)

batch_size = 32
n_epochs = 30

In [42]:
total_model = model_build_summary(train_features_np.shape[1])

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 48000, 256)        1792      
                                                                 
 average_pooling1d (AverageP  (None, 24000, 256)       0         
 ooling1D)                                                       
                                                                 
 dropout (Dropout)           (None, 24000, 256)        0         
                                                                 
 conv1d_1 (Conv1D)           (None, 24000, 128)        196736    
                                                                 
 average_pooling1d_1 (Averag  (None, 12000, 128)       0         
 ePooling1D)                                                     
                                                                 
 dropout_1 (Dropout)         (None, 12000, 128)        0

In [43]:
X_train = tf.convert_to_tensor(train_features_np, dtype=tf.float32)
y_train = tf.convert_to_tensor(train_labels, dtype=tf.float32)
X_test = tf.convert_to_tensor(test_features, dtype=tf.float32)
y_test = tf.convert_to_tensor(test_labels, dtype=tf.float32)

In [44]:
history = total_model.fit(X_train, y_train, batch_size=batch_size, epochs=n_epochs,
                          validation_data=(X_test, y_test),
                          callbacks=[rlrp])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


# ٍExample

In [45]:
# def create_waveplot(data, sr):
#     plt.figure(figsize=(10, 3))
#     plt.title(f'Waveplot for audio', size=15)
#     librosa.display.waveplot(data, sr=sr)
#     plt.show()
# path = DF['path'][6]
# data, sampling_rate = librosa.load(path)
# #create_waveplot(data, sampling_rate)
# Audio(path)
# data1 = shift(data)
# from __future__ import print_function
# import scipy.io.wavfile as wavf
# import numpy as np

# if __name__ == "__main__":

#     fs = 16000
#     out_f = 'out.wav'

#     wavf.write(out_f, fs, data1)
# Audio(out_f)