In [2]:
#!pip install kaggle

In [3]:
#!kaggle datasets download -d ejlok1/toronto-emotional-speech-set-tess
#!kaggle datasets download -d ejlok1/surrey-audiovisual-expressed-emotion-savee

In [4]:
#!unzip surrey-audiovisual-expressed-emotion-savee.zip
#!unzip toronto-emotional-speech-set-tess.zip

In [5]:
import os
import sys
import time

from IPython.display import Audio

import numpy as np
import pandas as pd
import librosa
import librosa.display

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Flatten, Dropout

import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [6]:
Ravdess = "/Users/nirupamaunnithan/Downloads/Amvi/Trials/audio_speech_actors_01-24/"
Tess = "/Users/nirupamaunnithan/Downloads/Amvi/Trials/TESS Toronto emotional speech set data/TESS Toronto emotional speech set data/"
Savee = "/Users/nirupamaunnithan/Downloads/Amvi/Trials/ALL/"

In [7]:
ravdess_directory_list = os.listdir(Ravdess)

file_emotion = []
file_path = []
for dir in ravdess_directory_list:
    actor = os.listdir(Ravdess + dir)
    for file in actor:
        part = file.split('.')[0]
        part = part.split('-')
        file_emotion.append(int(part[2]))
        file_path.append(Ravdess + dir + '/' + file)
        
# dataframe for emotion of files
emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])

# dataframe for path of files.
path_df = pd.DataFrame(file_path, columns=['Path'])
Ravdess_df = pd.concat([emotion_df, path_df], axis=1)

# Change integers to actual emotions.
Ravdess_df.Emotions.replace({1:'neutral', 2:'neutral', 3:'happy', 4:'sad', 5:'angry', 6:'fear', 7:'disgust', 8:'surprise'}, inplace=True)

Ravdess_df.head()

Unnamed: 0,Emotions,Path
0,angry,/Users/nirupamaunnithan/Downloads/Amvi/Trials/...
1,fear,/Users/nirupamaunnithan/Downloads/Amvi/Trials/...
2,fear,/Users/nirupamaunnithan/Downloads/Amvi/Trials/...
3,angry,/Users/nirupamaunnithan/Downloads/Amvi/Trials/...
4,disgust,/Users/nirupamaunnithan/Downloads/Amvi/Trials/...


In [8]:
tess_directory_list = os.listdir(Tess)

file_emotion = []
file_path = []

for dir in tess_directory_list:
    directories = os.listdir(Tess + dir)
    for file in directories:
        part = file.split('.')[0]
        part = part.split('_')[2]
        if part=='ps':
            file_emotion.append('surprise')
        else:
            file_emotion.append(part)
        file_path.append(Tess + dir + '/' + file)
        
# dataframe for emotion of files
emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])

# dataframe for path of files.
path_df = pd.DataFrame(file_path, columns=['Path'])
Tess_df = pd.concat([emotion_df, path_df], axis=1)

Tess_df.head()

Unnamed: 0,Emotions,Path
0,disgust,/Users/nirupamaunnithan/Downloads/Amvi/Trials/...
1,disgust,/Users/nirupamaunnithan/Downloads/Amvi/Trials/...
2,disgust,/Users/nirupamaunnithan/Downloads/Amvi/Trials/...
3,disgust,/Users/nirupamaunnithan/Downloads/Amvi/Trials/...
4,disgust,/Users/nirupamaunnithan/Downloads/Amvi/Trials/...


In [9]:
savee_directory_list = os.listdir(Savee)

file_emotion = []
file_path = []

for file in savee_directory_list:
    file_path.append(Savee + file)
    part = file.split('_')[1]
    ele = part[:-6]
    if ele=='a':
        file_emotion.append('angry')
    elif ele=='d':
        file_emotion.append('disgust')
    elif ele=='f':
        file_emotion.append('fear')
    elif ele=='h':
        file_emotion.append('happy')
    elif ele=='n':
        file_emotion.append('neutral')
    elif ele=='sa':
        file_emotion.append('sad')
    else:
        file_emotion.append('surprise')
        
# dataframe for emotion of files
emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])

# dataframe for path of files.
path_df = pd.DataFrame(file_path, columns=['Path'])
Savee_df = pd.concat([emotion_df, path_df], axis=1)

Savee_df.head()

Unnamed: 0,Emotions,Path
0,sad,/Users/nirupamaunnithan/Downloads/Amvi/Trials/...
1,sad,/Users/nirupamaunnithan/Downloads/Amvi/Trials/...
2,neutral,/Users/nirupamaunnithan/Downloads/Amvi/Trials/...
3,surprise,/Users/nirupamaunnithan/Downloads/Amvi/Trials/...
4,neutral,/Users/nirupamaunnithan/Downloads/Amvi/Trials/...


In [10]:
aggregated_data = pd.concat([Ravdess_df, Tess_df, Savee_df], axis = 0)

# Shuffle the dataframe using the sample method
aggregated_data = aggregated_data.sample(frac=1).reset_index(drop=True) 

# Drop rows where Emotions is 'fear' or 'disgust'
#aggregated_data = aggregated_data[~aggregated_data['Emotions'].isin(['fear', 'disgust'])]

# Drop rows where Emotions is "sad" and "angry" and replace them with "unpleasant"
#aggregated_data = aggregated_data.drop(aggregated_data[aggregated_data['Emotions'] == 'sad'].sample(frac=0.4).index)
#aggregated_data = aggregated_data.drop(aggregated_data[aggregated_data['Emotions'] == 'angry'].sample(frac=0.4).index)
#aggregated_data['Emotions'] = aggregated_data['Emotions'].replace(['sad', 'angry'], 'unpleasant')

aggregated_data.to_csv("data_path.csv",index=False)
aggregated_data.head()

Unnamed: 0,Emotions,Path
0,happy,/Users/nirupamaunnithan/Downloads/Amvi/Trials/...
1,sad,/Users/nirupamaunnithan/Downloads/Amvi/Trials/...
2,happy,/Users/nirupamaunnithan/Downloads/Amvi/Trials/...
3,neutral,/Users/nirupamaunnithan/Downloads/Amvi/Trials/...
4,happy,/Users/nirupamaunnithan/Downloads/Amvi/Trials/...


In [11]:
aggregated_data.Emotions.value_counts()

Emotions
neutral     808
happy       652
sad         652
fear        652
disgust     652
surprise    652
angry       652
Name: count, dtype: int64

In [12]:
labels = {'neutral':0, 'happy':1, 'surprise':2, 'angry': 3, 'disgust':4, 'fear':5,'sad':6}
aggregated_data.replace({'Emotions':labels},inplace=True)
aggregated_data.head()

Unnamed: 0,Emotions,Path
0,1,/Users/nirupamaunnithan/Downloads/Amvi/Trials/...
1,6,/Users/nirupamaunnithan/Downloads/Amvi/Trials/...
2,1,/Users/nirupamaunnithan/Downloads/Amvi/Trials/...
3,0,/Users/nirupamaunnithan/Downloads/Amvi/Trials/...
4,1,/Users/nirupamaunnithan/Downloads/Amvi/Trials/...


In [13]:
aggregated_data.Emotions.value_counts()

Emotions
0    808
1    652
6    652
5    652
4    652
2    652
3    652
Name: count, dtype: int64

In [14]:
def noise(data):
    noise_amp = 0.5*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data

def stretch(data, rate=0.8):
    return librosa.effects.time_stretch(data, rate)

def pitch(data, sampling_rate, pitch_factor=0.7):
    return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)

In [15]:
NUM_MFCC = 13
N_FFT = 2048
HOP_LENGTH = 512
SAMPLE_RATE = 22050
DOWN_SAMPLE_RATE = 16000
SAMPLE_NUM = aggregated_data.shape[0]

data = {
        "labels": [],
        "features": []
    }

def extract_features(data, sample_rate):
    mfcc = librosa.feature.mfcc(data, sample_rate, n_mfcc=NUM_MFCC, n_fft=N_FFT, hop_length=HOP_LENGTH)
    feature = mfcc.T
    return feature

for i in range(SAMPLE_NUM):
    for j in range(2):
        data['labels'].append(aggregated_data.iloc[i,0])
    signal, sample_rate = librosa.load(aggregated_data.iloc[i,1], sr=SAMPLE_RATE)
    
    # Cropping & Resampling
    start_time = 0.4  # Start time in seconds
    end_time = 1.9  # End time in seconds
    start_frame = int(start_time * sample_rate)
    end_frame = int(end_time * sample_rate)
    signal = signal[start_frame:end_frame]
    signal = librosa.resample(signal, sample_rate, DOWN_SAMPLE_RATE)
    
    # Add noise
    signal = noise(signal)
    res1 = extract_features(signal, DOWN_SAMPLE_RATE)
    data["features"].append(np.array(res1))
    
    # Stretch and shift pitch
    new_data = stretch(signal)[:24000]
    data_stretch_pitch = pitch(new_data, DOWN_SAMPLE_RATE)
    res2 = extract_features(data_stretch_pitch, DOWN_SAMPLE_RATE)
    data["features"].append(np.array(res2))
    
    if i % 100 == 0:
        print(f'Processing Data: {i}/{SAMPLE_NUM}')

Processing Data: 0/4720
Processing Data: 100/4720
Processing Data: 200/4720
Processing Data: 300/4720
Processing Data: 400/4720
Processing Data: 500/4720
Processing Data: 600/4720
Processing Data: 700/4720
Processing Data: 800/4720
Processing Data: 900/4720
Processing Data: 1000/4720
Processing Data: 1100/4720
Processing Data: 1200/4720
Processing Data: 1300/4720
Processing Data: 1400/4720
Processing Data: 1500/4720
Processing Data: 1600/4720
Processing Data: 1700/4720
Processing Data: 1800/4720
Processing Data: 1900/4720
Processing Data: 2000/4720
Processing Data: 2100/4720
Processing Data: 2200/4720
Processing Data: 2300/4720
Processing Data: 2400/4720
Processing Data: 2500/4720
Processing Data: 2600/4720
Processing Data: 2700/4720
Processing Data: 2800/4720
Processing Data: 2900/4720
Processing Data: 3000/4720
Processing Data: 3100/4720
Processing Data: 3200/4720
Processing Data: 3300/4720
Processing Data: 3400/4720
Processing Data: 3500/4720
Processing Data: 3600/4720
Processing Da

In [16]:
Features = pd.DataFrame()
Features['features'] = data["features"]
Features['labels'] = data["labels"]
Features.to_csv('Features.csv', index=False)
Features.head()

Unnamed: 0,features,labels
0,"[[72.08987572527357, 5.534708934733683, -1.377...",1
1,"[[67.86030970366848, 13.189012935613295, -10.5...",1
2,"[[-115.75637269257881, 27.472834745597417, 21....",6
3,"[[-121.01478342470571, 37.319812902895094, 13....",6
4,"[[-63.353990799337694, -5.711079993927296, 4.4...",1


In [17]:
Features.labels.value_counts()

labels
0    1616
1    1304
6    1304
5    1304
4    1304
2    1304
3    1304
Name: count, dtype: int64

In [64]:
X = np.asarray(Features['features'])
y = np.asarray(Features["labels"])

# Pad Features to make them of equal length
X = tf.keras.preprocessing.sequence.pad_sequences(X)

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.2)

print(f'Training Data:{X_train.shape} with label {y_train.shape}')
print(f'Validate Data:{X_validation.shape} with label {y_validation.shape}')
print(f' Testing Data:{X_test.shape} with label {y_test.shape}')

Training Data:(6796, 47, 13) with label (6796,)
Validate Data:(1700, 47, 13) with label (1700,)
 Testing Data:(944, 47, 13) with label (944,)


In [66]:
def build_model(input_shape):
    model = tf.keras.Sequential()

    model.add(LSTM(128, input_shape=input_shape, return_sequences=True))
    model.add(LSTM(64))
    
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))

    model.add(Dense(7, activation='softmax')) # change the no of classes depending upon the no of labels

    return model

In [67]:
input_shape = (47,13)
model = build_model(input_shape)

In [68]:
optimiser = tf.keras.optimizers.Adam(learning_rate=0.001)

model.compile(optimizer=optimiser,
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

model.summary()

In [69]:
EPOCHS = 50
history = model.fit(X_train, y_train, validation_data=(X_validation, y_validation), batch_size=32, epochs=EPOCHS)


Epoch 1/50
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 32ms/step - accuracy: 0.2507 - loss: 1.8063 - val_accuracy: 0.4100 - val_loss: 1.4965
Epoch 2/50
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 33ms/step - accuracy: 0.4643 - loss: 1.3805 - val_accuracy: 0.5294 - val_loss: 1.2156
Epoch 3/50
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 33ms/step - accuracy: 0.5366 - loss: 1.1897 - val_accuracy: 0.5435 - val_loss: 1.1639
Epoch 4/50
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 33ms/step - accuracy: 0.5980 - loss: 1.0698 - val_accuracy: 0.6153 - val_loss: 1.0272
Epoch 5/50
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 33ms/step - accuracy: 0.6374 - loss: 0.9706 - val_accuracy: 0.6371 - val_loss: 0.9627
Epoch 6/50
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 34ms/step - accuracy: 0.6490 - loss: 0.9196 - val_accuracy: 0.6353 - val_loss: 0.9709
Epoch 7/50
[1m213/213

In [70]:
y_pred = model.predict(X_test)

[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step


In [71]:
from sklearn.metrics import accuracy_score

y_pred_class = np.argmax(y_pred, axis=1)  
accuracy = accuracy_score(y_test, y_pred_class)
print("Accuracy:", accuracy)

Accuracy: 0.7108050847457628


In [72]:
pred = np.argmax(y_pred, axis=1)
y_true = np.array(y_test)
print(f"Classification Report:\n{classification_report(y_true, pred)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_true, pred)}")

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.78      0.76       155
           1       0.69      0.68      0.68       151
           2       0.66      0.65      0.66       122
           3       0.72      0.78      0.75       130
           4       0.74      0.68      0.71       139
           5       0.69      0.70      0.69       120
           6       0.72      0.69      0.70       127

    accuracy                           0.71       944
   macro avg       0.71      0.71      0.71       944
weighted avg       0.71      0.71      0.71       944

Confusion Matrix:
[[121   6   4   2   6   3  13]
 [  2 103  15  14   1  15   1]
 [  2  12  79   9   9   7   4]
 [  2   5   8 102   4   6   3]
 [ 10   8   3   8  95   6   9]
 [  5  11   4   6   7  84   3]
 [ 20   5   6   1   7   1  87]]


In [73]:
model.save('Models/Speech-Emotion-Recognition-Model-1.h5')




In [74]:
model = tf.keras.models.load_model('Models/Speech-Emotion-Recognition-Model-1.h5')




In [75]:
def preprocess_audio(file_path):
    # Load the audio file
    signal, sample_rate = librosa.load(file_path, sr=SAMPLE_RATE)
    
    # Cropping & Resampling
    start_time = 0.4  # Start time in seconds
    end_time = 1.9    # End time in seconds
    start_frame = int(start_time * sample_rate)
    end_frame = int(end_time * sample_rate)
    signal = signal[start_frame:end_frame]
    signal = librosa.resample(signal, sample_rate, DOWN_SAMPLE_RATE)
    
    # Add noise
    signal = noise(signal)
    res1 = extract_features(signal, DOWN_SAMPLE_RATE)
    
    # Stretch and shift pitch
    new_data = stretch(signal)[:24000]
    data_stretch_pitch = pitch(new_data, DOWN_SAMPLE_RATE)
    res2 = extract_features(data_stretch_pitch, DOWN_SAMPLE_RATE)
    
    # Prepare input by padding
    X_input = [res1, res2]
    X_input = tf.keras.preprocessing.sequence.pad_sequences(X_input)
    
    return np.array(X_input)

In [76]:
# Path to your new audio file
file_path = 'TESS Toronto emotional speech set data/OAF_Sad/OAF_bean_sad.wav'

# Preprocess the new audio file
X_input = preprocess_audio(file_path)

# Make predictions
predictions = model.predict(X_input)

# Get the predicted label
predicted_label = np.argmax(predictions, axis=1)

# Print the predicted emotion label
#emotion_labels = ["Neutral", "Calm", "Happy", "Sad", "Angry", "Fearful", "Disgust", "Surprised"]
emotion_labels = ["neutral", "happy", "surprise", "angry", "disgust", "fear", "sad"]
print(f"Predicted Emotion: {emotion_labels[predicted_label[0]]}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 187ms/step
Predicted Emotion: sad


# TF-lite

In [77]:
weights = model.get_weights()

In [78]:
converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]
converter._experimental_lower_tensor_list_ops = False  # Disable lowering of tensor list ops
tflite_model = converter.convert()

with tf.io.gfile.GFile("Models/Speech_Emo_Rec.tflite", 'wb') as f:
    f.write(tflite_model)

converter.optimizations = [tf.lite.Optimize.DEFAULT]
quant_tflite_model = converter.convert()
with tf.io.gfile.GFile("Models/Speech_Emo_Rec_quant.tflite", 'wb') as f:
   f.write(quant_tflite_model)

INFO:tensorflow:Assets written to: /var/folders/j1/1n5_k0ss3w38z0f3qkzrynj80000gn/T/tmp7_rr3oq3/assets


INFO:tensorflow:Assets written to: /var/folders/j1/1n5_k0ss3w38z0f3qkzrynj80000gn/T/tmp7_rr3oq3/assets


Saved artifact at '/var/folders/j1/1n5_k0ss3w38z0f3qkzrynj80000gn/T/tmp7_rr3oq3'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 47, 13), dtype=tf.float32, name='input_layer_1')
Output Type:
  TensorSpec(shape=(None, 7), dtype=tf.float32, name=None)
Captures:
  13390839504: TensorSpec(shape=(), dtype=tf.resource, name=None)
  13350674752: TensorSpec(shape=(), dtype=tf.resource, name=None)
  13390836336: TensorSpec(shape=(), dtype=tf.resource, name=None)
  13399947072: TensorSpec(shape=(), dtype=tf.resource, name=None)
  13399947424: TensorSpec(shape=(), dtype=tf.resource, name=None)
  13399933872: TensorSpec(shape=(), dtype=tf.resource, name=None)
  13400016496: TensorSpec(shape=(), dtype=tf.resource, name=None)
  13400015968: TensorSpec(shape=(), dtype=tf.resource, name=None)
  13400026704: TensorSpec(shape=(), dtype=tf.resource, name=None)
  13400027584: TensorSpec(shape=(), dtype=tf.resource, name=None)


W0000 00:00:1726726184.885719  827713 tf_tfl_flatbuffer_helpers.cc:392] Ignored output_format.
W0000 00:00:1726726184.885729  827713 tf_tfl_flatbuffer_helpers.cc:395] Ignored drop_control_dependency.
2024-09-19 11:39:44.885849: I tensorflow/cc/saved_model/reader.cc:83] Reading SavedModel from: /var/folders/j1/1n5_k0ss3w38z0f3qkzrynj80000gn/T/tmp7_rr3oq3
2024-09-19 11:39:44.886591: I tensorflow/cc/saved_model/reader.cc:52] Reading meta graph with tags { serve }
2024-09-19 11:39:44.886596: I tensorflow/cc/saved_model/reader.cc:147] Reading SavedModel debug info (if present) from: /var/folders/j1/1n5_k0ss3w38z0f3qkzrynj80000gn/T/tmp7_rr3oq3
2024-09-19 11:39:44.894638: I tensorflow/cc/saved_model/loader.cc:236] Restoring SavedModel bundle.
2024-09-19 11:39:44.929109: I tensorflow/cc/saved_model/loader.cc:220] Running initialization op on SavedModel bundle at path: /var/folders/j1/1n5_k0ss3w38z0f3qkzrynj80000gn/T/tmp7_rr3oq3
2024-09-19 11:39:44.944358: I tensorflow/cc/saved_model/loader.cc:

INFO:tensorflow:Assets written to: /var/folders/j1/1n5_k0ss3w38z0f3qkzrynj80000gn/T/tmpck2qnvw7/assets


INFO:tensorflow:Assets written to: /var/folders/j1/1n5_k0ss3w38z0f3qkzrynj80000gn/T/tmpck2qnvw7/assets


Saved artifact at '/var/folders/j1/1n5_k0ss3w38z0f3qkzrynj80000gn/T/tmpck2qnvw7'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 47, 13), dtype=tf.float32, name='input_layer_1')
Output Type:
  TensorSpec(shape=(None, 7), dtype=tf.float32, name=None)
Captures:
  13390839504: TensorSpec(shape=(), dtype=tf.resource, name=None)
  13350674752: TensorSpec(shape=(), dtype=tf.resource, name=None)
  13390836336: TensorSpec(shape=(), dtype=tf.resource, name=None)
  13399947072: TensorSpec(shape=(), dtype=tf.resource, name=None)
  13399947424: TensorSpec(shape=(), dtype=tf.resource, name=None)
  13399933872: TensorSpec(shape=(), dtype=tf.resource, name=None)
  13400016496: TensorSpec(shape=(), dtype=tf.resource, name=None)
  13400015968: TensorSpec(shape=(), dtype=tf.resource, name=None)
  13400026704: TensorSpec(shape=(), dtype=tf.resource, name=None)
  13400027584: TensorSpec(shape=(), dtype=tf.resource, name=None)


W0000 00:00:1726726185.335481  827713 tf_tfl_flatbuffer_helpers.cc:392] Ignored output_format.
W0000 00:00:1726726185.335494  827713 tf_tfl_flatbuffer_helpers.cc:395] Ignored drop_control_dependency.
2024-09-19 11:39:45.335614: I tensorflow/cc/saved_model/reader.cc:83] Reading SavedModel from: /var/folders/j1/1n5_k0ss3w38z0f3qkzrynj80000gn/T/tmpck2qnvw7
2024-09-19 11:39:45.336256: I tensorflow/cc/saved_model/reader.cc:52] Reading meta graph with tags { serve }
2024-09-19 11:39:45.336261: I tensorflow/cc/saved_model/reader.cc:147] Reading SavedModel debug info (if present) from: /var/folders/j1/1n5_k0ss3w38z0f3qkzrynj80000gn/T/tmpck2qnvw7
2024-09-19 11:39:45.344335: I tensorflow/cc/saved_model/loader.cc:236] Restoring SavedModel bundle.
2024-09-19 11:39:45.378084: I tensorflow/cc/saved_model/loader.cc:220] Running initialization op on SavedModel bundle at path: /var/folders/j1/1n5_k0ss3w38z0f3qkzrynj80000gn/T/tmpck2qnvw7
2024-09-19 11:39:45.393548: I tensorflow/cc/saved_model/loader.cc:

In [79]:
print("Model Sizes:")
!ls -lh Models | awk '{print $5 "\t" $9}'

Model Sizes:
	
514K	Speech_Emo_Rec.tflite
165K	Speech_Emo_Rec_quant.tflite


In [80]:
def evaluate_tflite(interpreter, test_data, test_label):
    # Get the input and output tensors.
    input_details = interpreter.get_input_details()
    output_details = interpreter.get_output_details()
    
    num_correct = 0
    num_total = 0

    # Iterate over the testing data.
    for i in range(test_data.shape[0]):
        # Get the input data for this example.
        input_data = np.array([test_data[i]], dtype=np.float32)

        # Set the input tensor.
        interpreter.set_tensor(input_details[0]['index'], input_data)

        # Run inference.
        interpreter.invoke()

        # Get the output tensor.
        output_data = interpreter.get_tensor(output_details[0]['index'])

        # Compute the predicted label.
        predicted_label = np.argmax(output_data)

        # Update the results.
        if predicted_label == test_label[i]:
            num_correct += 1
        num_total += 1

    # Reset all variables so it will not pollute other inferences.
    interpreter.reset_all_variables()
    
    # Compute the accuracy.
    accuracy = num_correct / num_total
    
    return accuracy

    
# Load tflite model.
interpreter = tf.lite.Interpreter(model_path="Models/Speech_Emo_Rec_quant.tflite")
interpreter.allocate_tensors()

tflite_test_acc = evaluate_tflite(interpreter, X_test, y_test)
print(f"TF Lite quant Model Accuracy: {tflite_test_acc * 100:.2f}%")

# Load Speech_Emo_Rec.tflite model
interpreter2 = tf.lite.Interpreter(model_path="Models/Speech_Emo_Rec.tflite")
interpreter2.allocate_tensors()

tflite_test_acc2 = evaluate_tflite(interpreter2, X_test, y_test)
print(f"TF Lite Model Accuracy: {tflite_test_acc2 * 100:.2f}%")

print("\nOriginal Accuracy  :  ", accuracy)
print(f"Accuracy Difference from Original Model of Speech_Emo_Rec_quant.tflite : {(accuracy-tflite_test_acc) * 100:.2f}%")
print(f"Accuracy Difference from Original Model of Speech_Emo_Rec.tflite : {(accuracy-tflite_test_acc2) * 100:.2f}%")

2024-09-19 11:39:54.467420: E tensorflow/core/framework/node_def_util.cc:676] NodeDef mentions attribute use_inter_op_parallelism which is not in the op definition: Op<name=TensorListReserve; signature=element_shape:shape_type, num_elements:int32 -> handle:variant; attr=element_dtype:type; attr=shape_type:type,allowed=[DT_INT32, DT_INT64]> This may be expected if your graph generating binary is newer  than this binary. Unknown attributes will be ignored. NodeDef: {{node TensorListReserve}}


TF Lite quant Model Accuracy: 71.61%
TF Lite Model Accuracy: 71.08%

Original Accuracy  :   0.7108050847457628
Accuracy Difference from Original Model of Speech_Emo_Rec_quant.tflite : -0.53%
Accuracy Difference from Original Model of Speech_Emo_Rec.tflite : 0.00%


In [81]:
interpreter = tf.lite.Interpreter(model_path="Models/Speech_Emo_Rec_quant.tflite")
interpreter.allocate_tensors()

2024-09-19 11:40:00.127405: E tensorflow/core/framework/node_def_util.cc:676] NodeDef mentions attribute use_inter_op_parallelism which is not in the op definition: Op<name=TensorListReserve; signature=element_shape:shape_type, num_elements:int32 -> handle:variant; attr=element_dtype:type; attr=shape_type:type,allowed=[DT_INT32, DT_INT64]> This may be expected if your graph generating binary is newer  than this binary. Unknown attributes will be ignored. NodeDef: {{node TensorListReserve}}


In [82]:
# Get input and output tensors.
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

In [83]:
print(input_details)

[{'name': 'serving_default_input_layer_1:0', 'index': 0, 'shape': array([ 1, 47, 13], dtype=int32), 'shape_signature': array([-1, 47, 13], dtype=int32), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}]


In [84]:
print(output_details)

[{'name': 'StatefulPartitionedCall_1:0', 'index': 48, 'shape': array([1, 7], dtype=int32), 'shape_signature': array([-1,  7], dtype=int32), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}]


In [85]:
def preprocess_audio(file_path):
    signal, sample_rate = librosa.load(file_path, sr=SAMPLE_RATE)
    
    # Cropping & Resampling
    start_time = 0.4  # Start time in seconds
    end_time = 1.9    # End time in seconds
    start_frame = int(start_time * sample_rate)
    end_frame = int(end_time * sample_rate)
    signal = signal[start_frame:end_frame]
    signal = librosa.resample(signal, sample_rate, DOWN_SAMPLE_RATE)
    
    # Add noise and extract features
    signal = noise(signal)
    res1 = extract_features(signal, DOWN_SAMPLE_RATE)
    
    # Stretch and shift pitch
    new_data = stretch(signal)[:24000]
    data_stretch_pitch = pitch(new_data, DOWN_SAMPLE_RATE)
    res2 = extract_features(data_stretch_pitch, DOWN_SAMPLE_RATE)
    
    # Prepare input by padding
    X_input = [res1, res2]
    X_input = tf.keras.preprocessing.sequence.pad_sequences(X_input)
    
    return np.array(X_input)



In [98]:
# Path to your new audio file
file_path = 'TESS Toronto emotional speech set data/OAF_Fear/OAF_chair_fear.wav'

# Preprocess the new audio file
X_input = preprocess_audio(file_path)


In [99]:
X_input.shape

(2, 47, 13)

In [100]:
X_reshaped = np.mean(X_input, axis=0).reshape(1, 47, 13).astype(np.float32)
print(X_reshaped.shape)
print(X_reshaped.dtype)

(1, 47, 13)
float32


In [101]:
interpreter.set_tensor(input_details[0]['index'], X_reshaped)


In [102]:
interpreter.invoke()

In [103]:
# Get the predicted output
output_data = interpreter.get_tensor(output_details[0]['index'])

# Get the predicted label
predicted_label = np.argmax(output_data)

# Print the predicted emotion label
emotion_labels = ["neutral", "happy", "surprise", "angry", "disgust", "fear", "sad"]
print(f"Predicted Emotion: {emotion_labels[predicted_label]}")

Predicted Emotion: fear
