Dataset retrieved from: https://zenodo.org/record/1188976

In [None]:
pip install numpy==1.21.0

In [None]:
pip install opendatasets

In [None]:
pip install tqdm

In [None]:
pip install matplotlib

In [None]:
pip install librosa

In [None]:
pip install fastai

In [None]:
pip install sounddevice

In [1]:
from tqdm import tqdm

import matplotlib.pyplot as plt   # plotting
from matplotlib import image as mpimg

import librosa                    # Python package for music and audio analysis
import librosa.display            # Allows you to display audio files 
import os                         # The OS module in Python provides a way of 
                                  # using operating system dependent functionality.
import scipy.io.wavfile           # Open a WAV files
from scipy.io.wavfile import write

import numpy as np                # Used for working with arrays
import fastai                     
import glob                       # Used to return all file paths that match a specific 
                                  # pattern

import sounddevice as sd


# Import fast AI stuff
# from fastai import *                                 
# from fastai.vision.all import *
# from fastai.vision.data import ImageDataLoaders
# from fastai.tabular.all import *
# from fastai.text.all import *
# from fastai.vision.widgets import *

# DOWNLOAD DATASET

In [None]:
import opendatasets as od
od.download("https://www.kaggle.com/datasets/uwrfkaggler/ravdess-emotional-speech-audio")
# use kaggle credentials

# CREATE NEEDED DIRECTORIES

In [2]:
!rm -r "./ravdess-emotional-speech-audio/audio_speech_actors_01-24"

!mkdir './output'
!mkdir './output/live_images'

# !mkdir './DATASET'
# !mkdir './DATASET/angry'
# !mkdir './DATASET/calm'
# !mkdir './DATASET/disgust'
# !mkdir './DATASET/fearful'
# !mkdir './DATASET/neutral'
# !mkdir './DATASET/sad'
# !mkdir './DATASET/happy'
# !mkdir './DATASET/surprised'

!mkdir './models'



rm: ./ravdess-emotional-speech-audio/audio_speech_actors_01-24: No such file or directory
mkdir: ./output: File exists
mkdir: ./output/live_images: File exists
mkdir: ./models: File exists


In [3]:
!mkdir './Categorical_Dataset'          
!mkdir './Categorical_Dataset/positive' # happy, surprised
!mkdir './Categorical_Dataset/negative' # angry, sad, disgusted, fearful
!mkdir './Categorical_Dataset/neutral'  # neutral, calm

# DEFINING THE PATHS

In [4]:
AUDIO_FOLDER = "./ravdess-emotional-speech-audio/*"
DATASET = "./Categorical_Dataset/"


paths = [audioFile for actor in glob.glob(AUDIO_FOLDER) for audioFile in glob.glob(actor +'/*')]

result = [paths[i:i+10] for i in range(0, len(paths), 10)]


In [5]:
# emotions
dicts = {
    '01' : 'neutral', 
    '02' : 'calm', 
    '03' : 'happy', 
    '04' : 'sad', 
    '05' : 'angry', 
    '06' : 'fearful', 
    '07' : 'disgust', 
    '08' : 'surprised'
}

# RUN THIS CODE TO CLEAR THE DATASET FOLDER:

In [14]:
# clear the test1 and train1 folders:
directories = [DATASET] # specify the path to the directory

for directory in directories:
    for foldername in os.listdir(directory):
        folderpath = os.path.join(directory, foldername)
        if os.path.isdir(folderpath):
            for filename in os.listdir(folderpath):
                filepath = os.path.join(folderpath, filename)
                if os.path.isfile(filepath):
                    os.remove(filepath)

# CONVERT ALL AUDIO FILES TO MELSPECTROGRAM:

In [7]:
!open .

In [8]:
def convert_audio_to_melspectrogram(audioPath, savePath):

    # Load audio file and visualize its waveform (using librosa)
    # The `librosa.load()` function takes two arguments: the path to the audio file and the sample rate.
    # The sample rate is the number of samples per second in the audio file.
    x, sr = librosa.load(audioPath, sr=44100)

    # Trim the audio file to remove silence at the beginning and end.
    # The `librosa.effects.trim()` function takes two arguments: the audio signal and the threshold.
    # The threshold is the minimum amplitude that is considered to be non-silence.
    xt,_=librosa.effects.trim(x)                         

    x=xt

    # Plot the spectrogram.
    # The `librosa.stft()` function computes the short-time Fourier transform of the audio signal.
    # The `librosa.amplitude_to_db()` function converts the amplitude of the spectrogram to decibels.
    X = librosa.stft(x)
    Xdb = librosa.amplitude_to_db(abs(X))

    # Apply log transformation on the loaded audio signals
    # The `librosa.display.specshow()` function plots the spectrogram.
    # The `sr` argument specifies the sample rate of the audio signal.
    # The `x_axis` argument specifies the axis along which the time is displayed.
    # The `y_axis` argument specifies the axis along which the frequency is displayed.
    librosa.display.specshow(Xdb, sr=sr, vmin=-60, vmax=60,x_axis='time', y_axis='log',cmap='magma', ax=None)

    #plt.colorbar()
    
    # Remove the axis
    plt.gca().get_xaxis().set_visible(False)
    plt.gca().get_yaxis().set_visible(False)
    
    # Save the figure.
    # The `plt.savefig()` function saves the figure to a file.
    # The `savePath` argument specifies the path to the file.
    plt.savefig(savePath)
    plt.clf()

In [None]:
# convert_audio_to_melspectrogram(result[0][0],"./output/sup")
# convert_audio_to_melspectrogram(result[0][1],"./output/sup")
# convert_audio_to_melspectrogram(result[0][2],"./output/sup")

In [15]:
# Convert the audio files to melspectrograms.
counts = {}
fileLoc=1
for path in result:
    files = tqdm(path)
    for audio_file_path in files:
        files.set_postfix_str(f"{fileLoc}/{len(result)}")
        
        em = audio_file_path[-18:-16]
        
        emotion = ''
        
        if em in ('01','02'):
            emotion = f'neutral'
        if em in ('03','08'):
            emotion = f'positive'
        if em in ('04','05','06','07'):
            emotion = f'negative'
        
        count = counts.get(emotion, 1)
        
        # split up test and train data
        p = os.path.join(f"{DATASET}{emotion}", f"{emotion}{str(count).zfill(6)}_({dict[em]}).jpg")
        
        
        counts[emotion] = count + 1
        
        convert_audio_to_melspectrogram(audio_file_path, p)
    fileLoc+=1

100%|████████████████████████████████████| 10/10 [00:03<00:00,  2.95it/s, 1/144]
100%|████████████████████████████████████| 10/10 [00:03<00:00,  3.25it/s, 2/144]
100%|████████████████████████████████████| 10/10 [00:03<00:00,  3.13it/s, 3/144]
100%|████████████████████████████████████| 10/10 [00:03<00:00,  3.23it/s, 4/144]
100%|████████████████████████████████████| 10/10 [00:02<00:00,  3.58it/s, 5/144]
100%|████████████████████████████████████| 10/10 [00:02<00:00,  3.53it/s, 6/144]
100%|████████████████████████████████████| 10/10 [00:03<00:00,  2.90it/s, 7/144]
100%|████████████████████████████████████| 10/10 [00:03<00:00,  3.03it/s, 8/144]
100%|████████████████████████████████████| 10/10 [00:03<00:00,  2.98it/s, 9/144]
100%|███████████████████████████████████| 10/10 [00:03<00:00,  2.68it/s, 10/144]
100%|███████████████████████████████████| 10/10 [00:03<00:00,  2.65it/s, 11/144]
100%|███████████████████████████████████| 10/10 [00:03<00:00,  2.94it/s, 12/144]
100%|███████████████████████

100%|██████████████████████████████████| 10/10 [00:03<00:00,  3.16it/s, 102/144]
100%|██████████████████████████████████| 10/10 [00:02<00:00,  3.82it/s, 103/144]
100%|██████████████████████████████████| 10/10 [00:03<00:00,  3.20it/s, 104/144]
100%|██████████████████████████████████| 10/10 [00:02<00:00,  3.77it/s, 105/144]
100%|██████████████████████████████████| 10/10 [00:02<00:00,  3.44it/s, 106/144]
100%|██████████████████████████████████| 10/10 [00:02<00:00,  3.67it/s, 107/144]
100%|██████████████████████████████████| 10/10 [00:02<00:00,  3.40it/s, 108/144]
100%|██████████████████████████████████| 10/10 [00:02<00:00,  3.38it/s, 109/144]
100%|██████████████████████████████████| 10/10 [00:03<00:00,  3.07it/s, 110/144]
100%|██████████████████████████████████| 10/10 [00:03<00:00,  3.21it/s, 111/144]
100%|██████████████████████████████████| 10/10 [00:03<00:00,  3.18it/s, 112/144]
100%|██████████████████████████████████| 10/10 [00:03<00:00,  3.20it/s, 113/144]
100%|███████████████████████

<Figure size 640x480 with 0 Axes>

In [None]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

fs=44100
seconds=3
dtype = 'int16'
mic = int(input(f'Of the following list of devices, select which one to use as your microphone:\n{sd.query_devices()}\n>>>'))
count = 1

while true:
    
    print("Recording...")
    
    checkEmotion(count,mic, fs, seconds, dtype)
    
    cont = input("\nContinue? (Y/N): ")
    
    if cont == 'Y' or cont == 'y':
        count+=1
        continue
    else:
        break

In [None]:
'''loss_functions = [CrossEntropyLossFlat(), LabelSmoothingCrossEntropy(),
                  FocalLossFlat()]
loss_functions1 = ["CrossEntropyLossFlat", "LabelSmoothingCrossEntropy",
                  "FocalLossFlat"]
train_size = [0.2, 0.4, 0.6]

for LF in range(0,3):
    for TS in range(0,3):
        print(f"___________{loss_functions1[LF]} ... {train_size[TS]}___________")
        dls = ImageDataLoaders.from_folder(train_path, valid_pct=train_size[TS], seed=5, num_workers=0)
        print(f"validation: {len(dls.valid_ds.items[:])}  |  training set: {len(dls.train_ds.items[:])}")

        learn = vision_learner(dls, models.resnet34, loss_func=loss_functions[LF], metrics=accuracy)
        lr_min, lr_steep = learn.lr_find(suggest_funcs=(minimum, steep))
        print(f"Minimum/10: {lr_min:.2e}, steepest point: {lr_steep:.2e}")

        learn.fit(10, float(f"{lr_steep:.2e}"))
        #interp = ClassificationInterpretation.from_learner(learn)
        #losses,idxs = interp.top_losses()
        #len(dls.valid_ds)==len(losses)==len(idxs)

        #interp.plot_confusion_matrix(figsize=(8,8), dpi=90)

        #interp.plot_top_losses(2, figsize=(10,11))
        
        print("\n")
'''

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import PIL
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

from sklearn.metrics import confusion_matrix
import seaborn as sns

In [None]:
import pathlib
data_dir = pathlib.Path('./DATASET').with_suffix('')

In [None]:
data_dir

In [None]:
image_count = len(list(data_dir.glob('*/*.jpg')))
print(image_count)

In [None]:
anger = list(data_dir.glob('angry/*'))
PIL.Image.open(str(anger[0]))
PIL.Image.open(str(anger[1]))

calm = list(data_dir.glob('calm/*'))
PIL.Image.open(str(calm[0]))
PIL.Image.open(str(calm[1]))

In [None]:
batch_size = 32
img_height = 240
img_width = 320

In [None]:
train_ds = tf.keras.utils.image_dataset_from_directory(
  data_dir,
  validation_split=0.2,
  subset="training",
  seed=123,
  image_size=(img_height, img_width),
  batch_size=batch_size)

In [None]:
val_ds = tf.keras.utils.image_dataset_from_directory(
  data_dir,
  validation_split=0.2,
  subset="validation",
  seed=123,
  image_size=(img_height, img_width),
  batch_size=batch_size)

In [None]:
class_names = train_ds.class_names
print(class_names)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 10))
for images, labels in train_ds.take(1):
    for i in range(9):
        ax = plt.subplot(3, 3, i + 1)
        plt.imshow(images[i].numpy().astype("uint8"))
        plt.title(class_names[labels[i]])
        plt.axis("off")

In [None]:
for image_batch, labels_batch in train_ds:
    print(image_batch.shape)
    print(labels_batch.shape)
    break

In [None]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().shuffle(1000).prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
normalization_layer = layers.Rescaling(1./255)

In [None]:
normalized_ds = train_ds.map(lambda x, y: (normalization_layer(x), y))
image_batch, labels_batch = next(iter(normalized_ds))
first_image = image_batch[0]
print(np.min(first_image), np.max(first_image))

In [None]:
num_classes = len(class_names)

model = Sequential([
  layers.Rescaling(1./255, input_shape=(img_height, img_width, 3)),
  layers.Conv2D(16, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(32, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(64, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Flatten(),
  layers.Dense(128, activation='relu'),
  layers.Dense(num_classes)
])

In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
epochs=10
history = model.fit(
  train_ds,
  validation_data=val_ds,
  epochs=epochs
)

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(epochs)

plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

In [None]:
# Get the predictions from the model
predictions = model.predict(val_ds)
predicted_classes = np.argmax(predictions, axis=1)

# Get the true labels
true_labels = []
for images, labels in val_ds:
    true_labels.extend(labels.numpy())

# Create the confusion matrix
cm = confusion_matrix(true_labels, predicted_classes)

# Plot the confusion matrix
sns.heatmap(cm, annot=True, xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title("Confusion Matrix")
plt.show()

In [None]:
# Fight overfitting by using data augmentation
data_augmentation = keras.Sequential(
  [
    layers.RandomFlip("horizontal",
                      input_shape=(img_height,
                                  img_width,
                                  3)),
    layers.RandomRotation(0.1),
    layers.RandomZoom(0.1),
  ]
)

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
plt.figure(figsize=(10, 10))
for images, _ in train_ds.take(1):
    for i in range(9):
        augmented_images = data_augmentation(images)
        ax = plt.subplot(3, 3, i + 1)
        plt.imshow(augmented_images[0].numpy().astype("uint8"))
        plt.axis("off")
        


In [None]:
# Utilizing Dropout
model = Sequential([
  data_augmentation,
  layers.Rescaling(1./255),
  layers.Conv2D(16, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(32, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(64, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Dropout(0.2),
  layers.Flatten(),
  layers.Dense(128, activation='relu'),
  layers.Dense(num_classes, name="outputs")
])

In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
epochs = 15
history = model.fit(
  train_ds,
  validation_data=val_ds,
  epochs=epochs
)

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(epochs)

plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

In [None]:
# Get the predictions from the model
predictions = model.predict(val_ds)
predicted_classes = np.argmax(predictions, axis=1)

# Get the true labels
true_labels = []
for images, labels in val_ds:
    true_labels.extend(labels.numpy())

# Create the confusion matrix
cm = confusion_matrix(true_labels, predicted_classes)

# Plot the confusion matrix
sns.heatmap(cm, annot=True, xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title("Confusion Matrix")
plt.show()

In [None]:
location = './DATASET/angry/angry000001.jpg'
#file_path = tf.keras.utils.get_file('thingo', origin=location)

#img = pathlib.Path('./DATASET/angry/angry000001.jpg')

img = tf.keras.utils.load_img(
   location, target_size=(img_height, img_width)
)
img_array = tf.keras.utils.img_to_array(img)
img_array = tf.expand_dims(img_array, 0) # Create a batch

predictions = model.predict(img_array)
score = tf.nn.softmax(predictions[0])



sorted_predictions = np.argsort(score)[::-1]

# Take the top five predictions.
top_predictions = sorted_predictions[:5]

# Print the top five predictions.
for i in range(len(top_predictions)):
    print(
        "The image most likely belongs to {} with a {:.4f} percent confidence."
        .format(class_names[top_predictions[i]], 100 * score[top_predictions[i]])
    )