# Calculate log mel-Spectrograms

Logarithmic mel-spectrograms are calculated according to Palanisamy et al. (2020) for three channels with different window sizes and hop lengths:
* Channel 1: window size = 25ms, hop length = 10ms.
* Channel 2: window size = 50ms, hop length = 25ms.
* Channel 3: window size = 100ms, hop length = 50ms. 

# Libraries

In [None]:
!pip install torch torchvision torchaudio
import torch
print ("current pytorch version is: ", torch.__version__)

import PIL
print ("current pillow version is: ", PIL.__version__)

from google.colab import drive 
import librosa
import tensorflow as tf
import numpy as np
import pandas as pd
import torchaudio
from torchvision import transforms, models, datasets
from PIL import Image
import pickle as pkl # save files als pkl
from tqdm import tqdm # Progress Bar
import warnings # ignore librosa warnings
print('\ndone importing.')

# Functions

In [None]:
def extract_spectrogram(audio_path,audiofile_name,target,resize,sampling_rate):
  """
  # audio_path: path to audio files
  # audiofile_name: name of audio file (e.g. 'test.mp3')
  # target: emotion class of audio
  # sampling_rate: sampling rate of audio file

  # Reference: Palanisamy et al. (2020).
  """

  num_channels = 3
  window_sizes = [25, 50, 100]
  hop_sizes = [10, 25, 50]

  # Load audio file
  with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        try:
          clip, sr = librosa.load("{}/{}".format(audio_path, audiofile_name), sr=sampling_rate)
        except Exception as e:
          print(e)

  # Initialise spectrogram list
  specs = []

  for i in range(num_channels):
    window_length = int(round(window_sizes[i]*sampling_rate/1000))
    hop_length = int(round(hop_sizes[i]*sampling_rate/1000))

    clip = torch.Tensor(clip)

    # Calculate Log Mel-Spectrograms
    try:
      spec = torchaudio.transforms.MelSpectrogram(sample_rate=sampling_rate, n_fft=4410, win_length=window_length, hop_length=hop_length, n_mels=128)(clip)
    except Exception as e:
      print(e)
    
    eps = 1e-6
    spec = spec.numpy()
    spec = np.log(spec+ eps)
    spec = np.asarray(transforms.Resize(resize)(Image.fromarray(spec)))
    specs.append(spec)

  # Create new entry for spectrogram value list
  new_entry = {}
  new_entry["audio"] = audiofile_name
  new_entry["target"] = target # Emotion label
  new_entry["spec_values"] = np.array(specs) # Values for 3-channel Log Mel-Spectrograms
  return new_entry

In [None]:
def calculate_specs(df,audio_path,sampling_rate,resize):

  print()
  print('-'*20)
  print('Input size: {}'.format(resize))

  # Initialise list for spectrogram values
  values = []

  with tqdm(total=df.shape[0]) as t: # Prograss Bar
    for row in df.itertuples():
      """
      ATTENTION: 
      Index for filename in row might differ 
      depending on the dataset used!
      """"
      audiofile_name = str(row[1])

      # Emotion label numerical 
      # (0: sad, 1: delighted, 2: relaxed, 3: angry)
      emotion_num = row[6]

      try:
        new_entry = extract_spectrogram(audio_path,audiofile_name,emotion_num,resize,sampling_rate)
      except Exception as e:
        print(e)
      
      values.append(new_entry)
      t.update() # Prograss Bar Update
  t.close()

  out_pkl_file = '/path/to/log_melspectrograms.pkl')

  # Save spectrograms as pkl file
  with open(out_pkl_file,"wb") as handler:
    try:
      pkl.dump(values, handler, protocol=pkl.HIGHEST_PROTOCOL)
    except Exception as e:
      print(e)

    print('\nFile saved to {}'.format(out_pkl_file))
    print()

  return values

# Script

Execute to calculate Log Mel-spectrograms.

In [None]:
# Get Data from Google Drive
drive.mount('/content/gdrive')

df = pd.read_csv('/path/to/list_of_audiofiles.csv',index_col=0)

audiofiles_path = '/path/to/audiofiles'

resize = (224,224) # ImageNet input image size
sampling_rate = 44100 

values = []

try:
  val = calculate_specs(df,audiofiles_path,sampling_rate,resize)
except Exception as e:
  print(e)
  val = 0

values.append(val)