In [9]:

# load modules for plotting
import matplotlib
import matplotlib.pyplot as plt

# load modules from the librosa library for acoustic processing and analysis
import librosa
import librosa.display # for displaying acoustic information
import librosa.feature # for extracting and working with audio features

# load modules for writing sound files
import soundfile as sf

# load module for playing sound files within the notebook
from IPython.display import Audio, display


# load module with audio loss functions
#import auraloss

# load modules for doing math
import pandas as pd
import numpy as np

# load module for measuring time (as during training)
# and define two functions for measuring and displaying time
import time

def timeSince(since):
    now = time.time()
    s = now - since
    return s

def asMinutes(s):
    if type(s) is str:
      return s
    else:
      m = np.floor(s / 60)
      s -= m * 60
      return '%dm %ds' % (m, s)
      

In [14]:
# get the duration of a sound file given the file name 'fn'
def get_dur(fn):
  timeseries, sample_rate = librosa.load(fn)
  dur = librosa.get_duration(y=timeseries, # measure the duration of the sound clip,
                             sr=sample_rate) # assuming sample rate "sr"
  return dur

# using liftering, export an audio file to filename 'fn' from the mfcc features as a torch tensor in sequence format
def lifter_mfcc_array(mfcc):
  mel_from_mfcc = librosa.feature.inverse.mfcc_to_mel(mfcc)
  fourier_from_mel = librosa.feature.inverse.mel_to_stft(mel_from_mfcc)
  return fourier_from_mel

# get audio data, including fourier representation and mfcc features,
# from a sound file given the file name 'fn'
def process_audio(fn, n_mfcc_list = [12], duration = 0, offset = 0):
  out_dict = {}
  if duration == 0:
    timeseries, sample_rate = librosa.load(fn, offset = offset)
  else:
    timeseries, sample_rate = librosa.load(fn, duration = duration, offset = offset)
  timeseries = librosa.effects.preemphasis(timeseries) # apply a pre-emphasis filter
  dur = librosa.get_duration(y=timeseries, # measure the duration of the sound clip,
                                sr=sample_rate) # assuming sample rate "sr"
  fourier = np.abs(librosa.stft(timeseries))
  mfcc_dict = {}
  for n_mfcc in n_mfcc_list:
    mfcc = librosa.feature.mfcc(y=timeseries, sr=sample_rate, n_mfcc = n_mfcc)
    mfcc_dict[str(n_mfcc)] = mfcc
  #liftered = lifter_mfcc_array(mfcc)
  #liftered_tensor = convert_to_tensor(liftered)
  out_dict["fn"] = fn
  out_dict["sr"] = sample_rate
  out_dict["start"] = offset
  out_dict["end"] = offset + duration
  out_dict["dur"] = dur
  out_dict["fourier"] = fourier
  out_dict["mfcc"] = mfcc_dict
  #out_dict["liftered"] = liftered_tensor
  return out_dict

def lifter_mfcc(mfcc):
  mel_from_mfcc = librosa.feature.inverse.mfcc_to_mel(mfcc)
  fourier_from_mel = librosa.feature.inverse.mel_to_stft(mel_from_mfcc)
  return fourier_from_mel

def plot_mel_spectrogram(fourier,sr):
  timeseries_from_fourier = librosa.griffinlim(fourier)
  mel = librosa.feature.melspectrogram(timeseries_from_fourier,sr)
  fig, ax = plt.subplots()
  ax.set_aspect(1/1000)
  mel_dB = librosa.power_to_db(mel, ref=np.max)
  img = librosa.display.specshow(mel_dB, x_axis='time',
                         y_axis='mel', sr=sr, cmap='gray_r',
                         fmax=5500, ax=ax)
  title = ax.set(title='Mel-frequency spectrogram')

# export an audio file to filename 'fn' from the fourier representation as a torch tensor in sequence format
def export_audio(fn,fourier,sr):
  timeseries_from_fourier = librosa.griffinlim(fourier)
  timeseries_from_fourier = librosa.effects.deemphasis(timeseries_from_fourier) # undo the preemphasis filter
  sf.write(fn, timeseries_from_fourier, sr, subtype='PCM_24')

In [11]:
folder = "sound"
filenames = ["greatgatsby","greatgatsby2"] # list of filenames without extensions
filenames = [folder+"/"+fn+".wav" for fn in filenames] # add the .wav extension to the end of each filename

In [26]:
clip_data = process_audio(filenames[0],
                          duration=8.9,
                          offset=49.7,
                          n_mfcc_list=[12,20,80])

sr_clip = clip_data["sr"]
dur_clip = clip_data["dur"]
print(f'Duration = {dur_clip}s') # display the duration of the sound clip

print(clip_data.keys())

print(f'Fourier Data Dimensions = {clip_data["fourier"].shape}') # display the dimensions of the fourier data
print(f'MFCC12 Data Dimensions = {clip_data["mfcc"]["12"].shape}') # display the dimensions of the mfcc data
print(f'MFCC20 Data Dimensions = {clip_data["mfcc"]["20"].shape}') # display the dimensions of the mfcc data
print(f'MFCC80 Data Dimensions = {clip_data["mfcc"]["80"].shape}') # display the dimensions of the mfcc data

Duration = 8.9s
dict_keys(['fn', 'sr', 'start', 'end', 'dur', 'fourier', 'mfcc'])
Fourier Data Dimensions = (1025, 384)
MFCC12 Data Dimensions = (12, 384)
MFCC20 Data Dimensions = (20, 384)
MFCC80 Data Dimensions = (80, 384)


In [28]:
# full quality audio recovered directly from the fourier data
plot_mel_spectrogram(clip_data["fourier"],clip_data["sr"])
export_audio('clip_from_fourier.wav', clip_data["fourier"], clip_data["sr"])

Audio('clip_from_fourier.wav')

TypeError: melspectrogram() takes 0 positional arguments but 2 were given