## Use for ESC-50 dataset. To convert from .ogg -> .wav -> STFT(linear/mel scale)/CQT/CWT spectrogram

In [1]:
## kernel: pyaudio27
import os
import numpy as np
import matplotlib.pyplot as plt
# https://github.com/librosa/librosa
import librosa
import librosa.display

from PIL import TiffImagePlugin
from PIL import Image
import tiffspect

# Set some project parameters
K_SR = 22050
K_FFTSIZE = 512 # also used for window length where that parameter is called for
K_HOP = 256
K_DUR = 4.0 # make all files this duration

# location of subdirectories of ogg files organized by category
K_OGGDIR = "C:/Users/Huz/Documents/python_scripts/ESC50/ESC-50-master"
# location to write the wav files (converted from ogg)
K_WAVEDIR = 'ESC-50-wav'
# location to write the spectrogram files (converted from wave files)
K_SPECTDIR = 'ESC-50-spec1'
# location to write the CQT files (converted from wave files)
K_CQTDIR = 'ESC-50-cqt1'
# location to write the wavelet files (converted from wave files)
K_WAVELETDIR = 'ESC-50-wavelet1'
# location to write the mel spec files (converted from wave files)
K_MELDIR = 'ESC-50-mel1'


In [2]:
def get_subdirs(a_dir):
    """ Returns a list of sub directory names in a_dir """ 
    return [name for name in os.listdir(a_dir)
            if (os.path.isdir(os.path.join(a_dir, name)) and not (name.startswith('.')))]

def listDirectory(directory, fileExtList):                                        
    """Returns list of file info objects in directory that extension in the list fileExtList - include the . in your extension string"""
    fnameList = [os.path.normcase(f)
                for f in os.listdir(directory)
                    if (not(f.startswith('.')))]            
    fileList = [os.path.join(directory, f) 
               for f in fnameList
                if os.path.splitext(f)[1] in fileExtList]  
    return fileList , fnameList

def dirs2labelfile(parentdir, labelfile):
    """takes subdirectories of parentdir and writes them, one per line, to labelfile"""
    namelist = get_subdirs(parentdir)
    with open(labelfile, mode='wt', encoding='utf-8') as myfile:
        myfile.write('\n'.join(namelist))

In [None]:
def stereo2mono(data) :
    """ Combine 2D array into a single array, averaging channels """ 
    """ Deprecated, since we use librosa for this now. """ 
    print('converting stereo data of shape ' + str(data.shape))
    outdata=np.ndarray(shape=(data.shape[0]), dtype=np.float32)
    if data.ndim != 2 :
        print('You are calling stero2mono on a non-2D array')
    else : 
        print('    converting stereo to mono, with outdata shape = ' + str(outdata.shape))
        for idx in range(data.shape[0]) :
            outdata[idx] = (data[idx,0]+data[idx,1])/2
    return outdata

In [None]:
# esc50Ogg2Wav('/Volumes/BothWays/ZCODE/GitHubOthers/karoldvi_ESC-50/ESC-50/', 'data', 5.0, K_SR)
def esc50Ogg2Wav (topdir, outdir, dur, srate) :
    """ 
        Creates regularlized wave files for the ogg files in the ESC-50 dataset. 
        Creates class folders for the wav files in outdir with the same structure found in topdir.
        
        Parameters
            topdir - the ESC-50 dir containing class folders. 
            outdir - the top level directory to write wave files to (written in to class subfolders)
            dur - (in seconds) all files will be truncated or zeropadded to have this duration given the srate
            srate - input files will be resampled to srate as they are read in before being saved as wav files
    """ 
    sample_length = int(dur * srate)
    try:
        os.stat(outdir)  # test for existence
    except:
        os.mkdir(outdir) # create if necessary
        
    subdirs = get_subdirs(topdir)
    for subdir in subdirs :
        try:
            os.stat(outdir + '/'  + subdir) # test for existence
        except:
            os.mkdir(outdir + '/' + subdir) # create if necessary
            print('creating ' + outdir + '/'  + subdir)
    
        fullpaths, _ = listDirectory(topdir + '/' + subdir, '.ogg') 
        for idx in range(len(fullpaths)) : 
            fname = os.path.basename(fullpaths[idx])
            # librosa.load resamples to sr, clips to duration, combines channels. 
            audiodata, samplerate = librosa.load(fullpaths[idx], sr=srate, mono=True, duration=dur) # resamples if necessary (some esc-50 files are in 48K)
            # just checking ..... 
            if (samplerate != srate) :
                print('You got a sound file ' + subdir  +  '/' +  fname + ' with sample rate ' + str(samplerate) + '!')
                print(' ********* BAD SAMPLE RATE ******** ')
            if (audiodata.ndim != 1) :
                print('You got a sound file ' + subdir  +  '/' +  fname + ' with ' + str(audiodata.ndim) + ' channels!')
                audiodata = stereo2mono(audiodata)
            if (len(audiodata) > sample_length) :
                print('You got a long sound file ' + subdir  +  '/' +  fname + ' with shape ' + str(audiodata.shape) + '!')
                audiodata = np.resize(audiodata, sample_length)
                # print('  ..... and len(audiodata) = ' + str(len(audiodata)) + ', while sample_length is sposed to be ' + str(sample_length))
                print('trimming data to shape ' + str(audiodata.shape))
            if (len(audiodata) < sample_length) :
                print('You got a short sound file ' + subdir  +  '/' +  fname + ' with shape ' + str(audiodata.shape) + '!')
                audiodata = np.concatenate([audiodata, np.zeros((sample_length-len(audiodata)))])
                print('      zero padding data to shape ' + str(audiodata.shape))
            # write the file out as a wave file
            librosa.output.write_wav(outdir + '/' + subdir + '/' + os.path.splitext(fname)[0] + '.wav', audiodata, samplerate)
    print("COMPLETE")    

### OK - do the .ogg -> .wav conversion (just takes something like 5 minutes to run on the whole set)

In [None]:
esc50Ogg2Wav(K_OGGDIR, K_WAVEDIR, K_DUR, K_SR)

In [3]:
#routines to convert wav to spectrograms using Librosa as backend. For wavelets we are using Pywavelet library.
import scipy
import pywt

def wav2spect(fname, srate, fftSize, fftHop, y_scale='linear', dur=None, showplt=False, dcbin=False) :
    try:
        audiodata, samplerate = librosa.load(fname, sr=srate, mono=True, duration=dur) 
    except:
        print('can not read ' + fname)
        return
    
    S = np.abs(librosa.stft(audiodata, n_fft=fftSize, hop_length=fftHop, win_length=fftSize,  center=False))

    if (dcbin ==  False) :
        S = np.delete(S, (0), axis=0)  # delete freq 0 row
            #note: a pure DC input signal bleeds into bin 1, too.

    D = librosa.amplitude_to_db(S, ref=np.max)
    
    if showplt : # Dangerous for long runs - it opens a new figure for each file!
        librosa.display.specshow(D, y_axis=y_scale, x_axis='time', sr=srate, hop_length=fftHop)
        plt.colorbar(format='%+2.0f dB')
        plt.title(showplt)
        plt.show(block=True)
                
    return D

def wav2CQT(fname, srate, fftSize, fftHop, dur=None, showplt=False) :
    try:
        audiodata, samplerate = librosa.load(fname, sr=srate, mono=True, duration=dur) 
    except:
        print('can not read ' + fname)
        return
    
    S = librosa.cqt(audiodata,hop_length=fftHop,n_bins=256,bins_per_octave=32)

    D = librosa.amplitude_to_db(S, ref=np.max)
    
    if showplt : # Dangerous for long runs - it opens a new figure for each file!
        librosa.display.specshow(D, y_axis='linear', x_axis='time', sr=srate, hop_length=fftHop)
        plt.colorbar(format='%+2.0f dB')
        plt.title(showplt)
        plt.show(block=True)
                
    return D

def wav2wavelet(fname, srate, freq_bins, wavelet='morl', dur=None, showplt=False) :
    try:
        audiodata, samplerate = librosa.load(fname, sr=srate, mono=True, duration=dur) 
    except:
        print('can not read ' + fname)
        return
    widths = np.arange(1, freq_bins+1) #no.of freq bins
    
    S, freqs = pywt.cwt(audiodata, widths, wavelet, sampling_period=1/srate)

    D = librosa.power_to_db(S**2, ref=np.max)
    
    if showplt : # Dangerous for long runs - it opens a new figure for each file!
        librosa.display.specshow(D, y_axis='linear', x_axis='time', sr=srate, hop_length=fftHop)
        plt.colorbar(format='%+2.0f dB')
        plt.title(showplt)
        plt.show(block=True)
                
    return D

def wav2melspect(fname, srate, fftSize, fftHop, y_scale='mel', dur=None, showplt=False, dcbin=False):
    try:
        audiodata, samplerate = librosa.load(fname, sr=srate, mono=True, duration=dur) 
    except:
        print('can not read ' + fname)
        return
    
    S = librosa.feature.melspectrogram(audiodata,sr=srate, n_fft=fftSize, hop_length=fftHop)

    D = librosa.power_to_db(S, ref=np.max)
    
    if showplt : # Dangerous for long runs - it opens a new figure for each file!
        librosa.display.specshow(D, y_axis=y_scale, x_axis='time', sr=srate, hop_length=fftHop)
        plt.colorbar(format='%+2.0f dB')
        plt.title(showplt)
        plt.show(block=True)
                
    return D
    

In [4]:
import scipy
%matplotlib inline

def esc50Wav2Spect(topdir, outdir, dur, srate, fftSize, fftHop, showplt=False, dcbin=False) :
    """ 
        Creates spectrograms for subfolder-labeled wavfiles. 
        Creates class folders for the spectrogram files in outdir with the same structure found in topdir.
        
        Parameters
            topdir - the dir containing class folders containing wav files. 
            outdir - the top level directory to write wave files to (written in to class subfolders)
            dur - (in seconds) all files will be truncated or zeropadded to have this duration given the srate
            srate - input files will be resampled to srate as they are read in before being saved as wav files
    """ 
    
    try:
        os.stat(outdir)  # test for existence
    except:
        os.mkdir(outdir) # create if necessary
    
    subdirs = get_subdirs(topdir)
    count = 0
    for subdir in subdirs :
        try:
            os.stat(outdir + '/'  + subdir) # test for existence
        except:
            os.mkdir(outdir + '/' + subdir) # create if necessary
            print('creating ' + outdir + '/'  + subdir)
    
        fullpaths, _ = listDirectory(topdir + '/' + subdir, '.wav') 
        
        for idx in range(len(fullpaths)) : 
            fname = os.path.basename(fullpaths[idx])

            #D = wav2spect(fullpaths[idx], srate, fftSize, fftHop, y_scale='mel', dur=None, dcbin=False, showplt=False)
            D = wav2CQT(fullpaths[idx], srate, fftSize, fftHop, dur=None, showplt=False)
            #D = wav2wavelet(fullpaths[idx], srate, 256, 'morl', dur=None, showplt=False)
            #D = wav2melspect(fullpaths[idx], srate, fftSize, fftHop, dur=None, dcbin=False, showplt=False)
            
            tiffspect.logSpect2Tiff(D, outdir + '/' + subdir + '/' + os.path.splitext(fname)[0]) #use for everything else
            #tiffspect.wavelet2Tiff(D, outdir + '/' + subdir + '/' + os.path.splitext(fname)[0] + '.tif', 343) #use for wavelets
            
            print(str(count) + ': ' + subdir + '/' + os.path.splitext(fname)[0])
            count +=1
    print("COMPLETE")       
 

### OK - do the conversion (actually runs faster than the ogg to wav conversion). Except wavelets, that takes exponentially longer

In [5]:
#esc50Wav2Spect(K_WAVEDIR, K_SPECTDIR, K_DUR, K_SR, K_FFTSIZE, K_HOP) 
esc50Wav2Spect(K_WAVEDIR, K_CQTDIR, K_DUR, K_SR, K_FFTSIZE, K_HOP) 
#esc50Wav2Spect(K_WAVEDIR, K_WAVELETDIR, K_DUR, K_SR, K_FFTSIZE, K_HOP)
#esc50Wav2Spect(K_WAVEDIR, K_MELDIR, K_DUR, K_SR, K_FFTSIZE, K_HOP) 

creating ESC-50-cqt1/101 - Dog
0: 101 - Dog/1-100032-a
1: 101 - Dog/1-110389-a
2: 101 - Dog/1-30226-a
3: 101 - Dog/1-30344-a
4: 101 - Dog/1-32318-a
5: 101 - Dog/1-59513-a
6: 101 - Dog/1-85362-a
7: 101 - Dog/1-97392-a
8: 101 - Dog/2-114280-a
9: 101 - Dog/2-114587-a
10: 101 - Dog/2-116400-a
11: 101 - Dog/2-117271-a
12: 101 - Dog/2-118072-a
13: 101 - Dog/2-118964-a
14: 101 - Dog/2-122104-a
15: 101 - Dog/2-122104-b
16: 101 - Dog/3-136288-a
17: 101 - Dog/3-144028-a
18: 101 - Dog/3-155312-a
19: 101 - Dog/3-157695-a
20: 101 - Dog/3-163459-a
21: 101 - Dog/3-170015-a
22: 101 - Dog/3-180256-a
23: 101 - Dog/3-180977-a
24: 101 - Dog/4-182395-a
25: 101 - Dog/4-183992-a
26: 101 - Dog/4-184575-a
27: 101 - Dog/4-191687-a
28: 101 - Dog/4-192236-a
29: 101 - Dog/4-194754-a
30: 101 - Dog/4-199261-a
31: 101 - Dog/4-207124-a
32: 101 - Dog/5-203128-a
33: 101 - Dog/5-203128-b
34: 101 - Dog/5-208030-a
35: 101 - Dog/5-212454-a
36: 101 - Dog/5-213855-a
37: 101 - Dog/5-217158-a
38: 101 - Dog/5-231762-a
39: 101 - 

TypeError: unsupported operand type(s) for -: 'NoneType' and 'NoneType'

In [None]:
# example of using librosa spectrogram viewer as well as tiff file writer and reader
import scipy
%matplotlib inline

# Just run this on any wav file in your system.
audiodata, samplerate = librosa.load('esc50Wave/109 - Sheep/4-196671-B.wav', sr=K_SR, mono=True, duration=K_DUR) # resamples if necessary (some esc-50 files are in 48K)
#audiodata, samplerate = librosa.load(K_WAVEDIR + '/1/BeingRural_short.wav', sr=K_SR, mono=True, duration=K_DUR) # resamples if necessary (some esc-50 files are in 48K)

print('audiodata max is ' + str(np.max(audiodata)) + ', and audiodata sum is ' + str(np.sum(audiodata)))

# compute spectrogram and display
S = np.abs(librosa.stft(audiodata, n_fft=K_FFTSIZE, hop_length=K_HOP, win_length=K_FFTSIZE,  center=False))
print('esc50Wav2Spect" magspec max is ' + str(np.max(S)) +  ', and magspec sum is ' + str(np.sum(S)) + ', and magspec min is ' + str(np.min(S)))

Sfoo = np.delete(S, (0), axis=0)  # delete freq 0 row
print('esc50Wav2Spect" Sfoo max is ' + str(np.max(Sfoo)) +  ', and Sfoo sum is ' + str(np.sum(Sfoo)) + ', and Sfoo min is ' + str(np.min(Sfoo)))


#D = librosa.logamplitude(S**2, ref_power=np.max)
# computes amplitude_to_dB (which is in the man, but not in the library for some reason)
D = librosa.amplitude_to_db(S, ref=np.max)
print('spectrogram D shape is ' + str(D.shape))
print('esc50Wav2Spect" ampDB max is ' + str(np.max(D)) +  ', and ampDB min is ' + str(np.min(D)))


plt.subplot(3, 1, 1)
librosa.display.specshow(D, y_axis='linear', x_axis='time', sr=K_SR, hop_length=K_HOP)
plt.colorbar(format='%+2.0f dB')
plt.title('Linear-frequency magnitude spectrogram')

# save spectrogram somewhere
tiffspect.logSpect2Tiff(D, 'foo.tif')

# ====================
# Proof that spectrogram writer/reader is working properly

D1 = tiffspect.Tiff2LogSpect('foo.tif')
plt.subplot(3, 1, 3)
librosa.display.specshow(D, y_axis='linear', x_axis='time', sr=K_SR, hop_length=K_HOP)
plt.colorbar(format='%+2.0f dB')
plt.title('From Tiff: Linear-frequency magnitude spectrogram')
tiffspect.logSpect2Tiff(D, 'foo.tif')