02_wav_features_and_spectrogram.py

'''
This script pre-processes the MP3 data for autoencoding.
Several features are calculated for wav files in a specified directory, 
which have been converted to wav from MP3 format.

The output of this script is Mel Spectrogram images for each wav file.
'''

import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
import os
import glob
import csv
import math

#Define all major scales to be used later for finding key signature
#Arrays all in the format:  [C, C#, D, Eb, E, F, F#, G, Ab, A, Bb, B]
majorscales = {'C' : [1,0,1,0,1,1,0,1,0,1,0,1],
               'C#': [1,1,0,1,0,1,1,0,1,0,1,0],
               'D' : [0,1,1,0,1,0,1,1,0,1,0,1],
               'Eb': [1,0,1,1,0,1,0,1,1,0,1,0],
               'E' : [0,1,0,1,1,0,1,0,1,1,0,1],
               'F' : [1,0,1,0,1,1,0,1,0,1,1,0],
               'F#': [0,1,0,1,0,1,1,0,1,0,1,1],
               'G' : [1,0,1,0,1,0,1,1,0,1,0,1],
               'Ab': [1,1,0,1,0,1,0,1,1,0,1,0],
               'A' : [0,1,1,0,1,0,1,0,1,1,0,1],
               'Bb': [1,0,1,1,0,1,0,1,0,1,1,0],
               'B' : [0,1,0,1,1,0,1,0,1,0,1,1]}

class Audio(object):
    
    """
    Song objects are initiated with librosa.load() which produces an array 
    containing wav data in the first index and the wav's sample frequency 
    in the second.
    
    Stereo audio will be converted to mono by librosa.load() by averaging 
    the left and right channels.  This halves both the sample frequency and 
    the number of sample points. Note that the channel averaging method of 
    conversion gives each channel equal weight, which may not always be 
    appropriate. Lossless conversion of stereo to mono is impossible.  
    
    Instead of converting to mono, file could be imported as stereo and each 
    channel could be accessed individually by setting mono=False and subsetting:
    wav[:,0] and wav[:,1]
    
    wav.dtype will be 1 of 2 types:
        1) 16-bit - This means that the sound pressure values are mapped to 
        integer values ranging from -2^15 to (2^15)-1.  If wav.dtype is 16-bit,
        it will need to be converted to 32-bit ranging from -1 to 1
        2) 32-bit - This means that the sound pressure values are mapped to
        floating point values ranging from -1 to 1
   """
    
    def __init__(self, loadedAudio):
        self.wav = loadedAudio[0]
        self.samplefreq = loadedAudio[1]
        #If imported as 16-bit, convert to floating 32-bit ranging from -1 to 1
        if (self.wav.dtype == 'int16'):
            self.wav = self.wav/(2.0**15)
        self.channels = 1  #Assumes mono, if stereo then 2 (found by self.wav.shape[1])
        self.sample_points = self.wav.shape[0]
        self.audio_length_seconds = self.sample_points/self.samplefreq
        self.time_array_seconds = np.arange(0, self.sample_points, 1)/self.samplefreq
        self.tempo_bpm = librosa.beat.beat_track(y=self.wav, sr=self.samplefreq)[0]
        self.beat_frames = librosa.beat.beat_track(y=self.wav, sr=self.samplefreq)[1]
        #Transform beat array into seconds (these are the times when the beat hits)
        self.beat_times = librosa.frames_to_time(self.beat_frames, sr=self.samplefreq)
        #Get the rolloff frequency - the frequency at which the loudness drops off by 90%, like a low pass filter
        self.rolloff_freq = np.mean(librosa.feature.spectral_rolloff(y=self.wav, sr=self.samplefreq, hop_length=512, roll_percent=0.9))
     
    def plotWav(self):
        plt.plot(self.time_array_seconds, self.wav, color='k')
        plt.xlabel('Time (seconds)')
        plt.ylabel('Amplitude')
        plt.show()

    def getTempo(self):
        print('Estimated tempo: {:.2f} beats per minute'.format(self.tempo_bpm))
    
    def getPercussiveTempo(self):
        #Separate the harmonics and percussives into 2 waves
        wav_harm, wav_perc = librosa.effects.hpss(self.wav)
        #Beat track the percussive signal
        tempo, beat_frames = librosa.beat.beat_track(y=wav_perc, sr=self.samplefreq)
        print('Estimated percussive tempo: {:.2f} beats per minute'.format(tempo))
        return tempo
    
    def getZeroCrossingRates(self):
        """
        ZCR is the count of times signal crosses 0 in a wave.  It is useful 
        for speech recognition and separating speech from background noise.
        ZCR will be smaller when a voice is speaking (0 is crossed less 
        frequently) and larger when there is a lot of background noise (0 is 
        crossed more frequently)
        
        ZCR is calculated by frame
        """
        zcrs = librosa.feature.zero_crossing_rate(y=self.wav, frame_length=2048, hop_length=512)
        return zcrs
        
    def plotChromagram(self):
        #Get chromagram of frequencies
        chroma = librosa.feature.chroma_stft(y=self.wav, sr=self.samplefreq)
        librosa.display.specshow(chroma, y_axis='chroma', x_axis='time')
        plt.colorbar()
        plt.title('Chromagram')
        plt.tight_layout()
        plt.show()
        return chroma
        
    def plotSpectrogram(self, mels=512, maxfreq=30000):
        #Plot the Mel power-scaled frequency spectrum, with any factor of 128 frequency bins and 512 frames (frame default)
        mel = librosa.feature.melspectrogram(y=self.wav, sr=self.samplefreq, n_mels=mels, fmax=maxfreq)
        librosa.display.specshow(librosa.logamplitude(mel, ref_power=np.max), y_axis='mel', fmax=maxfreq, x_axis='time')
        plt.colorbar(format='%+2.0f dB')
        plt.title('Mel Power-Scaled Frequency Spectrogram')
        plt.tight_layout()
        plt.show()
        return mel   
    
    def plotMFCCs(self):
        """
        The Mel Frequency Cepstral Coefficient is a measure of timbre
        """
        mfccs = librosa.feature.mfcc(y=self.wav, sr=self.samplefreq)
        librosa.display.specshow(mfccs, x_axis='time')
        plt.colorbar()
        plt.title('MFCC')
        plt.tight_layout()
        plt.show()
        return mfccs
    
    def plotTempogram(self):
        """
        The tempogram visualizes the rhythm (pattern recurrence), using the 
        onset envelope, oenv, to determine the start points for the patterns.
        """
        oenv = librosa.onset.onset_strength(y=self.wav, sr=self.samplefreq, hop_length=512)
        tempogram = librosa.feature.tempogram(onset_envelope=oenv, sr=self.samplefreq, hop_length=512)
        librosa.display.specshow(tempogram, sr=self.samplefreq, hop_length=512, x_axis='time', y_axis='tempo')
        plt.colorbar()
        plt.title('Tempogram')
        plt.tight_layout()
        plt.show()
        plt.plot(oenv, label='Onset strength')
        plt.title('Onset Strength Over Time')
        plt.xlabel('Time')
        plt.ylabel('Onset Strength')
        plt.show()
        return tempogram

    def findTonicAndKey(self):
        """
        The tonic is the base note in the key signature, e.g. c is the tonic for 
        the key of c major.  The tonic can be found by summing the chromagram
        arrays and finding the index of the array with the greatest sum.  The 
        logic is that the tonic is the note with the greatest presence.
        
        If the tonic doesn't match the tonic of bestmatch, the highest 
        correlated major scale, then the key is a minor scale. 
        (Minor scales = Major scales but have different tonics)
        """
        chromagram = librosa.feature.chroma_stft(y=self.wav, sr=self.samplefreq)
        chromasums = []
        for i,a in enumerate(chromagram):
            chromasums.append(np.sum(chromagram[i]))
        tonicval = np.where(max(chromasums)==chromasums)[0][0]
        notes = ['C', 'C#', 'D', 'Eb', 'E', 'F', 'F#', 'G', 'Ab', 'A', 'Bb', 'B']
        tonic = notes[tonicval]
        #In standard units, how far is the average pitch from the tonic?
        z_dist_avg_to_tonic = round((max(chromasums)-np.mean(chromasums))/np.std(chromasums), 4)
        #Correlate the chromasums array with each of the major scales, find the best match
        bestmatch = 0
        bestmatchid = 0
        for key, scale in majorscales.items():
            #np.corrcoef returns a matrix, only need the first value in the diagonal
            corr = np.corrcoef(scale, chromasums)[0,1]
            if (corr > bestmatch):
                bestmatch = corr
                bestmatchid = key
        if (tonic != bestmatchid):
            keysig = tonic + ' Minor'
        else:
            keysig = tonic + ' Major'        
        return tonic, keysig, z_dist_avg_to_tonic
    
#Specify a file directory and the types of audio files to get features for
filedir = 'C:/Users/Public/Documents/Python Scripts/Music Recommendation with Deep Learning/Audio Files/'
extension_list = ('*.wav')

#Iterate through the wavs in the directory and compile a list of features
os.chdir(filedir)
featurelist = []
melspecs = []
id_tracker = 1
for extension in extension_list:
    for file in glob.glob(extension):
        if (os.path.splitext(os.path.basename(file))[1] == '.wav'):
            print(file)
            song = Audio(librosa.load(file, mono=True))
            wavfeatures = dict()
            wavmel = dict()
            wavfeatures['audio_file_id']        = id_tracker
            wavfeatures['samplefreq']           = song.samplefreq
            wavfeatures['channels']             = song.channels
            wavfeatures['sample_points']        = song.sample_points
            wavfeatures['audio_length_seconds'] = round(song.audio_length_seconds, 4)
            wavfeatures['tempo_bpm']            = song.tempo_bpm
            wavfeatures['avg_diff_beat_times']  = round(np.mean(song.beat_times[1:]-song.beat_times[0:len(song.beat_times)-1]), 4)
            wavfeatures['std_diff_beat_times']  = round(np.std(song.beat_times[1:]-song.beat_times[0:len(song.beat_times)-1]), 4)
            wavfeatures['rolloff_freq']         = round(song.rolloff_freq, 0)
            wavfeatures['avg_zcr']              = round(np.mean(song.getZeroCrossingRates()), 4)
            wavfeatures['zcr_range']            = np.max(song.getZeroCrossingRates()) - np.min(song.getZeroCrossingRates())
            wavfeatures['avg_mel_freq']         = round(np.mean(song.plotSpectrogram()), 4)
            wavfeatures['std_mel_freq']         = round(np.std(song.plotSpectrogram()), 4)
            wavfeatures['avg_onset_strength']   = round(np.mean(song.plotTempogram()), 4)
            wavfeatures['std_onset_strength']   = round(np.std(song.plotTempogram()), 4)
            wavfeatures['tonic']                = song.findTonicAndKey()[0]
            wavfeatures['key_signature']        = song.findTonicAndKey()[1]
            wavfeatures['z_dist_avg_to_tonic']  = song.findTonicAndKey()[2]
            wavmel['audio_file_id']             = id_tracker
            #wavmel['mel_spectrogram_sample']    = (song.plotSpectrogram(mels=512, maxfreq=8192)).ravel()[song.samplefreq*30:song.samplefreq*90]
            startcol = math.ceil((song.samplefreq*30)/512)
            endcol = math.ceil((song.samplefreq*90)/512)
            wavmel['mel_spectrogram_sample']    = (song.plotSpectrogram(mels=512, maxfreq=8192))[:, startcol:endcol]
            featurelist.append(wavfeatures)
            melspecs.append(wavmel)
            id_tracker = id_tracker + 1

#Write the list of dictionaries with song features to a csv file
with open('Song_Features.csv', 'w') as f:
    w = csv.DictWriter(f, featurelist[0].keys())
    w.writeheader()
    w.writerows(featurelist)

'''
Ideally the entire mel frequency spectrogram for each song would be exported, 
but the songs are all different lengths, meaning that the dimensions of the 
spectrograms will be different.  To standardize them all, I'm using 512 
frequency bins and taking a 60 second sample of each song.  I'm starting 30
seconds into the song to skip over any song intros and get into the main verse
and/or chorus.  

The spectrogram is clipped at a max of 8192 Hz, as there are few songs with 
higher frequencies present, so there is mostly black space above 8192 Hz.

Once the mel spectrogram is built, it is vectoriezed to a 1D array and then the 
subsetting is done.  The spectrogram is exported so that 1 song gets 1 file.
'''

#Specify a file directory for the spectrograms
specfiledir = 'C:/Users/Public/Documents/Python Scripts/Music Recommendation with Deep Learning/Audio Files/Spectrograms/'
if not os.path.exists(specfiledir):
    os.makedirs(specfiledir)
os.chdir(specfiledir)

#Export all spectorgrams to csv files
for d in melspecs:
    filename = str(d['audio_file_id']) + '.csv'
    print(filename)
    print(d['mel_spectrogram_sample'].shape)
    np.savetxt(filename, d['mel_spectrogram_sample'], delimiter=",")