In [29]:
import numpy as np
from pydub import AudioSegment
from scipy.signal import spectrogram
from skimage.feature import peak_local_max
from skimage.measure import compare_ssim
from scipy.spatial.distance import hamming
import hashlib
import pyaudio
from bokeh.io import show, output_notebook
from bokeh.plotting import figure 
import warnings

warnings.filterwarnings('ignore')
output_notebook()

In [74]:
class AudioFP():
    ## Initializing AudioFP object properties
    def __init__(self):
        audio_type = input('Do you wish to read from a file or record? Enter "r" to read and "a" to record: ')
        if audio_type == 'r':
            filename = input('Enter the filename you want to read including the extension: ')
            self.songname = filename
            self.channels = []
            self.framerate = []
            self.read_audiofile(plot=True)
        elif audio_type=='a':
            filename = input('Enter a name for the recording:')
            self.songname = filename
            self.channels = []
            self.framerate = []    
            self.record_audiofile(plot=True)
        else:
            sys.exit('Error: Incorrect entry. Enter "r" to read an audio file or "a" to record')

        
    ## Read audio file using pydub and plot signal
    def read_audiofile(self, plot):
        songdata = []
        audiofile = AudioSegment.from_file(self.songname)
        songdata = np.frombuffer(audiofile._data, np.int16)
        for chn in range(audiofile.channels):
            self.channels.append(songdata[chn::audiofile.channels])
        self.framerate = audiofile.frame_rate
        # Plot time signal
        if plot:
            p1 = figure(plot_width=900, plot_height=500, title='Audio Signal', 
                        x_axis_label='Time (s)', y_axis_label='Amplitude (arb. units)')
            time = np.linspace(0, len(self.channels[0])/self.framerate, len(self.channels[0]))
            p1.line(time[0::1000], self.channels[0][0::1000])
            show(p1)
            
    ## Record audio file using pyaudio and plot signal
    def record_audiofile(self, plot):
        start_rec = input('Do you want to start recoding? Enter "y" to start:')
        if start_rec=='y':
            chk_size = 8192  # chunk size
            fmt = pyaudio.paInt16  # format of audio 
            chan = 2  # Number of channels 
            samp_rate = 44100  # sampling rate
            self.framerate = samp_rate
            rec_time = 20  # Number of seconds of audio to record
            p = pyaudio.PyAudio()
            astream = p.open(format=fmt, channels=chan, rate=samp_rate,
                             input=True, frames_per_buffer=chk_size)
            songdata = []
            self.channels = [[] for i in range(chan)]
            for i in range(0, np.int(samp_rate / chk_size * rec_time)):
                songdata = astream.read(chk_size)
                nums = np.fromstring(songdata, dtype=np.int16)
                for c in range(chan):
                    self.channels[c].extend(nums[c::chan])
            # Close audio stream
            astream.stop_stream()
            astream.close()
            p.terminate()
        else:
            sys.exit('Audio recording did not start. Start over again.')
        # Plot time signal
        if plot:
            p1 = figure(plot_width=900, plot_height=500, title='Audio Signal', 
                        x_axis_label='Time (s)', y_axis_label='Amplitude (arb. units)')
            time = np.linspace(0, len(self.channels[0])/self.framerate, len(self.channels[0]))
            p1.line(time[0::100], self.channels[0][0::100])
            show(p1)
        
    ## Generate and plot spectrogram of audio data
    def generate_spectrogram(self, plot):
        audiosignal = np.sum(self.channels, axis=0) / len(self.channels)  # Averaging signal over all channels
        fs = self.framerate  # sampling rate
        window = 'hamming'  # window function
        nperseg = 10 * 256  # window size
        overlap_ratio = 0.5  # degree of overlap, larger number->more overlap, denser fingerprint
        noverlap = int(overlap_ratio * nperseg)  # number of points to overlap
        # generate spectrogram from consecutive FFTs over the defined window
        self.f, self.t, self.sgram = spectrogram(audiosignal, fs, window, nperseg, noverlap)  
        self.sgram = 10 * np.log10(self.sgram)  # transmorm linear output to dB scale 
        self.sgram[self.sgram == -np.inf] = 0  # replace infs with zeros
        # Plot Spectrogram
        if plot:
            p2 = figure(plot_width=900, plot_height=500, title='Spectrogram',
                        x_axis_label='Time (s)', y_axis_label='Frequency (Hz)',
                        x_range=(min(self.t), max(self.t)), y_range=(min(self.f), max(self.f)))
            p2.image([self.sgram[::2, ::2]], x=min(self.t), y=min(self.f), 
                     dw=max(self.t), dh=max(self.f), palette='Spectral11')
            show(p2)
        
    ## Find peaks in the spectrogram using image processing
    def find_peaks(self, plot):
        min_peak_sep = 20  # larger sep -> less peaks -> less accuracy, but faster fingerprinting
        min_peak_amp = 15  # larger min amp -> less peaks -> less accuracy, but faster fingerprinting
        coordinates = peak_local_max(self.sgram, min_distance=min_peak_sep, indices=True,
                                     threshold_abs=min_peak_amp)
        
        self.peaks = self.sgram[coordinates[:, 0], coordinates[:, 1]]
        self.tp = self.t[coordinates[:, 1]]
        self.fp = self.f[coordinates[:, 0]]
        # Plot the peaks detected on the spectrogram
        if plot:
            p3 = figure(plot_width=900, plot_height=500, title='Spectrogram with Peaks',
                        x_axis_label='Time (s)', y_axis_label='Frequency (Hz)',
                        x_range=(min(self.t), max(self.t)), y_range=(min(self.f), max(self.f)))
            p3.image([self.sgram[::2, ::2]], x=min(self.t), y=min(self.f), 
                     dw=max(self.t), dh=max(self.f), palette='Spectral11')
            p3.scatter(self.tp, self.fp)
            show(p3)
        
    ## Use the peak data from the spectrogram to generate a string with pairs of 
    ## peak frequencies and the time delta between them 
    def generate_fingerprint(self, plot, fp_type):
        peak_connectivity = 5  # Number of neighboring peaks to use as target for each anchor
        peak_time_delta_min = 0  # Minimum spacing in time between peaks for anchor and target
        peak_time_delta_max = 20  # Maximum spacing in time between peaks for anchor and target
        s = ''  # Empty string to contain the fingerprint
        for i in range(len(self.peaks)):
            for j in range(1, peak_connectivity):
                if (i + j) < len(self.peaks):
                    f1 = self.fp[i]
                    f2 = self.fp[i + j]
                    t1 = self.tp[i]
                    t2 = self.tp[i + j]
                    t_delta = t2 - t1
                    if t_delta >= peak_time_delta_min and t_delta <= peak_time_delta_max:
                        s = s + str(np.rint(f1)) + str(np.rint(f2)) + str(np.rint(t_delta))
        if fp_type == 'default':
            self.fingerprint = s
        elif fp_type == 'sha1':
            self.fingerprint = hashlib.sha1(s.encode('utf-8')).hexdigest()
        elif fp_type == 'lsh':
            self.fingerprint = s
        elif fp_type == 'image':
            self.fingerprint = self.sgram
        else:
            print('Invalid fingerprint type selected')
        if plot:
            print('{} audio-fingprint: '.format(self.songname), self.fingerprint)

In [75]:
## Compare fingerprints of two songs and calculate percentage of match
def compare_fingerprints(s1, s2):
    def levenshtein(seq1, seq2):  
        size_x = len(seq1) + 1
        size_y = len(seq2) + 1
        matrix = np.zeros ((size_x, size_y))
        for x in range(size_x):
            matrix [x, 0] = x
        for y in range(size_y):
            matrix [0, y] = y

        for x in range(1, size_x):
            for y in range(1, size_y):
                if seq1[x-1] == seq2[y-1]:
                    matrix [x,y] = min(
                        matrix[x-1, y] + 1,
                        matrix[x-1, y-1],
                        matrix[x, y-1] + 1)
                else:
                    matrix [x,y] = min(
                        matrix[x-1,y] + 1,
                        matrix[x-1,y-1] + 1,
                        matrix[x,y-1] + 1)
        return (matrix[size_x - 1, size_y - 1])
    if (len(s1)==len(s2)):
        hdist = sum(ch1 == ch2 for ch1, ch2 in zip(s1, s2))
        print('Percentage similarity using Hamming distance= ', 100 * (hdist/len(s1)))
    else:
        print('Fingerprints are not the same length. Cannot use Hamming Distance to compare')
    print('Running calculation of Levenshtein distance...')
    ldist = levenshtein(s1, s2)
    print('Percentage similarity using Levenshtein distance= ', 100 * (1 - ldist/len(s1)))

In [76]:
# Create song object. Select whether to read an audiofile from file or record using the microphone.
# Provide full file name including extension.
# Plots the audio signal
song1 = AudioFP()

Do you wish to read from a file or record? Enter "r" to read and "a" to record: a
Enter a name for the recording:test1
Do you want to start recoding? Enter "y" to start:y


In [77]:
# Use audio signal from above file to generate spectrogram
song1.generate_spectrogram(plot=True)

In [78]:
# Find local maxima in the spectrogram to generate the audio fingerprint 
song1.find_peaks(plot=True)

In [79]:
# Use peaks in the spectrogram to generate a fingerprint for the audio signal
song1.generate_fingerprint(plot=True, fp_type='default')

test1 audio-fingprint:  11576.2511111.13281250.011197.26562511111.13281253.97641723356009211197.26562510818.281252.20589569160997811076.679687511042.22656252.176870748299318611076.679687510818.281258.3301587301587311076.679687510697.69531256.12426303854875311042.226562510818.281256.15328798185941211042.226562510697.69531253.94739229024943411042.226562510335.93750.010801.054687510697.69531257.45941043083900310801.054687510335.93753.512018140589568710801.054687510284.25781256.84988662131519210801.054687510249.804687510.33287981859410510697.695312510249.80468752.87346938775510110335.937510284.25781253.33786848072562310335.937510249.80468756.82086167800453510284.257812510249.80468753.48299319727891210284.257812510008.63281254.58594104308390310249.804687510008.63281251.102947845804990810215.351562510163.6718751.073922902494331310215.351562510008.632812511.29070294784580710215.35156259991.406257.31428571428571510215.35156259956.9531259.49115646258503410163.67187510008.632812510.2167800453514

In [81]:
song2 = AudioFP()
song2.generate_spectrogram(plot=True)
song2.find_peaks(plot=True)
song2.generate_fingerprint(plot=True, fp_type='default')

Do you wish to read from a file or record? Enter "r" to read and "a" to record: a
Enter a name for the recording:test2
Do you want to start recoding? Enter "y" to start:y


test2 audio-fingprint:  11197.26562511093.906253.947392290249432311197.26562510697.69531250.011076.679687511042.22656252.14784580498866111076.679687510697.69531256.12426303854875311076.679687510508.2031258.27210884353741511042.226562510697.69531253.97641723356009211042.226562510508.2031256.12426303854875410697.695312510508.2031252.147845804988662610542.6562510508.2031259.63628117913832210542.6562510422.07031250.841723356009071110542.6562510335.93753.512018140589568710542.656259974.17968759.66530612244897810508.2031259974.17968750.0290249433106559710422.070312510335.93752.670294784580497610422.07031259974.17968758.82358276643990910422.07031259888.04687510.59410430839002310335.93759974.17968756.1532879818594110335.93759888.0468757.92380952380952410335.93759784.68752.90249433106576059974.17968759888.0468751.77052154195011379939.72656259888.04687511.2907029478458059939.72656259784.68756.26938775510204059939.72656259646.8751.10294784580498829939.72656259526.289062510.1877551020408189888.046

In [82]:
compare_fingerprints(song1.fingerprint, song2.fingerprint)

Fingerprints are not the same length. Cannot use Hamming Distance to compare
Running calculation of Levenshtein distance...
Percentage similarity using Levenshtein distance=  34.88751502661857
