In [2]:
###load libraries and set parameters
import os
import sys
import matplotlib.pyplot as plt
import sklearn
import librosa.display
import numpy

# Check for duplicates
First of all check if any of the selected song has the same name, and if so, to avoid problems modify the names.

In [2]:
def duplicates(walk_dir):
    s = []
    for root, subdirs, files in os.walk(walk_dir):
        for filename in files:
            if ".mp3" in filename:
                file_path = os.path.join(root, filename)
                artist=root.split("\\")[1]
                song=filename.split(" ",1)[1].split(".")[0]
                s.append(song)
    print(set([x for x in s if s.count(x) > 1]))

In [3]:
duplicates("Dataset")

set()


# Count Songs
then we count how many songs we have per artist in the dataset

In [4]:
def count(walk_dir):
    x = {}
    for root, subdirs, files in os.walk(walk_dir):
        for filename in files:
            if ".mp3" in filename:
                file_path = os.path.join(root, filename)
                artist=root.split("\\")[1]
                song=filename.split(" ",1)[1].split(".")[0]
                if artist not in x:
                    x[artist] = 0
                x[artist]+=1
    print(x)

In [5]:
count("Dataset")

{'Bruce Springsteen': 55, 'Coldplay': 55, 'Ed Sheeran': 55, "Guns N' Roses": 55, 'Michael Jackson': 55, 'Passenger': 55, 'Pink Floyd': 55, 'Queen': 55, 'Simon & Garfunkel': 55, 'The Beatles': 55}


# Parse song to extract features for visualization
Here we provide the code to extract the mffc from the song in our database. We extract both normalized *song-by-song* and unnormalized mfcc's. These data will be used for visualization purposes and exploratory analysis.

In [2]:
###set parameters:

fs=6000      #sampling rate, default in librosa is 22050
offset=30    #load song with specified offset in sec
duration=60  #portion of the song to load in sec
n_mfcc=12    #number of mfccs to extract, default is 20, other good value (to reduce dimension) is 12

In [5]:
###define function to extract features using parameter set above
scaler = sklearn.preprocessing.StandardScaler()##to normalize mfcc
def create_mfcc_visual(walk_dir):
    x = []
    x1 = []
    #c = 0
    for root, subdirs, files in os.walk(walk_dir):
        for filename in files:
            if ".mp3" in filename:
                file_path = os.path.join(root, filename)
                artist=root.split("\\")[1]
                song=filename.split(" ",1)[1].split(".")[0]
                print("processing:"+song)
                a = librosa.load(file_path,duration=duration,offset=offset,sr=fs)[0]
                mfcc = librosa.feature.mfcc(a, sr=fs,n_mfcc=n_mfcc).T
                mfcc_s = scaler.fit_transform(mfcc)
                x.append({"artist": artist,"song": song,"mfcc":mfcc_s})
                x1.append({"artist": artist,"song": song,"mfcc":mfcc})
    return(x,x1)

In [7]:
#create a python list of dictionaries {artist, song, mfcc} using above function
#x1 are non-nomalized and will be useful for visualization purposes
x,x1=create_mfcc("Dataset")

In [8]:
#save files with extracted features

##convert mfcc from numpy array to list to be able to place them in a json file
y=[]
for t in x:
    y.append({"artist": t["artist"],"song": t["song"],"mfcc":t["mfcc"].tolist()})   
y1=[]
for t in x1:
    y1.append({"artist": t["artist"],"song": t["song"],"mfcc":t["mfcc"].tolist()})

#save features
with open('visualization-data-norm.txt', 'w') as filehandle:
    json.dump(y, filehandle)
with open('visualization-data-unnorm.txt', 'w') as filehandle:
    json.dump(y1, filehandle)
    
#save also the sampling rate used to extract features
with open('visualization-fs.txt', 'w') as filehandle:
    json.dump(fs, filehandle)

# Create a variety of datasets with different parameters
Here we create different files with features extracted with different parameters to see then how performance of our classifiers change by changing the features.

In [11]:
#list of sampling rates
SR=[2500,5000,7500,10000]
#list of numbers for mfcc
MFCC=[4,8,12,16,20]

offset=30
duration=60

In [12]:
#function to create mfcc with desidered parameters
scaler = sklearn.preprocessing.StandardScaler()##to normalize mfcc
def create_multiple_mfcc(walk_dir,SR,MFCC):
    x={}
    for sr in SR:
        for n_mfcc in MFCC:
            x[(sr,n_mfcc)]=[]
    c = 0
    for root, subdirs, files in os.walk(walk_dir):
        for filename in files:
            if ".mp3" in filename:
                file_path = os.path.join(root, filename)
                artist=root.split("\\")[1]
                song=filename.split(" ",1)[1].split(".")[0]
                #print("processing:"+song)
                c=c+1
                if c%10==0:
                    print(c)
                for sr in SR:
                    a = librosa.load(file_path,duration=duration,offset=offset,sr=sr)[0]
                    for n_mfcc in MFCC:
                        mfcc = librosa.feature.mfcc(a, sr=sr,n_mfcc=n_mfcc).T
                        mfcc = scaler.fit_transform(mfcc)
                        x[(sr,n_mfcc)].append({"artist": artist,"song": song,"mfcc":mfcc})
    return(x)

In [14]:
#use above function fo create features
x=create_multiple_mfcc("Dataset",SR,MFCC)

10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
500
510
520
530
540
550


In [15]:
#transform x into a dictionary (with elements indexed in SR) of dictionaries (with elements indexed in MFCC)
#of list (with elements that represent by songs) of dictionaries of the form {"artist", "song", "mfcc"}
#where mfcc has been converted to list to be able to put everything in a json file
y={}
for sr in SR:
    y[sr]={}
    for n_mfcc in MFCC:
        y[sr][n_mfcc]=[]
            
for key in x:
    for t in x[key]:
        y[key[0]][key[1]].append({"artist": t["artist"],"song": t["song"],"mfcc":t["mfcc"].tolist()})

In [16]:
#save features
with open('x.txt', 'w') as filehandle:
    json.dump(y, filehandle)

In [17]:
#also save the used list of SR and MFCC
with open('SR.txt', 'w') as filehandle:
    json.dump(SR, filehandle)
with open('MFCC.txt', 'w') as filehandle:
    json.dump(MFCC, filehandle)