In [2]:
import numpy as np
import pandas as pd
import scipy
import librosa, librosa.display
import matplotlib.pyplot as plt
import matplotlib.style as ms
#ms.use('seaborn-muted')
import IPython.display as Ipd
import os
import sklearn

In [4]:
def SphericalToCartesian(ele, azi, dist):
    phi = np.deg2rad(90-ele)
    theta = np.deg2rad(azi)
    
    x = dist * np.sin(phi) * np.cos(theta)
    #x=ρsinφcosθ 
    y = dist * np.sin(phi) * np.sin(theta)
    #y=ρsinφsinθ 
    z = dist * np.cos(phi)
    #z=ρcosφ
    return np.array([x, y, z])

def CartesianToSpherical(x, y, z):
    dist = np.sqrt(x*x + y*y + z*z)
    theta = np.arctan2(y,x)    
    phi = np.arccos(z/dist)
    ele = 90 - np.rad2deg(phi)
    azi = np.rad2deg(theta)
    return np.array([round(ele), round(azi), round(dist), theta])

def normalize(v):
    norm=np.linalg.norm(v)
    if norm==0:
        norm=np.finfo(v.dtype).eps
    return v/norm


#Get all metadata

filenames_meta = os.listdir('data/dcase_data/metadata_dev') #Development

full_metadata = pd.DataFrame()
#all_metadata
max_it = 10
c = 0
for i in range(len(filenames_meta)):
    metadata = pd.read_csv('data/dcase_data/metadata_dev/' + filenames_meta[i])  #Development
    coord = np.zeros((len(metadata),3))
    sphe = np.zeros((len(metadata),4))
    for index, row in metadata.iterrows():
        coord[index, :] = SphericalToCartesian(row['ele'], row['azi'], row['dist'])
        sphe[index, :] = CartesianToSpherical(coord[index, 0], coord[index, 1], coord[index, 2])
    metadata['x'] = coord[:,0]
    metadata['y'] = coord[:,1]
    metadata['z'] = coord[:,2]
    
    metadata['ele_r'] = sphe[:,0]
    metadata['azi_r'] = sphe[:,1]
    metadata['dist_r'] = sphe[:,2]
    metadata['theta'] = sphe[:,3]
    
    full_metadata = full_metadata.append(metadata, ignore_index=True, sort=False)
    c +=1
    if c == max_it:
        break
print(len(full_metadata))
print(len(full_metadata.loc[full_metadata['sound_event_recording'] == 'phone']))

258
24


In [5]:

def lin_interp_2d(data, out_size):
    
    x_in_size = data.shape[1]
    y_in_size = data.shape[0]
    x_in = np.arange(0,x_in_size)
    y_in = np.arange(0,y_in_size)
    interpolator = scipy.interpolate.interp2d(x_in, y_in, data, kind='linear')
    x_out = np.arange(0,x_in_size-1,((x_in_size-1)/out_size[1]))
    y_out = np.arange(0,y_in_size-1,((y_in_size-1)/out_size[0]))
    output = interpolator(x_out, y_out)
    output = output[0:out_size[0],0:out_size[1]]
    
    return output

def extract_features_target(signal, location):
    stft_amp = np.zeros((128,0))
    stft_phase = np.zeros((128,0))
    for i in range(4):
        stft = librosa.stft(signal[i], n_fft=1024, hop_length=512)
        stft_amp = np.append(stft_amp, lin_interp_2d(librosa.amplitude_to_db(np.abs(stft)), (128,64)), axis=1)
        stft_phase = np.append(stft_phase, lin_interp_2d(np.angle(stft), (128,64)), axis = 1)

        #print(stft_amp.shape)
        #print(np.std(stft_amp[0]))


    stft_amp = sklearn.preprocessing.scale(stft_amp, axis = 1)
    stft_phase = sklearn.preprocessing.scale(stft_phase, axis = 1)

    feature = np.append(stft_amp.flatten(), stft_phase.flatten())
    return feature, np.array(location)

sr = 22050
filenames_meta = os.listdir('data/dcase_data/metadata_dev') #Development
features = np.zeros((len(full_metadata),65536)) #we compute the melspectrogram which once flattened will produce 22144 values
target = np.zeros((len(full_metadata),3)) #we store the 2 target values

example = 0
for i in range(max_it):
    
    #Metadata
    metadata = pd.read_csv('data/dcase_data/metadata_dev/' + filenames_meta[i])  #Development
    filename = os.path.splitext(filenames_meta[i])[0]
    
    print("processing '" + filename + "' " + str(i + 1) + "/" + str(max_it))
    
    #Audio track
    signal, dummy = librosa.load('data/dcase_data/foa_dev/' + filename + '.wav', sr, mono=False)    
    
    for s in range(len(metadata)):
        if(metadata['sound_event_recording'][s] == 'phone' ):
            #print('processing',filenames[i])
            start_time = int(metadata['start_time'][s] * sr)
            end_time = int(metadata['end_time'][s] * sr)
            #print(str(end_time - start_time))
            subsignal = signal[:, start_time:end_time]
            #features[example,:], target[example,:] = extract_features_target(subsignal, metadata['x'][s],  metadata['y'][s], metadata['z'][s])
            features[example,:], target[example,:] = extract_features_target(subsignal, SphericalToCartesian(metadata['ele'][s],  metadata['azi'][s], metadata['dist'][s]))
            
            #Ipd.display(Ipd.Audio(subsignal, rate=sr))
            example += 1
print(features.shape)            
features = np.delete(features, np.arange(example,features.shape[0], 1, dtype=int), axis=0)
target = np.delete(target, np.arange(example,target.shape[0], 1, dtype=int), axis=0)
print('Done!')
print(features.shape)
print(example)

processing 'split1_ir0_ov1_1' 1/10
processing 'split1_ir0_ov1_10' 2/10
processing 'split1_ir0_ov1_2' 3/10
processing 'split1_ir0_ov1_3' 4/10
processing 'split1_ir0_ov1_4' 5/10
processing 'split1_ir0_ov1_5' 6/10
processing 'split1_ir0_ov1_6' 7/10
processing 'split1_ir0_ov1_7' 8/10
processing 'split1_ir0_ov1_8' 9/10
processing 'split1_ir0_ov1_9' 10/10
(258, 65536)
Done!
(24, 65536)
24


In [7]:
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV


#creating pipeline
#note that here we do not initialize the parameters we want to tune/change
#the parameters that we decide to initialize here will be fixed across the grid search
#we also need to keep track of the names (between the quotes)
#that we selected for the different components of the pipeline
#the names are needed when creating the grid of parameters

pipe = Pipeline([
        ('scaler', StandardScaler()),
        #('dim_red', LinearDiscriminantAnalysis()),
        ('classifier', MLPRegressor(max_iter=500, random_state = 1))
        ])

#n_components = 10
#hidden_layer_sizes=(20,5), max_iter=10000, activation='relu'


#creating the repeated stratified k-folds
#this is not a must, we can do grid search with a simple k-fold
#cross validation by setting cv= to a number in the GridSearchCV constructor
rkf = RepeatedKFold(n_splits=5, n_repeats=10)


#defining the parameters range to explore
#the name of the parameters is preceeded by the name of the component
#in the pipeline followed by two underscore
#if you have trouble in identifying the correct, print all parameters and their
#names uwith the following commented line
#print(pipe.get_params().keys())
grid_param = {
    #'dim_red__n_components': [3, 2],
    'classifier__hidden_layer_sizes': [(128, 128, 128), (30,20,10), (500, 400, 300, 100, 50)],
    'classifier__activation': ['logistic', 'relu'],
    'classifier__alpha': [0.01, 0.001],
}

gd_sr = GridSearchCV(estimator=pipe,
                     param_grid=grid_param,
                     scoring='r2', #this can be changes to accuracy, f1_micro, etc. or to another classification metric
                     cv=rkf, # if you do not want to do repeated kfold, you can set cv=5 to test just on 5 different splits 
                     n_jobs=-1) #if equal to -1 will use as many CPU as available

gd_sr.fit(features, target) #performing the search

print('best set of parameters', gd_sr.best_params_)
print('associated best score',gd_sr.best_score_)
gd_sr.

best set of parameters {'classifier__activation': 'logistic', 'classifier__alpha': 0.01, 'classifier__hidden_layer_sizes': (30, 20, 10)}
associated best score -1.0462702026179753
