# Calculate UMAP embeddings with different parameters

In [1]:
import os
import pandas as pd
import sys
import numpy as np
from pandas.core.common import flatten
import pickle
import umap
from pathlib import Path

In [2]:
wd = os.getcwd()

DF = os.path.join(os.path.sep, str(Path(wd).parents[0]), "data", "processed", "df_focal_reduced.pkl")
OUT_COORDS = os.path.join(os.path.sep, str(Path(wd).parents[0]), "data", "interim", "parameter_search", "umap_coords")

In [3]:
spec_df = pd.read_pickle(DF)
spec_df.shape

(6430, 28)

## Functions

In [30]:
def set_defaults():
    global input_type
    global preprocess_type 
    global metric_type
    global duration_method
    global min_dist
    global spread
    global n_neighbors
    global n_comps
    global denoise
    global n_mels
    global f_unit
    
    input_type = 'melspecs'
    preprocess_type = 'zs'
    metric_type = 'manhattan'
    duration_method = 'pad'
    min_dist = 0
    spread = 1
    n_neighbors=15
    n_comps = 5
    denoise = "no"
    n_mels = 40
    f_unit = "dB"
    
    
def get_param_string():
    param_combi = "_".join([str(x) for x in [preprocess_type, metric_type, duration_method,
                                             min_dist, spread, n_neighbors, n_comps, input_type, 
                                             denoise, n_mels, f_unit]])
    return param_combi

In [31]:
def calc_umap(data, outname, metric='manhattan', min_dist=0, spread=1, n_neighbors=15, n_comps=5,n=5):
    
    for i in range(n):
        reducer = umap.UMAP(n_components = n_comps, 
                            min_dist=min_dist,
                            spread=spread,
                            n_neighbors=n_neighbors,
                            metric=metric)

        embedding = reducer.fit_transform(data) 
        np.savetxt(outname+'_'+str(i)+'.csv', embedding, delimiter=";")   

In [6]:
from preprocessing_functions import calc_zscore, pad_spectro, create_padded_data

# Different input types

In [32]:
set_defaults()
input_type = "mfccs"
specs = spec_df['mfccs']
specs = [calc_zscore(s) for s in specs]
data = create_padded_data(specs)
outname = os.path.join(os.path.sep, OUT_COORDS, get_param_string())
print(outname)
calc_umap(data, outname)

/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords/zs_manhattan_pad_0_1_15_5_mfccs_no_40_dB


# Amplitude vs. dB

In [34]:
set_defaults()
f_unit = "magnitude"
specs = spec_df['ampli_spectrograms']
specs = [calc_zscore(s) for s in specs]
data = create_padded_data(specs)
outname = os.path.join(os.path.sep, OUT_COORDS, get_param_string())
print(outname)
calc_umap(data, outname)

/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords/zs_manhattan_pad_0_1_15_5_melspecs_no_40_magnitude


# Mel-transform and number of mels

In [None]:
from spectrogramming_functions import generate_mel_spectrogram

In [48]:
set_defaults()

n_mels = 0
specs = spec_df["freq_spectrograms"]
specs = [calc_zscore(s) for s in specs]
data = create_padded_data(specs)
outname = os.path.join(os.path.sep, OUT_COORDS, get_param_string())
print(outname)
calc_umap(data, outname)

/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords/zs_manhattan_pad_0_1_15_5_melspecs_no_0_dB


In [46]:
# Spectrogramming parameters
FFT_WIN = 0.03 # FFT_WIN*samplerate = length of fft/n_fft (number of audio frames that go in one fft)
FFT_HOP = FFT_WIN/8 # FFT_HOP*samplerate = n of audio frames between successive ffts
WINDOW = 'hann' # each frame of audio is windowed by a window function (its length can also be
# determined and is then padded with zeros to match n_fft. we use window_length = length of fft
FMAX = 4000
N_MFCC = 13

for n_mels in [10,20,30,50]:
    specs = spec_df.apply(lambda row: generate_mel_spectrogram(row['raw_audio'],
                                                              row['samplerate_hz'],
                                                              n_mels,
                                                              WINDOW,
                                                              FFT_WIN,
                                                              FFT_HOP,
                                                              FMAX),
                          axis=1)
    specs = [calc_zscore(s) for s in specs]
    data = create_padded_data(specs)
    outname = os.path.join(os.path.sep, OUT_COORDS, get_param_string())
    print(outname)
    calc_umap(data, outname)  

10
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords/zs_manhattan_pad_0_1_15_5_melspecs_no_10_dB
20
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords/zs_manhattan_pad_0_1_15_5_melspecs_no_20_dB
30
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords/zs_manhattan_pad_0_1_15_5_melspecs_no_30_dB
50
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords/zs_manhattan_pad_0_1_15_5_melspecs_no_50_dB


# Denoising yes/no

In [47]:
set_defaults()
denoise = "yes"
specs = spec_df["denoised_spectrograms"]
specs = [calc_zscore(s) for s in specs]
data = create_padded_data(specs)
outname = os.path.join(os.path.sep, OUT_COORDS, get_param_string())
print(outname)
calc_umap(data, outname)

/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords/zs_manhattan_pad_0_1_15_5_melspecs_yes_40_dB


## Different preprocessing steps

In [16]:
from preprocessing_functions import preprocess_spec_numba,preprocess_spec_numba_fl, pad_transform_spectro

In [17]:
N_LOWER=5
N_UPPER=5
N_MELS = 40

specs = spec_df.spectrograms.copy()

preprocessed_specs = {'no': specs,
                      'zs' : [calc_zscore(s) for s in specs],
                      'zs-cu': [calc_zscore(s[N_LOWER:(N_MELS-N_UPPER),:]) for s in specs],
                      'zs-cu-fl': [preprocess_spec_numba_fl(s, N_LOWER, N_UPPER) for s in specs],
                      'zs-cu-fl-ce': [preprocess_spec_numba(s, N_LOWER, N_UPPER) for s in specs]}
preprocess_types = list(preprocessed_specs.keys())

In [18]:
set_defaults()

for preprocess_type in preprocess_types:
    specs = preprocessed_specs[preprocess_type]
    data = create_padded_data(specs)
    
    outname = os.path.join(os.path.sep, OUT_COORDS, get_param_string())
    print(outname)
    calc_umap(data, outname)

/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords/no_manhattan_pad_0_1_15_5_melspecs_no
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords/zs_manhattan_pad_0_1_15_5_melspecs_no
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords/zs-cu_manhattan_pad_0_1_15_5_melspecs_no
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords/zs-cu-fl_manhattan_pad_0_1_15_5_melspecs_no
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords/zs-cu-fl-ce_manhattan_pad_0_1_15_5_melspecs_no


## Different ways to deal with variable duration

In [24]:
#from custom_dist_functions import unpack_specs, spec_dist
from custom_dist_functions import calc_pairwise_pad, calc_overlap_only, calc_timeshift, calc_timeshift_pad

In [26]:
set_defaults()
duration_method_types = ['pad', 'stretch', 'pairwise-pad', 'overlap-only', 'timeshift-overlap', 'timeshift-pad']

# this is a tricky one, because for pad and stretch, I can use the default metric type from umap with varying 
# input, but for the other duration_method_types, I need to write a numba-compatible custom distance function
# thus the 3 "cases"

for duration_method in duration_method_types:
    
    outname = os.path.join(os.path.sep, OUT_COORDS, get_param_string())
    print(outname)
    
    # baseline pipeline
    if duration_method=='pad':
        specs = spec_df.spectrograms.copy()
        specs = [calc_zscore(x) for x in specs]
        data = create_padded_data(specs)
        calc_umap(data, outname)
    
    # using stretched specs as input
    elif duration_method=='stretch':
        specs = spec_df.stretched_spectrograms.copy()
        specs = [calc_zscore(x) for x in specs]
        data = create_padded_data(specs)
        calc_umap(data, outname)
        
    # pairwise-pad, overlap-only, timeshift-overlap and timeshift-pad all require
    # custom distance functions
    else:
        specs = spec_df.spectrograms.copy()
        specs = [calc_zscore(x) for x in specs]
        n_bins = specs[0].shape[0]
        maxlen= np.max([spec.shape[1] for spec in specs]) * n_bins +2
        trans_specs = [pad_transform_spectro(spec, maxlen) for spec in specs]
        data = np.asarray(trans_specs)

        metric_dict = {'pairwise-pad': calc_pairwise_pad,
                      'overlap-only': calc_overlap_only,
                      'timeshift-overlap': calc_timeshift,
                      'timeshift-pad': calc_timeshift_pad}
        
        calc_umap(data, outname, metric=metric_dict[duration_method])
    

/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords/zs_manhattan_pad_0_1_15_5_melspecs
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords/zs_manhattan_stretch_0_1_15_5_melspecs


## Different distance metrics

In [27]:
set_defaults()
    
metric_types = ['euclidean', 'manhattan', 'cosine', 'correlation']

specs = spec_df.spectrograms.copy()
specs = [calc_zscore(x) for x in specs]
data = create_padded_data(specs)

for metric_type in metric_types:
    
    outname = os.path.join(os.path.sep, OUT_COORDS, get_param_string())
    print(outname)
    calc_umap(data, outname, metric=metric_type)

/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords/zs_euclidean_pad_0_1_15_5_melspecs
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords/zs_manhattan_pad_0_1_15_5_melspecs
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords/zs_cosine_pad_0_1_15_5_melspecs
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords/zs_correlation_pad_0_1_15_5_melspecs


## N_neighbors

In [19]:
set_defaults()

n_neighbors_types = [5,15,30,50,100,150,200]

specs = spec_df.spectrograms.copy()
specs = [calc_zscore(x) for x in specs]
data = create_padded_data(specs)

for n_neighbors in n_neighbors_types:
    outname = os.path.join(os.path.sep, OUT_COORDS, get_param_string())
    print(outname)
    calc_umap(data, outname, n_neighbors=n_neighbors)

/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords/zs_manhattan_pad_0_1_50_5_melspecs_no
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords/zs_manhattan_pad_0_1_100_5_melspecs_no
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords/zs_manhattan_pad_0_1_150_5_melspecs_no
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords/zs_manhattan_pad_0_1_200_5_melspecs_no


# Spread

In [59]:
set_defaults()

n_spread_types = [0.1,0.5, 0.75, 1.5]

specs = spec_df.spectrograms.copy()
specs = [calc_zscore(x) for x in specs]
data = create_padded_data(specs)

for n_spread in n_spread_types:
    spread = n_spread
    outname = os.path.join(os.path.sep, OUT_COORDS, get_param_string())
    print(outname)
    calc_umap(data, outname, spread=n_spread)

/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords/zs_manhattan_pad_0_0.1_15_5_melspecs_no_40_dB
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords/zs_manhattan_pad_0_0.5_15_5_melspecs_no_40_dB
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords/zs_manhattan_pad_0_0.75_15_5_melspecs_no_40_dB
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords/zs_manhattan_pad_0_1.5_15_5_melspecs_no_40_dB


## Min_dist

In [57]:
set_defaults()

min_dist_types = [0.001, 0.01, 0.1, 1]

specs = spec_df.spectrograms.copy()
specs = [calc_zscore(x) for x in specs]
data = create_padded_data(specs)

for min_dist_n in min_dist_types:
    min_dist = min_dist_n
    outname = os.path.join(os.path.sep, OUT_COORDS, get_param_string())
    print(outname)
    calc_umap(data, outname, min_dist=min_dist)

/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords/zs_manhattan_pad_0.001_1_15_5_melspecs_no_40_dB
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords/zs_manhattan_pad_0.01_1_15_5_melspecs_no_40_dB
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords/zs_manhattan_pad_0.1_1_15_5_melspecs_no_40_dB
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords/zs_manhattan_pad_1_1_15_5_melspecs_no_40_dB
