# Calculate UMAP embeddings with different parameters

In [1]:
import os
import pandas as pd
import sys
import numpy as np
from pandas.core.common import flatten
import pickle
import umap
from pathlib import Path

In [2]:
from preprocessing_functions import calc_zscore, pad_spectro, create_padded_data

In [3]:
from preprocessing_functions import preprocess_spec_numba,preprocess_spec_numba_fl, pad_transform_spectro

In [4]:
from spectrogramming_functions import generate_mel_spectrogram

In [5]:
from custom_dist_functions import calc_pairwise_pad, calc_overlap_only, calc_timeshift, calc_timeshift_pad

In [19]:
wd = os.getcwd()

DF = os.path.join(os.path.sep, str(Path(wd).parents[0]), "data", "processed", "df_focal_reduced.pkl")
OUT_COORDS = os.path.join(os.path.sep, str(Path(wd).parents[0]), "data", "interim", "parameter_search", "umap_coords_3D")

In [20]:
spec_df = pd.read_pickle(DF)
spec_df.shape

(6430, 34)

## Functions

In [31]:
DEF_INPUT_TYPE = 'melspecs'
DEF_PREPROCESS_TYPE = 'zs'
DEF_METRIC_TYPE = 'euclidean'
DEF_DURATION_METHOD = 'pad'
DEF_MIN_DIST = 0
DEF_SPREAD = 1
DEF_NEIGHBORS = 15
DEF_N_COMPS = 3
DEF_DENOISE = 'no'
DEF_N_MELS = 40
DEF_F_UNIT = 'dB'

In [32]:
def set_defaults():
    global input_type
    global preprocess_type 
    global metric_type
    global duration_method
    global min_dist
    global spread
    global n_neighbors
    global n_comps
    global denoise
    global n_mels
    global f_unit
    
    input_type = DEF_INPUT_TYPE
    preprocess_type = DEF_PREPROCESS_TYPE
    metric_type = DEF_METRIC_TYPE
    duration_method = DEF_DURATION_METHOD
    min_dist = DEF_MIN_DIST
    spread = DEF_SPREAD
    n_neighbors= DEF_NEIGHBORS
    n_comps = DEF_N_COMPS
    denoise = DEF_DENOISE
    n_mels = DEF_N_MELS
    f_unit = DEF_F_UNIT
    
    
def get_param_string():
    param_combi = "_".join([str(x) for x in [preprocess_type, metric_type, duration_method,
                                             min_dist, spread, n_neighbors, n_comps, input_type, 
                                             denoise, n_mels, f_unit]])
    return param_combi

In [33]:
def calc_umap(data, outname, metric=DEF_METRIC_TYPE, min_dist=DEF_MIN_DIST, spread=DEF_SPREAD, n_neighbors=DEF_NEIGHBORS, n_comps=DEF_N_COMPS,n = 5):
    
    for i in range(n):
        reducer = umap.UMAP(n_components = n_comps, 
                            min_dist=min_dist,
                            spread=spread,
                            n_neighbors=n_neighbors,
                            metric=metric)

        embedding = reducer.fit_transform(data) 
        np.savetxt(outname+'_'+str(i)+'.csv', embedding, delimiter=";")   

In [34]:
# ORIGINAL

set_defaults()

specs = spec_df.spectrograms.copy()
specs = [calc_zscore(x) for x in specs]
data = create_padded_data(specs)

outname = os.path.join(os.path.sep, OUT_COORDS, get_param_string())
calc_umap(data, outname)
print(outname)

/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs_euclidean_pad_0_1_15_3_melspecs_no_40_dB


# Different dimensions in latent space

In [35]:
set_defaults()

for n_comps in [2,3,4,5,6,7,8,9,10,15,20]:
    specs = spec_df.spectrograms.copy()
    specs = [calc_zscore(s) for s in specs]
    data = create_padded_data(specs)
    outname = os.path.join(os.path.sep, OUT_COORDS, get_param_string())
    print(outname)
    calc_umap(data, n_comps=n_comps, outname=outname)

/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs_euclidean_pad_0_1_15_2_melspecs_no_40_dB
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs_euclidean_pad_0_1_15_3_melspecs_no_40_dB
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs_euclidean_pad_0_1_15_4_melspecs_no_40_dB
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs_euclidean_pad_0_1_15_5_melspecs_no_40_dB
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs_euclidean_pad_0_1_15_6_melspecs_no_40_dB
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs_euclidean_pad_0_1_15_7_melspecs_no_40_dB
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data

# Different input types

In [36]:
set_defaults()

input_type = 'mfccs'
specs = spec_df.zmfccs.copy()
specs = [calc_zscore(s) for s in specs]
data = create_padded_data(specs)
outname = os.path.join(os.path.sep, OUT_COORDS, get_param_string())
print(outname)
calc_umap(data, outname)

/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs_euclidean_pad_0_1_15_3_mfccs_no_40_dB


# Amplitude vs. dB

In [37]:
set_defaults()
f_unit = "magnitude"
specs = spec_df['ampli_spectrograms'].copy()
specs = [calc_zscore(s) for s in specs]
data = create_padded_data(specs)
outname = os.path.join(os.path.sep, OUT_COORDS, get_param_string())
print(outname)
calc_umap(data, outname)

/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs_euclidean_pad_0_1_15_3_melspecs_no_40_magnitude


# Mel-transform and number of mels

In [38]:
set_defaults()

n_mels = 0
specs = spec_df["freq_spectrograms"].copy()
specs = [calc_zscore(s) for s in specs]
data = create_padded_data(specs)
outname = os.path.join(os.path.sep, OUT_COORDS, get_param_string())
print(outname)
calc_umap(data, outname)

/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs_euclidean_pad_0_1_15_3_melspecs_no_0_dB


In [39]:
# Spectrogramming parameters
FFT_WIN = 0.03 # FFT_WIN*samplerate = length of fft/n_fft (number of audio frames that go in one fft)
FFT_HOP = FFT_WIN/8 # FFT_HOP*samplerate = n of audio frames between successive ffts
WINDOW = 'hann' # each frame of audio is windowed by a window function (its length can also be
# determined and is then padded with zeros to match n_fft. we use window_length = length of fft
FMAX = 4000
N_MFCC = 13

for n_mels in [10,20,30,50]:
    specs = spec_df.apply(lambda row: generate_mel_spectrogram(row['raw_audio'],
                                                              row['samplerate_hz'],
                                                              n_mels,
                                                              WINDOW,
                                                              FFT_WIN,
                                                              FFT_HOP,
                                                              FMAX),
                          axis=1)
    specs = [calc_zscore(s) for s in specs]
    data = create_padded_data(specs)
    outname = os.path.join(os.path.sep, OUT_COORDS, get_param_string())
    print(outname)
    calc_umap(data, outname)  

/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs_euclidean_pad_0_1_15_3_melspecs_no_10_dB
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs_euclidean_pad_0_1_15_3_melspecs_no_20_dB
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs_euclidean_pad_0_1_15_3_melspecs_no_30_dB
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs_euclidean_pad_0_1_15_3_melspecs_no_50_dB


# Denoising yes/no

In [40]:
set_defaults()
denoise = "yes"
specs = spec_df["denoised_spectrograms"].copy()
specs = [calc_zscore(s) for s in specs]
data = create_padded_data(specs)
outname = os.path.join(os.path.sep, OUT_COORDS, get_param_string())
print(outname)
calc_umap(data, outname)

/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs_euclidean_pad_0_1_15_3_melspecs_yes_40_dB


## Different preprocessing steps

In [41]:
N_LOWER=5
N_UPPER=5
N_MELS = 40

specs = spec_df.spectrograms.copy()

preprocessed_specs = {'no': specs,
                      #'zs' : [calc_zscore(s) for s in specs],
                      'zs-cu': [calc_zscore(s[N_LOWER:(N_MELS-N_UPPER),:]) for s in specs],
                      'zs-cu-fl': [preprocess_spec_numba_fl(s, N_LOWER, N_UPPER) for s in specs],
                      'zs-cu-fl-ce': [preprocess_spec_numba(s, N_LOWER, N_UPPER) for s in specs]}
preprocess_types = list(preprocessed_specs.keys())

In [42]:
set_defaults()

for preprocess_type in preprocess_types:
    specs = preprocessed_specs[preprocess_type]
    data = create_padded_data(specs)
    
    outname = os.path.join(os.path.sep, OUT_COORDS, get_param_string())
    print(outname)
    calc_umap(data, outname)

/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/no_euclidean_pad_0_1_15_3_melspecs_no_40_dB
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs-cu_euclidean_pad_0_1_15_3_melspecs_no_40_dB
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs-cu-fl_euclidean_pad_0_1_15_3_melspecs_no_40_dB
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs-cu-fl-ce_euclidean_pad_0_1_15_3_melspecs_no_40_dB


## Different ways to deal with variable duration

In [43]:
set_defaults()
#duration_method_types = ['pad', 'stretch', 'pairwise-pad', 'overlap-only', 'timeshift-overlap', 'timeshift-pad']
duration_method_types = ['stretch', 'pairwise-pad', 'overlap-only', 'timeshift-overlap', 'timeshift-pad']

duration_method_types = ['pairwise-pad', 'overlap-only', 'timeshift-overlap', 'timeshift-pad']

# this is a tricky one, because for pad and stretch, I can use the default metric type from umap with varying 
# input, but for the other duration_method_types, I need to write a numba-compatible custom distance function
# thus the if clause

for duration_method in duration_method_types:
    
    outname = os.path.join(os.path.sep, OUT_COORDS, get_param_string())
    print(outname)
    
    # baseline pipeline
    #if duration_method=='pad':
    #    specs = spec_df.spectrograms.copy()
    #    specs = [calc_zscore(x) for x in specs]
    #    data = create_padded_data(specs)
    #    calc_umap(data, outname)
    
    # using stretched specs as input
    if duration_method=='stretch':
        specs = spec_df.stretched_spectrograms.copy()
        specs = [calc_zscore(x) for x in specs]
        data = create_padded_data(specs)
        calc_umap(data, outname)
        
    # pairwise-pad, overlap-only, timeshift-overlap and timeshift-pad all require
    # custom distance functions
    else:
        specs = spec_df.spectrograms.copy()
        specs = [calc_zscore(x) for x in specs]
        n_bins = specs[0].shape[0]
        maxlen= np.max([spec.shape[1] for spec in specs]) * n_bins +2
        trans_specs = [pad_transform_spectro(spec, maxlen) for spec in specs]
        data = np.asarray(trans_specs)

        metric_dict = {'pairwise-pad': calc_pairwise_pad,
                      'overlap-only': calc_overlap_only,
                      'timeshift-overlap': calc_timeshift,
                      'timeshift-pad': calc_timeshift_pad}
        
        calc_umap(data, outname, metric=metric_dict[duration_method])
    

/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs_euclidean_pairwise-pad_0_1_15_3_melspecs_no_40_dB


  "custom distance metric does not return gradient; inverse_transform will be unavailable. "
  "custom distance metric does not return gradient; inverse_transform will be unavailable. "
  "custom distance metric does not return gradient; inverse_transform will be unavailable. "
  "custom distance metric does not return gradient; inverse_transform will be unavailable. "
  "custom distance metric does not return gradient; inverse_transform will be unavailable. "


/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs_euclidean_overlap-only_0_1_15_3_melspecs_no_40_dB


  "custom distance metric does not return gradient; inverse_transform will be unavailable. "
  "custom distance metric does not return gradient; inverse_transform will be unavailable. "
  "custom distance metric does not return gradient; inverse_transform will be unavailable. "
  "custom distance metric does not return gradient; inverse_transform will be unavailable. "
  "custom distance metric does not return gradient; inverse_transform will be unavailable. "


/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs_euclidean_timeshift-overlap_0_1_15_3_melspecs_no_40_dB


  "custom distance metric does not return gradient; inverse_transform will be unavailable. "
  "custom distance metric does not return gradient; inverse_transform will be unavailable. "
  "custom distance metric does not return gradient; inverse_transform will be unavailable. "
  "custom distance metric does not return gradient; inverse_transform will be unavailable. "
  "custom distance metric does not return gradient; inverse_transform will be unavailable. "


/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs_euclidean_timeshift-pad_0_1_15_3_melspecs_no_40_dB


  "custom distance metric does not return gradient; inverse_transform will be unavailable. "
  "custom distance metric does not return gradient; inverse_transform will be unavailable. "
  "custom distance metric does not return gradient; inverse_transform will be unavailable. "
  "custom distance metric does not return gradient; inverse_transform will be unavailable. "
  "custom distance metric does not return gradient; inverse_transform will be unavailable. "


## Different distance metrics

In [44]:
set_defaults()
    
metric_types = ['manhattan', 'cosine', 'correlation']

specs = spec_df.spectrograms.copy()
specs = [calc_zscore(x) for x in specs]
data = create_padded_data(specs)

for metric_type in metric_types:
    
    outname = os.path.join(os.path.sep, OUT_COORDS, get_param_string())
    print(outname)
    calc_umap(data, outname, metric=metric_type)

/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs_manhattan_pad_0_1_15_3_melspecs_no_40_dB
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs_cosine_pad_0_1_15_3_melspecs_no_40_dB
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs_correlation_pad_0_1_15_3_melspecs_no_40_dB


## N_neighbors

In [45]:
set_defaults()

n_neighbors_types = [5,10,30,50,100,150,200]

specs = spec_df.spectrograms.copy()
specs = [calc_zscore(x) for x in specs]
data = create_padded_data(specs)

for n_neighbors in n_neighbors_types:
    outname = os.path.join(os.path.sep, OUT_COORDS, get_param_string())
    print(outname)
    calc_umap(data, outname, n_neighbors=n_neighbors)

/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs_euclidean_pad_0_1_5_3_melspecs_no_40_dB
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs_euclidean_pad_0_1_10_3_melspecs_no_40_dB
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs_euclidean_pad_0_1_30_3_melspecs_no_40_dB
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs_euclidean_pad_0_1_50_3_melspecs_no_40_dB
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs_euclidean_pad_0_1_100_3_melspecs_no_40_dB
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs_euclidean_pad_0_1_150_3_melspecs_no_40_dB
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/dat

# Spread

In [46]:
set_defaults()

n_spread_types = [0.5, 0.75, 1, 1.5]

specs = spec_df.spectrograms.copy()
specs = [calc_zscore(x) for x in specs]
data = create_padded_data(specs)

for n_spread in n_spread_types:
    spread = n_spread
    outname = os.path.join(os.path.sep, OUT_COORDS, get_param_string())
    print(outname)
    calc_umap(data, outname, spread=n_spread)

/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs_euclidean_pad_0_0.5_15_3_melspecs_no_40_dB
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs_euclidean_pad_0_0.75_15_3_melspecs_no_40_dB
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs_euclidean_pad_0_1_15_3_melspecs_no_40_dB
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs_euclidean_pad_0_1.5_15_3_melspecs_no_40_dB


## Min_dist

In [47]:
set_defaults()

min_dist_types = [0.001, 0.01, 0.1, 1]

specs = spec_df.spectrograms.copy()
specs = [calc_zscore(x) for x in specs]
data = create_padded_data(specs)

for min_dist_n in min_dist_types:
    min_dist = min_dist_n
    outname = os.path.join(os.path.sep, OUT_COORDS, get_param_string())
    print(outname)
    calc_umap(data, outname, min_dist=min_dist)

/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs_euclidean_pad_0.001_1_15_3_melspecs_no_40_dB
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs_euclidean_pad_0.01_1_15_3_melspecs_no_40_dB
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs_euclidean_pad_0.1_1_15_3_melspecs_no_40_dB
/home/mthomas/Documents/MPI_work/projects/meerkat/meerkat_umap_pv/data/interim/parameter_search/umap_coords_3D/zs_euclidean_pad_1_1_15_3_melspecs_no_40_dB
