## 0.2 Project dataset


### This notebook does the following:
 - Segments songs into individual syllables
 - Creates spectrograms for each syllables
 - Saves a dataset to be used in furhter analyses
 

In [1]:
# Reload modules automatically
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import librosa
import src

from tqdm.autonotebook import tqdm
from joblib import Parallel, delayed

from src.greti.read.paths import DATA_DIR
from src.avgn.utils.paths import most_recent_subdirectory, ensure_dir
from src.avgn.utils.hparams import HParams
from src.avgn.dataset import DataSet
from src.avgn.signalprocessing.create_spectrogram_dataset import *

from src.avgn.visualization.spectrogram import draw_spec_set
from src.avgn.visualization.projections import (
    scatter_spec,
    scatter_projections,
    draw_projection_transitions,
)
from src.avgn.visualization.quickplots import draw_projection_plots
import umap


### Import syllable dataset
> Use output of the previous notebook (save_loc)

In [4]:
# Which dataset?
DATASET_ID = 'GRETI_HQ_2020_segmented'

save_loc = DATA_DIR / "syllable_dfs" / DATASET_ID / "{}.pickle".format(DATASET_ID)

syllable_df = pd.read_pickle(save_loc)

ensure_dir(DATA_DIR / "embeddings" / DATASET_ID)

In [4]:
# Save dataframe with embeddings for each bird
ensure_dir(DATA_DIR / "embeddings" / DATASET_ID)

for indv in tqdm(syllable_df.indv.unique()):
    subset_df = syllable_df[syllable_df.indv == indv]

    specs = list(subset_df.spectrogram.values)
    specs = [i / np.max(i) for i in tqdm(specs)]
    specs_flattened = flatten_spectrograms(specs)
    print(np.shape(specs_flattened))

    fit = umap.UMAP(min_dist=0.20)
    embedding = fit.fit_transform(specs_flattened)
    subset_df["umap"] = list(embedding)
    subset_df.to_pickle(DATA_DIR / "embeddings" / DATASET_ID / (indv + ".pickle"))


loading json:   0%|          | 0/747 [00:00<?, ?it/s][Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
loading json:   3%|▎         | 21/747 [00:02<01:41,  7.16it/s][Parallel(n_jobs=-2)]: Done  36 tasks      | elapsed:    2.9s
loading json: 100%|██████████| 747/747 [00:03<00:00, 245.10it/s]
[Parallel(n_jobs=-2)]: Done 747 out of 747 | elapsed:    3.1s finished
getting unique individuals:   0%|          | 0/747 [00:00<?, ?it/s]

747