In [None]:
#@markdown <b>Run me to import underscore module</b><br/>   {display-mode: "form"}
#@markdown <small>Method signatures:</small><br/> 
#@markdown <small><small>&nbsp; &nbsp; &nbsp; _(source_path, target_path)</small></small><br/>
#@markdown <small><small>&nbsp; &nbsp; &nbsp; _set_gh_token(token)</small></small><br/>
#@markdown <small><small>&nbsp; &nbsp; &nbsp; _from_gh(user_name, repo_name, release_name) &nbsp; &nbsp; &nbsp; <b>Returns:</B> dictionary of arrays { 'array_name' : np.ndarray }</small></small><br/>
#@markdown <small><small>&nbsp; &nbsp; &nbsp; _to_gh(user_name, repo_name, release_name, split_size=600, **arr_kwargs)</small></small><br/>

!pip install -q githubrelease
import numpy as np
import os, glob, re, time
import github_release


def _compress(source_path, target_path, target_dir=None):
    if target_dir:
        !mkdir -p {target_dir}
    if target_path.endswith('.tar.gz'):
        !tar -czf {target_path} -C {source_path} .
    elif target_path.endswith('.tar'):
        !tar -cf {target_path} -C {source_path} .
    elif target_path.endswith('.zip'):
        !(cd {source_path} && zip -q -r {target_path} .)


def _extract(source_path, target_path):
    !mkdir -p {target_path}
    if source_path.endswith('.tar.gz'):
        !tar -xzf {source_path} -C {target_path}
    elif source_path.endswith('.tar'):
        !tar -xf {source_path} -C {target_path}
    elif source_path.endswith('.zip'):
        !unzip -qq {source_path} -d {target_path}


def _(source_path, target_path):
    """
    Use cases:
        Movement:
            - GCS -> GCS
            - GCS -> LOCAL
            - LOCAL -> GCS
            - LOCAL -> LOCAL
            
        Compression (e.g. from dir to .tar.gz):
            - GCS -> GCS
            - GCS -> LOCAL
            - LOCAL -> GCS
            - LOCAL -> LOCAL
            
        Extraction (e.g. from .zip to dir):
            - GCS -> GCS
            - GCS -> LOCAL
            - LOCAL -> GCS
            - LOCAL -> LOCAL
            
        Extraction & compression (e.g. from .zip to .tar.gz):
            - GCS -> GCS
            - GCS -> LOCAL
            - LOCAL -> GCS
            - LOCAL -> LOCAL
    """
    COMPRESSION_FORMATS = ('zip', 'tar', 'tar.gz')
    TEMP_DIR = "/tmp_"
    LOG_TEMPLATE = "{}    from    {}    to    {}"

    # Source
    source_dir, _, source_name = source_path.rpartition('/')
    source_isgcs = source_path.startswith("gs://")
    source_islocal = not source_isgcs
    source_isprefix, source_isfile, source_ext = source_name.partition('.')
    source_isdir = not source_isfile
    source_iscompression = source_ext in COMPRESSION_FORMATS

    # Target
    target_dir, _, target_name = target_path.rpartition('/')
    target_isgcs = target_path.startswith("gs://")
    target_islocal = not target_isgcs
    target_prefix, target_isfile, target_ext = target_name.partition('.')
    target_isdir = not target_isfile
    target_iscompression = target_ext in COMPRESSION_FORMATS

    # Flags
    MOVE_ONLY = source_ext == target_ext
    GCS_ONLY = source_isgcs and target_isgcs
    RENAME = source_isprefix != target_prefix
    COMPRESSION = source_isdir and target_iscompression
    EXTRACTION = source_iscompression and target_isdir
    EXTRACTION_COMPRESSION = source_iscompression and target_iscompression and source_ext != target_ext

    # Authenticate if writing to GCS
    if target_isgcs:
        from google.colab import auth
        auth.authenticate_user()

    # Assert that subdirectories exist if target is local
    if target_islocal:
        !mkdir -p {target_dir}

    # Movement commands
    if MOVE_ONLY:
        # GCS -> GCS
        if source_isgcs and target_isgcs:
            print(LOG_TEMPLATE.format("MOVING (1/1)", source_path, target_path))
            !gsutil -m -q mv {source_path} {target_path}
        
        # LOCAL -> LOCAL
        elif source_islocal and target_islocal:
            print(LOG_TEMPLATE.format("MOVING (1/1)", source_path, target_path))
            !mv {source_path} {target_path}
        
        # GCS -> LOCAL
        elif source_isgcs and target_islocal:
            if source_isdir:
                print(LOG_TEMPLATE.format("DOWNLOADING DIR (1/1)", source_path, target_dir))
                !gsutil -m -q cp -r {source_path} {target_dir}
                if RENAME:
                    print(LOG_TEMPLATE.format("\tRENAMING DIR", source_isprefix, target_prefix))
                    !mv {target_dir}/{source_isprefix} {target_dir}/{target_prefix}
            else:
                print(LOG_TEMPLATE.format("DOWNLOADING FILE (1/1)", source_path, target_path))
                !gsutil -m -q cp {source_path} {target_path}
        
        # LOCAL -> GCS
        if source_islocal and target_isgcs:
            if source_isdir:
                print(LOG_TEMPLATE.format("UPLOADING DIR (1/1)", source_path, target_path))
                !gsutil -m -q cp -r {source_path} {target_path}
            else:
                print(LOG_TEMPLATE.format("UPLOADING FILE (1/1)", source_path, target_path))
                !gsutil -m -q cp {source_path} {target_path}
        return


    # Create directory for intermediate storage if required
    if source_isgcs or target_isgcs or EXTRACTION_COMPRESSION:
        !mkdir -p {TEMP_DIR}
    

    # For remaining operations, download GCS source to temp and treat as local
    if source_isgcs:
        if source_isdir:
            print(LOG_TEMPLATE.format("\tDOWNLOADING DIR", source_path, TEMP_DIR))
            !gsutil -m -q cp -r {source_path} {TEMP_DIR}
        else:
            print(LOG_TEMPLATE.format("\tDOWNLOADING FILE", source_path, f"{TEMP_DIR}/{source_name}"))
            !gsutil -m -q cp {source_path} {TEMP_DIR}/{source_name}
        source_path = f"{TEMP_DIR}/{source_name}"
        source_dir = TEMP_DIR

    # Compression
    if COMPRESSION:
        if target_islocal:
            print(LOG_TEMPLATE.format("COMPRESSING (1/1)", source_path, target_path))
            _compress(source_path, target_path, target_dir=target_dir)
        else:
            print(LOG_TEMPLATE.format("COMPRESSING (1/2)", source_path, f"{TEMP_DIR}/{target_name}"))
            _compress(source_path, f"{TEMP_DIR}/{target_name}")
            print(LOG_TEMPLATE.format("UPLOADING FILE (2/2)", f"{TEMP_DIR}/{target_name}", target_path))
            !gsutil -m -q cp {TEMP_DIR}/{target_name} {target_path}

    # Extraction
    elif EXTRACTION:
        if target_islocal:
            print(LOG_TEMPLATE.format("EXTRACTING (1/1)", source_path, target_path))
            _extract(source_path, target_path)
        else:
            print(LOG_TEMPLATE.format("EXTRACTING (1/2)", source_path, f"{TEMP_DIR}/{target_name}"))
            _extract(source_path, f"{TEMP_DIR}/{target_name}")
            print(LOG_TEMPLATE.format("UPLOADING DIR (2/2)", f"{TEMP_DIR}/{target_name}", target_path))
            !gsutil -m -q cp -r {TEMP_DIR}/{target_name} {target_path}

    # Extraction & compression
    elif EXTRACTION_COMPRESSION:
        if target_islocal:
            print(LOG_TEMPLATE.format("EXTRACTING (1/2)", source_path, f"{TEMP_DIR}/{target_prefix}"))
            _extract(source_path, f"{TEMP_DIR}/{target_prefix}")
            print(LOG_TEMPLATE.format("COMPRESSING (2/2)", f"{TEMP_DIR}/{target_prefix}", target_path))
            _compress(f"{TEMP_DIR}/{target_prefix}", target_path, target_dir=target_dir)
        else:
            print(LOG_TEMPLATE.format("EXTRACTING (1/3)", source_path, f"{TEMP_DIR}/{target_prefix}"))
            _extract(source_path, f"{TEMP_DIR}/{target_prefix}")
            print(LOG_TEMPLATE.format("COMPRESSING (2/3)", f"{TEMP_DIR}/{target_prefix}", f"{TEMP_DIR}/{target_name}"))
            _compress(f"{TEMP_DIR}/{target_prefix}", f"{TEMP_DIR}/{target_name}")
            print(LOG_TEMPLATE.format("UPLOADING FILE (3/3)", f"{TEMP_DIR}/{target_name}", target_path))
            !gsutil -m -q cp {TEMP_DIR}/{target_name} {target_path}
    
    # Cleanup intermediate storage
    !rm -rf {TEMP_DIR}


def _set_gh_token(token):
    os.environ["GITHUB_TOKEN"] = token


def _export_array(array, release_name, prefix="", splits=3):
    dir_path = f"/tmp_/{release_name}"
    !mkdir -p {dir_path}
    n_digits = len(str(splits - 1))
    subarrays = np.array_split(array, splits)
    for i, subarray in enumerate(subarrays):
        filename = f"{prefix}__{str(i).zfill(n_digits)}.npy"
        np.save(f"{dir_path}/{filename}", subarray)


def _concat_arrays(paths):
    return np.concatenate([np.load(path, allow_pickle=True) for path in sorted(paths)])


def _to_gh(user_name, repo_name, release_name, split_size=600, **arr_kwargs):
    # Assert that GitHub Auth token is set
    if "GITHUB_TOKEN" not in os.environ:
        print("GitHub authentication token is not set.")
        print("Set token using the '_set_gh_token(token_string)' method.")
        print("Minimal required auth scope is 'repo/public_repo' for public repositories.")
        print("URL: https://github.com/settings/tokens/new")
        return

    # Split arrays
    for prefix, array in arr_kwargs.items():
        splits = int((array.nbytes/1_000_000) // split_size) + 1
        _export_array(array, release_name, prefix=prefix, splits=splits)

    # Upload arrays
    github_release.gh_release_create(
        f"{user_name}/{repo_name}", 
        release_name, 
        publish=True, 
        name=release_name, 
        asset_pattern=f"/tmp_/{release_name}/*"
    )
    !rm -rf /tmp_/*


def _from_gh(user_name, repo_name, release_name):
    # Download release to temporary directory
    print("Downloading dataset in parallell ... ", end='\t')
    t0 = time.perf_counter()
    assets = github_release.get_assets(f"{user_name}/{repo_name}", tag_name=release_name)
    download_urls = [asset['browser_download_url'] for asset in assets]
    urls_str = " ".join(download_urls)
    !echo {urls_str} | xargs -n 1 -P 8 wget -q -P /tmp_/{release_name}_dl/
    t1 = time.perf_counter()
    print(f"done! ({t1 - t0:.3f} seconds)")

    # Load data into numpy arrays
    paths = glob.glob(f"/tmp_/{release_name}_dl/*.npy")
    groups = {}
    for path in paths:
        match = re.match(r".*/(.*)__[0-9]*\.npy", path)
        if match:
            prefix = match.group(1)
            groups[prefix] = groups.get(prefix, []) + [path]
    arrays_dict = {name: _concat_arrays(paths) for name, paths in groups.items()}
    !rm -rf /tmp_/*
    return arrays_dict

  Building wheel for linkheader (setup.py) ... [?25l[?25hdone


##### Initial setup

In [None]:
# Download git repository
!git config --global user.email "patrikkja@gmail.com"
!git config --global user.name "Patrik Kjærran"
!git clone -q https://github.com/patrikkj/marvin-models.git
%cd marvin-models

/content/marvin-models


In [None]:
# Extract dataset into git/data directory
DATASET = "dataset_full"
_(f"gs://marvin-voice/data/raw/{DATASET}.tar.gz", f"/content/marvin-models/data")

	DOWNLOADING FILE    from    gs://marvin-voice/data/raw/dataset_full.tar.gz    to    /tmp_/dataset_full.tar.gz
EXTRACTING (1/1)    from    /tmp_/dataset_full.tar.gz    to    /content/marvin-models/data


In [None]:
# Internal modules
import os, sys, glob
from importlib import reload

# External modules
!pip install -q tensorflow-io
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_io as tfio
from tensorflow_io import experimental as tfex

# Colab modules
from google.colab import auth
from IPython import display
    
# Scripts
from scripts import preprocessing

[K     |████████████████████████████████| 22.4MB 1.3MB/s 
[?25h

##### Create and export numpy arrys for each label

In [None]:
def generate_label_paths(data_dir):
    for dir_path in glob.iglob(DATA_DIR + "/*"):
        # Traverse directories only
        if not os.path.isdir(dir_path):
            continue
        
        # Fetch label
        label = dir_path.split('/')[-1]

        # Ignore _background_noise_ directory
        if label.startswith("_"):
            continue
        yield (dir_path, label)

params = {'sample_rate': 16_000, 'min_freq': 0, 'max_freq': 8_000}
hparams = {
    'frame_size': 512,
    'frame_step': 256,
    'fft_size': 512,
    'mel_bins': 64,

    'num_mfccs': 26,
}

In [None]:
from scripts import preprocessing, layers

spectrogram_layer = layers.Spectrogram(params, hparams)
mel_spectrogram_layer = layers.MelSpectrogram(params, hparams)
log_mel_spectrogram_layer = layers.LogMelSpectrogram(params, hparams)
mfccs_layer = layers.MFCC(params, hparams)

DATA_DIR = "/content/marvin-models/data"
!mkdir -p /tmp/arrays/tensors /tmp/arrays/log_mel_specs /tmp/arrays/mfccs

for dir_path, label in list(generate_label_paths(DATA_DIR)):
    print(f"Processing: {label}\t{dir_path}")
    paths = glob.glob(dir_path + "/*.wav")

    print("Creating tensors ...")
    tensors = tf.stack([preprocessing.to_tensor(path) for path in paths])
    np.save(f'/tmp/arrays/tensors/{label}', tensors)

    print("Creating log_mel_specs ...")
    spectrograms = spectrogram_layer(tensors)
    mel_spectrograms = mel_spectrogram_layer(spectrograms)
    log_mel_specs = log_mel_spectrogram_layer(mel_spectrograms)
    np.save(f'/tmp/arrays/log_mel_specs/{label}', log_mel_specs)

    print("Creating mfccs ...")
    mfccs = mfccs_layer(log_mel_specs)
    np.save(f'/tmp/arrays/mfccs/{label}', mfccs)
    print()

Processing: marvin	/content/marvin-models/data/marvin
Creating tensors ...
Creating log_mel_specs ...
Creating mfccs ...

Processing: zero	/content/marvin-models/data/zero
Creating tensors ...
Creating log_mel_specs ...
Creating mfccs ...

Processing: six	/content/marvin-models/data/six
Creating tensors ...
Creating log_mel_specs ...
Creating mfccs ...

Processing: seven	/content/marvin-models/data/seven
Creating tensors ...
Creating log_mel_specs ...
Creating mfccs ...

Processing: two	/content/marvin-models/data/two
Creating tensors ...
Creating log_mel_specs ...
Creating mfccs ...

Processing: one	/content/marvin-models/data/one
Creating tensors ...
Creating log_mel_specs ...
Creating mfccs ...

Processing: off	/content/marvin-models/data/off
Creating tensors ...
Creating log_mel_specs ...
Creating mfccs ...

Processing: three	/content/marvin-models/data/three
Creating tensors ...
Creating log_mel_specs ...
Creating mfccs ...

Processing: four	/content/marvin-models/data/four
Creati

In [None]:
# Reimport incase VM ran out of memory
import numpy as np
import glob
_set_gh_token("73a1d93fa1e7fe7696321a86ff037a0ecc58346c")

for name in ('tensors', 'log_mel_specs', 'mfccs'):
    print(f"Currently compressing: {name}")
    # Free up memory from previous iteration
    data, labels = None, None

    arrs, labels_arrs = [], []
    for npy_file in glob.glob(f"/tmp/arrays/{name}/*.npy"):
        arr = np.load(npy_file)

        label = npy_file.split('/')[-1].split('.')[0]
        label_arr = np.full((arr.shape[0],), 1 if label == "marvin" else 0).astype(int)

        arrs.append(arr)
        labels_arrs.append(label_arr)

    data = np.vstack(arrs).squeeze()
    del arrs

    labels = np.concatenate(labels_arrs).squeeze()
    del labels_arrs

    m = data.shape[0]
    p = np.random.permutation(m)
    data, labels = data[p], labels[p]
    
    pos_data = data[labels==1, ...]
    pos_labels = labels[labels==1, ...]

    neg_data = data[labels==0, ...][::2]
    neg_labels = labels[labels==0, ...][::2]

    arr_kwargs = {
        "pos_data": pos_data,
        "pos_labels": pos_labels,
        "neg_data": neg_data,
        "neg_labels": neg_labels
    }

    _to_gh("patrikkj", "marvin-models", f"dataset_half_notrim_{name}", split_size=200, **arr_kwargs)

    #np.savez(f'/tmp/{name}', data=data, labels=labels)
    #NEW_DATASET_NAME = "dataset_full"
    #_(f"/tmp/{name}.npz", f"gs://marvin-voice/data/arrays/{NEW_DATASET_NAME}/{name}.npz")

Currently compressing: tensors
created 'dataset_half_notrim_tensors' release
  Tag name      : dataset_half_notrim_tensors
  Name          : dataset_half_notrim_tensors
  ID            : 33010663
  Created       : 2020-10-23T17:07:49Z
  URL           : https://github.com/patrikkj/marvin-models/releases/tag/dataset_half_notrim_tensors
  Author        : patrikkj
  Is published  : True
  Is prerelease : False

uploading 'dataset_half_notrim_tensors' release asset(s) (found 14):
  uploading /tmp_/dataset_half_notrim_tensors/neg_data__08.npy
  download_url: https://github.com/patrikkj/marvin-models/releases/download/dataset_half_notrim_tensors/neg_data__08.npy

  uploading /tmp_/dataset_half_notrim_tensors/neg_data__09.npy
  download_url: https://github.com/patrikkj/marvin-models/releases/download/dataset_half_notrim_tensors/neg_data__09.npy

  uploading /tmp_/dataset_half_notrim_tensors/neg_data__00.npy
  download_url: https://github.com/patrikkj/marvin-models/releases/download/dataset_hal