In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys, os
sys.path.append('..')

import numpy as np
import librosa as lr
import torch
import IPython.display as ipd
import matplotlib.pyplot as plt
import pytorch_lightning as pl

from scipy.signal.windows import hann
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import LearningRateMonitor, EarlyStopping

from datasets.nsynth_datamodule import NsynthDataModule
from models.cvae_resnet import CvaeResnet
from models.cvae_inception import CvaeInception
from models.vae_inception import VaeInception
from models.vae_inception_custom import VaeInceptionCustom

pl.seed_everything(42)

Global seed set to 42


42

In [10]:
### CONFIGS

train_configs = {
    'type': 'vae_cstm',
    'descr': 'decay',
    'num_workers': 16,
    'batch_size': 64,
    'max_epochs': 10000,
    'patience': 300,
    'trainer_kwargs': {
        'gpus': '1',
        'accelerator': None,
        'num_nodes': 1,
        'precision': 32,
        'accumulate_grad_batches': 1
    }
}

ds_configs = {
    'dataset_path': '/data/riccardo_datasets',
    'feature': 'spec',
    'feature_params': {
        'win_length': 256,
        'hop_length': 64,
        'window': hann(256).tolist()
    },
    'n_fft': 510,
    'ds_kwargs': {
        'pitches': [60],
        #'instrument_families': [0],
        'sr': 16000,
        'duration': 1.02
    }
}

m_configs_incept = {
    'optim': 'yogi',
    'optim_kwargs': {
        'lr': 1e-3,
    },
    'lr_scheduler': {
        'factor': 0.1, 
        'patience': 100,
        'cooldown': 30,
        'min_lr': 1e-5
    },
    'db_kwargs': {
        'amin': 1e-5,
        'top_db': 90
    },
#    'c_labels': ['pitch'],
    'kl_coeff': 5e-5,
    'kl_decay': 1.,
    'db_coeff': 1e-4,
    'db_decay': 1.,
    'latent_size': 64,
    'channel_size': 2,
    'channel_max': 128,
    'use_inception': True,
    'repeat_per_block': 1,
}

configs = {
    'train': train_configs,
    'dataset': ds_configs,
    'model': m_configs_incept
}

In [11]:
import json
with open('../configs/vae_small.json', 'w') as fp:
    json.dump(configs, fp, indent=2)

In [33]:
m_configs_incept = {
    'optim': 'yogi',
    'optim_kwargs': {
        'lr': 0.0001,
    },
    'lr_scheduler': {
        'factor': 0.1, 
        'patience': 100,
        'cooldown': 30,
        'min_lr': 1e-5
    },
    'db_kwargs': {
        'amin': 1e-5,
        'top_db': 90
    },
#    'c_labels': ['pitch'],
    'kl_coeff': 5e-5,
    'kl_decay': 1.,
    'db_coeff': 1e-3,
    'db_decay': 1.,
    'latent_size': 128,
    'channel_size': 2,
    'channel_max': 128,
    'use_inception': True,
    'repeat_per_block': 1,
}

m = VaeInceptionCustom(m_configs_incept)
x = torch.randn(1, 2, 256, 256)
m._shared_eval(x)

x_true torch.Size([1, 2, 256, 256])
hidden_enc torch.Size([1, 128])
mean torch.Size([1, 128]) torch.Size([1, 128])
z torch.Size([1, 128])
hidden_dec torch.Size([1, 128])
x_rec torch.Size([1, 2, 256, 256])


(tensor([[[[ 4.7255,  7.0502,  7.7187,  ...,  7.1188,  5.3107,  4.1573],
           [ 6.5614,  3.4549,  4.7172,  ...,  2.9339,  3.5087,  4.7234],
           [ 6.1976,  4.0059,  1.4937,  ...,  1.6084, -0.2501,  2.8330],
           ...,
           [ 7.2969,  6.4551,  5.9911,  ...,  3.6885,  3.3300,  2.3644],
           [ 7.4315,  3.6635,  3.1415,  ...,  2.3514, -0.0492,  3.9194],
           [ 5.0750,  4.8093,  3.3677,  ...,  3.1145,  3.5428, -1.3836]],
 
          [[ 0.7958,  0.6023,  1.6776,  ...,  0.3987,  0.8204,  1.3168],
           [-1.9202, -0.6654,  0.1256,  ..., -0.6959, -0.8270,  0.3657],
           [ 0.7794, -0.2615,  0.3933,  ...,  0.2971, -0.4553, -0.4939],
           ...,
           [-1.1908, -2.2876, -1.3261,  ..., -1.2160, -2.0508, -0.9143],
           [-0.2340, -0.1883,  0.6244,  ...,  0.7341, -1.4440,  1.2535],
           [-2.2437, -2.0847, -1.6963,  ...,  0.7179, -1.5545, -3.0081]]]],
        grad_fn=<SlowConvTranspose2DBackward>),
 tensor([[ 3.0922, -0.5216, -0.9163, -

## urban sounds

In [12]:
### CONFIGS

ds_configs_usnds = {
    'dataset_path': '/data/riccardo_datasets/urban_sounds',
    'feature': 'spec',
    'feature_params': {
        'win_length': 256,
        'hop_length': 64,
        'window': hann(256).tolist()
    },
    'n_fft': 510,
    'ds_kwargs': {
        'sr': 16000,
        'duration': 1.02
    }
}

configs_usnds = {
    'train': train_configs,
    'dataset': ds_configs_usnds,
    'model': m_configs_incept
}
configs_usnds['train']['trainer_kwargs']['gpus'] = '1'
configs_usnds['train']['num_workes'] = 16

In [13]:
import json
with open('../configs/test2_usnds.json', 'w') as fp:
    json.dump(configs_usnds, fp, indent=2)

## proper training

In [6]:
# logger
log_name = '{}_{}'.format(CvaeInception.model_name, train_configs['descr'])
logger = TensorBoardLogger('logs', name=log_name)

In [7]:
# init data loader
dm = NsynthDataModule(ds_configs, num_workers=train_configs['num_workers'], batch_size=train_configs['batch_size'])
dm.setup()

  return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore


In [None]:
# callbacks
early_stop = EarlyStopping(monitor='val_loss', patience=train_configs['patience'])
lr_monitor = LearningRateMonitor(logging_interval='epoch')

# train!
trainer = pl.Trainer(
#    weights_summary='full',
#    overfit_batches=1,
#    terminate_on_nan=False,
#    gradient_clip_val=0.5,
    max_epochs=train_configs['max_epochs'],
    callbacks=[early_stop, lr_monitor],
    logger=logger,
    **configs['train']['trainer_kwargs'])

## quick test

In [11]:
dm = NsynthDataModule(ds_configs, num_workers=16, batch_size=8)
trainer = pl.Trainer(fast_dev_run=5, gpus='1')
trainer = pl.Trainer(max_epochs=10, overfit_batches=1, gpus='1')
trainer.fit(model=model, datamodule=dm)

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
Running in fast_dev_run mode: will run a full train, val and test loop using 5 batch(es).
GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name      | Type    | Params
--------------------------------------
0 | encoder   | Encoder | 3.8 M 
1 | fc_mu     | Linear  | 8.3 K 
2 | fc_logvar | Linear  | 8.3 K 
3 | fc_rep    | Linear  | 8.7 K 
4 | decoder   | Decoder | 2.2 M 
--------------------------------------
5.9 M     Trainable params
0         Non-trainable params
5.9 M     Total params


Validation sanity check: 0it [00:00, ?it/s]

0.9




Training: 0it [00:00, ?it/s]

0.81


Validating: 0it [00:00, ?it/s]

0.81
0.7290000000000001


Validating: 0it [00:00, ?it/s]

0.7290000000000001
0.6561000000000001


Validating: 0it [00:00, ?it/s]

0.6561000000000001
0.5904900000000002


Validating: 0it [00:00, ?it/s]

0.5904900000000002
0.5314410000000002


Validating: 0it [00:00, ?it/s]

0.5314410000000002
0.47829690000000014


Validating: 0it [00:00, ?it/s]

0.47829690000000014
0.43046721000000016


Validating: 0it [00:00, ?it/s]

0.43046721000000016
0.38742048900000015


Validating: 0it [00:00, ?it/s]

0.38742048900000015
0.34867844010000015


Validating: 0it [00:00, ?it/s]

0.34867844010000015
0.31381059609000017


Validating: 0it [00:00, ?it/s]

0.31381059609000017


1