In [1]:
import numpy as np 
import sklearn as sk
# from skopt import gp_minimize, forest_minimize
from src.layers import padding as pad_utils
from src.spatial_attn_lightning import BinauralAttentionModule
import yaml 



## Randomly generate architectures

### Required conditions:
* N parameters < 200M
* 4 < N layers < 11
* 2 < output height < 8
* 2 < output width < 8

In [3]:
## write configs 

from pathlib import Path
import yaml 
from copy import deepcopy


## import default config 
outdir = Path("config/arch_search")
outdir.mkdir(exist_ok=True, parents=True)

base_config = yaml.load(open("config/binaural_attn/word_task_half_co_loc_v09_gender_bal_4M_w_no_cue_learned_higher_lr_less_dropout.yaml", 'r'), Loader=yaml.FullLoader)


In [4]:

def get_pool_padding(kernel_size):
    if kernel_size % 2 == 0:
        pool_pad = kernel_size // 2
    else:
        pool_pad = (kernel_size - 1) // 2
    return pool_pad


def compute_layers(output_height, output_len, kernel_h, kernel_w, pool_stride_h, pool_stride_w):
    """
    Compute the output shape of a CNN given lists of network parameters.
    Args:
        n_layers: number of layers
        kernel_h: height of kernel
        kernel_w: length of kernel
        pool_stride_h: pooling stride in height
        pool_stride_w: pooling stride in length 
    Returns:
        output_height: height of output
        output_len: length of output
    """
    # Compute output shapes using conv formula [(Height - (Filter-1) + 2Pad)/ Stride]+1
    conv_pad, _  = pad_utils.get_padding_value('valid_time', [kernel_h, kernel_w], stride=[1,1])

    output_height = int(np.floor((output_height + (2 * conv_pad[0]) - (kernel_h - 1) - 1) / 1) + 1)
    output_len = int(np.floor((output_len + (2 * conv_pad[1]) - (kernel_w - 1) - 1) / 1) + 1)

    # pooling layers
    pool_h = pool_stride_h * 4 if pool_stride_h > 1 else 1
    pool_w = pool_stride_w * 4 if pool_stride_w > 1 else 1
    # print(f'pool_h: {pool_h}, pool_w: {pool_w}')
    # print(f'pool_stride_h: {pool_stride_h}, pool_stride_w: {pool_stride_w}')
    # pool_pad, _  = pad_utils.get_padding_value("same", [pool_h, pool_w], stride=[pool_stride_h, pool_stride_w])
    # print(f'pool_pad: {pool_pad}')
    pool_padding_h = get_pool_padding(pool_h)
    pool_padding_w =  get_pool_padding(pool_w)
    
    output_height = int(np.floor((output_height + (2 * pool_padding_h) - pool_h) / pool_stride_h) + 1)
    output_len = int(np.floor((output_len + (2 * pool_padding_w) - pool_w) / pool_stride_w) + 1)

    return output_height, output_len, conv_pad, [pool_h, pool_w], [pool_padding_h, pool_padding_w]



np.random.seed(10)
archs = {}

n_good_archs = 0
n_tol = 1e10

n_in_channels = 2 # 2 for binaural audio, 1 for mono 
fc_size = 512

while n_good_archs < 10:
    n_layers = np.random.randint(5, 11)
    output_height = 40
    output_len = 20000
    params = {}
    params['n_layers'] = n_layers
    params['kernel'] = []
    params['conv_pad'] = [] 
    params['pool_stride'] = []
    params['pool_size'] = []
    params['pool_pad'] = []
    params['n_filts'] = []

    try:
        total_params = 0
        # add input norm to params 
        # total_params += n_in_channels * output_height * output_len
        for layer in range(n_layers):
            if layer < 2:
                kernel_w = np.random.randint(10, 81)
                kernel_h = np.random.randint(1, 4)
                pool_stride_h = np.random.randint(1, 3)
                pool_stride_w = np.random.randint(1, 7)

            else:
                kernel_h = np.random.randint(3, 7)
                kernel_w = np.random.randint(3, 7)
                pool_stride_h = np.random.randint(1, 2)
                pool_stride_w = np.random.randint(1, 4)

            # pool_stride_h = np.random.randint(1, 3)
            # pool_stride_w = np.random.randint(1, 6)
            # compute output shape
            output_height, output_len, conv_pad, pool_size, pool_padding = compute_layers(output_height, output_len, kernel_h, kernel_w, pool_stride_h, pool_stride_w)

            # update params dict 
            params['kernel'].append([kernel_h, kernel_w])
            params['conv_pad'].append(conv_pad)
            params['pool_stride'].append([pool_stride_h, pool_stride_w])
            params['pool_size'].append(pool_size)
            params['pool_pad'].append(pool_padding)

            if layer == 0:
                n_filts = 2**np.random.randint(5,7)
                params['n_filts'].append(n_filts)
                n_layer_params = (n_in_channels * kernel_h * kernel_w) * n_filts # no bias in these models 
            else:
                n_filts = np.min([2 * n_filts, 512])
                params['n_filts'].append(n_filts)
                prev_layer_filts = params['n_filts'][layer-1]
                n_layer_params = (prev_layer_filts * kernel_h * kernel_w) * n_filts # no bias in these models 
            
            total_params += n_layer_params

         
        ## get fully connected size for good architectures 
        final_output_size = (n_filts * output_height * output_len)
        n_fc_params = fc_size * final_output_size
        assert n_fc_params > 0, f'n_fc_params: {n_fc_params}, final_output_size: {final_output_size}, fc_size: {fc_size}'
        # print(f'final_output_size: {final_output_size}, n_fc_params: {n_fc_params}')
        total_params += n_fc_params

        config = deepcopy(base_config)
        config['model']['out_channels'] = [int(i) for i in params['n_filts']]
        # conv layers
        config['model']['kernel'] = params['kernel']
        config['model']['stride'] = [[1,1] for _ in  range(params['n_layers'])]
        config['model']['padding'] = params['conv_pad']
        config['model']['padding'] = ['valid_time' for _ in range(params['n_layers'])]
        # pooling layers
        config['model']['pool_stride'] = params['pool_stride']
        config['model']['pool_size'] = params['pool_size']
        config['model']['pool_padding'] = params['pool_pad']
        # add attn 
        config['model']['attn'] = [1 for _ in range(params['n_layers'])]
        # config['model']['block_order'] = "Conv -> LN -> ReLU"
        config['model']['ln_affine'] = True
        config['model']['norm_first'] = True

        if (output_height >= 2 and output_height <= 8) and (output_len >= 2 and output_len <= 8) and (total_params <= 2e8 and total_params >= 1e7):

            model = BinauralAttentionModule(config).model
            n_params = sum([p.numel() for p in model.parameters()])
            if (output_height >= 2 and output_height <= 8) and (output_len >= 2 and output_len <= 8) and (n_params <= 1.5e8 and n_params >= 1e7):
                print(f'output_height: {output_height}, output_len: {output_len}, n layers: {n_layers}, our n_params: {round(total_params/1e6, 2)}M, actual n_params: {n_params/1e6:.2f}M')
                # save dict of params 
                archs[f"arch_{n_good_archs}"] = params
                n_good_archs += 1
    
    except Exception as e:
        # print(f"Error: {e}")
        continue
    
    # else:
    n_tol -= 1
    if n_tol == 0:
        break 
    

Using explicit dim specification for demeaning in audio transforms
Using BinauralAuditoryAttentionCNN
v08 True
num_classes={'num_words': 800}
Model performing word task
Conv block order: LN -> Conv -> ReLU
fc_attn: True
coch_affine: True
center_crop=True
binaural=True
Binaural cochleagram
using FIR cochleagram
output_height: 7, output_len: 4, n layers: 10, our n_params: 37.53M, actual n_params: 144.71M
Using explicit dim specification for demeaning in audio transforms
Using BinauralAuditoryAttentionCNN
v08 True
num_classes={'num_words': 800}
Model performing word task
Conv block order: LN -> Conv -> ReLU
fc_attn: True
coch_affine: True
center_crop=True
binaural=True
Binaural cochleagram
using FIR cochleagram
output_height: 6, output_len: 4, n layers: 10, our n_params: 33.49M, actual n_params: 80.97M
Using explicit dim specification for demeaning in audio transforms
Using BinauralAuditoryAttentionCNN
v08 True
num_classes={'num_words': 800}
Model performing word task
Conv block order: LN

### Update arch of base config 

In [5]:
base_config["model"]

{'input_sr': 10000,
 'out_channels': [32, 64, 256, 512, 512, 512, 512],
 'kernel': [[2, 34], [2, 14], [5, 5], [5, 5], [6, 6], [5, 5], [6, 6]],
 'stride': [[1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1]],
 'padding': ['valid_time',
  'valid_time',
  'valid_time',
  'valid_time',
  'valid_time',
  'valid_time',
  'valid_time'],
 'pool_stride': [[2, 4], [2, 4], [1, 5], [1, 4], [1, 1], [1, 1], [2, 4]],
 'pool_size': [[9, 13], [9, 13], [1, 13], [1, 13], [1, 1], [1, 1], [6, 13]],
 'pool_padding': [[4, 6], [4, 6], [0, 6], [0, 6], [0, 0], [0, 0], [3, 6]],
 'attn': [1, 1, 1, 1, 1, 1, 1],
 'num_classes': {'num_words': 800},
 'fc_size': 512,
 'global_avg_cue': False,
 'dropout': 0.5,
 'attn_constraints': {'slope': True},
 'v08': True,
 'norm_first': True,
 'ln_affine': True}

In [6]:
# Convert architectures in archs to configs
from copy import deepcopy

## import default config 
outdir = Path("config/arch_search")
outdir.mkdir(exist_ok=True, parents=True)

base_config = yaml.load(open("config/binaural_attn/word_task_half_co_loc_v09_gender_bal_4M_w_no_cue_learned_higher_lr_less_dropout.yaml", 'r'), Loader=yaml.FullLoader)

for arch in archs:
    params = archs[arch]
    config = deepcopy(base_config)
    config['model']['out_channels'] = [int(i) for i in params['n_filts']]
    # conv layers
    config['model']['kernel'] = params['kernel']
    config['model']['stride'] = [[1,1] for _ in  range(params['n_layers'])]
    config['model']['padding'] = params['conv_pad']
    config['model']['padding'] = ['valid_time' for _ in range(params['n_layers'])]
    # pooling layers
    config['model']['pool_stride'] = params['pool_stride']
    config['model']['pool_size'] = params['pool_size']
    config['model']['pool_padding'] = params['pool_pad']
    # add attn 
    config['model']['attn'] = [1 for _ in range(params['n_layers'])]

    # update learning rate 
    config['hparas']['valid_step'] = 4000

    # config['model']
     # write config to file
    config_name = outdir / f"word_task_v09_4MGB_ln_first_{arch:2}.yaml"
    print(config_name)
    # break
    with open(config_name, 'w') as f:
        yaml.dump(config, f, default_flow_style=False)

config/arch_search/word_task_v09_4MGB_ln_first_arch_0.yaml
config/arch_search/word_task_v09_4MGB_ln_first_arch_1.yaml
config/arch_search/word_task_v09_4MGB_ln_first_arch_2.yaml
config/arch_search/word_task_v09_4MGB_ln_first_arch_3.yaml
config/arch_search/word_task_v09_4MGB_ln_first_arch_4.yaml
config/arch_search/word_task_v09_4MGB_ln_first_arch_5.yaml
config/arch_search/word_task_v09_4MGB_ln_first_arch_6.yaml
config/arch_search/word_task_v09_4MGB_ln_first_arch_7.yaml
config/arch_search/word_task_v09_4MGB_ln_first_arch_8.yaml
config/arch_search/word_task_v09_4MGB_ln_first_arch_9.yaml


{'input_sr': 10000,
 'out_channels': [32, 64, 128, 256, 512, 512, 512, 512, 512, 512],
 'kernel': [[2, 11],
  [3, 60],
  [4, 5],
  [4, 4],
  [3, 4],
  [4, 3],
  [5, 3],
  [4, 6],
  [3, 3],
  [5, 6]],
 'stride': [[1, 1],
  [1, 1],
  [1, 1],
  [1, 1],
  [1, 1],
  [1, 1],
  [1, 1],
  [1, 1],
  [1, 1],
  [1, 1]],
 'padding': ['valid_time',
  'valid_time',
  'valid_time',
  'valid_time',
  'valid_time',
  'valid_time',
  'valid_time',
  'valid_time',
  'valid_time',
  'valid_time'],
 'pool_stride': [[2, 2],
  [2, 6],
  [1, 1],
  [1, 3],
  [1, 3],
  [1, 1],
  [1, 3],
  [1, 3],
  [1, 2],
  [1, 1]],
 'pool_size': [[8, 8],
  [8, 24],
  [1, 1],
  [1, 12],
  [1, 12],
  [1, 1],
  [1, 12],
  [1, 12],
  [1, 8],
  [1, 1]],
 'pool_padding': [[4, 4],
  [4, 12],
  [0, 0],
  [0, 6],
  [0, 6],
  [0, 0],
  [0, 6],
  [0, 6],
  [0, 4],
  [0, 0]],
 'attn': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'num_classes': {'num_words': 800},
 'fc_size': 512,
 'global_avg_cue': False,
 'dropout': 0.5,
 'attn_constraints': {'slop

## Make sure configs are compat with model 

In [93]:
!hostname

node111


In [94]:
from src.spatial_attn_lightning import BinauralAttentionModule
import yaml 

In [95]:
config

{'corpus': {'name': 'spatialized_commonvoice_audioset_scenes',
  'cue_type': 'mixed',
  'task': 'word',
  'root': '/om/scratch/Sun/imgriff/datasets/spatial_audio_pipeline/assets/dataset_binaural_attn/v08',
  'mixture_percentages': {'voice_only': 0.5, 'voice_and_location': 0.5},
  'gender_balanced_4M': True,
  'cue_free_percentage': 0.1,
  'v06': True},
 'audio': {'rep_type': 'cochlea_filt',
  'v2_demean': True,
  'rep_kwargs': {'sr': 44100,
   'env_sr': 10000,
   'n_channels': 40,
   'low_lim': 40,
   'use_pad': True,
   'binaural': True,
   'rep_on_gpu': True,
   'center_crop': True,
   'out_dur': 2,
   'impulse_len': 0.25,
   'env_extraction_type': 'Half-wave Rectification',
   'downsampling_type': 'TorchTransformsResample',
   'downsampling_kwargs': {'lowpass_filter_width': 64,
    'rolloff': 0.9475937167399596,
    'resampling_method': 'kaiser_window',
    'beta': 14.769656459379492}},
  'compression_type': 'coch_p3',
  'compression_kwargs': {'scale': 1,
   'offset': 1e-07,
   'cli

In [96]:
n_params = 0
for param in BinauralAttentionModule(config).model.parameters():
    n_params += param.numel()
print(f"{n_params/1e6:.2f} M")

Using explicit dim specification for demeaning in audio transforms
Using BinauralAuditoryAttentionCNN
v08 True
num_classes={'num_words': 800}
Model performing word task
Conv block order: LN -> Conv -> ReLU
coch_affine: True
center_crop=True
binaural=True
Binaural cochleagram
using FIR cochleagram
98.29 M


In [3]:
for i in range(10):
    
    config_name = f"config/arch_search/nsynth_clean_coarse_interval_diff_bins_arch_search_arch_{i}.yaml"
    config = yaml.load(open(config_name, 'r'), Loader=yaml.FullLoader)
    module = ModMelModule(config)
    del module 


N pitch classes = 8
height = 50, width = 100000
height = 49, width = 99996
height = 25, width = 33329
height = 21, width = 33305
height = 21, width = 16649
height = 21, width = 16624
height = 11, width = 4153
height = 10, width = 4127
height = 10, width = 4127
height = 7, width = 4082
height = 4, width = 2038
height = 2, width = 2018
height = 2, width = 400
center_crop=True
binaural=False
using FIR cochleagram
N pitch classes = 8
height = 50, width = 100000
height = 48, width = 99934
height = 48, width = 99934
height = 45, width = 99865
height = 23, width = 24963
height = 20, width = 24933
height = 20, width = 12463
height = 19, width = 12426
height = 10, width = 2068
height = 9, width = 2056
height = 9, width = 2056
height = 7, width = 2017
height = 4, width = 400
center_crop=True
binaural=False
using FIR cochleagram
N pitch classes = 8
height = 50, width = 100000
height = 49, width = 99956
height = 25, width = 24986
height = 22, width = 24925
height = 22, width = 4982
height = 19, wi

In [83]:
config = yaml.load(open(config_name, 'r'), Loader=yaml.FullLoader)

In [84]:
config

{'data': {'audio': {'compression_kwargs': {'clip_value': 5,
    'offset': 1e-07,
    'power': 0.3,
    'scale': 1},
   'compression_type': 'coch_p3',
   'rep_kwargs': {'center_crop': True,
    'downsampling_kwargs': {'beta': 14.769656459379492,
     'lowpass_filter_width': 64,
     'resampling_method': 'kaiser_window',
     'rolloff': 0.9475937167399596},
    'downsampling_type': 'TorchTransformsResample',
    'env_extraction_type': 'Half-wave Rectification',
    'env_sr': 10000,
    'impulse_len': 0.25,
    'low_lim': 40,
    'n_channels': 50,
    'out_dur': 10,
    'rep_on_gpu': True,
    'sr': 20000,
    'use_pad': True},
   'rep_type': 'cochlea_filt'},
  'bin_kwargs': {'bin_max': 4, 'bin_min': 1, 'n_intervals': 3},
  'corpus': {'root': '/om/scratch/Mon/imgriff/datasets/modmel/nsynth_training/',
   'with_noise': False},
  'loader': {'batch_size': 96, 'num_workers': 10},
  'noise_kwargs': {'high_snr': 0, 'low_snr': 0},
  'num_class_bins': 8,
  'pitch_task': 'coarse_interval'},
 'hpar