In [2]:
import numpy as np 
import sklearn as sk
# from skopt import gp_minimize, forest_minimize
from src.layers import padding as pad_utils
from src.spatial_attn_lightning import BinauralAttentionModule
import yaml 



# Second pass architecture search using v10 dataset (final version)

This will use the 7 good architectures idetified in the first pass architecture search, and include 2 more to have 10 total models

In [3]:
### Track good architecutres 
good_arch_config_names = ['word_task_v09_4MGB_ln_first_arch_1',
                        'word_task_v09_4MGB_ln_first_arch_2',
                        'word_task_v09_4MGB_ln_first_arch_4',
                        'word_task_v09_4MGB_ln_first_arch_6',
                        'word_task_v09_4MGB_ln_first_arch_7',
                        'word_task_v09_4MGB_ln_first_arch_8',
                        'word_task_v09_4MGB_ln_first_arch_9']
len(good_arch_config_names)

7

## Randomly generate architectures

### Required conditions:
* N parameters < 200M
* 4 < N layers < 11
* 2 < output height < 8
* 2 < output width < 8

In [4]:
## write configs 

from pathlib import Path
import yaml 
from copy import deepcopy


## import default config 
outdir = Path("config/arch_search")
outdir.mkdir(exist_ok=True, parents=True)

base_config = yaml.load(open("config/binaural_attn/word_task_v10_main_feature_gain_config.yaml", 'r'), Loader=yaml.FullLoader)


In [65]:

def get_pool_padding(kernel_size):
    if kernel_size % 2 == 0:
        pool_pad = kernel_size // 2
    else:
        pool_pad = (kernel_size - 1) // 2
    return pool_pad


def compute_layers(output_height, output_len, kernel_h, kernel_w, pool_stride_h, pool_stride_w):
    """
    Compute the output shape of a CNN given lists of network parameters.
    Args:
        n_layers: number of layers
        kernel_h: height of kernel
        kernel_w: length of kernel
        pool_stride_h: pooling stride in height
        pool_stride_w: pooling stride in length 
    Returns:
        output_height: height of output
        output_len: length of output
    """
    # Compute output shapes using conv formula [(Height - (Filter-1) + 2Pad)/ Stride]+1
    conv_pad, _  = pad_utils.get_padding_value('valid_time', [kernel_h, kernel_w], stride=[1,1])

    output_height = int(np.floor((output_height + (2 * conv_pad[0]) - (kernel_h - 1) - 1) / 1) + 1)
    output_len = int(np.floor((output_len + (2 * conv_pad[1]) - (kernel_w - 1) - 1) / 1) + 1)

    # pooling layers
    pool_h = pool_stride_h * 4 if pool_stride_h > 1 else 1
    pool_w = pool_stride_w * 4 if pool_stride_w > 1 else 1
    # print(f'pool_h: {pool_h}, pool_w: {pool_w}')
    # print(f'pool_stride_h: {pool_stride_h}, pool_stride_w: {pool_stride_w}')
    # pool_pad, _  = pad_utils.get_padding_value("same", [pool_h, pool_w], stride=[pool_stride_h, pool_stride_w])
    # print(f'pool_pad: {pool_pad}')
    pool_padding_h = get_pool_padding(pool_h)
    pool_padding_w =  get_pool_padding(pool_w)
    
    output_height = int(np.floor((output_height + (2 * pool_padding_h) - pool_h) / pool_stride_h) + 1)
    output_len = int(np.floor((output_len + (2 * pool_padding_w) - pool_w) / pool_stride_w) + 1)

    return output_height, output_len, conv_pad, [pool_h, pool_w], [pool_padding_h, pool_padding_w]



np.random.seed(0)
archs = {}

n_good_archs = 0
n_tol = 1e10

n_in_channels = 2 # 2 for binaural audio, 1 for mono 
fc_size = 512

while n_good_archs < 4:
    n_layers = np.random.randint(5, 11)
    output_height = 40
    output_len = 20000
    params = {}
    params['n_layers'] = n_layers
    params['kernel'] = []
    params['conv_pad'] = [] 
    params['pool_stride'] = []
    params['pool_size'] = []
    params['pool_pad'] = []
    params['n_filts'] = []


    try:
        total_params = 0
        # add input norm to params 
        total_params += (n_in_channels * output_height * output_len) * 2  
        for layer in range(n_layers):
            if layer == 0:
                n_filts = 2  
            # add layer norm (that occurs before conv)
            total_params += (n_filts * output_height * output_len) * 2 

            if layer < 2:
                kernel_w = np.random.randint(10, 81)
                kernel_h = np.random.randint(1, 4)
                pool_stride_h = np.random.randint(1, 3)
                pool_stride_w = np.random.randint(1, 7)

            else:
                kernel_h = np.random.randint(3, 7)
                kernel_w = np.random.randint(3, 7)
                pool_stride_h = np.random.randint(1, 2)
                pool_stride_w = np.random.randint(1, 4)

            # pool_stride_h = np.random.randint(1, 3)
            # pool_stride_w = np.random.randint(1, 6)
            # compute output shape
            output_height, output_len, conv_pad, pool_size, pool_padding = compute_layers(output_height, output_len, kernel_h, kernel_w, pool_stride_h, pool_stride_w)

            # update params dict 
            params['kernel'].append([kernel_h, kernel_w])
            params['conv_pad'].append(conv_pad)
            params['pool_stride'].append([pool_stride_h, pool_stride_w])
            params['pool_size'].append(pool_size)
            params['pool_pad'].append(pool_padding)

            if layer == 0:
                n_filts = 2**np.random.randint(5,7)
                params['n_filts'].append(n_filts)
                n_layer_params = (n_in_channels * kernel_h * kernel_w) * n_filts # no bias in these models 
            else:
                n_filts = np.min([2 * n_filts, 512])
                params['n_filts'].append(n_filts)
                prev_layer_filts = params['n_filts'][layer-1]
                n_layer_params = (prev_layer_filts * kernel_h * kernel_w) * n_filts # no bias in these models 
            
            total_params += n_layer_params
            
            # add layer norm parameters 

         
        ## get fully connected size for good architectures 
        final_output_size = (n_filts * output_height * output_len)
        n_fc_params = fc_size * final_output_size
        assert n_fc_params > 0, f'n_fc_params: {n_fc_params}, final_output_size: {final_output_size}, fc_size: {fc_size}'
        # print(f'final_output_size: {final_output_size}, n_fc_params: {n_fc_params}')
        total_params += n_fc_params
        
        # Compute fully connected to word task size 
        n_classifier_size = fc_size * 800 
        total_params += n_classifier_size

        config = deepcopy(base_config)
        config['model']['out_channels'] = [int(i) for i in params['n_filts']]
        # conv layers
        config['model']['kernel'] = params['kernel']
        config['model']['stride'] = [[1,1] for _ in  range(params['n_layers'])]
        config['model']['padding'] = params['conv_pad']
        config['model']['padding'] = ['valid_time' for _ in range(params['n_layers'])]
        # pooling layers
        config['model']['pool_stride'] = params['pool_stride']
        config['model']['pool_size'] = params['pool_size']
        config['model']['pool_padding'] = params['pool_pad']
        # add attn 
        config['model']['attn'] = [1 for _ in range(params['n_layers'])]
        # config['model']['block_order'] = "Conv -> LN -> ReLU"
        config['model']['ln_affine'] = True
        config['model']['norm_first'] = True

        # if (output_height >= 2 and output_height <= 8) and (output_len >= 2 and output_len <= 8) and (total_params <= 2e8 and total_params >= 1e7):

            # model = BinauralAttentionModule(config).model
            # n_params = sum([p.numel() for p in model.parameters()])
        if (output_height >= 2 and output_height <= 8) and (output_len >= 2 and output_len <= 8) and (total_params <= 1e8 and total_params >= 1e7):
            print(f'output_height: {output_height}, output_len: {output_len}, n layers: {n_layers}, our total_params: {round(total_params/1e6, 2)}M')
            # save dict of params 
            archs[f"arch_{n_good_archs}"] = params
            n_good_archs += 1

    except Exception as e:
        # print(f"Error: {e}")
        continue
    
    # else:
    n_tol -= 1
    if n_tol == 0:
        break 
    

output_height: 8, output_len: 5, n layers: 9, our total_params: 58.18M
output_height: 8, output_len: 3, n layers: 10, our total_params: 76.67M
output_height: 8, output_len: 2, n layers: 10, our total_params: 68.92M
output_height: 8, output_len: 7, n layers: 8, our total_params: 71.96M


# Write configs out

### Update existing configs with name and new dataset path

In [68]:
# Convert architectures in archs to configs
from copy import deepcopy

## import default config 
outdir = Path("config/arch_search")
outdir.mkdir(exist_ok=True, parents=True)

base_config = yaml.load(open("config/binaural_attn/word_task_v10_main_feature_gain_config.yaml", 'r'), Loader=yaml.FullLoader)

### Track good architecutres 
good_arch_config_names = ['word_task_v09_4MGB_ln_first_arch_1',
                        'word_task_v09_4MGB_ln_first_arch_2',
                        'word_task_v09_4MGB_ln_first_arch_4',
                        'word_task_v09_4MGB_ln_first_arch_6',
                        'word_task_v09_4MGB_ln_first_arch_7',
                        'word_task_v09_4MGB_ln_first_arch_8',
                        'word_task_v09_4MGB_ln_first_arch_9']

for arch_name in good_arch_config_names:
    arch_config = yaml.load(open(outdir / f"{arch_name}.yaml", 'r'), Loader=yaml.FullLoader)
    # update path 
    arch_config['corpus']['root'] = base_config['corpus']['root']
    model_name = f"{arch_name.replace('09', '10')}"
    arch_config['model_name'] = model_name
    new_config_path = outdir / f"{model_name}.yaml"
    print(new_config_path)
        # # break
    # with open(new_config_path, 'w') as f:
    #     yaml.dump(arch_config, f, default_flow_style=False)

config/arch_search/word_task_v10_4MGB_ln_first_arch_1.yaml
config/arch_search/word_task_v10_4MGB_ln_first_arch_2.yaml
config/arch_search/word_task_v10_4MGB_ln_first_arch_4.yaml
config/arch_search/word_task_v10_4MGB_ln_first_arch_6.yaml
config/arch_search/word_task_v10_4MGB_ln_first_arch_7.yaml
config/arch_search/word_task_v10_4MGB_ln_first_arch_8.yaml
config/arch_search/word_task_v10_4MGB_ln_first_arch_9.yaml


### Write the 2 mew architectures to include with the current 7

### Update arch of base config 

In [69]:
# Convert architectures in archs to configs
from copy import deepcopy

## import default config 
outdir = Path("config/arch_search")
outdir.mkdir(exist_ok=True, parents=True)

base_config = yaml.load(open("config/binaural_attn/word_task_v10_main_feature_gain_config.yaml", 'r'), Loader=yaml.FullLoader)

### We will index thest architectures by the number of the architecture satrting at 10, so they will be sequential additions to the existing 10 tried (numbered 0-9)

for i, (arch_name, params) in enumerate(archs.items()):
    config = deepcopy(base_config)
    config['model']['out_channels'] = [int(i) for i in params['n_filts']]
    # conv layers
    config['model']['kernel'] = params['kernel']
    config['model']['stride'] = [[1,1] for _ in  range(params['n_layers'])]
    config['model']['padding'] = params['conv_pad']
    config['model']['padding'] = ['valid_time' for _ in range(params['n_layers'])]
    # pooling layers
    config['model']['pool_stride'] = params['pool_stride']
    config['model']['pool_size'] = params['pool_size']
    config['model']['pool_padding'] = params['pool_pad']
    # add attn 
    config['model']['attn'] = [1 for _ in range(params['n_layers'])]

    # update learning rate 
    config['hparas']['valid_step'] = 4000

    # config['model']
     # write config to file
    model_name = f"word_task_v10_4MGB_ln_first_arch_{10 + i:2}"
    config['model_name'] = model_name
    config_name = outdir / f"{model_name}.yaml"
    print(config_name)
    print(config)
    # # break
    with open(config_name, 'w') as f:
        yaml.dump(config, f, default_flow_style=False)

config/arch_search/word_task_v10_4MGB_ln_first_arch_10.yaml
{'corpus': {'name': 'spatialized_commonvoice_audioset_scenes', 'cue_type': 'mixed', 'task': 'word', 'root': '/om/scratch/Fri/imgriff/datasets/spatial_audio_pipeline/assets/dataset_binaural_attn/v10', 'mixture_percentages': {'voice_only': 0.5, 'voice_and_location': 0.5}, 'gender_balanced_4M': True, 'cue_free_percentage': 0.1, 'v06': True}, 'audio': {'rep_type': 'cochlea_filt', 'v2_demean': True, 'rep_kwargs': {'sr': 44100, 'env_sr': 10000, 'n_channels': 40, 'low_lim': 40, 'use_pad': True, 'binaural': True, 'rep_on_gpu': True, 'center_crop': True, 'out_dur': 2, 'impulse_len': 0.25, 'env_extraction_type': 'Half-wave Rectification', 'downsampling_type': 'TorchTransformsResample', 'downsampling_kwargs': {'lowpass_filter_width': 64, 'rolloff': 0.9475937167399596, 'resampling_method': 'kaiser_window', 'beta': 14.769656459379492}}, 'compression_type': 'coch_p3', 'compression_kwargs': {'scale': 1, 'offset': 1e-07, 'clip_value': 5, 'pow

## Make sure configs are compat with model 

In [55]:
!hostname

node063


In [56]:
from src.spatial_attn_lightning import BinauralAttentionModule
import yaml 

In [64]:
wanted_configs = list(outdir.glob('*v10*.yaml'))
len(wanted_configs)

9

In [63]:
for config_path in wanted_configs:
    config = yaml.load(open(config_path, 'r'), Loader=yaml.FullLoader)
    n_params = 0
    for param in BinauralAttentionModule(config).model.parameters():
        n_params += param.numel()
    print(f"{n_params/1e6:.2f} M")

Using explicit dim specification for demeaning in audio transforms
Using BinauralAuditoryAttentionCNN
v08 True
num_classes={'num_words': 800}
Model performing word task
Using singe gain function per layer
Conv block order: LN -> Conv -> ReLU
fc_attn: True
coch_affine: True
center_crop=True
binaural=True
Binaural cochleagram
using FIR cochleagram
80.97 M
Using explicit dim specification for demeaning in audio transforms
Using BinauralAuditoryAttentionCNN
v08 True
num_classes={'num_words': 800}
Model performing word task
Using singe gain function per layer
Conv block order: LN -> Conv -> ReLU
fc_attn: True
coch_affine: True
center_crop=True
binaural=True
Binaural cochleagram
using FIR cochleagram
58.18 M
Using explicit dim specification for demeaning in audio transforms
Using BinauralAuditoryAttentionCNN
v08 True
num_classes={'num_words': 800}
Model performing word task
Using singe gain function per layer
Conv block order: LN -> Conv -> ReLU
fc_attn: True
coch_affine: True
center_crop=Tr