In [None]:
import subprocess
import wandb
import os
import yaml
import numpy as np

This notebook does a WandB hyperparamter sweep by writing a `dummy.yml` file for use with the `pretrain_gpt2.py` script.

In [None]:
def sweep_to_yml(run_config,dummy_yml):
  with open(dummy_yml, 'r') as f:
    conf = yaml.load(f,Loader=yaml.Loader)
  for k in ['num-layers', 'hidden-size', 'num-attention-heads']:
    conf[k] = run_config[k]
  conf['optimizer']['params']['lr'] = run_config['lr']
  with open(dummy_yml, 'w') as f:
    yaml.dump(conf, f)

In [None]:
sweep_config = {
    'name': 'Scaling Laws for Neural Language Models sweep',
    'method': 'grid',
    'metric': {
        'name': 'loss',
        'goal': 'minimize'
    },
    'parameters': {
        'valid_set': {
          'values': [
                    # This will be a list of strings
          ]
        }
    }
}

param_dict = {
  # these are the ranges to sweep over
  # ballpark numbers from Figure 5
  'exponent': [exponent for exponent in range(10,22)],
  'ar': [round(10**x) for x in np.linspace(1,2.5,3)],
  'attn_dim': [round(10**x) for x in np.linspace(1.5,2.5,3)],
}

for exponent in param_dict['exponent']:
  N = np.exp(exponent)
  # add LR according equation D.1 from Kaplan et. al
  # "Scaling Laws for Neural Language Models"
  lr = 0.003239 + (-0.0001395)*np.log(N)
  for ar in param_dict['ar']:
    # substitute for n_layer, solve for d_model
    d_model = (N*ar/12)**(1/3)
    # calculate n_layer
    n_layer = N/12/(d_model**2)
    if n_layer < 1:
      # don't clip n_layer
      break
    for attn_dim in param_dict['attn_dim']:
      # add n_head per attn_dim
      n_head = d_model/attn_dim
      if n_head < 1:
        # don't clip n_head
        break
      # add this combination as a string to sweep_config
      sweep_config['parameters']['valid_set']['values'].append(
          ','.join(
              [str(round(x)) for x in \
               [exponent,n_layer, d_model, n_head]] + \
               [str(float(lr))])
      )

In [None]:
# test run
sweep_id = wandb.sweep(sweep_config)

def train():
  run = wandb.init()
  variables = {k:v for k,v in zip(
        # these are from neox_arguments.md
        ['exponent',
         'num-layers', # "n_layers" (GPT)
         'hidden-size', # "d_model" (GPT)
         'num-attention-heads', # "n_heads" (GPT)
         'lr' # "learning_rate" (GPT)
         ],
        [float(x) for x in run.config.valid_set.split(',')]
    )}
  print(variables)
  # write the run config to the dummy yaml
  sweep_to_yml(variables, 'configs/dummy.yml')
  # execute the actual pretrain process
  subprocess.run('python deepy.py pretrain_gpt2.py -d configs dummy.yml local_setup.yml'.split(' '), capture_output=True)
  run.finish()

sweep_id = wandb.sweep(sweep_config)
agent = wandb.agent(sweep_id=sweep_id, function=train)
agent.run()