In [1]:
from u import *
from ut import *
from model import *
from data import *

%load_ext autoreload
%autoreload 2

decoder = (Cache / 'vocab.npy').load()
encoder = get_encoder(decoder)
n_vocab = len(decoder)

# Model Configurations

Also prints out the command to run training

In [2]:
# base hyperparameters for transformer
transformer = dict(
    model=Proj / 'model.py', model_class='Transformer', n_vocab=n_vocab, step_save=5000,
    train_batch=17, train_chunk=1088,
    step_eval=500, eval_batch=1, eval_chunk=4096,
    cutoffs=[5000, 25000, 50000], adaptive_ratio=4, pos_emb='trained',
    n_seq=64, n_layers=16, n_embed=256, n_head=8, n_k=32, n_v=32, n_inner=1024, dropout=0.1,
    lr=0.0005, step_warmup=100, scheduler='cosine'
)

In [3]:
# create config object from dictionary
c = Config(Wiki / 'hebbian,large', # first argument to config is the path of the folder to create for the run
    transformer,
    hebbian=True, hebbian_gamma=0.002, hebbian_T=2500,
    train_batch=8, train_chunk=1152,
    n_embed=512, n_seq=128, mask_pad=True
).save(True) # save to the folder

# print out command to run the training
print(c.train(env_gpu=lrange(4), steps=200000, opt='O1'))

cd /data/scratch/zxyan/micronet/wikitext-103/hebbian,large
CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m torch.distributed.launch --nproc_per_node=4 --use_env ../../model.py . steps=200000 opt_level=O1


In [4]:
c = Config(Wiki / 'hebbian,large2', transformer,
    hebbian=True, hebbian_gamma=0.002, hebbian_T=2500,
    train_batch=7, train_chunk=1152,
    n_embed=512, n_k=64, n_v=64, n_seq=96, n_inner=1536, mask_pad=True
).save(True)
print(c.train(env_gpu=lrange(6), steps=200000, opt='O1'))

cd /data/scratch/zxyan/micronet/wikitext-103/hebbian,large2
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 python3 -m torch.distributed.launch --nproc_per_node=6 --use_env ../../model.py . steps=200000 opt_level=O1


In [5]:
c = Config(Wiki / 'hebbian', transformer,
           hebbian=True, hebbian_gamma=0.01, hebbian_T=500).save(True)
print(c.train(env_gpu=2, steps=200000, opt='O1'))

cd /data/scratch/zxyan/micronet/wikitext-103/hebbian
CUDA_VISIBLE_DEVICES=2 python3 ../../model.py . steps=200000 opt_level=O1


# Cache Parameter Search

In [27]:
# loads the model (net and step) for evaluation
net, step = c.var(device='cuda:1').load_model('max')
from model import evaluate
data_val = SequentialIterator(c, c.eval_batch, split='valid')
perplexity = {}
print('Model at step', step)

Model at step 195727


In [30]:
thetas = [1e-2, 1e-3, 1e-4]
thetas = [1e-1, 5e-2, 2e-2, 1e-2]
thetas = [1e-2, 5e-3, 2e-3, 1e-3]
thetas = [1e-2, 9e-3, 8e-3, 7e-3, 6e-3, 5e-3, 4e-3, 3e-3, 2e-3, 1e-3]
# thetas = [6e-3, 5.5e-3, 5e-3, 4.5e-3, 4e-3]
lambdas = [0.05, 0.1, 0.2, 0.3, 0.4]
lambdas = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1]
# lambdas = [0.05, 0.055, 0.06, 0.065, 0.07]

# search over cache parameters
for theta in thetas:
    for lam in lambdas:
        if (theta, lam) in perplexity:
            continue
        net.loss.cache_keys = net.loss.cache_values = None
        perplexity[theta, lam] = evaluate(c.var(use_cache=True, n_cache=500, cache_theta=theta, cache_lambda=lam), data_val, net)['perplexity']

In [31]:
pd.DataFrame([[perplexity[theta, lam] for theta in thetas] for lam in lambdas], index=lambdas, columns=thetas)

Unnamed: 0,0.01,0.009,0.008,0.007,0.006,0.005,0.004,0.003,0.002,0.001
0.01,36.787824,36.745686,36.712506,36.693736,36.697288,36.734211,36.820141,36.978187,37.237089,37.60289
0.02,36.283192,36.228803,36.186418,36.163114,36.168977,36.217775,36.329173,36.531,36.857135,37.313109
0.03,36.004096,35.941256,35.892839,35.86721,35.876138,35.935763,36.068431,36.305578,36.683904,37.207555
0.04,35.832858,35.763247,35.7102,35.683136,35.695405,35.765106,35.917125,36.185494,36.609384,37.191023
0.05,35.727658,35.652199,35.595217,35.567214,35.582981,35.66229,35.832414,36.129728,36.595386,37.229641
0.06,35.668199,35.587486,35.527033,35.498398,35.517649,35.606319,35.793726,36.11841,36.623474,37.307016
0.07,35.643037,35.557387,35.493777,35.464687,35.487473,35.585323,35.789442,36.140436,36.683318,37.413896
0.08,35.644898,35.55464,35.488066,35.458701,35.484969,35.591771,35.812198,36.188816,36.768392,37.544563
0.09,35.668999,35.574278,35.504924,35.475326,35.50517,35.620864,35.857283,36.258944,36.874512,37.695265
0.1,35.711979,35.612933,35.540902,35.511129,35.544503,35.668956,35.921227,36.347592,36.998593,37.863268


In [32]:
data_test = SequentialIterator(c, c.eval_batch, split='test')
evaluate(c.var(use_cache=True, n_cache=500, cache_theta=0.007, cache_lambda=0.08), data_test, net)

{'loss': 3.594371795654297, 'perplexity': 36.392830670949074, 'time': 3.0}