In [9]:
from u import *
from ut import *
from model import *
from data import *

%load_ext autoreload
%autoreload 2

decoder = (Cache / 'vocab.npy').load()
encoder = get_encoder(decoder)
n_vocab = len(decoder)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Model Configurations

Also prints out the command to run training

In [10]:
# base hyperparameters for transformer
transformer = dict(
    model=Proj / 'model.py', model_class='Transformer', n_vocab=n_vocab, step_save=5000,
    train_batch=17, train_chunk=1088,
    step_eval=500, eval_batch=1, eval_chunk=4096,
    cutoffs=[5000, 25000, 50000], adaptive_ratio=4, pos_emb='trained',
    n_seq=64, n_layers=16, n_embed=256, n_head=8, n_k=32, n_v=32, n_inner=1024, dropout=0.1,
    lr=0.0005, step_warmup=100, scheduler='cosine'
)

In [11]:
# create config object from dictionary
c = Config(Wiki / 'hebbian,large', # first argument to config is the path of the folder to create for the run
    transformer,
    hebbian=True, hebbian_gamma=0.002, hebbian_T=2500,
    train_batch=8, train_chunk=1152,
    n_embed=512, n_seq=128
).save(True) # save to the folder

# print out command to run the training
print(c.train(env_gpu=lrange(4), steps=200000, opt='O1'))

cd /data/scratch/zxyan/micronet/wikitext-103/hebbian,large
CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m torch.distributed.launch --nproc_per_node=4 --use_env ../../model.py . steps=200000 opt_level=O1


In [12]:
c = Config(Wiki / 'hebbian,large2', transformer,
    hebbian=True, hebbian_gamma=0.002, hebbian_T=2500,
    train_batch=7, train_chunk=1152,
    n_embed=512, n_k=64, n_v=64, n_seq=96, n_inner=1536
).save(True)
print(c.train(env_gpu=lrange(6), steps=200000, opt='O1'))

cd /data/scratch/zxyan/micronet/wikitext-103/hebbian,large2
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 python3 -m torch.distributed.launch --nproc_per_node=6 --use_env ../../model.py . steps=200000 opt_level=O1


In [13]:
c = Config(Wiki / 'hebbian', transformer,
           hebbian=True, hebbian_gamma=0.01, hebbian_T=500).save(True)
print(c.train(env_gpu=2, steps=200000, opt='O1'))

cd /data/scratch/zxyan/micronet/wikitext-103/hebbian
CUDA_VISIBLE_DEVICES=2 python3 ../../model.py . steps=200000 opt_level=O1


In [14]:
layers = []
for i in range(transformer['n_layers']):
    if i < 5:
        layers.append(dict(lc_kernel_size=3))
    elif i < 10:
        layers.append(dict(lc_kernel_size=7))
    else:
        layers.append(dict(lc_kernel_size=15))
c = Config(Wiki / 'transformer,lightconv', transformer, train_batch=10,
           light_conv=True, layers=layers,
           hebbian=True, hebbian_gamma=0.01, hebbian_T=500).save(True)
print(c.train(env_gpu=1, steps=200000, opt='O0'))

cd /data/scratch/zxyan/micronet/wikitext-103/transformer,lightconv
CUDA_VISIBLE_DEVICES=1 python3 ../../model.py . steps=200000 opt_level=O0


In [15]:
c = Config(Wiki / 'gru',
    model=Proj / 'model.py', model_class='RNN', net='GRU', n_vocab=n_vocab, step_save=5000,
    train_batch=11, n_seq=2048, step_eval=1000, eval_batch=1, eval_chunk=8192,
    cutoffs=[5000, 25000, 50000], adaptive_ratio=4,
    lr=0.01, step_warmup=100, scheduler='rsqrt',
    num_layers=1, n_embed=512, n_hidden=2048, dropout=0.1,
    hebbian=True, hebbian_gamma=0.01, hebbian_T=500
).save(True)
print(c.train(env_gpu=1, steps=200000))

cd /data/scratch/zxyan/micronet/wikitext-103/gru
CUDA_VISIBLE_DEVICES=1 python3 ../../model.py . steps=200000 opt_level=O0


In [16]:
sorted_hebbian = transformer.copy()
sorted_hebbian.update(dict(
    hebbian=True, hebbian_gamma=0.01, hebbian_T=500,
    vocab_sorted=True, cutoffs=[3500, 25000], n_embeds=[256, 64, 4]
))
c = Config(Wiki / 'sorted,hebbian', sorted_hebbian).save(True)
print(c.train(env_gpu=2, steps=200000, opt='O1'))

cd /data/scratch/zxyan/micronet/wikitext-103/sorted,hebbian
CUDA_VISIBLE_DEVICES=2 python3 ../../model.py . steps=200000 opt_level=O1


In [17]:
sorted_hebbian_compound = sorted_hebbian.copy()
layers = []
for i in range(sorted_hebbian_compound['n_layers']):
    # more heads when shallow, less heads when deep
    # smaller inner when shallow
    if i < 4:
        layer = dict(n_inner=64, n_head=8, n_k=16, n_v=16)
    elif i < 8:
        layer = dict(n_inner=256, n_head=4, n_k=32, n_v=32)
    elif i < 10:
        layer = dict(n_inner=1024, n_head=4, n_k=64, n_v=64)
    elif i < 12:
        layer = dict(n_inner=1024, n_head=4, n_k=64, n_v=64)
    elif i < 16:
        layer = dict(n_inner=1536, n_head=2, n_k=128, n_v=128)
    layers.append(layer)

c = Config(Wiki / 'sorted,hebbian,compound', sorted_hebbian_compound, layers=layers, train_batch=18).save(True)
print(c.train(env_gpu=0, steps=200000, opt='O1'))

cd /data/scratch/zxyan/micronet/wikitext-103/sorted,hebbian,compound
CUDA_VISIBLE_DEVICES=0 python3 ../../model.py . steps=200000 opt_level=O1


In [18]:
sorted_hebbian_layer12 = sorted_hebbian.copy()
sorted_hebbian_layer12.update(dict(n_layers=12, train_batch=18))
c = Config(Wiki / 'sorted,hebbian,layer12', sorted_hebbian_layer12).save(True)
print(c.train(env_gpu=2, steps=200000, opt='O1'))

cd /data/scratch/zxyan/micronet/wikitext-103/sorted,hebbian,layer12
CUDA_VISIBLE_DEVICES=2 python3 ../../model.py . steps=200000 opt_level=O1


In [19]:
sorted_hebbian_large = sorted_hebbian.copy()
sorted_hebbian_large.update(dict(
    train_batch=7, train_chunk=1152,
    n_embeds=[512, 256, 16],
    n_embed=512, n_k=64, n_v=64, n_seq=96, n_inner=1536
))
c = Config(Wiki / 'sorted,hebbian,large', sorted_hebbian_large).save(True)
print('S=$HOME/Research/exercises', c.train(env_gpu=lrange(8), steps=200000, opt='O1').split('\n')[1])

S=$HOME/Research/exercises CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 -m torch.distributed.launch --nproc_per_node=8 --use_env ../../model.py . steps=200000 opt_level=O1


In [20]:
sorted_hebbian_softmax = sorted_hebbian.copy()
sorted_hebbian_softmax.update(dict(fix_softmax=True, train_batch=16))
c = Config(Wiki / 'sorted,hebbian,softmax', sorted_hebbian_softmax).save(True)
print(c.train(env_gpu=0, steps=200000, opt='O1'))

cd /data/scratch/zxyan/micronet/wikitext-103/sorted,hebbian,softmax
CUDA_VISIBLE_DEVICES=0 python3 ../../model.py . steps=200000 opt_level=O1


In [23]:
tes = sorted_hebbian.copy()
tes.update(dict(fix_softmax=True, train_batch=16))
c = Config(Wiki / 'test', tes).save(True)
print(c.train(env_gpu=0, steps=200000, opt='O1'))

cd /data/scratch/zxyan/micronet/wikitext-103/test
CUDA_VISIBLE_DEVICES=0 python3 ../../model.py . steps=200000 opt_level=O1


In [21]:
sorted_hebbian_mask = sorted_hebbian.copy()
sorted_hebbian_mask.update(dict(mask_pad=True, fix_softmax=True, train_batch=16))
c = Config(Wiki / 'sorted,hebbian,mask', sorted_hebbian_mask).save(True)
print(c.train(env_gpu=3, steps=200000, opt='O1'))

cd /data/scratch/zxyan/micronet/wikitext-103/sorted,hebbian,mask
CUDA_VISIBLE_DEVICES=3 python3 ../../model.py . steps=200000 opt_level=O1


In [22]:
large = sorted_hebbian_softmax.copy()
large.update(dict(
    train_batch=7, train_chunk=1152,
    n_embeds=[512, 256, 16],
    n_embed=512, n_k=64, n_v=64, n_seq=96, n_inner=1536
))
c = Config(Wiki / 'large', large).save(True)
print('S=$HOME/Research/exercises', c.train(env_gpu=lrange(8), steps=200000, opt='O1').split('\n')[1])

S=$HOME/Research/exercises CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 -m torch.distributed.launch --nproc_per_node=8 --use_env ../../model.py . steps=200000 opt_level=O1


In [60]:
tie_layers = sorted_hebbian_mask.copy()
tie_layers.update(dict(
    tie_layers=True,
    train_batch=10,
    n_k=64, n_v=64, n_inner=2048,
))
c = Config(Wiki / 'tie_layers', tie_layers).save(True)
print(c.train(env_gpu=1, steps=200000, opt='O1'))

cd /data/scratch/zxyan/micronet/wikitext-103/tie_layers
CUDA_VISIBLE_DEVICES=1 python3 ../../model.py . steps=200000 opt_level=O1


In [64]:
tie_layers_4x4 = sorted_hebbian_mask.copy()
tie_layers_4x4.update(dict(
    tie_layers=[4, 4, 4, 4],
    train_batch=16
))
c = Config(Wiki / 'tie_layers,4x4', tie_layers_4x4).save(True)
print(c.train(env_gpu=2, steps=200000, opt='O1'))

cd /data/scratch/zxyan/micronet/wikitext-103/tie_layers,4x4
CUDA_VISIBLE_DEVICES=2 python3 ../../model.py . steps=200000 opt_level=O1


In [66]:
tie_layers_8x2 = sorted_hebbian_mask.copy()
tie_layers_8x2.update(dict(
    tie_layers=[2] * 8,
    train_batch=16
))
c = Config(Wiki / 'tie_layers,8x2', tie_layers_8x2).save(True)
print(c.train(env_gpu=1, steps=200000, opt='O1'))

cd /data/scratch/zxyan/micronet/wikitext-103/tie_layers,8x2
CUDA_VISIBLE_DEVICES=1 python3 ../../model.py . steps=200000 opt_level=O1


In [50]:
universal = sorted_hebbian_mask.copy()
universal.update(dict(
    model_class='UniversalTransformer',
    train_batch=10,
    n_k=64, n_v=64, n_inner=2048,
    threshold=0.99, time_penalty=0.1
))
c = Config(Wiki / 'universal', universal).save(True)
print(c.train(env_gpu=2, steps=200000, opt='O1'))

cd /data/scratch/zxyan/micronet/wikitext-103/universal
CUDA_VISIBLE_DEVICES=2 python3 ../../model.py . steps=200000 opt_level=O1


# Cache Parameter Search

In [2]:
# loads the model (net and step) for evaluation
c = Config(Wiki / 'sorted,hebbian,softmax').load()
net, step = c.var(device='cuda:0').load_model('max')
from model import evaluate
data_val = SequentialIterator(c, c.eval_batch, split='valid')
perplexity = {}
print('Model at step', step)

Model at step 200000


In [5]:
thetas = [1e-2, 1e-3, 1e-4]
thetas = [1e-1, 5e-2, 2e-2, 1e-2]
thetas = [1e-2, 5e-3, 2e-3, 1e-3]
thetas = [1e-2, 9e-3, 8e-3, 7e-3, 6e-3, 5e-3, 4e-3, 3e-3, 2e-3, 1e-3]
# thetas = [6e-3, 5.5e-3, 5e-3, 4.5e-3, 4e-3]
# thetas = np.arange(1e-3, 0, -1e-4)

lambdas = [0.05, 0.1, 0.2, 0.3, 0.4]
lambdas = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1]
# lambdas = [0.05, 0.055, 0.06, 0.065, 0.07]
# lambdas = np.arange(0.03, 0, -0.01)

# search over cache parameters
for theta in thetas:
    for lam in lambdas:
        if (theta, lam) in perplexity:
            continue
        net.loss.cache_keys = net.loss.cache_values = None
        perplexity[theta, lam] = evaluate(c.var(use_cache=True, n_cache=500, cache_theta=theta, cache_lambda=lam), data_val, net)['perplexity']

In [6]:
df = pd.DataFrame([[perplexity[theta, lam] for theta in thetas] for lam in lambdas], index=lambdas, columns=thetas)
df.index.name = 'lambda'
df.columns.name = 'theta'
df

theta,0.01,0.009,0.008,0.007,0.006,0.005,0.004,0.003,0.002,0.001
lambda,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0.01,34.502601,34.41892,34.332323,34.247945,34.175684,34.13273,34.144239,34.24014,34.451669,34.800362
0.02,34.150915,34.044402,33.93454,33.82792,33.737501,33.685009,33.701814,33.825081,34.09239,34.527584
0.03,33.977148,33.854255,33.727681,33.605426,33.502683,33.444655,33.467668,33.614601,33.927381,34.429655
0.04,33.888657,33.752103,33.61178,33.476709,33.364081,33.302101,33.331586,33.500407,33.854101,34.415145
0.05,33.8526,33.704008,33.551483,33.405176,33.283955,33.21894,33.254964,33.44452,33.836551,34.451916
0.06,33.853173,33.693515,33.529828,33.373262,33.244325,33.176769,33.219264,33.428903,33.857629,34.524513
0.07,33.881475,33.711393,33.537232,33.371018,33.234902,33.165127,33.21414,33.443331,33.907641,34.624281
0.08,33.931894,33.751813,33.567662,33.39234,33.249406,33.177647,33.233112,33.48153,33.980615,34.745901
0.09,34.000672,33.810987,33.61719,33.432999,33.283527,33.209952,33.271903,33.539295,34.072709,34.885835
0.1,34.085238,33.886144,33.682921,33.490153,33.334312,33.258991,33.327454,33.613671,34.181079,35.041601


In [5]:
data_test = SequentialIterator(c, c.eval_batch, split='test')
evaluate(c.var(use_cache=True, n_cache=500, cache_theta=0.005, cache_lambda=0.07), data_test, net)

{'loss': 3.522698402404785, 'perplexity': 33.87571555612406, 'time': 2.0}