In [14]:
from u import *
from ut import *
from model import *
from data import *

%load_ext autoreload
%autoreload 2

decoder = (Cache / 'vocab.npy').load()
encoder = get_encoder(decoder)
n_vocab = len(decoder)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Model Configurations

Also prints out the command to run training

In [15]:
# base hyperparameters for transformer
transformer = dict(
    model=Proj / 'model.py', model_class='Transformer', n_vocab=n_vocab, step_save=5000,
    train_batch=17, train_chunk=1088,
    step_eval=500, eval_batch=1, eval_chunk=4096,
    cutoffs=[5000, 25000, 50000], adaptive_ratio=4, pos_emb='trained',
    n_seq=64, n_layers=16, n_embed=256, n_head=8, n_k=32, n_v=32, n_inner=1024, dropout=0.1,
    lr=0.0005, step_warmup=100, scheduler='cosine'
)

In [14]:
# create config object from dictionary
c = Config(Wiki / 'hebbian,large', # first argument to config is the path of the folder to create for the run
    transformer,
    hebbian=True, hebbian_gamma=0.002, hebbian_T=2500,
    train_batch=8, train_chunk=1152,
    n_embed=512, n_seq=128, mask_pad=True
).save(True) # save to the folder

# print out command to run the training
print(c.train(env_gpu=lrange(4), steps=200000, opt='O1'))

cd /data/scratch/zxyan/micronet/wikitext-103/hebbian,large
CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m torch.distributed.launch --nproc_per_node=4 --use_env ../../model.py . steps=200000 opt_level=O1


In [7]:
c = Config(Wiki / 'hebbian,large2', transformer,
    hebbian=True, hebbian_gamma=0.002, hebbian_T=2500,
    train_batch=7, train_chunk=1152,
    n_embed=512, n_k=64, n_v=64, n_seq=96, n_inner=1536, mask_pad=True
).save(True)
print(c.train(env_gpu=lrange(6), steps=200000, opt='O1'))

cd /data/scratch/zxyan/micronet/wikitext-103/hebbian,large2
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 python3 -m torch.distributed.launch --nproc_per_node=6 --use_env ../../model.py . steps=200000 opt_level=O1


In [5]:
c = Config(Wiki / 'hebbian', transformer,
           hebbian=True, hebbian_gamma=0.01, hebbian_T=500).save(True)
print(c.train(env_gpu=2, steps=200000, opt='O1'))

cd /data/scratch/zxyan/micronet/wikitext-103/hebbian
CUDA_VISIBLE_DEVICES=2 python3 ../../model.py . steps=200000 opt_level=O1


In [22]:
layers = []
for i in range(transformer['n_layers']):
    if i < 5:
        layers.append(dict(lc_kernel_size=3))
    elif i < 10:
        layers.append(dict(lc_kernel_size=7))
    else:
        layers.append(dict(lc_kernel_size=15))
c = Config(Wiki / 'transformer,lightconv', transformer, train_batch=10,
           light_conv=True, layers=layers,
           hebbian=True, hebbian_gamma=0.01, hebbian_T=500).save(True)
print(c.train(env_gpu=1, steps=200000, opt='O0'))

cd /data/scratch/zxyan/micronet/wikitext-103/transformer,lightconv
CUDA_VISIBLE_DEVICES=1 python3 ../../model.py . steps=200000 opt_level=O0


In [23]:
c = Config(Wiki / 'gru',
    model=Proj / 'model.py', model_class='RNN', net='GRU', n_vocab=n_vocab, step_save=5000,
    train_batch=11, n_seq=2048, step_eval=1000, eval_batch=1, eval_chunk=8192,
    cutoffs=[5000, 25000, 50000], adaptive_ratio=4,
    lr=0.01, step_warmup=100, scheduler='rsqrt',
    num_layers=1, n_embed=512, n_hidden=2048, dropout=0.1,
    hebbian=True, hebbian_gamma=0.01, hebbian_T=500
).save(True)
print(c.train(env_gpu=1, steps=200000))

cd /data/scratch/zxyan/micronet/wikitext-103/gru
CUDA_VISIBLE_DEVICES=1 python3 ../../model.py . steps=200000 opt_level=O0


In [24]:
sorted_hebbian = transformer.copy()
sorted_hebbian.update(dict(
    hebbian=True, hebbian_gamma=0.01, hebbian_T=500,
    vocab_sorted=True, cutoffs=[3500, 25000], n_embeds=[256, 64, 4]
))
c = Config(Wiki / 'sorted,hebbian', sorted_hebbian).save(True)
print(c.train(env_gpu=2, steps=200000, opt='O1'))

cd /data/scratch/zxyan/micronet/wikitext-103/sorted,hebbian
CUDA_VISIBLE_DEVICES=2 python3 ../../model.py . steps=200000 opt_level=O1


In [31]:
sorted_hebbian_layer2 = sorted_hebbian.copy()
layers = []
for i in range(sorted_hebbian_layer2['n_layers']):
    # more heads when shallow, less heads when deep
    # smaller inner when shallow
    if i < 4:
        layer = dict(n_inner=64, n_head=8, n_k=16, n_v=16)
    elif i < 8:
        layer = dict(n_inner=256, n_head=4, n_k=32, n_v=32)
    elif i < 10:
        layer = dict(n_inner=1024, n_head=4, n_k=64, n_v=64)
    elif i < 12:
        layer = dict(n_inner=1024, n_head=4, n_k=64, n_v=64)
    elif i < 16:
        layer = dict(n_inner=1536, n_head=2, n_k=128, n_v=128)
    layers.append(layer)

c = Config(Wiki / 'sorted,hebbian,compound', sorted_hebbian_layer2, layers=layers, train_batch=18).save(True)
print(c.train(env_gpu=0, steps=200000, opt='O1'))

cd /data/scratch/zxyan/micronet/wikitext-103/sorted,hebbian,compound
CUDA_VISIBLE_DEVICES=0 python3 ../../model.py . steps=200000 opt_level=O1


In [33]:
sorted_hebbian_layer12 = sorted_hebbian.copy()
sorted_hebbian_layer12.update(dict(n_layers=12, train_batch=18))
c = Config(Wiki / 'sorted,hebbian,layer12', sorted_hebbian_layer12).save(True)
print(c.train(env_gpu=2, steps=200000, opt='O1'))

cd /data/scratch/zxyan/micronet/wikitext-103/sorted,hebbian,layer12
CUDA_VISIBLE_DEVICES=2 python3 ../../model.py . steps=200000 opt_level=O1


# Cache Parameter Search

In [8]:
# loads the model (net and step) for evaluation
c = Config(Wiki / 'hebbian,large2').load()
net, step = c.var(device='cuda:0').load_model('max')
from model import evaluate
data_val = SequentialIterator(c, c.eval_batch, split='valid')
perplexity = {}
print('Model at step', step)

Model at step 200000


In [11]:
thetas = [1e-2, 1e-3, 1e-4]
thetas = [1e-1, 5e-2, 2e-2, 1e-2]
thetas = [1e-2, 5e-3, 2e-3, 1e-3]
thetas = [1e-2, 9e-3, 8e-3, 7e-3, 6e-3, 5e-3, 4e-3, 3e-3, 2e-3, 1e-3]
# thetas = [6e-3, 5.5e-3, 5e-3, 4.5e-3, 4e-3]
thetas = np.arange(1e-3, 0, -1e-4)

lambdas = [0.05, 0.1, 0.2, 0.3, 0.4]
lambdas = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1]
# lambdas = [0.05, 0.055, 0.06, 0.065, 0.07]
lambdas = np.arange(0.03, 0, -0.01)

# search over cache parameters
for theta in thetas:
    for lam in lambdas:
        if (theta, lam) in perplexity:
            continue
        net.loss.cache_keys = net.loss.cache_values = None
        perplexity[theta, lam] = evaluate(c.var(use_cache=True, n_cache=500, cache_theta=theta, cache_lambda=lam), data_val, net)['perplexity']

In [12]:
df = pd.DataFrame([[perplexity[theta, lam] for theta in thetas] for lam in lambdas], index=lambdas, columns=thetas)
df.index.name = 'lambda'
df.columns.name = 'theta'
df

theta,0.001,0.0009,0.0007999999999999999,0.0006999999999999999,0.0005999999999999998,0.0004999999999999998,0.00039999999999999975,0.0002999999999999997,0.00019999999999999966,9.999999999999961e-05
lambda,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0.03,25.787173,25.788649,25.79224,25.798046,25.80627,25.817021,25.83069,25.847989,25.869721,25.897238
0.02,25.75044,25.751717,25.754799,25.759718,25.766647,25.775642,25.786952,25.801232,25.819139,25.84187
0.01,25.784382,25.785464,25.787948,25.791742,25.796988,25.803736,25.812116,25.82263,25.835765,25.852599


In [13]:
data_test = SequentialIterator(c, c.eval_batch, split='test')
evaluate(c.var(use_cache=True, n_cache=500, cache_theta=0.001, cache_lambda=0.02), data_test, net)

{'loss': 3.2895140647888184, 'perplexity': 26.829822932009282, 'time': 5.0}