In [1]:
from u import *
from ut import *
from model import *
from data import *

%load_ext autoreload
%autoreload 2

decoder = (Cache / 'vocab.npy').load()
encoder = get_encoder(decoder)
n_vocab = len(decoder)

# Model Configurations

Also prints out the command to run training

In [9]:
# base hyperparameters for transformer
transformer = dict(
    model=Proj / 'main.py', model_class='Transformer', n_vocab=n_vocab, step_save=5000,
    train_batch=17, train_chunk=1088,
    step_eval=500, eval_batch=1, eval_chunk=4096,
    cutoffs=[5000, 25000, 50000], adaptive_ratio=4, pos_emb='trained',
    n_seq=64, n_layers=16, n_embed=256, n_head=8, n_k=32, n_v=32, n_inner=1024, dropout=0.1,
    lr=0.0005, step_warmup=100, scheduler='cosine'
)

# create config object from dictionary
c = Config(Wiki / 'hebbian,large', # first argument to config is the path of the folder to create for the run
    transformer,
    hebbian=True, hebbian_gamma=0.002, hebbian_T=2500,
    train_batch=8, train_chunk=1152,
    n_embed=512, n_seq=128
) # save to the folder

# # print out command to run the training
# print(c.train(env_gpu=lrange(4), steps=200000, opt='O1'))

c = Config(Wiki / 'hebbian,large2', transformer,
    hebbian=True, hebbian_gamma=0.002, hebbian_T=2500,
    train_batch=7, train_chunk=1152,
    n_embed=512, n_k=64, n_v=64, n_seq=96, n_inner=1536
)
# print(c.train(env_gpu=lrange(6), steps=200000, opt='O1'))

c = Config(Wiki / 'hebbian', transformer,
           hebbian=True, hebbian_gamma=0.01, hebbian_T=500)
# print(c.train(env_gpu=2, steps=200000, opt='O1'))

layers = []
for i in range(transformer['n_layers']):
    if i < 5:
        layers.append(dict(lc_kernel_size=3))
    elif i < 10:
        layers.append(dict(lc_kernel_size=7))
    else:
        layers.append(dict(lc_kernel_size=15))
c = Config(Wiki / 'transformer,lightconv', transformer, train_batch=10,
           light_conv=True, layers=layers,
           hebbian=True, hebbian_gamma=0.01, hebbian_T=500)
# print(c.train(env_gpu=1, steps=200000, opt='O0'))

c = Config(Wiki / 'gru',
    model=Proj / 'model.py', model_class='RNN', net='GRU', n_vocab=n_vocab, step_save=5000,
    train_batch=11, n_seq=2048, step_eval=1000, eval_batch=1, eval_chunk=8192,
    cutoffs=[5000, 25000, 50000], adaptive_ratio=4,
    lr=0.01, step_warmup=100, scheduler='rsqrt',
    num_layers=1, n_embed=512, n_hidden=2048, dropout=0.1,
    hebbian=True, hebbian_gamma=0.01, hebbian_T=500
)
# print(c.train(env_gpu=1, steps=200000))

sorted_hebbian = transformer.copy()
sorted_hebbian.update(dict(
    hebbian=True, hebbian_gamma=0.01, hebbian_T=500,
    vocab_sorted=True, cutoffs=[3500, 25000], n_embeds=[256, 64, 4]
))
c = Config(Wiki / 'sorted,hebbian', sorted_hebbian)
# print(c.train(env_gpu=2, steps=200000, opt='O1'))

sorted_hebbian_compound = sorted_hebbian.copy()
layers = []
for i in range(sorted_hebbian_compound['n_layers']):
    # more heads when shallow, less heads when deep
    # smaller inner when shallow
    if i < 4:
        layer = dict(n_inner=64, n_head=8, n_k=16, n_v=16)
    elif i < 8:
        layer = dict(n_inner=256, n_head=4, n_k=32, n_v=32)
    elif i < 10:
        layer = dict(n_inner=1024, n_head=4, n_k=64, n_v=64)
    elif i < 12:
        layer = dict(n_inner=1024, n_head=4, n_k=64, n_v=64)
    elif i < 16:
        layer = dict(n_inner=1536, n_head=2, n_k=128, n_v=128)
    layers.append(layer)

c = Config(Wiki / 'sorted,hebbian,compound', sorted_hebbian_compound, layers=layers, train_batch=18)
# print(c.train(env_gpu=0, steps=200000, opt='O1'))

sorted_hebbian_layer12 = sorted_hebbian.copy()
sorted_hebbian_layer12.update(dict(n_layers=12, train_batch=18))
c = Config(Wiki / 'sorted,hebbian,layer12', sorted_hebbian_layer12)

sorted_hebbian_large = sorted_hebbian.copy()
sorted_hebbian_large.update(dict(
    train_batch=7, train_chunk=1152,
    n_embeds=[512, 256, 16],
    n_embed=512, n_k=64, n_v=64, n_seq=96, n_inner=1536
))
c = Config(Wiki / 'sorted,hebbian,large', sorted_hebbian_large)
# print('S=$HOME/Research/exercises', c.train(env_gpu=lrange(8), steps=200000, opt='O1').split('\n')[1])

sorted_hebbian_softmax = sorted_hebbian.copy()
sorted_hebbian_softmax.update(dict(fix_softmax=True, train_batch=16))
c = Config(Wiki / 'sorted,hebbian,softmax', sorted_hebbian_softmax)
# print(c.train(env_gpu=0, steps=200000, opt='O1'))

sorted_hebbian_mask = sorted_hebbian.copy()
sorted_hebbian_mask.update(dict(mask_pad=True, fix_softmax=True, train_batch=16))
c = Config(Wiki / 'sorted,hebbian,mask', sorted_hebbian_mask)
# print(c.train(env_gpu=3, steps=200000, opt='O1'))

large = sorted_hebbian_softmax.copy()
large.update(dict(
    train_batch=7, train_chunk=1152,
    n_embeds=[512, 256, 16],
    n_embed=512, n_k=64, n_v=64, n_seq=96, n_inner=1536
))
c = Config(Wiki / 'large', large)
# print('S=$HOME/Research/exercises', c.train(env_gpu=lrange(8), steps=200000, opt='O1').split('\n')[1])

tie_layers = sorted_hebbian_mask.copy()
tie_layers.update(dict(
    tie_layers=True,
    train_batch=10,
    n_k=64, n_v=64, n_inner=2048,
))
c = Config(Wiki / 'tie_layers', tie_layers)
# print(c.train(env_gpu=1, steps=200000, opt='O1'))

tie_layers_4x4 = sorted_hebbian_mask.copy()
tie_layers_4x4.update(dict(
    tie_layers=[4, 4, 4, 4],
    train_batch=16
))
c = Config(Wiki / 'tie_layers,4x4', tie_layers_4x4)
# print(c.train(env_gpu=2, steps=200000, opt='O1'))

tie_layers_8x2 = sorted_hebbian_mask.copy()
tie_layers_8x2.update(dict(
    tie_layers=[2] * 8,
    train_batch=16
))
c = Config(Wiki / 'tie_layers,8x2', tie_layers_8x2)
# print(c.train(env_gpu=1, steps=200000, opt='O1'))

tie_layers_8x2_drop_02 = tie_layers_8x2.copy()
tie_layers_8x2_drop_02.update(dropout=0.2)
del tie_layers_8x2_drop_02['mask_pad']
c = Config(Wiki / '8x2,drop_02', tie_layers_8x2_drop_02)
# print(c.train(env_gpu=0, steps=200000, opt='O1'))

tie_layers_8x2_drop_005 = tie_layers_8x2_drop_02.copy()
tie_layers_8x2_drop_005.update(dropout=0.05)
c = Config(Wiki / '8x2,drop_005', tie_layers_8x2_drop_005)
# print(c.train(env_gpu=2, steps=200000, opt='O1'))

tie_layers_8x2_drop_0 = tie_layers_8x2_drop_02.copy()
tie_layers_8x2_drop_0.update(dropout=0.0)
c = Config(Wiki / '8x2,drop_0', tie_layers_8x2_drop_0)
# print(c.train(env_gpu=3, steps=200000, opt='O1'))

tie_layers_8x2_drop_0_k_24 = tie_layers_8x2_drop_0.copy()
tie_layers_8x2_drop_0_k_24.update(n_k=24, n_v=40)
c = Config(Wiki / '8x2,drop_0,k_24', tie_layers_8x2_drop_0_k_24)
# print(c.train(env_gpu=0, steps=200000, opt='O1'))

tie_layers_8x2_drop_0_k_40 = tie_layers_8x2_drop_0.copy()
tie_layers_8x2_drop_0_k_40.update(n_k=24, n_v=40)
c = Config(Wiki / '8x2,drop_0,k_40', tie_layers_8x2_drop_0_k_40)
# print(c.train(env_gpu=1, steps=200000, opt='O1'))

tie_layers_8x2_drop_0_attn_32 = tie_layers_8x2_drop_0.copy()
tie_layers_8x2_drop_0_attn_32.update(n_seq=32, train_batch=18)
c = Config(Wiki / '8x2,drop_0,attn_32', tie_layers_8x2_drop_0_attn_32)
# print(c.train(env_gpu=3, steps=200000, opt='O1'))

tie_layers_8x2_warmup_10000 = tie_layers_8x2.copy()
tie_layers_8x2_warmup_10000.update(step_warmup=10000, lr=0.005)
del tie_layers_8x2_warmup_10000['mask_pad']
c = Config(Wiki / '8x2,warmup_10000', tie_layers_8x2_warmup_10000)
# print(c.train(env_gpu=3, steps=200000, opt='O1'))

tie_layers_8x2_warmup_1000 = tie_layers_8x2_warmup_10000.copy()
tie_layers_8x2_warmup_1000.update(step_warmup=1000, lr=0.001)
c = Config(Wiki / '8x2,warmup_1000', tie_layers_8x2_warmup_1000)
# print(c.train(env_gpu=3, steps=200000, opt='O1'))

tie_layers_8x2_warmup_1000_drop_0 = tie_layers_8x2_warmup_1000.copy()
tie_layers_8x2_warmup_1000_drop_0.update(dropout=0)
c = Config(Wiki / '8x2,warmup_1000,drop_0', tie_layers_8x2_warmup_1000_drop_0)
# print(c.train(env_gpu=1, steps=200000, opt='O1'))

tie_layers_8x2_warmup_1000_drop_0_k_24 = tie_layers_8x2_warmup_1000.copy()
tie_layers_8x2_warmup_1000_drop_0_k_24.update(dropout=0, n_k=24, n_v=40)
c = Config(Wiki / '8x2,warmup_1000,drop_0,k_24', tie_layers_8x2_warmup_1000_drop_0_k_24)
# print(c.train(env_gpu=3, steps=200000, opt='O1'))

layers = []
for i in range(tie_layers_8x2['n_layers']):
    # more heads when shallow, less heads when deep
    # smaller inner when shallow
    if i < 2:
        layer = dict(n_inner=256, n_head=16, n_k=16, n_v=16)
    elif i < 4:
        layer = dict(n_inner=512, n_head=8, n_k=32, n_v=32)
    elif i < 6:
        layer = dict(n_inner=1024, n_head=4, n_k=64, n_v=64)
    else:
        layer = dict(n_inner=2048, n_head=2, n_k=128, n_v=128)
    layers.append(layer)
tie_layers_8x2_compound = tie_layers_8x2.copy()
tie_layers_8x2_compound.update(layers=layers)
del tie_layers_8x2_compound['mask_pad']
c = Config(Wiki / '8x2,compound', tie_layers_8x2_compound)
# print(c.train(env_gpu=1, steps=200000, opt='O1'))

layers = []
for i in range(tie_layers_8x2_compound['n_layers']):
    # smaller inner when shallow
    if i < 2:
        layer = dict(n_inner=256, n_head=8, n_k=8, n_v=8)
    elif i < 4:
        layer = dict(n_inner=512, n_head=8, n_k=16, n_v=16)
    elif i < 6:
        layer = dict(n_inner=1024, n_head=8, n_k=32, n_v=32)
    else:
        layer = dict(n_inner=2048, n_head=8, n_k=64, n_v=64)
    layers.append(layer)
tie_layers_8x2_compound2 = tie_layers_8x2_compound.copy()
tie_layers_8x2_compound2.update(layers=layers)
c = Config(Wiki / '8x2,compound2', tie_layers_8x2_compound2)
# print(c.train(env_gpu=1, steps=200000, opt='O1'))

tie_layers_8x2_attn_128 = tie_layers_8x2.copy()
tie_layers_8x2_attn_128.update(n_seq=128, train_batch=14)
del tie_layers_8x2_attn_128['mask_pad']
c = Config(Wiki / '8x2,attn_128', tie_layers_8x2_attn_128)
# print(c.train(env_gpu=3, steps=200000, opt='O1'))

tie_layers_8x2_attn_128_2 = tie_layers_8x2_attn_128.copy()
tie_layers_8x2_attn_128_2.update(train_chunk=1024, dropout=0.2, train_batch=15)
c = Config(Wiki / '8x2,attn_128_2', tie_layers_8x2_attn_128_2)
# print(c.train(env_gpu=0, steps=200000, opt='O1'))

tie_layers_8x2_attn_96 = tie_layers_8x2_attn_128.copy()
tie_layers_8x2_attn_96.update(n_seq=96, train_batch=15)
c = Config(Wiki / '8x2,attn_96', tie_layers_8x2_attn_96)
# print(c.train(env_gpu=0, steps=200000, opt='O1'))

universal = sorted_hebbian_mask.copy()
universal.update(dict(
    model_class='UniversalTransformer',
    train_batch=10,
    n_k=64, n_v=64, n_inner=2048,
    threshold=0.99, time_penalty=0.1
))
c = Config(Wiki / 'universal', universal)
# print(c.train(env_gpu=2, steps=200000, opt='O1'))

In [17]:
shallow = sorted_hebbian_softmax.copy()
shallow.update(
    n_layers=4, train_batch=20,
    dropout=0,
    step_warmup=1000, lr=0.001,
    train_chunk=1152, n_seq=96,
    n_inner=1536, n_k=32, n_v=64)
c = Config(Wiki / 'shallow', shallow).save(True)
# print(c.train(env_gpu=0, steps=200000, opt='O1'))

shallow_attn_128 = shallow.copy()
shallow_attn_128.update(n_seq=128)
c = Config(Wiki / 'shallow,attn_128', shallow_attn_128)
# print(c.train(env_gpu=3, steps=200000, opt='O1'))

shallow_attn_64 = shallow.copy()
shallow_attn_64.update(n_seq=64)
c = Config(Wiki / 'shallow,attn_64', shallow_attn_64)
# print(c.train(env_gpu=0, steps=200000, opt='O1'))

shallow_attn_72 = shallow.copy()
shallow_attn_72.update(n_seq=72)
c = Config(Wiki / 'shallow,attn_72', shallow_attn_72)
# print(c.train(env_gpu=0, steps=200000, opt='O1'))

layers = []
for i in range(shallow['n_layers']):
    if i < 2:
        layers.append(dict(lc_kernel_size=7))
    else:
        layers.append(dict(lc_kernel_size=15))
c = Config(Wiki / 'shallow,lightconv', shallow,
           light_conv=True, layers=layers, train_batch=16)
# print(c.train(env_gpu=3, steps=200000, opt='O0'))

shallow_lr_005 = shallow.copy()
shallow_lr_005.update(lr=0.005)
c = Config(Wiki / 'shallow,lr_005', shallow_lr_005)
# print(c.train(env_gpu=3, steps=200000, opt='O1'))

shallow_lr_01 = shallow.copy()
shallow_lr_01.update(lr=0.01)
c = Config(Wiki / 'shallow,lr_01', shallow_lr_01)
# print(c.train(env_gpu=0, steps=200000, opt='O1'))

shallow_lr_002 = shallow.copy()
shallow_lr_002.update(lr=0.002)
c = Config(Wiki / 'shallow,lr_002', shallow_lr_002)
# print(c.train(env_gpu=1, steps=200000, opt='O1'))

shallow_layers_5 = shallow.copy()
shallow_layers_5.update(n_layers=5, n_inner=1024, n_v=48)
c = Config(Wiki / 'shallow,layers_5', shallow_layers_5).save(True)
# print(c.train(env_gpu=0, steps=200000, opt='O1'))

shallow_layers_3 = shallow.copy()
shallow_layers_3.update(n_layers=3, n_inner=2048, n_k=48)
c = Config(Wiki / 'shallow,layers_3', shallow_layers_3).save(True)
# print(c.train(env_gpu=3, steps=200000, opt='O1'))

shallow_layers_6 = shallow.copy()
shallow_layers_6.update(n_layers=6, n_inner=1024, n_k=32, n_v=32)
c = Config(Wiki / 'shallow,layers_6', shallow_layers_6).save(True)
# print(c.train(env_gpu=0, steps=200000, opt='O1'))

shallow_layers_8 = shallow.copy()
shallow_layers_8.update(n_layers=8, n_inner=768, n_k=24, n_v=24)
c = Config(Wiki / 'shallow,layers_8', shallow_layers_8).save(True)
# print(c.train(env_gpu=0, steps=200000, opt='O1'))

shallow_layers_12 = shallow.copy()
shallow_layers_12.update(n_layers=12, n_inner=512, n_k=16, n_v=16)
c = Config(Wiki / 'shallow,layers_12', shallow_layers_12).save(True)
# print(c.train(env_gpu=0, steps=200000, opt='O1'))

shallow_layers_16 = shallow.copy()
shallow_layers_16.update(n_layers=16, n_inner=384, n_k=12, n_v=12, train_batch=19)
c = Config(Wiki / 'shallow,layers_16', shallow_layers_16)
# print(c.train(env_gpu=1, steps=200000, opt='O1'))

shallow_layers_8_attn_64 = shallow_layers_8.copy()
shallow_layers_8_attn_64.update(n_seq=64)
c = Config(Wiki / 'shallow,layers_8,attn_64', shallow_layers_8_attn_64)
# print(c.train(env_gpu=1, steps=200000, opt='O1'))

shallow_hebbian_1000 = shallow.copy()
shallow_hebbian_1000.update(hebbian_T=1000)
c = Config(Wiki / 'shallow,hebbian_1000', shallow_hebbian_1000)
# print(c.train(env_gpu=0, steps=200000, opt='O1'))

shallow_layers_8_gpu_8 = shallow_layers_8.copy()
shallow_layers_8_gpu_8.update(hebbian_T=4000)
c = Config(Wiki / 'shallow,layers_8,gpu_8', shallow_layers_8_gpu_8).save(True)
# print(c.train(env_gpu=lrange(8), steps=200000, opt='O1'))

shallow_layers_8_embed_128 = shallow_layers_8.copy()
shallow_layers_8_embed_128.update(n_embed=128, n_embeds=[128, 32, 2])
c = Config(Wiki / 'shallow,layers_8,embed_128', shallow_layers_8_embed_128).save(True)
# print(c.train(env_gpu=0, steps=200000, opt='O1'))

shallow_layers_8_cutoff_1000_10000 = shallow_layers_8.copy()
shallow_layers_8_cutoff_1000_10000.update(cutoffs=[1000, 10000], train_batch=17)
c = Config(Wiki / 'shallow,layers_8,cutoff_1000_10000', shallow_layers_8_cutoff_1000_10000).save(True)
# print(c.train(env_gpu=1, steps=200000, opt='O1'))

shallow_layers_8_inner_512 = shallow_layers_8.copy()
shallow_layers_8_inner_512.update(n_inner=512, n_k=16)
c = Config(Wiki / 'shallow,layers_8,inner_512', shallow_layers_8_inner_512).save(True)
# print(c.train(env_gpu=0, steps=200000, opt='O1'))

shallow_layers_8_gpu_2 = shallow_layers_8.copy()
shallow_layers_8_gpu_2.update(hebbian_T=1000)
c = Config(Wiki / 'shallow,layers_8,gpu_2', shallow_layers_8_gpu_2)
# print(c.save(True).train(env_gpu=[0, 1], steps=200000, opt='O1'))

shallow_layers_8_tt = shallow_layers_8.copy()
shallow_layers_8_tt.update(
    tensor_train=True, train_batch=16,
    modes_embed=[4, 16, 4], modes_inner=[8, 12, 8],
    ranks_e2i=[4, 16, 16, 4], ranks_i2e=[4, 16, 16, 4]
)
c = Config(Wiki / 'shallow,layers_8,tt', shallow_layers_8_tt)
# print(c.save(True).train(env_gpu=4, steps=200000, opt='O1'))

shallow_layers_8_gpu_4 = shallow_layers_8.copy()
shallow_layers_8_gpu_4.update(hebbian_T=2000)
c = Config(Wiki / 'shallow,layers_8,gpu_4', shallow_layers_8_gpu_4)
# print(c.save(True).train(env_gpu=lrange(4), steps=200000, opt='O1'))

shallow_layers_8_gpu_3 = shallow_layers_8.copy()
shallow_layers_8_gpu_3.update(hebbian_T=1500)
c = Config(Wiki / 'shallow,layers_8,gpu_3', shallow_layers_8_gpu_3)
# print(c.save(True).train(env_gpu=lrange(3), steps=200000, opt='O1'))

shallow_layers_8_cache = shallow_layers_8.copy()
shallow_layers_8_cache.update(train_batch=14, use_cache=True, cache_theta_init=0.016, cache_lambda_init=0.07, n_cache=2000)
c = Config(Wiki / 'shallow,layers_8,cache', shallow_layers_8_cache)
print(c.save(True).train(env_gpu=3, steps=200000, opt='O1'))

shallow_layers_8_cache_1500 = shallow_layers_8.copy()
shallow_layers_8_cache_1500.update(train_batch=15, use_cache=True, cache_theta_init=0.016, cache_lambda_init=0.07, n_cache=1500)
c = Config(Wiki / 'shallow,layers_8,cache_1500', shallow_layers_8_cache_1500)
print(c.save(True).train(env_gpu=1, steps=200000, opt='O1'))

shallow_layers_8_cache_1000 = shallow_layers_8.copy()
shallow_layers_8_cache_1000.update(train_batch=16, use_cache=True, cache_theta_init=0.016, cache_lambda_init=0.07, n_cache=1000)
c = Config(Wiki / 'shallow,layers_8,cache_1000', shallow_layers_8_cache_1000)
print(c.save(True).train(env_gpu=0, steps=200000, opt='O1'))

cd /data/scratch/zxyan/micronet/wikitext-103/shallow,layers_8,cache
CUDA_VISIBLE_DEVICES=3 python3 ../../main.py . steps=200000 opt_level=O1
cd /data/scratch/zxyan/micronet/wikitext-103/shallow,layers_8,cache_1500
CUDA_VISIBLE_DEVICES=1 python3 ../../main.py . steps=200000 opt_level=O1
cd /data/scratch/zxyan/micronet/wikitext-103/shallow,layers_8,cache_1000
CUDA_VISIBLE_DEVICES=0 python3 ../../main.py . steps=200000 opt_level=O1


# Cache Parameter Search

In [None]:
# loads the model (net and step) for evaluation
c = Config(Wiki / 'shallow,layers_8').load()
net, step = c.var(device='cuda:3').load_model('max')
from model import evaluate
data_val = SequentialIterator(c, c.eval_batch, split='valid')
perplexity = {}
print('Model at step', step)

In [5]:
thetas = [1e-2, 1e-3, 1e-4]
thetas = [1e-1, 5e-2, 2e-2, 1e-2]
thetas = [1e-2, 5e-3, 2e-3, 1e-3]
thetas = [1e-2, 9e-3, 8e-3, 7e-3, 6e-3, 5e-3, 4e-3, 3e-3, 2e-3, 1e-3]
thetas = np.arange(3e-2, 1e-2, -2e-3)
# thetas = [0.02, 0.018, 0.016, 0.015, 0.014, 0.012, 0.01]

lambdas = [0.05, 0.1, 0.2, 0.3, 0.4]
lambdas = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1]
lambdas = [0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.15, 0.2]
lambdas = [0.06, 0.07, 0.08, 0.09]
# lambdas = [0.05, 0.055, 0.06, 0.065, 0.07]
# lambdas = np.arange(0.03, 0, -0.01)

# search over cache parameters
for theta in thetas:
    for lam in lambdas:
        if (theta, lam) in perplexity:
            continue
        net.loss.cache_keys = net.loss.cache_values = None
        perplexity[theta, lam] = evaluate(c.var(use_cache=True, n_cache=1500, cache_theta=theta, cache_lambda=lam), data_val, net)['perplexity']

In [6]:
df = pd.DataFrame([[perplexity[theta, lam] for theta in thetas] for lam in lambdas], index=lambdas, columns=thetas)
df.index.name = 'lambda'
df.columns.name = 'theta'
df

theta,0.03,0.027999999999999997,0.025999999999999995,0.023999999999999994,0.021999999999999992,0.01999999999999999,0.017999999999999988,0.015999999999999986,0.013999999999999985,0.011999999999999983
lambda,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0.06,33.295599,33.216659,33.144749,33.084193,33.040807,33.021898,33.036002,33.093076,33.204457,33.382024
0.07,33.248082,33.163625,33.086639,33.021961,32.975755,32.95576,32.971164,33.032639,33.152288,33.342913
0.08,33.227692,33.137938,33.056195,32.987542,32.938667,32.917706,32.934395,33.000136,33.127739,33.330975
0.09,33.229554,33.134659,33.048307,32.975912,32.924448,32.90257,32.920555,32.990413,33.12582,33.341219


In [7]:
data_test = SequentialIterator(c, c.eval_batch, split='test')
evaluate(c.var(use_cache=True, n_cache=1500, cache_theta=0.02, cache_lambda=0.09), data_test, net)

{'loss': 3.5233027935028076, 'perplexity': 33.89619592549835, 'time': 1.0}

# Cache from Training

In [2]:
# loads the model (net and step) for evaluation
c = Config(Wiki / 'quant_aware', device='cuda:1').load()
from model import get_net
from main import evaluate
# from quantized_model import get_net, evaluate
net = get_net(c)
net, step = c.init_model(net, step='max', train=False)
data_val = SequentialIterator(c, c.eval_batch, split='valid')
data_test = SequentialIterator(c, c.eval_batch, split='test')
print('Model at step', step)

Model at step 20000


In [3]:
net.loss.cache_keys = net.loss.cache_values = None
evaluate(c, data_val, net)

{'loss': 3.5198779106140137, 'perplexity': 33.780303995504525, 'time': 1.0}

In [4]:
net.loss.cache_keys = net.loss.cache_values = None
evaluate(c, data_test, net)

{'loss': 3.5439910888671875, 'perplexity': 34.60475460205126, 'time': 2.0}