In [None]:
!pip install --user tensorboardX
!pip install --user --force https://github.com/chengs/tqdm/archive/colab.zip

In [31]:
import math
import numpy as np
import torch
from torch.utils import data
import torch.optim as optim
import torch.nn.functional as F
from tensorboard import Tensorboard

from math_dataset import (
    LazyFileMathDataset,
    MathDatasetManager
)

from transformer.Models import Transformer
from math_dataset import (
    random_split_dataset, MAX_ANSWER_SZ, MAX_QUESTION_SZ, VOCAB_SZ,
    question_answer_to_position_batch_collate_fn, question_to_position_batch_collate_fn
)
import model_process
from itertools import islice

print("Torch Version", torch.__version__)

%reload_ext autoreload
%autoreload 2

Torch Version 1.0.0.dev20190402


In [8]:
my_ds = LazyFileMathDataset(
    "/home/mandubian/datasets/mathematics_dataset/v1.0/train-easy/algebra__polynomial_roots_composed.txt",
    lazy_load=True
)
print(my_ds[0])
print(my_ds[500000])
my_ds.set_max_elements(50)
print(len(my_ds))
print(my_ds[49])

{'q': 'Let d = -25019/90 - -278. Let v(j) be the third derivative of 0 + 1/27*j**3 - d*j**5 + 1/54*j**4 + 3*j**2 + 0*j. Suppose v(o) = 0. What is o?', 'q_enc': array([ 2, 45, 70, 85,  1, 69,  1, 30,  1, 14, 19, 22, 17, 18, 26, 16, 26,
       17,  1, 14,  1, 14, 19, 24, 25, 15,  1, 45, 70, 85,  1, 87,  9, 75,
       10,  1, 67, 70,  1, 85, 73, 70,  1, 85, 73, 74, 83, 69,  1, 69, 70,
       83, 74, 87, 66, 85, 74, 87, 70,  1, 80, 71,  1, 17,  1, 12,  1, 18,
       16, 19, 24, 11, 75, 11, 11, 20,  1, 14,  1, 69, 11, 75, 11, 11, 22,
        1, 12,  1, 18, 16, 22, 21, 11, 75, 11, 11, 21,  1, 12,  1, 20, 11,
       75, 11, 11, 19,  1, 12,  1, 17, 11, 75, 15,  1, 52, 86, 81, 81, 80,
       84, 70,  1, 87,  9, 80, 10,  1, 30,  1, 17, 15,  1, 56, 73, 66, 85,
        1, 74, 84,  1, 80, 32,  3], dtype=uint8), 'a': '-1/3, 1', 'a_enc': array([ 2, 14, 18, 16, 20, 13,  1, 18,  3], dtype=uint8)}
{'q': 'Let t(v) be the second derivative of 1/16*v**4 + 3/8*v**2 - 3*v + 1/4*v**3 + 0. Factor t(s).', 'q_en

In [9]:
mdsmgr = MathDatasetManager(
  "/home/mandubian/datasets/mathematics_dataset/v1.0"
)

initialized MultiFilesMathDataset with categories ['algebra', 'numbers', 'polynomials', 'arithmetic', 'measurement', 'comparison', 'probability', 'calculus'] and types ['train-easy', 'train-medium', 'train-hard', 'interpolate', 'extrapolate']


In [10]:
print("categories", mdsmgr.get_categories())
print("types", mdsmgr.get_types())
print("modules of arithmetic", mdsmgr.get_modules_for_category('arithmetic'))

categories dict_keys(['algebra', 'numbers', 'polynomials', 'arithmetic', 'measurement', 'comparison', 'probability', 'calculus'])
types dict_keys(['train-easy', 'train-medium', 'train-hard', 'interpolate', 'extrapolate'])
modules of arithmetic dict_keys(['mul', 'add_or_sub_in_base', 'simplify_surd', 'mul_div_multiple', 'mixed', 'nearest_integer_root', 'div', 'add_or_sub', 'add_sub_multiple', 'add_sub_multiple_longer', 'mul_div_multiple_longer', 'div_big', 'mul_big', 'mixed_longer', 'add_or_sub_big'])


In [11]:
ds = mdsmgr.build_dataset_from_category('algebra', 'train-easy')
print("size", len(ds))


adding category algebra/../train-easy
added module algebra/polynomial_roots_composed/train-easy
added module algebra/polynomial_roots/train-easy
added module algebra/linear_1d_composed/train-easy
added module algebra/linear_2d_composed/train-easy
added module algebra/linear_2d/train-easy
added module algebra/sequence_nth_term/train-easy
added module algebra/linear_1d/train-easy
added module algebra/sequence_next_term/train-easy
size 5333328


In [8]:
ds_arithmetic_add_or_sub_easy = mdsmgr.build_dataset_from_module('arithmetic', 'add_or_sub', 'train-easy')
print("size", len(ds_arithmetic_add_or_sub_easy))

size 666666


In [13]:
ds_arithmetic_add_or_sub_hard = mdsmgr.build_dataset_from_module('arithmetic', 'add_or_sub', 'train-hard')
print("size", len(ds_arithmetic_add_or_sub_hard))

size 666666


In [14]:
ds_arithmetic_add_or_sub_interpolate = mdsmgr.build_dataset_from_module('arithmetic', 'add_or_sub', 'interpolate')
print("size", len(ds_arithmetic_add_or_sub_interpolate))

size 10000


In [15]:
seed = 1
torch.manual_seed(seed)
device = torch.device("cuda")
print("device", device)

device cuda


In [16]:
model = Transformer(
  n_src_vocab=VOCAB_SZ + 1, # add PAD in vocabulary
  n_tgt_vocab=VOCAB_SZ + 1, # add PAD in vocabulary
  len_max_seq_encoder=MAX_QUESTION_SZ,
  len_max_seq_decoder=MAX_ANSWER_SZ,
)

exp_name = "math_ds_arithmetic_add_or_sub_hard"
unique_id = "20190423_1800"


In [17]:
optimizer = optim.Adam(model.parameters(), lr=6e-6, betas=(0.9, 0.995), eps=1e-9)
        #filter(lambda x: x.requires_grad, model.parameters()),
        #lr=6e-4, betas=(0.9, 0.995), eps=1e-9)


In [19]:
#train_ds, val_ds = random_split_dataset(list(islice(ds_arithmetic_add_or_sub_easy, 128*100)), split_rate=0.9)
train_ds, val_ds = math_dataset.random_split_dataset(ds_arithmetic_add_or_sub_hard, split_rate=0.9)
train_loader = data.DataLoader(
    train_ds, batch_size=128, shuffle=True, num_workers=4,
    collate_fn=question_answer_to_position_batch_collate_fn)

val_loader = data.DataLoader(
    val_ds, batch_size=128, shuffle=False, num_workers=4,
    collate_fn=question_answer_to_position_batch_collate_fn)

interpolate_loader = data.DataLoader(
    ds_arithmetic_add_or_sub_interpolate, batch_size=128, shuffle=False, num_workers=4,
    collate_fn=question_answer_to_position_batch_collate_fn)


In [71]:
from checkpoints import restore_checkpoint

state = restore_checkpoint(model, optimizer, "checkpoints/math_ds_arithmetic_add_or_sub_easy_2019-04-22T13:32:24_validation_best.pth")
exp_name = state["exp_name"]
unique_id = state["unique_id"]
model = state["model"]
optimizer = state["optimizer"]
epoch = state["epoch"]
best_acc = state["acc"]
best_loss = state["loss"]

print("exp_name", exp_name)
print("unique_id", unique_id)
print("epoch", epoch)
print("best_acc", best_acc)
print("best_loss", best_loss)

Extracting state from checkpoints/math_ds_arithmetic_add_or_sub_easy_2019-04-22T13:32:24_validation_best.pth
acc 0.9608616416082527
loss 0.171765097012516
exp_name math_ds_arithmetic_add_or_sub_easy
unique_id 2019-04-22T13:32:24
epoch 13
best_acc 0.9608616416082527
best_loss 0.171765097012516


In [72]:
model = model.to(device)


In [21]:
tb = Tensorboard(exp_name, unique_name=unique_id)

Writing TensorBoard events locally to runs/math_ds_arithmetic_add_or_sub_hard_20190423_1800


In [None]:
model_process.train(
    exp_name, unique_id,
    model, 
    train_loader, val_loader, interpolate_loader,
    optimizer, device,
    epochs=100, tb=tb, log_interval=100,
    #start_epoch=epoch+1, best_valid_accu=best_acc, best_valid_loss=best_loss
)

  0%|          | 0/4688 [00:00<?, ?it/s]

[ Epoch 0 ]
Adding group train to writers (dict_keys([]))


  0%|          | 0/521 [00:00<?, ?it/s]

[Training]  loss: 2.6261098524396793, ppl:  13.81990, accuracy: 26.954 %, elapse: 1326301.921ms




Adding group eval to writers (dict_keys(['train']))
[Validation]  loss: 2.009183946017596,  ppl:  7.45723, accuracy: 32.786 %, elapse: 59049.417ms
Checkpointing Validation Model...




Adding group interpolate to writers (dict_keys(['train', 'eval']))
[Interpolate]  loss: 1.9652058213868955,  ppl:  7.13638, accuracy: 34.904 %, elapse: 8521.743ms
Checkpointing Interpolate Model...


  0%|          | 0/4688 [00:00<?, ?it/s]

[ Epoch 1 ]


  0%|          | 0/521 [00:00<?, ?it/s]

[Training]  loss: 2.4193844069497454, ppl:  11.23894, accuracy: 34.804 %, elapse: 1330191.011ms




[Validation]  loss: 1.6934560433494328,  ppl:  5.43824, accuracy: 42.877 %, elapse: 59087.733ms
Checkpointing Validation Model...




[Interpolate]  loss: 1.629926490597652,  ppl:  5.10350, accuracy: 45.579 %, elapse: 8520.825ms
Checkpointing Interpolate Model...


  0%|          | 0/4688 [00:00<?, ?it/s]

[ Epoch 2 ]


  0%|          | 0/521 [00:00<?, ?it/s]

[Training]  loss: 2.2095352993943713, ppl:  9.11148, accuracy: 43.102 %, elapse: 1329812.416ms




[Validation]  loss: 1.4383366662863577,  ppl:  4.21368, accuracy: 51.893 %, elapse: 59110.307ms
Checkpointing Validation Model...




[Interpolate]  loss: 1.3729854025081536,  ppl:  3.94712, accuracy: 54.870 %, elapse: 8493.141ms
Checkpointing Interpolate Model...


  0%|          | 0/4688 [00:00<?, ?it/s]

[ Epoch 3 ]


  0%|          | 0/521 [00:00<?, ?it/s]

[Training]  loss: 2.0523151830481385, ppl:  7.78591, accuracy: 49.178 %, elapse: 1329342.120ms




[Validation]  loss: 1.2768513429233892,  ppl:  3.58533, accuracy: 57.191 %, elapse: 59105.786ms
Checkpointing Validation Model...




[Interpolate]  loss: 1.2048870047649358,  ppl:  3.33638, accuracy: 60.082 %, elapse: 8516.227ms
Checkpointing Interpolate Model...


  0%|          | 0/4688 [00:00<?, ?it/s]

[ Epoch 4 ]


 92%|█████████▏| 4295/4688 [20:16<01:51,  3.52it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

  0%|          | 0/521 [00:00<?, ?it/s]

[Training]  loss: 1.642367266533072, ppl:  5.16739, accuracy: 64.373 %, elapse: 1328794.714ms




[Validation]  loss: 0.8363425628310155,  ppl:  2.30791, accuracy: 71.465 %, elapse: 59025.042ms
Checkpointing Validation Model...




[Interpolate]  loss: 0.7751030592908736,  ppl:  2.17082, accuracy: 73.799 %, elapse: 8507.756ms
Checkpointing Interpolate Model...


  0%|          | 0/4688 [00:00<?, ?it/s]

[ Epoch 9 ]


  0%|          | 0/521 [00:00<?, ?it/s]

[Training]  loss: 1.5750303601423803, ppl:  4.83089, accuracy: 67.280 %, elapse: 1328182.366ms




[Validation]  loss: 0.7438201703372517,  ppl:  2.10396, accuracy: 75.002 %, elapse: 58996.402ms
Checkpointing Validation Model...




[Interpolate]  loss: 0.6848689156538703,  ppl:  1.98351, accuracy: 77.031 %, elapse: 8509.625ms
Checkpointing Interpolate Model...


  0%|          | 0/4688 [00:00<?, ?it/s]

[ Epoch 10 ]


  0%|          | 0/521 [00:00<?, ?it/s]

[Training]  loss: 1.5016073457229309, ppl:  4.48890, accuracy: 70.429 %, elapse: 1328137.709ms




[Validation]  loss: 0.681095479055235,  ppl:  1.97604, accuracy: 77.133 %, elapse: 58999.226ms
Checkpointing Validation Model...




[Interpolate]  loss: 0.6350420471742464,  ppl:  1.88710, accuracy: 78.705 %, elapse: 8518.597ms
Checkpointing Interpolate Model...


  0%|          | 0/4688 [00:00<?, ?it/s]

[ Epoch 11 ]


  0%|          | 0/521 [00:00<?, ?it/s]

[Training]  loss: 1.4369096315441388, ppl:  4.20767, accuracy: 73.165 %, elapse: 1328068.570ms




[Validation]  loss: 0.612573622797567,  ppl:  1.84517, accuracy: 79.411 %, elapse: 59014.692ms
Checkpointing Validation Model...




[Interpolate]  loss: 0.5709221237510936,  ppl:  1.76990, accuracy: 80.800 %, elapse: 8514.115ms
Checkpointing Interpolate Model...


  0%|          | 0/4688 [00:00<?, ?it/s]

[ Epoch 12 ]


  0%|          | 0/521 [00:00<?, ?it/s]

[Training]  loss: 1.37111557850934, ppl:  3.93974, accuracy: 75.920 %, elapse: 1327493.639ms




[Validation]  loss: 0.5606734197226444,  ppl:  1.75185, accuracy: 81.278 %, elapse: 59029.054ms
Checkpointing Validation Model...




[Interpolate]  loss: 0.5205480797488986,  ppl:  1.68295, accuracy: 82.621 %, elapse: 8505.996ms
Checkpointing Interpolate Model...


  0%|          | 0/4688 [00:00<?, ?it/s]

[ Epoch 13 ]


 74%|███████▎  | 3454/4688 [16:19<05:50,  3.52it/s]

In [81]:
from transformer.Generator import Generator
from math_dataset import np_encode_string

generator = Generator(model, device, beam_size=5, n_best=1, max_token_seq_len=MAX_QUESTION_SZ)

#generator.generate_batch()
pred = ["what is 10 + 32?"]
pred = list(map(lambda q: np_encode_string(q), pred))
pred_loader = data.DataLoader(pred, batch_size=1, shuffle=False, num_workers=1, collate_fn=question_to_position_batch_collate_fn)

model_process.predict(generator, pred_loader, device)


batch_qs tensor([[ 2, 88, 73, 66, 85,  1, 74, 84,  1, 18, 17,  1, 12,  1, 20, 19, 32,  3]])
batch_qs_pos tensor([[ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18]])
4       2       "       
3       2       "       


In [13]:
'''A wrapper class for optimizer '''
import numpy as np

class ScheduledOptim():
    '''A simple wrapper class for learning rate scheduling'''

    def __init__(self, optimizer, d_model, n_warmup_steps):
        self._optimizer = optimizer
        self.n_warmup_steps = n_warmup_steps
        self.n_current_steps = 0
        self.init_lr = np.power(d_model, -0.5)

    def step_and_update_lr(self):
        "Step with the inner optimizer"
        self._update_learning_rate()
        self._optimizer.step()

    def zero_grad(self):
        "Zero out the gradients by the inner optimizer"
        self._optimizer.zero_grad()

    def _get_lr_scale(self):
        return np.min([
            np.power(self.n_current_steps, -0.5),
            np.power(self.n_warmup_steps, -1.5) * self.n_current_steps])

    def _update_learning_rate(self):
        ''' Learning rate scheduling per step '''

        self.n_current_steps += 1
        lr = self.init_lr * self._get_lr_scale()

        for param_group in self._optimizer.param_groups:
            param_group['lr'] = lr


In [70]:
state = {
    "exp_name": "math_ds_arithmetic_add_or_sub_easy",
    "unique_id": "2019-04-22T13:32:24",
    "type": "train",
    "model": model.state_dict(),
    "optimizer": optimizer.state_dict(),
    "acc": 0.9319619128828253,
    "loss": 0.2518192112851183,
    "epoch": 10,
}
torch.save(state, "checkpoints/math_ds_arithmetic_add_or_sub_easy_2019-04-22T13:32:24_validation_best.pth")


In [None]:
from rotating_checkpoint import restore_checkpoint

state = restore_checkpoint(model, optimizer, "math_ds_arithmetic_add_or_sub_easy_2019-04-22T13:32:24.pth")
print(state)