In [1]:
%load_ext autoreload
%autoreload 2

from datasets import DataCocoFeat
from models import LSTMCaption
from optimers import OptimerAdam
from utils import check_gradient, show_training_info

In [2]:
# load data
dataloader = DataCocoFeat('./datasets/coco_captioning/', pca_features=True)
dataloader.show_info()

Training data shape:  (400135, 512)
Training labels shape:  (400135, 17)
Validation data shape:  (195954, 512)
Validation labels shape:  (195954, 17)
Number of training images:  82783
Number of validation images:  40504
Number of words:  1004


In [26]:
def get_init_model(hyperparams, device=''):
    return LSTMCaption(dataloader.x_train.shape[1], None, hyperparams=hyperparams, seed=101, device=device)


def test_model(model, data, idx_train, idx_val):
    caps_train = model.predict(data.x_train[idx_train])
    caps_val = model.predict(data.x_val[idx_val])

    # the predicted captions
    for i in range(len(caps_train)):
        print('Predicted training data:', data.decode_captions(caps_train[i]))
        data.show_by_index(idx_train[i], data_type='train')

    for i in range(len(caps_val)):
        print('Predicted validation data:', data.decode_captions(caps_val[i]))
        data.show_by_index(idx_val[i], data_type='val')

# Sanity check

### Init loss

In [27]:
# init model
model = get_init_model({'word_to_idx': dataloader.data['word_to_idx'],
                        'reg': 0., 'num_hidden': 32, 'num_vector': 32, 'init_scale': None}, device='')

# init loss
loss = model.backward(dataloader.x_train[0: 100, :], dataloader.y_train[0: 100])
print('Sanity check result:')
print('Init loss is', loss)

Sanity check result:
Init loss is 
[75.08026]
<NDArray 1 @cpu(0)>


### Gradient check

In [33]:
# gradient check
for i in range(1):
    print('\nGradient check result - ', i + 1, ':')
    check_gradient(model, dataloader.x_train[0: 100, :], dataloader.y_train[0: 100], h=0.1)


Gradient check result -  1 :
Layer | Key | Numerical gradient | Calculated gradient | Relative error
<class 'models.layers.layers.Linear'> W 0.0078582763671875 0.007861453 0.0004041725154276654
<class 'models.layers.layers.Linear'> b 0.01445770263671875 0.01446102 0.00022942721582602075
<class 'models.layers.rnn_layers.WordEmbedding'> W_embed 0.0 0.0 0.0
<class 'models.layers.rnn_layers.LSTM'> Wx 3.814697265625e-05 3.839045e-05 0.006362351745829909
<class 'models.layers.rnn_layers.LSTM'> Wh 0.0006103515625 0.0006280303 0.02855131397354991
<class 'models.layers.rnn_layers.LSTM'> b -0.0014495849609375 -0.0014397334 0.006819276285125429
<class 'models.layers.rnn_layers.LinearForRNN'> W -0.00026702880859375 -0.0002672776 0.0009312242723149029
<class 'models.layers.rnn_layers.LinearForRNN'> b 0.010833740234375 0.01081694 0.00155192609738076


### Overfit small dataset

In [None]:
# load small dataset
smallloader = DataCocoFeat('./datasets/coco_captioning/', pca_features=True,
                           num_train=64, num_val=64, order_by='random')

# init model
model = get_init_model({'word_to_idx': dataloader.data['word_to_idx'],
                        'reg': 0., 'num_hidden': 512, 'num_vector': 256})

# train model
optimer = OptimerAdam({'learn_rate': 5e-3, 'learn_rate_decay': 0.995, 'num_iters': 200, 'batch_size': 32}, 
                      print_every=10, check_val_acc=False, check_train_acc=False)

optimer.train(model, smallloader)

# save model
model.save('./saves/LSTMCaption/model_small.pkl')

In [None]:
# plot loss curve
show_training_info(optimer)

In [None]:
# load model
model = LSTMCaption.load('./saves/LSTMCaption/model_small.pkl')

# test model
test_model(model, smallloader, [0, 1, 3], [0, 1, 2])

# Run time test

In [45]:
# init model
hyperparam = {'word_to_idx': dataloader.data['word_to_idx'],
              'reg': 0., 'num_hidden': 1024, 'num_vector': 512, 'init_scale': None}

model1 = get_init_model(hyperparam, device='cpu')
model2 = get_init_model(hyperparam, device='')

# init loss
print('\nRun time for CPU model:')
loss1 = model1.backward(dataloader.x_train[0: 128, :], dataloader.y_train[0: 128], print_time=True)
print('\nRun time for GPU model:')
loss2 = model2.backward(dataloader.x_train[0: 128, :], dataloader.y_train[0: 128], print_time=True)


Run time for CPU model:
Forward time: 1.3654310703277588
    Input linear forward time: 0.00575709342956543
    Word embedding forward time: 0.0016629695892333984
    RNN forward time: 1.2377729415893555
    Output linear forward time: 0.12023806571960449

Backward time: 2.060302972793579
    Loss calculate time: 0.06282210350036621
    Output linear backward time: 0.17243194580078125
    RNN backward time: 1.7108380794525146
    Word embedding backward time: 0.10556697845458984
    Input linear backward time: 0.008643865585327148

Reg time: 0.015360116958618164

Run time for GPU model:
Forward time: 0.5149998664855957
    Input linear forward time: 0.005795955657958984
    Word embedding forward time: 0.00115203857421875
    RNN forward time: 0.4602680206298828
    Output linear forward time: 0.047783851623535156

Backward time: 2.6620171070098877
    Loss calculate time: 0.07166004180908203
    Output linear backward time: 0.14072704315185547
    RNN backward time: 2.317404985427856

# Train LSTM-Caption model

### Train Model

In [None]:
# init model
if True:
    model = get_init_model({'word_to_idx': dataloader.data['word_to_idx'],
                            'reg': 0.01, 'num_hidden': 1024, 'num_vector': 512})
else:
    model = LSTMCaption.load('./saves/LSTMCaption/model.pkl')

# train model 'learn_rate': 2e-3
optimer = OptimerAdam({'learn_rate': 2e-3, 'learn_rate_decay': 0.995, 'num_iters': 2000, 'batch_size': 256}, 
                      print_every=100, check_val_acc=False, check_train_acc=False)

optimer.train(model, dataloader)

# save model
model.save('./saves/LSTMCaption/model.pkl')

In [None]:
# plot loss curve
show_training_info(optimer)

In [None]:
# load model
model = LSTMCaption.load('./saves/LSTMCaption/model.pkl')

# test model
test_model(model, dataloader, [0, 1, 4], [10, 11, 12, 13, 14, 15])