In [1]:
import torch
from torch import optim,nn
from torch.autograd import Variable
import numpy as np
import time
from models.data_loader import DataLoader
from models.retain_bidirectional import RETAIN

In [2]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=3

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=3


In [3]:
# hyperparameters
epochs = 30
batch_size = 50
max_seq_length = 100
min_seq_length = 5
num_classes = 268
emb_size = 128
hid_size = 128
lr = 0.001
cuda_flag = True

# data loader
D = DataLoader(batch_size=batch_size,
   data_dir='data/batches/',
    mode='train', max_seq_length=max_seq_length, min_seq_length=min_seq_length)

# import model and optimization settings
model = RETAIN(emb_size,hid_size,num_classes,cuda_flag)
model.release = True

criterion = nn.CrossEntropyLoss()
cnt = 0
if cuda_flag:
    model.cuda()
    criterion.cuda()

In [4]:
# train model
lr_list = [0.001, 0.0003, 0.0001, 0.00003, 0.00001, 0.000003, 0.000001]
lr_counter = 0
lr = lr_list[lr_counter]
opt = optim.Adam(model.parameters(), lr=lr)
loss_list = []
loss_mean = 0.0
file_cnt = 0
cnt = 0
loss_dict = dict()

In [None]:
len_train = len(D.train_list)
while file_cnt<(epochs*len(D.train_list)):
    idx = file_cnt%(len_train)
    file = D.train_list[idx]
    if file not in loss_dict:
        loss_dict[file] = []
    print("Epoch %d [%d, %d/%d] - opening file %s" %(((file_cnt+1)/len_train), file_cnt, idx, len_train, D.train_list[idx]))
    file_num = int(file.split('_')[1].split('.')[0])
    D.batch_size = int(40000/file_num)
    D.load_batch_file(file)
    loss_list = []
    for i in range(D.batch_count):
        cnt+=1
        input_list, targets = D.get_batch()
        start = time.time()
        inputs = model.list_to_tensor(input_list)
        outputs = model(inputs)
        targets = Variable(torch.LongTensor(targets)[:,-1]) # to only use last of each sequence
#             targets = Variable(torch.LongTensor(targets)).view(len(inputs),-1)[:,-1] # to only use last of each sequence
        if cuda_flag:
            targets = targets.cuda()
        loss = criterion(outputs.view(-1,num_classes),targets)
        loss_list.append(loss.data[0])
        if cnt%10==0:
            print('[%d] %1.3f' %(cnt,loss.data[0]))
        if cnt%500==0:
            print("Saving model at %dth step" %cnt)
            torch.save(model,'data/saved_weights/retain_bi_%d.pth'%(cnt))
            # create CPU version
            model2 = RETAIN(emb_size,hid_size,num_classes,False)
            if cuda_flag:
                model.cpu()
            model2.load_state_dict(model.state_dict())
            torch.save(model2,'data/saved_weights/retain_bi_%d_cpu.pth'%(cnt))
            if cuda_flag:
                model.cuda()
            print("Saving at %dth step"%cnt)
        # manual loss changes
        if cnt==100:
            lr_counter+=1
            lr = lr_list[lr_counter]
            opt = optim.Adam(model.parameters(),lr=lr)
        if cnt==500:
            lr_counter+=1
            lr = lr_list[lr_counter]
            opt = optim.Adam(model.parameters(),lr=lr)        
        if loss.data[0]>10:
            import sys
            sys.exit()
#             print(loss.data[0])
        loss.backward()
        opt.step()
    print("Loss: %1.3f" %np.mean(loss_list))
    loss_dict[file].append(loss.data[0])
    file_cnt+=1

Epoch 0 [0, 0/206] - opening file 2014_94.pckl
Loss: 5.557
Epoch 0 [1, 1/206] - opening file 2014_216.pckl
Loss: 5.331
Epoch 0 [2, 2/206] - opening file 2014_213.pckl
Loss: 5.187
Epoch 0 [3, 3/206] - opening file 2014_254.pckl
Loss: 4.792
Epoch 0 [4, 4/206] - opening file 2014_192.pckl
Loss: 4.995
Epoch 0 [5, 5/206] - opening file 2014_287.pckl
Loss: 4.864
Epoch 0 [6, 6/206] - opening file 2014_34.pckl
[10] 5.191
Loss: 5.137
Epoch 0 [7, 7/206] - opening file 2014_138.pckl
Loss: 3.568
Epoch 0 [8, 8/206] - opening file 2014_172.pckl
Loss: 3.799
Epoch 0 [9, 9/206] - opening file 2014_194.pckl
Loss: 3.460
Epoch 0 [10, 10/206] - opening file 2014_252.pckl
Loss: 2.906
Epoch 0 [11, 11/206] - opening file 2014_261.pckl
Loss: 3.133
Epoch 0 [12, 12/206] - opening file 2014_54.pckl
[20] 4.363
Loss: 4.319
Epoch 0 [13, 13/206] - opening file 2014_132.pckl
Loss: 3.574
Epoch 0 [14, 14/206] - opening file 2014_116.pckl
Loss: 3.944
Epoch 0 [15, 15/206] - opening file 2014_285.pckl
Loss: 3.823
Epoch 0 [

In [None]:
# lower learning rate
lr_counter+=1
lr = lr_list[lr_counter]
opt = optim.Adam(model.parameters(), lr=lr)