In [1]:
import argparse
parser = argparse.ArgumentParser(description='Argparser')
parser.add_argument('--data_path', type=str, default='../data/', help='data path')
parser.add_argument('--data_type', type=str, default='*.h5', help='data type')
parser.add_argument('--print_freq', type=int, default=100, help='loss print frequency')

parser.add_argument('--batch_size', type=int, default=8, help='batch size')
parser.add_argument('--num_workers', type=int, default=4, help='number of cpu works')

parser.add_argument('--num_epochs', type=int, default=10, help='number of epochs')
parser.add_argument('--pad_idx', type=int, default=0, help='padding index')
parser.add_argument('--bos_idx', type=int, default=1, help='start token index')
parser.add_argument('--eos_idx', type=int, default=2, help='end token index')
parser.add_argument('--max_len', type=int, default=300, help='max length of input')
parser.add_argument('--d_model', type=int, default=768, help='dimension of model')
parser.add_argument('--d_embedding', type=int, default=256, help='dimension of embedding')
parser.add_argument('--n_head', type=int, default=8, help='''multihead-attention's head count''')
parser.add_argument('--dim_feedforward', type=int, default=2048, help='feedforward dimension')
parser.add_argument('--n_layers', type=int, default=6, help='layers number')
parser.add_argument('--dropout', type=float, default=0.2, help='dropout ratio')

parser.add_argument('--learning_rate', type=float, default=1e-3, help='learning rate')
parser.add_argument('--lr_decay_step', type=int, default=5, help='learning rate decay step')
parser.add_argument('--lr_decay_gamma', type=float, default=0.1, help='learning rate decay rate')
parser.add_argument('--grad_clip', type=int, default=5, help='gradient clipping norm')
args = parser.parse_args(list())

In [2]:
# Import Module
import os
import time
import datetime
import numpy as np
import pandas as pd

from glob import glob
from tqdm import tqdm

# Import PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.utils as torch_utils

# Import Custom
from model.bert import littleBert
from dataset import CustomDataset, getDataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load Data
print('Data Loading...')
start_time = time.time()

data_path = './data/'
data_list = sorted(glob(os.path.join(args.data_path, args.data_type)))
dataset_dict = {
    'train': CustomDataset(data_list[0]),
    'valid': CustomDataset(data_list[1])
}

dataloader_dict = {
    'train':  getDataLoader(dataset_dict['train'], 
                            shuffle=True,
                            drop_last=True, 
                            pin_memory=True, 
                            batch_size=args.batch_size,
                            num_workers=args.num_workers),
    'valid':  getDataLoader(dataset_dict['valid'], 
                            shuffle=True, 
                            drop_last=True, 
                            pin_memory=True, 
                            batch_size=args.batch_size,
                            num_workers=args.num_workers)
}

# Model setting
model = littleBert(pad_idx=args.pad_idx, bos_idx=args.bos_idx, eos_idx=args.eos_idx, 
                   max_len=args.max_len, d_model=args.d_model, d_embedding=args.d_embedding, 
                   n_head=args.n_head, dim_feedforward=args.dim_feedforward,
                   n_layers=args.n_layers, dropout=args.dropout, device=device)
model = model.to(device)

# Optimizer Setting
criterion = nn.MSELoss()
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.learning_rate)
lr_step_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_decay_step, gamma=args.lr_decay_gamma) # Decay LR by a factor of 0.1 every step_size

# Preparing
best_val_loss = None
now = datetime.datetime.now()
nowDatetime = now.strftime('%Y-%m-%d %H:%M:%S')
if not os.path.exists('./save/'):
    os.mkdir('./save/')
if not os.path.exists(f'./save/save_{nowDatetime}'):
    os.mkdir(f'./save/save_{nowDatetime}')
hyper_parameter_setting = dict()
hyper_parameter_setting['n_layers'] = args.n_layers
hyper_parameter_setting['d_model'] = args.d_model
hyper_parameter_setting['n_head'] = args.n_head
hyper_parameter_setting['d_embedding'] = args.d_embedding
hyper_parameter_setting['dim_feedforward'] = args.dim_feedforward
with open(f'./save/save_{nowDatetime}/hyper_parameter_setting.txt', 'w') as f:
    for key in hyper_parameter_setting.keys():
        f.write(str(key) + ': ' + str(hyper_parameter_setting[key]))
        f.write('\n')

spend_time = round((time.time() - start_time) / 60, 4)
print(f'Setting done...! / {spend_time}min spend...!')

for epoch in range(args.num_epochs):
    print('Epoch {}/{}'.format(epoch + 1, args.num_epochs))
    start_time_e = time.time()
    for phase in ['train', 'valid']:
        running_loss = 0
        freq = args.print_freq - 1
        if phase == 'train':
            model.train()
        else:
            model.eval()

        # Iterate over data
        for i, input_ in enumerate(tqdm(dataloader_dict[phase])):

            # Input to Device(CUDA) and split
            src = input_[0].to(device)
            src_hour = input_[2].to(device)
            src_weekday = input_[3].to(device)
            trg = input_[4].to(device)

            # Optimizer Setting
            optimizer.zero_grad()

            # Model Training & Validation
            with torch.set_grad_enabled(phase == 'train'):
                outputs = model(src, src_hour, src_weekday)

                # Backpropagate Loss
                loss = criterion(outputs, trg.to(torch.float))
                if phase == 'train':
                    loss.backward()
                    torch_utils.clip_grad_norm_(model.parameters(), args.grad_clip)
                    optimizer.step()

                    # Print every print_frequency
                    freq += 1
                    if freq == args.print_freq:
                        total_loss = loss.item()
                        print("[loss:%5.2f]" % (total_loss))
                        freq = 0
                if phase == 'valid':
                    val_loss += loss.item()

        # Save model and view total loss
        if phase == 'valid': 
            print('='*45)
            val_loss /= len(dataloader_dict['valid'])
            print("[Epoch:%d] val_loss:%5.3f | spend_time:%5.2fmin"
                    % (e, val_loss, (time.time() - start_time_e) / 60))
            if not best_val_loss or val_loss < best_val_loss:
                print("[!] saving model...")
                val_loss_save = round(val_loss, 2)
                torch.save(model.state_dict(), f'./save/save_{nowDatetime}/model_{e}_{val_loss_save}.pt')
                best_val_loss = val_loss

    # Gradient Scheduler Step
    scheduler.step()

Data Loading...


  0%|          | 0/150002 [00:00<?, ?it/s]

Setting done...! / 0.0602min spend...!
Epoch 1/10


  0%|          | 5/150002 [00:00<7:38:33,  5.45it/s] 

[loss:277.11]


  0%|          | 108/150002 [00:03<1:09:27, 35.97it/s]

[loss:4540.31]


  0%|          | 205/150002 [00:06<1:16:04, 32.82it/s]

[loss:2572.46]


  0%|          | 305/150002 [00:08<1:05:49, 37.90it/s]

[loss:1351.26]


  0%|          | 405/150002 [00:11<1:05:49, 37.88it/s]

[loss:792.19]


  0%|          | 508/150002 [00:13<1:03:33, 39.20it/s]

[loss:485.50]


  0%|          | 606/150002 [00:16<1:07:48, 36.72it/s]

[loss:148.91]


  0%|          | 708/150002 [00:19<1:03:51, 38.97it/s]

[loss:3263.90]


  1%|          | 805/150002 [00:21<1:07:50, 36.65it/s]

[loss:5328.60]


  1%|          | 906/150002 [00:24<1:04:45, 38.37it/s]

[loss:1945.81]


  1%|          | 1006/150002 [00:27<1:07:00, 37.06it/s]

[loss:195.36]


  1%|          | 1106/150002 [00:29<1:05:40, 37.78it/s]

[loss:334.23]


  1%|          | 1208/150002 [00:32<1:12:21, 34.27it/s]

[loss:7711.01]


  1%|          | 1307/150002 [00:35<1:05:56, 37.58it/s]

[loss:7921.32]


  1%|          | 1406/150002 [00:37<1:11:16, 34.75it/s]

[loss:3486.46]


  1%|          | 1507/150002 [00:40<1:04:22, 38.45it/s]

[loss:4846.92]


  1%|          | 1607/150002 [00:43<1:08:05, 36.33it/s]

[loss:259.00]


  1%|          | 1657/150002 [00:44<1:11:27, 34.60it/s]

KeyboardInterrupt: 

In [3]:
input_

[tensor([[0., 1., 0., 0., 0., 2., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 3., 1., 0., 1., 1., 1., 2., 0., 0., 1.],
         [2., 0., 2., 2., 3., 0., 1., 4., 2., 4., 3., 5.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0.]]),
 tensor([[0., 0., 0., 0., 0., 0., 2., 0., 0., 0., 1., 0.],
         [0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
         [1., 0., 0., 2., 1., 1., 1., 0., 1., 3., 0., 0.],
         [5., 3., 4., 2., 4., 1., 0., 3., 2., 2., 0., 2.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0.]]),
 tensor([[5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6],
     