In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import string
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt

from typing import * 


import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, ConcatDataset, Subset, DataLoader

import torchvision.transforms as VT


from ocrnune.data import dataset
from ocrnune.models import crnn

import ocrnune.transforms as NT
from ocrnune.data.dataset import LMDBDataset, BalanceDatasetConcatenator
from ocrnune.utils import AttnLabelConverter

In [3]:
import pytorch_lightning as pl
from pytorch_lightning import loggers as pl_loggers
from pytorch_lightning.metrics import Accuracy

class OCRTaks(pl.LightningModule):
    def __init__(self, model, optimizer, criterion, converter, grad_clip=5.0):
        super().__init__()
        self.model = model
        self.model = self.model.to(self.device)
        
        self.optimizer = optimizer
        self.criterion = criterion
        self.converter = converter
        self.grad_clip = grad_clip
    
    def forward(self, imgs, text):
        output = self.model(imgs, texts)
        return output
    

    def backward(self, trainer, loss, optimizer, optimizer_idx):
        loss.backward()
        nn.utils.clip_grad_norm_(self.model.parameters(), self.grad_clip)
   
    def shared_step(self, batch, batch_idx):
        images, texts = batch
        images = images.to(self.device)
        
        texts_encoded, texts_length = self.converter.encode(texts)
        texts_encoded = texts_encoded.to(self.device)
        texts_length = texts_encoded.to(self.device)
        
        preds = self.model(images, texts_encoded[:, :-1])
        targets = texts_encoded[:, 1:]
        
        loss = self.criterion(preds.view(-1, preds.shape[-1]), targets.contiguous().view(-1))
        
        return loss
        
        
    def training_step(self, batch, batch_idx):
        loss = self.shared_step(batch, batch_idx)
        result = pl.TrainResult(loss)
        result.log_dict({'trn_loss': loss})
        
        return result
    
    def validation_step(self, batch, batch_idx):
        loss = self.shared_step(batch, batch_idx)
        result = pl.EvalResult(checkpoint_on=loss)
        result.log_dict({'val_loss': loss})
        
        return result
    
    def configure_optimizers(self):
        return self.optimizer
    

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [5]:
BATCH_SIZE = 2
NUM_WORKERS = 4
BATCH_MAX_LENGTH = 25
CHARACTER = string.printable[:-6]
IMG_SIZE = (32,100)
BETA1 = 0.9
BETA2 = 0.999
LRATE = 1.0

GRAD_CLIP = 5.0

In [6]:
trn_transform = VT.Compose([
    NT.ResizeRatioWithRightPad(size=IMG_SIZE),
    VT.ToTensor(),
    VT.Normalize(mean=(0.5), std=(0.5))  
])

val_transform = VT.Compose([
    NT.ResizeRatioWithRightPad(size=IMG_SIZE),
    VT.ToTensor(),
    VT.Normalize(mean=(0.5), std=(0.5))  
])


trn_path = '/data/lmdb/data_lmdb_release/training'
val_path = '/data/lmdb/data_lmdb_release/validation'



train_bdc = BalanceDatasetConcatenator(trn_path, dataset_class=LMDBDataset, 
                                       transform=trn_transform,
                                       subdir=('ST', 'MJ'), usage_ratio=(0.5, 0.5),
                                       im_size=IMG_SIZE, is_sensitive=True)
trainset = train_bdc.get_dataset()


valid_bdc = BalanceDatasetConcatenator(val_path, dataset_class=LMDBDataset, 
                                       transform=val_transform,
                                       im_size=IMG_SIZE, is_sensitive=True)
validset = valid_bdc.get_dataset()

In [7]:
len(trainset), len(validset)

(7221024, 6992)

In [8]:
from ocrnune.utils import AttnLabelConverter
converter = AttnLabelConverter(CHARACTER)
num_class = len(converter.character)

In [9]:
train_loader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)
valid_loader = DataLoader(validset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)

In [10]:
imgs, texts =  next(iter(train_loader))

In [11]:
converter = AttnLabelConverter(CHARACTER)
NUM_CLASS = len(converter.character)

In [12]:
model = crnn.OCR(num_class=NUM_CLASS, im_size=IMG_SIZE)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=LRATE, betas=(BETA1, BETA2))

In [13]:
checkpoint_path = '../saved_model'
# DEFAULTS used by the Trainer
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    filepath=checkpoint_path,
    save_top_k=1,
    verbose=True,
    monitor='checkpoint_on',
    mode='min',
    prefix='ocr_net_'
)



In [14]:
tb_logger = pl_loggers.TensorBoardLogger('../logs/ocr_net')
task = OCRTaks(model, optimizer, criterion, converter)
trainer = pl.Trainer(gpus=1, logger=tb_logger, checkpoint_callback=checkpoint_callback)
trainer.fit(task, train_loader, valid_loader)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | model     | OCR              | 856 M 
1 | criterion | CrossEntropyLoss | 0     


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

RuntimeError: CUDA out of memory. Tried to allocate 1.52 GiB (GPU 0; 7.79 GiB total capacity; 4.77 GiB already allocated; 606.19 MiB free; 6.29 GiB reserved in total by PyTorch) (malloc at /opt/conda/conda-bld/pytorch_1587428091666/work/c10/cuda/CUDACachingAllocator.cpp:289)
frame #0: c10::Error::Error(c10::SourceLocation, std::string const&) + 0x4e (0x7fc35a0d4b5e in /opt/anaconda3/envs/dlearn/lib/python3.6/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x1f39d (0x7fc359e9639d in /opt/anaconda3/envs/dlearn/lib/python3.6/site-packages/torch/lib/libc10_cuda.so)
frame #2: <unknown function> + 0x2058e (0x7fc359e9758e in /opt/anaconda3/envs/dlearn/lib/python3.6/site-packages/torch/lib/libc10_cuda.so)
frame #3: at::native::empty_cuda(c10::ArrayRef<long>, c10::TensorOptions const&, c10::optional<c10::MemoryFormat>) + 0x291 (0x7fc35ce2d461 in /opt/anaconda3/envs/dlearn/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #4: <unknown function> + 0xddcb6b (0x7fc35b0ddb6b in /opt/anaconda3/envs/dlearn/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #5: <unknown function> + 0xe26457 (0x7fc35b127457 in /opt/anaconda3/envs/dlearn/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #6: <unknown function> + 0xdd3999 (0x7fc382089999 in /opt/anaconda3/envs/dlearn/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #7: <unknown function> + 0xdd3cd7 (0x7fc382089cd7 in /opt/anaconda3/envs/dlearn/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #8: <unknown function> + 0xd9345e (0x7fc35b09445e in /opt/anaconda3/envs/dlearn/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #9: <unknown function> + 0xd9a4df (0x7fc35b09b4df in /opt/anaconda3/envs/dlearn/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #10: at::native::_cudnn_rnn_backward(at::Tensor const&, c10::ArrayRef<at::Tensor>, long, at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, bool, double, bool, bool, c10::ArrayRef<long>, at::Tensor const&, at::Tensor const&, std::array<bool, 4ul>) + 0x2c8 (0x7fc35b0a2938 in /opt/anaconda3/envs/dlearn/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #11: <unknown function> + 0xe245ad (0x7fc35b1255ad in /opt/anaconda3/envs/dlearn/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #12: <unknown function> + 0xe25d43 (0x7fc35b126d43 in /opt/anaconda3/envs/dlearn/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #13: <unknown function> + 0x2866a60 (0x7fc383b1ca60 in /opt/anaconda3/envs/dlearn/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #14: <unknown function> + 0x28d9eb3 (0x7fc383b8feb3 in /opt/anaconda3/envs/dlearn/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #15: torch::autograd::generated::CudnnRnnBackward::apply(std::vector<at::Tensor, std::allocator<at::Tensor> >&&) + 0x708 (0x7fc3838d1338 in /opt/anaconda3/envs/dlearn/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #16: <unknown function> + 0x2ae8215 (0x7fc383d9e215 in /opt/anaconda3/envs/dlearn/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #17: torch::autograd::Engine::evaluate_function(std::shared_ptr<torch::autograd::GraphTask>&, torch::autograd::Node*, torch::autograd::InputBuffer&) + 0x16f3 (0x7fc383d9b513 in /opt/anaconda3/envs/dlearn/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #18: torch::autograd::Engine::thread_main(std::shared_ptr<torch::autograd::GraphTask> const&, bool) + 0x3d2 (0x7fc383d9c2f2 in /opt/anaconda3/envs/dlearn/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #19: torch::autograd::Engine::thread_init(int) + 0x39 (0x7fc383d94969 in /opt/anaconda3/envs/dlearn/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #20: torch::autograd::python::PythonEngine::thread_init(int) + 0x38 (0x7fc3870dac38 in /opt/anaconda3/envs/dlearn/lib/python3.6/site-packages/torch/lib/libtorch_python.so)
frame #21: <unknown function> + 0xc819d (0x7fc3e9ace19d in /opt/anaconda3/envs/dlearn/lib/python3.6/site-packages/zmq/backend/cython/../../../../.././libstdc++.so.6)
frame #22: <unknown function> + 0x76db (0x7fc3ecaaa6db in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #23: clone + 0x3f (0x7fc3ec7d3a3f in /lib/x86_64-linux-gnu/libc.so.6)


In [None]:
!nvidia-smi