In [0]:
# !wget https://www.robots.ox.ac.uk/~vgg/data/text/mjsynth.tar.gz

--2019-08-21 13:44:08--  https://www.robots.ox.ac.uk/~vgg/data/text/mjsynth.tar.gz
Resolving www.robots.ox.ac.uk (www.robots.ox.ac.uk)... 129.67.94.2
Connecting to www.robots.ox.ac.uk (www.robots.ox.ac.uk)|129.67.94.2|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10678411583 (9.9G) [application/x-gzip]
Saving to: ‘mjsynth.tar.gz’


2019-08-21 13:46:43 (65.9 MB/s) - ‘mjsynth.tar.gz’ saved [10678411583/10678411583]



In [0]:
# !tar zxf mjsynth.tar.gz

In [0]:
# Implement CRNN
import torch
import torch.nn as nn
import torch.nn.functional as F



class CRNN(nn.Module):

    def __init__(self,c_img):
        super(CRNN, self).__init__()
        self.c_img = c_img
        self.conv1 = nn.Conv2d(c_img, 64 ,3, stride=1,padding=1)
        self.conv2 = nn.Conv2d(64, 128,3, stride=1,padding=1)
        self.conv3_1 = nn.Conv2d(128, 256,3, stride=1,padding=1)
        self.conv3_2 = nn.Conv2d(256, 256,3, stride=1,padding=1)
        self.conv4 = nn.Conv2d(256, 512,3, stride=1,padding=1)
        self.conv5 = nn.Conv2d(512, 512,3, stride=1,padding=1)
        self.conv6 = nn.Conv2d(512, 512,2, stride=1,padding=0)
        self.relu = nn.ReLU(True)
        self.batchnorm1 = nn.BatchNorm2d(64)
        self.batchnorm2 = nn.BatchNorm2d(512)
        self.pool22s2 = nn.MaxPool2d(2, stride=2)
        self.pool12s2 = nn.MaxPool2d((1,2), stride=2)
        self.rnn = nn.LSTM(512, 256, num_layers=2,bidirectional=True)
        # with blank
        self.maplin = nn.Linear(512,11)

      

    def forward(self, x):

        x = self.relu(self.conv1(x))
        x = self.pool22s2(x)
        x = self.relu(self.conv2(x))
        x = self.pool22s2(x)
        x = self.relu(self.conv3_1(x))
        x = self.relu(self.conv3_2(x))
        x = self.pool12s2(x)
        x = self.relu(self.batchnorm2(self.conv4(x)))
        x = self.relu(self.batchnorm2(self.conv5(x)))
        x = self.pool12s2(x)
        x = self.relu(self.conv6(x))
        # x (batch, 512, 1, seq_len)
        # need to resize it to (seq len, batch, 512)
        x, _ = self.rnn(x.permute(3,0,1,2).squeeze(3))
        
        x = self.maplin(x) 
        # x (seqlen, 1, class_size)

        return x.log_softmax(2)


net = CRNN(1).cuda()
print(net)

CRNN(
  (conv1): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3_1): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3_2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv4): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv5): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv6): Conv2d(512, 512, kernel_size=(2, 2), stride=(1, 1))
  (relu): ReLU(inplace)
  (batchnorm1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool22s2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (pool12s2): MaxPool2d(kernel_size=(1, 2), stride=2, padding=0, dilation=1, ceil_mode=False)
  (rnn): LSTM(512, 256, num_layers=2, bidirectional=True)

In [0]:
inp = torch.randn(32,1,32,64).cuda()
out = net(inp)
print(out.shape)

torch.Size([3, 32, 11])


In [0]:
inp.permute(3,0,1,2).shape

torch.Size([64, 32, 1, 32])

In [0]:
# Example training with CTC Loss
T = 5      # Input sequence length
C = 11      # Number of classes (including blank)
N = 1      # Batch size
S = 2      # Target sequence length of longest target in batch
S_min = 1  # Minimum target length, for demonstration purposes
# Initialize random batch of input vectors, for *size = (T,N,C)
input = torch.randn(T, N, C, requires_grad=True)
z = torch.ones(1, requires_grad=True)
# Initialize random batch of targets (0 = blank, 1:C = classes)
target = torch.randint(low=1, high=C, size=(N, S), dtype=torch.long)
input_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.long)
target_lengths = torch.randint(low=S_min, high=S, size=(N,), dtype=torch.long)
if target_lengths == 2: target_lengths -=1
ctc_loss = nn.CTCLoss()
loss = 999
while loss >= 0.005 :
  log_inp = F.log_softmax(input*z, dim=2)
  loss = ctc_loss(log_inp, target, input_lengths, target_lengths)
  loss.backward()
  #print('loss:', loss)
  with torch.no_grad():
    input  -= 0.1*input.grad
    input.grad.zero_()

  

In [0]:
# Try training our model with CTC Loss with random input

target = torch.randint(low=1, high=11, size=(32, 1), dtype=torch.long)
print('target',target)
learning_rate = 0.0005
ctc_loss = nn.CTCLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)
loss = 999
while loss >= 0.005 :
  pred = net(inp)
  print(pred)
  input_lengths = torch.full(size=(32,), fill_value=pred.shape[0], dtype=torch.long)
  target_lengths = torch.full(size=(32,), fill_value=1, dtype=torch.long)
  loss = ctc_loss(pred, target.cuda(), input_lengths, target_lengths)
  print(loss)
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

  

In [0]:
target_lengths.shape

torch.Size([32, 1])

In [0]:
input_lengths

tensor([5.8743e-34])

In [0]:
target_lengths

tensor([[1.]])

In [0]:
pred.exp().max(2)

torch.return_types.max(values=tensor([[0.9924],
        [0.9992],
        [0.9996],
        [0.9994],
        [0.9987],
        [0.9893],
        [0.9425],
        [0.9877],
        [0.9159],
        [0.9992],
        [0.9977]], grad_fn=<MaxBackward0>), indices=tensor([[10],
        [10],
        [10],
        [10],
        [10],
        [10],
        [ 2],
        [ 2],
        [ 9],
        [ 9],
        [ 9]]))

In [0]:
from torchvision import datasets, transforms
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
# Train with mnist
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True,
                    transform=transforms.Compose([
                        transforms.Resize(size=(32,64)),                      
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])),
    batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, transform=transforms.Compose([
                        transforms.Resize(size=(32,64)), 
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])),
    batch_size=1, shuffle=True)

In [0]:

def train(model, device, train_loader, optimizer, crit, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        input_lengths = torch.full(size=(32,), fill_value=output.shape[0], dtype=torch.long)
        target_lengths = torch.full(size=(32,), fill_value=1, dtype=torch.long)
        loss = crit(output, target[:,None], input_lengths,target_lengths )
        loss.backward()
        optimizer.step()
        if batch_idx % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

In [0]:
device = torch.device("cuda")

In [0]:
model = CRNN(1).to(device)
ctc_loss = nn.CTCLoss()
model

CRNN(
  (conv1): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3_1): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3_2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv4): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv5): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv6): Conv2d(512, 512, kernel_size=(2, 2), stride=(1, 1))
  (relu): ReLU(inplace)
  (batchnorm1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool22s2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (pool12s2): MaxPool2d(kernel_size=(1, 2), stride=2, padding=0, dilation=1, ceil_mode=False)
  (rnn): LSTM(512, 256, num_layers=2, bidirectional=True)

In [0]:

optimizer = optim.SGD(model.parameters(), lr=0.0005, momentum=0.9)

for epoch in range(1, 10):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        
        optimizer.zero_grad()
        output = model(data)
        
        input_lengths = torch.full(size=(data.shape[0],), fill_value=output.shape[0], dtype=torch.long)
        target_lengths = torch.full(size=(data.shape[0],), fill_value=1, dtype=torch.long)
        loss = ctc_loss(output, target+1, input_lengths,target_lengths )
        loss.backward()
        optimizer.step()
        if batch_idx % 100 == 0:
            # print(output.exp().max(2),target)
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))



In [0]:
target_lengths

tensor([66815232,        0,        0,        2,        1,        2,        2,
               2,        3,        2,        4,        2,        5,        2,
               6,        2,        6,        2,        7,        2,        8,
               2,        9,        2,       10,        2,       11,        2,
              12,        2,       13,        2,       13,        2,       14,
               2,       15,        2,       16,        2,       17,        2,
              18,        2,       19,        2,       20,        2,       20,
               2,       21,        2,       22,        2,       23,        2,
              24,        2,       25,        2,       26,        2,       27,
               1], dtype=torch.int32)

In [0]:
input_lengths

tensor([9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
        9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
        9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9], dtype=torch.int32)

In [0]:
output.size(0)

9

In [0]:
acc = 0
for img,label in test_loader:
  img,label = img.cuda(), label.cuda()
  pred = model(img).exp().max(2)[1]-1
  if pred.max() == label:
    acc += 1

print('acc:', acc / len(test_loader))

acc: 0.9255


In [0]:
output.exp().max(2)[1]-1==test_label.cuda()

tensor([[1]], device='cuda:0', dtype=torch.uint8)

In [0]:
test_label == 1

tensor([1], dtype=torch.uint8)

In [0]:
len(test_loader)

10000

In [0]:
img.shape

torch.Size([1, 1, 32, 64])

In [0]:
pred.max()

tensor(4, device='cuda:0')