## Library import

For usage of CER need to install fastwer (yeah I am too lazy to write easy dp)

In [148]:
pip install fastwer

Collecting fastwer
  Downloading fastwer-0.1.3.tar.gz (4.6 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: fastwer
  Building wheel for fastwer (setup.py) ... [?25ldone
[?25h  Created wheel for fastwer: filename=fastwer-0.1.3-cp37-cp37m-linux_x86_64.whl size=916310 sha256=9b028c5a3ad1a7b66b9b6e6c7d69ac9ba934c8701b230634a5e8211a33eccc1e
  Stored in directory: /root/.cache/pip/wheels/c8/b6/93/419e758f0c0176d311602763520bcfdec18107be1f15186fe6
Successfully built fastwer
Installing collected packages: fastwer
Successfully installed fastwer-0.1.3
[0mNote: you may need to restart the kernel to use updated packages.


In [151]:
from PIL import Image
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader, random_split
import torch.nn as nn
import os
from matplotlib import pyplot as plt
from torchvision import transforms
from torch.nn import functional
import numpy as np
import pandas as pd
import fastwer

Defining some hyperparameters

In [19]:
data_path  = "/kaggle/input/captcha-version-2-images/samples"
label_len = 5
split_frac = 0.20
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
img_shape = ()

## Dataset class
 - loading data from the directory
 - Calculating the image shape and symbols occuring in the dataset

In [20]:

class ImageData(Dataset):
    def load_image(self, path):
        img = Image.open(path).convert('RGB')
        trans = transforms.ToPILImage()
        trans1 = transforms.ToTensor()
        res = trans1(img)
        return  res

    def get_label(self, filename):
            label = filename.split('.')[0]
            self.symbols.update(set(list(label)))
            return label 

    def __init__(self, data_path, label_len):
        self.samples = []
        self.symbols = set()
        self.img_shape = None
        super(self.__class__, self).__init__()
        for filename in os.listdir(data_path):
            if filename != 'samples':
                file_path = data_path + "/" + filename
                print(f"processing {file_path}")
                image = self.load_image(file_path)
                self.img_shape = tuple(image.shape)
                image_label = self.get_label(filename)
                self.samples.append((image, image_label))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

In [29]:
dataset = ImageData(data_path, label_len)
img_shape = dataset.img_shape
classes = dataset.symbols
classes_num = len(classes)

processing /kaggle/input/captcha-version-2-images/samples/6n6gg.png
processing /kaggle/input/captcha-version-2-images/samples/fncnb.png
processing /kaggle/input/captcha-version-2-images/samples/nxf2c.jpg
processing /kaggle/input/captcha-version-2-images/samples/nbwnn.png
processing /kaggle/input/captcha-version-2-images/samples/ebcbx.png
processing /kaggle/input/captcha-version-2-images/samples/nxn4f.png
processing /kaggle/input/captcha-version-2-images/samples/2en7g.png
processing /kaggle/input/captcha-version-2-images/samples/y866y.png
processing /kaggle/input/captcha-version-2-images/samples/xcmbp.png
processing /kaggle/input/captcha-version-2-images/samples/g247w.png
processing /kaggle/input/captcha-version-2-images/samples/56ncx.png
processing /kaggle/input/captcha-version-2-images/samples/n8wxm.jpg
processing /kaggle/input/captcha-version-2-images/samples/6ng6n.png
processing /kaggle/input/captcha-version-2-images/samples/x362g.png
processing /kaggle/input/captcha-version-2-image

Some statistics

In [30]:
print(f"total number of samples = {len(dataset)}")
print(f"shape of each image is {img_shape}")
print(f"symbols to recognize are {classes}, \nwhich is in total {len(classes)} elements")

total number of samples = 1070
shape of each image is (3, 50, 200)
symbols to recognize are {'w', '7', 'y', '3', '6', 'd', 'x', 'c', '2', '4', 'f', 'g', 'p', 'e', '8', 'n', 'b', 'm', '5'}, 
which is in total 19 elements


## Train test split
Here we split the dataset in train and test in the fraction defined before

In [31]:
trainset, valset = random_split(dataset, [856, 214])
train_loader = DataLoader(trainset, batch_size=16, shuffle=True, num_workers=2)
val_loader = DataLoader(valset, batch_size=16, shuffle=True, num_workers=2)

In [32]:
print(f"length of trainset is {len(trainset)}, length of validation set is {len(valset)}")
print(f"length of training set is {len(train_loader)}, length of validation loader {len(val_loader)}")

length of trainset is 856, length of validation set is 214
length of training set is 54, length of validation loader 14


In [25]:
print(trainset[0])

(tensor([[[0.7529, 0.7529, 0.7529,  ..., 0.9843, 0.9843, 0.9843],
         [0.7529, 0.7529, 0.7529,  ..., 0.9843, 0.9843, 0.9843],
         [0.7529, 0.7529, 0.7529,  ..., 0.9843, 0.9843, 0.9843],
         ...,
         [0.7647, 0.7647, 0.7647,  ..., 0.9961, 0.9961, 0.9961],
         [0.7647, 0.7647, 0.7647,  ..., 0.9961, 0.9961, 0.9961],
         [0.7647, 0.7647, 0.7647,  ..., 0.9961, 0.9961, 0.9961]],

        [[0.7529, 0.7529, 0.7529,  ..., 0.9843, 0.9843, 0.9843],
         [0.7529, 0.7529, 0.7529,  ..., 0.9843, 0.9843, 0.9843],
         [0.7529, 0.7529, 0.7529,  ..., 0.9843, 0.9843, 0.9843],
         ...,
         [0.7647, 0.7647, 0.7647,  ..., 0.9961, 0.9961, 0.9961],
         [0.7647, 0.7647, 0.7647,  ..., 0.9961, 0.9961, 0.9961],
         [0.7647, 0.7647, 0.7647,  ..., 0.9961, 0.9961, 0.9961]],

        [[0.7529, 0.7529, 0.7529,  ..., 0.9843, 0.9843, 0.9843],
         [0.7529, 0.7529, 0.7529,  ..., 0.9843, 0.9843, 0.9843],
         [0.7529, 0.7529, 0.7529,  ..., 0.9843, 0.9843, 0

This nasty line of code below crashed my whole jupiter kernel I have no idea why :( It just made an infinity loop even though in collab it worked fine 

In [33]:
image_batch, text_batch = iter(train_loader).next()
print(image_batch.size(), text_batch)

torch.Size([16, 3, 50, 200]) ('3xng6', 'd6fcn', 'g6n7x', 'g7gnf', '8gmnx', '64m82', 'e8e5e', '2pfpn', '3bnyf', 'ncw4g', 'n4b4m', '64b3p', 'en32e', 'cdf77', '77387', 'mdyp7')


## Model class

In [62]:
class MyNeuralNetwork(nn.Module):

    def __init__(self, vocab_size, dropout=0.5):
        super(MyNeuralNetwork, self).__init__()

        self.dropout = nn.Dropout(dropout)
        # convilutional part
        self.convlayer = nn.Sequential(
            nn.Conv2d(3, 32, (3,3), stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d((2,2), 2),

            nn.Conv2d(32, 64, (3,3), stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d((2,2), 2),

            nn.Conv2d(64, 128, (3,3), stride=1, padding=1),
            nn.ReLU(),

            nn.Conv2d(128, 256, (3,3), stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d((1,2), 2),

            nn.Conv2d(256, 512, (3,3), stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),

            nn.Conv2d(512, 512, (3,3), stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d((1,2), 2),

            nn.Conv2d(512, 512, (2,2), stride=1, padding=0),
            self.dropout
        )

        self.mapSeq = nn.Sequential(
            nn.Linear(1024, 256),
            self.dropout
        )
        #Bi Lstm (gru)
        
        self.lstm_0 = nn.GRU(256, 256, bidirectional=True)  
        self.lstm_1 = nn.GRU(512, 256, bidirectional=True)

        self.out = nn.Sequential(
            nn.Linear(512, vocab_size),
        )
        
        
    def forward(self, x): 
        x = self.convlayer(x)

        x = x.permute(0, 3, 1, 2)
        x = x.view(x.size(0), x.size(1), -1)
     
        x = self.mapSeq(x)

        x, _ = self.lstm_0(x)
        x, _ = self.lstm_1(x)

        x = self.out(x)   
        return x.permute(1, 0, 2)

In [110]:
class custom_loss:
    def conform_classes(self,bare_classes):
        i = 0
        self.num_classes = {}
        for bare_class in bare_classes:
            self.num_classes[bare_class] = i
            i+=1
    
    def encode_target(self, pred):
        res = torch.IntTensor([self.num_classes[val] for val in  "".join(pred)])
        res_shape = torch.IntTensor([len(val) for val in pred])
        return res, res_shape
        
        
    def __init__(self, bare_classes, loss_func):
        self.conform_classes(bare_classes)
        self.loss_func  = loss_func
        
    def calc_loss(self, pred, y):
            print(pred)
            print(y.shape)
            y_ = functional.log_softmax(text_batch_logits, 2) 
            y_shape = torch.full(size=(y_.size(1),), 
                                       fill_value=y_.size(0), 
                                       dtype=torch.int32).to(device)  
            print(y_.shape)
            pred_, pred_shape = self.encode_target(pred)
            print(pred_)
            print(y_shape)
            print(pred_shape)
            loss = self.loss_func(y_, pred_, y_shape, pred_shape)
            return loss
        

In [158]:
criterion = nn.CTCLoss(blank=0)
idx2char = [ '2', '3', '4', '5', '6', '7', '8', 'b', 'c', 'd', 'e', 'f', 'g', 'm', 'n', 'p', 'w', 'x', 'y']
char2idx = {'2': 0, '3': 1, '4': 2, '5': 3, '6': 4, '7': 5, '8': 6, 'b': 7, 'c': 8, 'd': 9, 'e': 10, 'f': 11, 'g': 12, 'm': 13, 'n': 14, 'p': 15, 'w': 16, 'x': 17, 'y': 18}
def encode_text_batch(text_batch):
    
    text_batch_targets_lens = [len(text) for text in text_batch]
    text_batch_targets_lens = torch.IntTensor(text_batch_targets_lens)
    
    text_batch_concat = "".join(text_batch)
    text_batch_targets = [char2idx[c] for c in text_batch_concat]
    text_batch_targets = torch.IntTensor(text_batch_targets)
    
    return text_batch_targets, text_batch_targets_lens
def compute_loss(text_batch, text_batch_logits):
    """
    text_batch: list of strings of length equal to batch size
    text_batch_logits: Tensor of size([T, batch_size, num_classes])
    """
   # print(text_batch)
   # print(text_batch_logits.shape)
    text_batch_logps = functional.log_softmax(text_batch_logits, 2) # [T, batch_size, num_classes]  
    text_batch_logps_lens = torch.full(size=(text_batch_logps.size(1),), 
                                       fill_value=text_batch_logps.size(0), 
                                       dtype=torch.int32).to(device) # [batch_size] 
   # print(text_batch_logps.shape)
   # print(text_batch_logps_lens) 
    text_batch_targets, text_batch_targets_lens = encode_text_batch(text_batch)
  #  print(text_batch_targets)
  #  print(text_batch_targets_lens)
    loss = criterion(text_batch_logps, text_batch_targets, text_batch_logps_lens, text_batch_targets_lens)

    return loss

In [132]:
loss_func = custom_loss(classes, nn.CTCLoss(blank=0))
loss_func.calc_loss(text_batch, text_batch_logits)

('3xng6', 'd6fcn', 'g6n7x', 'g7gnf', '8gmnx', '64m82', 'e8e5e', '2pfpn', '3bnyf', 'ncw4g', 'n4b4m', '64b3p', 'en32e', 'cdf77', '77387', 'mdyp7')
torch.Size([11, 16, 19])
torch.Size([11, 16, 19])
tensor([ 3,  6, 15, 11,  4,  5,  4, 10,  7, 15, 11,  4, 15,  1,  6, 11,  1, 11,
        15, 10, 14, 11, 17, 15,  6,  4,  9, 17, 14,  8, 13, 14, 13, 18, 13,  8,
        12, 10, 12, 15,  3, 16, 15,  2, 10, 15,  7,  0,  9, 11, 15,  9, 16,  9,
        17,  4,  9, 16,  3, 12, 13, 15,  3,  8, 13,  7,  5, 10,  1,  1,  1,  1,
         3, 14,  1, 17,  5,  2, 12,  1], dtype=torch.int32)
tensor([11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11],
       dtype=torch.int32)
tensor([5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5], dtype=torch.int32)


tensor(4.7190, grad_fn=<MeanBackward0>)

In [170]:
model = MyNeuralNetwork(classes_num).to(device)
loss_func = custom_loss(classes, nn.CrossEntropyLoss())
opt = torch.optim.SGD(
    model.parameters(), lr=0.02, nesterov=True,
    weight_decay=1e-5, momentum=0.7
)

In [171]:
def decode_predictions(text_batch_logits):
    text_batch_tokens = functional.softmax(text_batch_logits, 2).argmax(2) # [T, batch_size]
    text_batch_tokens = text_batch_tokens.numpy().T # [batch_size, T]

    text_batch_tokens_new = []
    for text_tokens in text_batch_tokens:
        text = [idx2char[idx] for idx in text_tokens]
        text = "".join(text)
        text_batch_tokens_new.append(text)

    return text_batch_tokens_new

In [172]:
def train_model(model, train_loader, val_loader, loss_fn, opt, n_epochs: int):
    train_loss = []
    val_loss = []
    
    for epoch in range(n_epochs):
        print(f"processing {epoch} epoch")
        ep_train_loss = []
        ep_val_loss = []
        ep_val_accuracy = []

        model.train(True) 
        for X_batch, y_batch in train_loader:
            X_batch.to(device)

            predictions = model.forward(X_batch)
            loss = compute_loss(y_batch, predictions) # in this sequence this is important
            loss.backward()
            opt.step()
            opt.zero_grad()


            ep_train_loss.append(loss.item())

        model.eval() 
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch.to(device)
                predictions = model(X_batch)
                loss = compute_loss(y_batch, predictions)
                ep_val_loss.append(loss)
        # print the results for this epoch:

        train_loss.append(np.mean(ep_train_loss))
        val_loss.append(np.mean(ep_val_loss))
        
        print(f"\t  training loss: {train_loss[-1]:.6f}")
        print(f"\tvalidation loss: {val_loss[-1]:.6f}")

    return train_loss , val_loss

In [173]:
train_loss, val_loss = train_model(model, train_loader, val_loader, loss_func,opt, n_epochs=20 )

processing 0 epoch
	  training loss: 3.085205
	validation loss: 3.449675
processing 1 epoch
	  training loss: 3.005863
	validation loss: 2.999969
processing 2 epoch
	  training loss: 2.989966
	validation loss: 3.058426
processing 3 epoch
	  training loss: 2.970368
	validation loss: 3.142894
processing 4 epoch
	  training loss: 2.940849
	validation loss: 3.004417
processing 5 epoch
	  training loss: 2.850863
	validation loss: 2.907131
processing 6 epoch
	  training loss: 2.634125
	validation loss: 2.612948
processing 7 epoch
	  training loss: 2.215319
	validation loss: 2.585659
processing 8 epoch
	  training loss: 1.809047
	validation loss: 1.710643
processing 9 epoch
	  training loss: 1.335380
	validation loss: 1.707028
processing 10 epoch
	  training loss: 0.862891
	validation loss: 0.667168
processing 11 epoch
	  training loss: 0.599973
	validation loss: 0.604128
processing 12 epoch
	  training loss: 0.396029
	validation loss: 0.470824
processing 13 epoch
	  training loss: 0.280273
	

In [174]:
def plot_train_process(train_loss, val_loss):

    axes.set_title('Loss')
    axes.plot(train_loss, label='train')
    axes.plot(val_loss, label='validation')
    axes.legend()

In [175]:
plot_train_process(train_loss, val_loss)

NameError: name 'axes' is not defined

In [176]:
results_train = pd.DataFrame(columns=['actual', 'prediction'])
train_loader = DataLoader(trainset, batch_size=16, num_workers=1, shuffle=False)
with torch.no_grad():
    for image_batch, text_batch in train_loader:
        text_batch_logits = model(image_batch.to(device)) # [T, batch_size, num_classes==num_features]
        text_batch_pred = decode_predictions(text_batch_logits.cpu())
        #print(text_batch, text_batch_pred)
        df = pd.DataFrame(columns=['actual', 'prediction'])
        df['actual'] = text_batch
        df['prediction'] = text_batch_pred
        results_train = pd.concat([results_train, df])
results_train = results_train.reset_index(drop=True)

In [178]:
results_train

Unnamed: 0,actual,prediction
0,fp382,f2p38222222
1,edg3p,e2dg32p2222
2,c4bny,c24bn2y2222
3,ennmm,e2n2n2m2222
4,b4y5x,b24y25x2222
...,...,...
851,6bdn5,62bd2n52222
852,2nbc5,22nbc552222
853,5np4m,52np42m2222
854,w8bnx,w28b2nx2222


In [None]:
fastwer.score_sent(output, ref, char_level=True)