# Get Data

In [1]:
!kaggle competitions download -c kuzushiji-recognition -p raw

!unzip -n -q raw/train.csv.zip -d data
!chmod 444 data/train.csv 
!unzip -n -q raw/train_images.zip -d data/train_images
!unzip -n -q raw/test_images.zip -d data/test_images

!cp -f raw/unicode_translation.csv data/

unicode_translation.csv: Skipping, found more recently modified local copy (use --force to force download)
sample_submission.csv: Skipping, found more recently modified local copy (use --force to force download)
train.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
test_images.zip: Skipping, found more recently modified local copy (use --force to force download)
train_images.zip: Skipping, found more recently modified local copy (use --force to force download)


In [2]:
!pip install wget
!wget -nd -q --show-progress https://noto-website-2.storage.googleapis.com/pkgs/NotoSansCJKjp-hinted.zip -P raw
!unzip -n -q raw/NotoSansCJKjp-hinted.zip -d assets

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


# Data Manipulation

In [6]:
import numpy as np
import pandas as pd
import cv2
from tqdm import tqdm_notebook
import csv

In [7]:
input_dir = 'data'

# font_path = 'assets/NotoSansCJKjp-Regular.otf'
# prop = font_manager.FontProperties(fname=font_path)

with open('data/unicode_translation.csv') as f:
    reader = csv.reader(f)
    next(reader) # skip head  er
    unicode = dict(reader)

In [44]:
def id_to_path(input_id, dir_type='train'):
    """
    Takes an id for a page as an input, and returns the filepath to the image
    """
    path = input_dir + '/' + dir_type + '_images/' + input_id
    if '.jpg' not in path:
        path = path + '.jpg'
        
    return path

def page_to_bw(*args, **kwargs):
    """
    Takes an id as an input and returns a black and white version of the iamge
    """
    path = id_to_path(*args, **kwargs)    
    
    img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    (thresh, img_bw) = cv2.threshold(img, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
    img_bw = cv2.bitwise_not(img_bw)
    
    return img_bw

def label_explode(l):
    """
    Takes a unsplit string (tuple of unicode character and position) and reutnrs a true tuple
    """
    try:
        codename, x, y, w, h = str.split(l)
    except TypeError:
        print('skipping {}'.format(l))
        return None
    
    try:
        char = unicode[codename]
    except KeyError:
        try:
            char = unicode['U' + codename]
        except KeyError:
            print('{} not found in unicode lookup, skipping'.format(codename))
            return None
    
    x, y, w, h = int(x), int(y), int(w), int(h)
    
    return char, codename, x, y, w, h

def make_square(img, dim=(28,28)):
    """
    Converts the image into a square and adds padding
    """
    aspect = img.shape
    max_aspect = max(aspect)
    
    if max_aspect % 2 != 0:
        max_aspect += 11
    else:
        max_aspect += 10
        
    y_growth = (max_aspect - aspect[0])/2
    x_growth = max_aspect - aspect[1]
    
    t, b, l, r = int(np.floor(y_growth/2)), int(np.ceil(y_growth/2)), int(np.floor(x_growth/2)), int(np.ceil(x_growth/2))
    
    square_img = cv2.copyMakeBorder(img, t, b, l, r, cv2.BORDER_CONSTANT, value=(0,0,0))
    square_img = cv2.resize(square_img, dim)
    
    return square_img

def labels_and_images(data_type='train', save=True, output_fp=None):
    """
    returns a tuple of labels and images. if save=true, also returns the filepath of the saved npz file
    """
    df = pd.read_csv(input_dir + '/' + data_type + '.csv')
    df['chars'] = df.labels.str.findall(r"[\w\+]{6}[\s\d]*\d")
    
    df1 = df[['image_id','chars']].explode('chars').reset_index()
    df2 = pd.DataFrame(df1['chars'].str.split(expand=True).values, columns=('unicode', 'x_min', 'y_min', 'width', 'height'))

    dfo = pd.concat([df1,df2], axis=1)
    nums = ['x_min','y_min','width','height']
    for n in nums:
        dfo[n] = pd.to_numeric(dfo[n], errors='coerce', downcast='integer')

    dfo['x_max'] = dfo.x_min + dfo.width
    dfo['y_max'] = dfo.y_min + dfo.height

    dfo = dfo.drop(['index'], axis=1).dropna()
    
    labels = dfo.unicode.to_numpy()
    
    images = []
    for g in tqdm_notebook(dfo.groupby('image_id')):
        img = page_to_bw(g[0])
        dfg = g[1]

        for _, row in dfg.iterrows():
            y_min, y_max, x_min, x_max = int(row['y_min']), int(row['y_max']), int(row['x_min']), int(row['x_max'])
            images.append(make_square(img[y_min:y_max, x_min:x_max]))

    images = np.array(images)
    
    if save:
        if output_fp is None:
            output_fp = input_dir + '/' + data_type + '_img_data.npz'
        np.savez_compressed(output_fp, labels=labels, images=images)
    
    return labels, images

In [45]:
train_labels, train_images = labels_and_images('train')
train_dir = input_dir + '/train_img_data.npz'

HBox(children=(IntProgress(value=0, max=3605), HTML(value='')))




In [46]:
print(train_labels.shape, train_images.shape)

(683464,) (683464, 28, 28)


# Create KujuMNIST Dataset class for torch

In [34]:
import os
import numpy as np
from torchvision.datasets.utils import makedir_exist_ok, download_url
from torch.utils.data import Dataset
from sklearn.preprocessing import LabelEncoder

class KujuMNIST_DS(Dataset):
    
    def __init__(self, data_fp, num_classes=10, tfms=None, generate_data=False, *args, **kwargs):
        if generate_data == True:
            self.target, self.data = generate_data(*args, **kwargs)
            
        else:
            npz = np.load(data_fp, allow_pickle=True)
            self.data = npz['images']
            self.target = npz['labels']
            
        le = LabelEncoder()
        self.target = le.fit_transform(self.target)
            
        self.c = num_classes
        self.tfms = tfms
    
    def __getitem__(self, index):
        """
        Args:
            index (int): Index
        Returns:
            tuple: (image, target) where target is index of the target class.
        """
        cur_data = np.expand_dims(self.data[index], axis=-1)

        if self.tfms:
            cur_data = self.tfms(cur_data)
        
        target = int(self.target[index])
        img, target = cur_data, target
        
        return img, target

    def __len__(self):
        return len(self.data)
    
    @staticmethod
    def generate_data(*args, **kwargs):
        return labels_and_images(*args, **kwargs)

# Build Model

Extensive credits to https://github.com/ranihorev/Kuzushiji_MNIST

In [35]:
from fastai.vision import *
from torchvision.models.resnet import resnet18, ResNet, BasicBlock
from torchvision.datasets.mnist import MNIST
import torchvision.transforms as transforms
from torchvision.datasets.utils import makedir_exist_ok, download_url
import torch.utils.data as data
from torch.utils.data import BatchSampler, DataLoader, random_split
from torch.autograd import Variable

In [4]:
input_dir = 'data'
train_dir = input_dir + '/train_img_data.npz'

In [6]:
train_images = np.load(train_dir)['images']

In [36]:
data_mean = train_images.mean()
data_std = train_images.std()

default_device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

transform_train = transforms.Compose(
    [transforms.ToPILImage(), 
     transforms.ToTensor(),
     transforms.Normalize((data_mean,), (data_std,)),
    ])

transform_valid = transforms.Compose(
    [transforms.ToPILImage(), 
     transforms.ToTensor(),
     transforms.Normalize((data_mean,), (data_std,)),
    ])

total_ds = KujuMNIST_DS(train_dir, tfms=transform_train)

In [38]:
train_size = int(0.8 * len(total_ds))
test_size = len(total_ds) - train_size

train_ds, test_ds = random_split(total_ds, [train_size, test_size])

In [39]:
trn_dl = DataLoader(train_ds, batch_size=128, shuffle=True, num_workers=1, pin_memory=True)
val_dl = DataLoader(test_ds, batch_size=128, shuffle=True, num_workers=1, pin_memory=True)

In [40]:
databunch = DataBunch(train_dl=trn_dl, valid_dl=val_dl, device=default_device)

In [41]:
len(set(total_ds.target))

4212

In [42]:
class VGG(nn.Module):  
    """
    Based on - https://github.com/kkweon/mnist-competition
    """
    def two_conv_pool(self, in_channels, f1, f2):
        s = nn.Sequential(
            nn.Conv2d(in_channels, f1, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(f1),
            nn.ReLU(inplace=True),
            nn.Conv2d(f1, f2, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(f2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        for m in s.children():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
        return s
    
    def three_conv_pool(self,in_channels, f1, f2, f3):
        s = nn.Sequential(
            nn.Conv2d(in_channels, f1, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(f1),
            nn.ReLU(inplace=True),
            nn.Conv2d(f1, f2, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(f2),
            nn.ReLU(inplace=True),
            nn.Conv2d(f2, f3, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(f3),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        for m in s.children():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
        return s
        
    
    def __init__(self, num_classes=10):
        super(VGG, self).__init__()
        self.l1 = self.two_conv_pool(1, 64, 64)
        self.l2 = self.two_conv_pool(64, 128, 128)
        self.l3 = self.three_conv_pool(128, 256, 256, 256)
        self.l4 = self.three_conv_pool(256, 256, 256, 256)
        
        self.classifier = nn.Sequential(
            nn.Dropout(p = 0.5),
            nn.Linear(256, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(inplace=True),
            nn.Dropout(p = 0.5),
            nn.Linear(512, num_classes),
        )
    
    def forward(self, x):
        x = self.l1(x)
        x = self.l2(x)
        x = self.l3(x)
        x = self.l4(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return F.log_softmax(x, dim=1) 
              

In [48]:
def conv3x3(in_planes, out_planes, stride=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=False)


def conv1x1(in_planes, out_planes, stride=1):
    """1x1 convolution"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)

class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = nn.BatchNorm2d(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out

    
class MyResNet(nn.Module):
    # Based on PyTorch ResNet-18
    
    def __init__(self, block, layers, num_classes=len(set(total_ds.target)), zero_init_residual=False):
        super(MyResNet, self).__init__()
        self.inplanes = 64
        self.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

        # Zero-initialize the last BN in each residual branch,
        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, Bottleneck):
                    nn.init.constant_(m.bn3.weight, 0)
                elif isinstance(m, BasicBlock):
                    nn.init.constant_(m.bn2.weight, 0)
                    
        self.classifier = nn.Sequential(
            nn.Dropout(p = 0.5),
            nn.Linear(512 * block.expansion, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(inplace=True),
            nn.Dropout(p = 0.5),
            nn.Linear(256, num_classes),
        )

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                conv1x1(self.inplanes, planes * block.expansion, stride),
                nn.BatchNorm2d(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):
#         import pdb; pdb.set_trace()
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)

        return F.log_softmax(x, dim=1)

class VGG_ResNet(nn.Module):
    def __init__(self):
        super(VGG_ResNet, self).__init__()
        self.vgg = VGG(len(set(total_ds.target)))
        self.resnet = MyResNet(BasicBlock, [2, 2, 2, 2])
    
    def forward(self, x):
        vgg_out = self.vgg(x)
        resnet_out = self.resnet(x)
        out = (vgg_out + resnet_out) / 2
        return out
    
def vgg_resnet_load_model(learner, vgg_name, resnet_name):
        device = learner.data.device
        vgg_state = torch.load(learner.path/learner.model_dir/f'{vgg_name}.pth', map_location=device)
        learner.model.vgg.load_state_dict(vgg_state['model'], strict=True)
        
        resnet_state = torch.load(learner.path/learner.model_dir/f'{resnet_name}.pth', map_location=device)
        learner.model.resnet.load_state_dict(resnet_state['model'], strict=True)

In [49]:
learn = Learner(databunch, VGG_ResNet(), metrics=accuracy)
learn.fit(1)

epoch,train_loss,valid_loss,accuracy,time
0,1.910229,673.799683,0.004645,7:43:52
