In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.backends.cudnn as cudnn

import torchvision
import torchvision.transforms as T

import os
import random
import numpy as np

import string
import glob 
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import cv2
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import IPython.display as ipd

# 1 Setting

## 1.1 Argument

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [3]:
best_acc = 0 # best test acc
start_epoch = 0 # start from epoch 0 OR checkpoint epoch
learning_rate = 1e-4

## 1.2 Data

In [4]:
PATH_SAVE_DATASET = 'Large_Captcha_5000'

In [5]:
all_letters = string.ascii_lowercase + string.digits + string.ascii_uppercase

mapping = {}      # key - num    & value - letter
mapping_inv = {}  # key - letter & value - num

for i, x in enumerate(all_letters):
    mapping[x] = i+1
    mapping_inv[i+1] = x

num_class = len(mapping)
num_class

62

In [6]:
class Loader_Captcha(Dataset):
    def __init__(self, is_train=True, transform=None):
        self.is_train = is_train
        self.transform = transform
        self.img_path = glob.glob('/data/jh/datasets/Large_Captcha_Dataset/*.png')

    def __len__(self):
        return len(self.img_path)

    def __getitem__(self, idx):
        #img = cv2.imread(self.img_path[idx])
        #img = Image.fromarray(img)
        img = Image.open(self.img_path[idx]).convert('L')
        if self.transform is not None:
            img = self.transform(img)
        
        img_name = self.img_path[idx].split('/')[-1]
        label = torch.IntTensor([mapping[i] for i in img_name.split('.')[0]]) #int(self.img_path[idx].split('/')[-1])
        
        return img, label

In [7]:
transform = T.Compose([
    T.ToTensor()
])

## Save a testset for active learning

## if have a pkl file, load the file.

In [8]:
import pickle
trainset = []
with open('./trainset.pkl', 'rb') as f:
    trainset = pickle.load(f)
len(trainset)

65862

In [9]:
trainloader = torch.utils.data.DataLoader(trainset[1000:6000], batch_size=16, shuffle=True)
testloader  = torch.utils.data.DataLoader(trainset[:1000],  batch_size=8,  shuffle=False) # > trainset으로 변경해야 함. (active에 영향을 안주기위해서)
len(trainloader.dataset), len(testloader.dataset)

(5000, 1000)

## 1.2 Model

In [10]:
class Bidirectional(nn.Module):
    def __init__(self, inp, hidden, out, lstm=True):
        super(Bidirectional, self).__init__()
        if lstm:
            self.rnn = nn.LSTM(inp, hidden, bidirectional=True)
        else:
            self.rnn = nn.GRU(inp, hidden, bidirectional=True)
        self.embedding = nn.Linear(hidden*2, out)
    def forward(self, X):
        recurrent, _ = self.rnn(X)
        out = self.embedding(recurrent)     
        return out

In [11]:
class CRNN(nn.Module):
    def __init__(self, in_channels, output):
        super(CRNN, self).__init__()

        self.cnn = nn.Sequential(
                nn.Conv2d(in_channels, 256, 9, stride=1, padding=1),
                nn.ReLU(),
                nn.BatchNorm2d(256),
                nn.MaxPool2d(3, 3),
                nn.Conv2d(256, 256, (4, 3), stride=1, padding=1),
                nn.ReLU(),
                nn.BatchNorm2d(256))
        
        self.linear = nn.Linear(20992, 256)
        self.bn1 = nn.BatchNorm1d(256)
        self.rnn = Bidirectional(256, 1024, output+1)

    def forward(self, X, y=None, criterion = None): # y is target.
        out = self.cnn(X)
        N, C, w, h = out.size()
        out = out.view(N, -1, h)
        out = out.permute(0, 2, 1)
        out = self.linear(out)

        out = out.permute(1, 0, 2)
        out = self.rnn(out)
            
        if y is not None:
            T = out.size(0)
            N = out.size(1)
        
            input_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.int32)
            target_lengths = torch.full(size=(N,), fill_value=5, dtype=torch.int32)
        
            loss = criterion(out, y, input_lengths, target_lengths)
            
            return out, loss
        
        return out, None
    
    def _ConvLayer(self, inp, out, kernel, stride, padding, bn=False):
        if bn:
            conv = [
                nn.Conv2d(inp, out, kernel, stride=stride, padding=padding),
                nn.ReLU(),
                nn.BatchNorm2d(out)
            ]
        else:
            conv = [
                nn.Conv2d(inp, out, kernel, stride=stride, padding=padding),
                nn.ReLU()
            ]
        return nn.Sequential(*conv)

In [12]:
model = CRNN(in_channels=1, output=num_class)
model = model.to(device)

In [13]:
if device == 'cuda':
    cudnn.benchmark = True

In [14]:
criterion = nn.CTCLoss() #nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate) # optim.SGD(net.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[30, 60, 90])

# 2 Training

In [19]:
def predict(outputs):
    result = []
    for i in range(len(outputs)):
        pred = []
        then = 0
        for x in outputs[i]:
            if then != x and x > 0 :
                pred.append(x)
                if len(pred) == 5:
                    break
            then = x
        if len(pred) < 5:
            for i in range(5-len(pred)):
                pred.append(0)
        result.append(pred)
    result = torch.LongTensor(result).cuda()
    return result

In [20]:
def train(epoch):
    print('\nEpoch: %d' % epoch)
    model.train()
    train_loss = 0
    correct = 0
    total = 0
    
    tk = tqdm(trainloader, total=len(trainloader))
    for inputs, targets in tk:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        
        out, loss = model(inputs, targets, criterion=criterion)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        predicted = predict(out.permute(1, 2, 0).max(1)[1])
        total += targets.size(0)
        for i in range(len(predicted)):
            correct += torch.equal(predicted[i], targets[i]) 
        
        tk.set_postfix({'Train - Loss' : loss.item(), '& ACC':100.*correct/total})
    
    return 100.*correct/total, train_loss/total

# 3 Test

In [21]:
def test(epoch):
    global best_acc
    model.eval()
    test_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        tk = tqdm(testloader, total=len(testloader))
        for inputs, targets in tk:
            inputs, targets = inputs.to(device), targets.to(device)
            out, loss = model(inputs, targets, criterion=criterion)
            
            test_loss += loss.item()
            predicted = predict(out.permute(1, 2, 0).max(1)[1])
            total += targets.size(0)
            for i in range(len(predicted)):
                correct += torch.equal(predicted[i], targets[i]) 

            tk.set_postfix({'Test - Loss' : loss.item(), '& ACC':100.*correct/total})

    # Save checkpoint.
    acc = 100.*correct/total
    with open('./0_backbone_model_best_test_'+PATH_SAVE_DATASET+'.txt','a') as f:
        f.write(str(acc)+'/'+str(test_loss)+':'+str(epoch)+'\n')
    if acc > best_acc:
        print('Saving..')
        state = {
            'model': model.state_dict(),
            'acc': acc,
            'epoch': epoch,
        }
        if not os.path.isdir('checkpoint'):
            os.mkdir('checkpoint')
        # save rotation weights
        torch.save(state, './checkpoint/0_backbone_model_'+PATH_SAVE_DATASET+'.pth')
        best_acc = acc
        
    return 100.*correct/total, test_loss/total

# 4 Run

In [22]:
def save_acc_and_loss(epoch, acc, loss, is_train = True):
    task_name = 'epoch - '+str(epoch)+'\ntrain > ' if is_train else 'test > '
    file_path = './0_backbone_model_result_'+PATH_SAVE_DATASET+'.txt'
    with open(file_path,'a') as f:
        f.write(task_name + 'acc_' + str(acc) + ' & loss_' + str(loss) +'\n')

In [23]:
history_train = {'acc':[], 'loss':[]} 
history_test  = {'acc':[], 'loss':[]}

for epoch in range(100):
    print('===== HISTORY =====')
    print('train > ', str(history_train))
    print('test > ', str(history_test))
    
    acc, loss = train(epoch)
    history_train['acc'].append(acc)
    history_train['loss'].append(loss)
    save_acc_and_loss(epoch, acc, loss, is_train = True)
    
    acc, loss = test(epoch)
    history_test['acc'].append(acc)
    history_test['loss'].append(loss)
    save_acc_and_loss(epoch, acc, loss, is_train = False)
       
    ipd.clear_output(wait=True)
    scheduler.step()

===== HISTORY =====
train >  {'acc': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.34, 12.4, 47.02, 66.72, 77.2, 83.24, 88.16, 90.84, 93.4, 95.5, 96.04, 97.12, 97.9, 98.5, 98.48, 99.56, 99.94, 99.96, 99.96, 99.96, 99.96, 99.96, 99.96, 99.96, 99.96, 99.96, 99.98, 99.98, 99.98, 99.98, 99.98], 'loss': [0.22594706927835942, 0.25982749943733213, 0.2616117265224457, 0.2616815471172333, 0.2616121827602386, 0.2617728785514832, 0.2618544746875763, 0.2616652231693268, 0.2617204129219055, 0.2607871565818787, 0.26032606320381163, 0.25816235246658326, 0.2560550127506256, 0.2532932361125946, 0.2485858654975891, 0.20055346059799195, 0.08604959109425545, 0.038264464914798735, 0.023760358621552586, 0.01703615467827767, 0.013071761648729443, 0.01020415249299258, 0.008088611452095211, 0.006741082617640495, 0.005523762807808816, 0.004794246220961213, 0.004029096635617316, 0.0036001272475346922, 0.0031026557456701994, 0.0029781145986169577, 0.0017045613896567375, 0.001274680

 23%|██       | 72/313 [00:16<00:54,  4.41it/s, Train - Loss=-.00534, & ACC=100]


KeyboardInterrupt: 

# 5 Make batches >> batch size 1로 하고 진행 & trainset을 쪼개야 함

In [24]:
batchloader  = torch.utils.data.DataLoader(trainset[6000:],  batch_size=1,  shuffle=False)

In [25]:
def make_batch():
    global best_acc
    model.eval()
    test_loss = 0
    correct = 0
    total = 0
    
    # 같은 이름 파일 초기화
    with open('./1_make_batches_'+PATH_SAVE_DATASET+'.txt','w') as f:
        f.write('')

    with torch.no_grad():
        tk = tqdm(batchloader, total=len(batchloader))
        for inputs, targets in tk:
            inputs, targets = inputs.to(device), targets.to(device)
            out, loss = model(inputs, targets, criterion=criterion)
            
            test_loss += loss.item()
            predicted = predict(out.permute(1, 2, 0).max(1)[1])
            total += targets.size(0) * 5
            correct += predicted.eq(targets).sum().item()
            
            label = ''
            for p in targets[0]:
                label += mapping_inv[p.item()]
            s = str(float(loss))+'_/data/jh/datasets/Large_Captcha_Dataset/'+label+'.png\n'
            with open('./1_make_batches_'+PATH_SAVE_DATASET+'.txt','a') as f:
                f.write(s)

In [None]:
make_batch()

 20%|███████▎                             | 11905/59862 [02:36<10:30, 76.04it/s]

In [33]:
with open('./1_make_batches_'+PATH_SAVE_DATASET+'.txt', 'r') as f:
    losses = f.readlines()
len(losses)

59862

In [34]:
loss_1 = []
name_2 = []
for j in losses:
    loss_1.append(j[:-1].split('_/')[0])
    name_2.append('/'+j[:-1].split('_/')[1])
loss_1[0], name_2[0]

('0.3057868480682373', '/data/jh/datasets/Large_Captcha_Dataset/s4JfO.png')

In [35]:
s = np.array(loss_1)
sort_index = np.argsort(s)
x = sort_index.tolist()
x.reverse()
sort_index = np.array(x) # convert to high loss first

In [36]:
if not os.path.isdir('loss_'+PATH_SAVE_DATASET):
    os.mkdir('loss_'+PATH_SAVE_DATASET)

In [37]:
batch_file_size = 5000

In [None]:
# loss txt 기반으로 10등분
for i in range(10):
    # sample minibatch from unlabeled pool 
    sample = sort_index[i*batch_file_size:(i+1)*batch_file_size]
    b = np.zeros(1)
    for jj in sample:
        b[0] +=1
    print(f'{i} Class Distribution: {b}')
    s = './loss_'+PATH_SAVE_DATASET+'/batch_' + str(i) + '.txt'
    for k in sample:
        with open(s, 'a') as f:
            f.write(name_2[k]+'\n')