In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.backends.cudnn as cudnn

import torchvision
import torchvision.transforms as T

import os
import random

# from models import *
# from loader import Loader, RotationLoader
# from utils import progress_bar
import numpy as np

import string
import glob 
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import cv2
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# 1 Setting

## 1.1 Argument

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [5]:
best_acc = 0 # best test acc
start_epoch = 0 # start from epoch 0 OR checkpoint epoch

## 1.2 Data

In [6]:
DATASET = 'Large_Captcha'

In [7]:
all_letters = string.ascii_lowercase + string.digits + string.ascii_uppercase

mapping = {}      # key - num    & value - letter
mapping_inv = {}  # key - letter & value - num

for i, x in enumerate(all_letters):
    mapping[x] = i+1
    mapping_inv[i+1] = x

num_class = len(mapping)
num_class

62

In [6]:
class Loader_Captcha(Dataset):
    def __init__(self, is_train=True, transform=None):
        self.is_train = is_train
        self.transform = transform
        self.img_path = glob.glob('/data/jh/datasets/Large_Captcha_Dataset/*.png')

    def __len__(self):
        return len(self.img_path)

    def __getitem__(self, idx):
        #img = cv2.imread(self.img_path[idx])
        #img = Image.fromarray(img)
        img = Image.open(self.img_path[idx]).convert('L')
        if self.transform is not None:
            img = self.transform(img)
        
        img_name = self.img_path[idx].split('/')[-1]
        label = torch.IntTensor([mapping[i] for i in img_name.split('.')[0]]) #int(self.img_path[idx].split('/')[-1])
        
        return img, label

In [7]:
transform = T.Compose([
    T.ToTensor()
])

## Save a testset for active learning

## if have a pkl file, load the file.

In [8]:
import pickle
trainset = []
with open('./trainset.pkl', 'rb') as f:
    trainset = pickle.load(f)
len(trainset)

65862

In [9]:
trainloader = torch.utils.data.DataLoader(trainset, batch_size=16, shuffle=True)
testloader  = torch.utils.data.DataLoader(trainset,  batch_size=8,  shuffle=False) # > trainset으로 변경해야 함. (active에 영향을 안주기위해서)
len(testloader.dataset)

65862

## 1.2 Model

In [10]:
class Bidirectional(nn.Module):
    def __init__(self, inp, hidden, out, lstm=True):
        super(Bidirectional, self).__init__()
        if lstm:
            self.rnn = nn.LSTM(inp, hidden, bidirectional=True)
        else:
            self.rnn = nn.GRU(inp, hidden, bidirectional=True)
        self.embedding = nn.Linear(hidden*2, out)
    def forward(self, X):
        recurrent, _ = self.rnn(X)
        out = self.embedding(recurrent)     
        return out

In [11]:
class CRNN(nn.Module):
    def __init__(self, in_channels, output):
        super(CRNN, self).__init__()

        self.cnn = nn.Sequential(
                nn.Conv2d(in_channels, 256, 9, stride=1, padding=1),
                nn.ReLU(),
                nn.BatchNorm2d(256),
                nn.MaxPool2d(3, 3),
                nn.Conv2d(256, 256, (4, 3), stride=1, padding=1),
                nn.ReLU(),
                nn.BatchNorm2d(256))
        
        self.linear = nn.Linear(20992, 256)
        self.bn1 = nn.BatchNorm1d(256)
        self.rnn = Bidirectional(256, 1024, output+1)

    def forward(self, X, y=None, criterion = None): # y is target.
        out = self.cnn(X)
        N, C, w, h = out.size()
        out = out.view(N, -1, h)
        out = out.permute(0, 2, 1)
        out = self.linear(out)

        out = out.permute(1, 0, 2)
        out = self.rnn(out)
            
        if y is not None:
            T = out.size(0)
            N = out.size(1)
        
            input_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.int32)
            target_lengths = torch.full(size=(N,), fill_value=5, dtype=torch.int32)
        
            loss = criterion(out, y, input_lengths, target_lengths)
            
            return out, loss
        
        return out, None
    
    def _ConvLayer(self, inp, out, kernel, stride, padding, bn=False):
        if bn:
            conv = [
                nn.Conv2d(inp, out, kernel, stride=stride, padding=padding),
                nn.ReLU(),
                nn.BatchNorm2d(out)
            ]
        else:
            conv = [
                nn.Conv2d(inp, out, kernel, stride=stride, padding=padding),
                nn.ReLU()
            ]
        return nn.Sequential(*conv)

In [12]:
model = CRNN(in_channels=1, output=num_class)
model = model.to(device)
# print(net)

In [13]:
if device == 'cuda':
    #net = torch.nn.DataParallel(net) # 문현안쓸때만쓰기... 나는 0
    cudnn.benchmark = True

In [14]:
criterion = nn.CTCLoss() #nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4) # optim.SGD(net.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[30, 60, 90])

# 2 Training

In [15]:
def predict(outputs):
    result = []
    for i in range(len(outputs)):
        pred = []
        then = 0
        for x in outputs[i]:
            if then != x and x > 0 :
                pred.append(x)
                if len(pred) == 5:
                    break
            then = x
        if len(pred) < 5:
            for i in range(5-len(pred)):
                pred.append(0)
        result.append(pred)
    result = torch.LongTensor(result).cuda()
    return result

In [16]:
def train(epoch):
    print('\nEpoch: %d' % epoch)
    model.train()
    train_loss = 0
    correct = 0
    total = 0
    
    tk = tqdm(trainloader, total=len(trainloader))
    #for batch_idx, (inputs, targets) in enumerate(trainloader):
    for inputs, targets in tk:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        
        out, loss = model(inputs, targets, criterion=criterion)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        predicted = predict(out.permute(1, 2, 0).max(1)[1])
        total += targets.size(0) * 5
        correct += predicted.eq(targets).sum().item()
        
        tk.set_postfix({'Train - Loss' : loss.item(), '& ACC':100.*correct/total})

# 3 Test

In [17]:
def test(epoch):
    global best_acc, DATASET
    model.eval()
    test_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        tk = tqdm(testloader, total=len(testloader))
#         for batch_idx, (inputs, targets) in enumerate(testloader):
        for inputs, targets in tk:
            inputs, targets = inputs.to(device), targets.to(device)
            out, loss = model(inputs, targets, criterion=criterion)
            
            test_loss += loss.item()
            predicted = predict(out.permute(1, 2, 0).max(1)[1])
            total += targets.size(0) * 5
            correct += predicted.eq(targets).sum().item()

            tk.set_postfix({'Test - Loss' : loss.item(), '& ACC':100.*correct/total})

    # Save checkpoint.
    acc = 100.*correct/total
    with open('./best_test_'+DATASET+'.txt','a') as f:
        f.write(str(acc)+'/'+str(test_loss)+':'+str(epoch)+'\n')
    if acc > best_acc:
        print('Saving..')
        state = {
            'model': model.state_dict(),
            'acc': acc,
            'epoch': epoch,
        }
        if not os.path.isdir('checkpoint'):
            os.mkdir('checkpoint')
        # save rotation weights
        torch.save(state, './checkpoint/test_'+DATASET+'.pth')
        best_acc = acc

# 4 Run

In [18]:
for epoch in range(start_epoch, start_epoch+10): # default : 120
    train(epoch)
    test(epoch)
    scheduler.step()


Epoch: 0


100%|████████| 4117/4117 [16:07<00:00,  4.26it/s, Train - Loss=1.12, & ACC=1.68]
100%|███████████| 8233/8233 [07:10<00:00, 19.13it/s, Test - Loss=2.07, & ACC=30]


Saving..

Epoch: 1


100%|██████| 4117/4117 [15:50<00:00,  4.33it/s, Train - Loss=-.0819, & ACC=87.2]
100%|████████| 8233/8233 [06:42<00:00, 20.47it/s, Test - Loss=0.382, & ACC=96.2]


Saving..

Epoch: 2


100%|███████| 4117/4117 [15:45<00:00,  4.35it/s, Train - Loss=0.192, & ACC=96.9]
100%|████████| 8233/8233 [06:40<00:00, 20.54it/s, Test - Loss=0.378, & ACC=98.2]


Saving..

Epoch: 3


100%|██████| 4117/4117 [15:45<00:00,  4.35it/s, Train - Loss=0.0264, & ACC=98.2]
100%|████████| 8233/8233 [06:41<00:00, 20.49it/s, Test - Loss=0.134, & ACC=98.4]


Saving..

Epoch: 4


100%|███████| 4117/4117 [15:45<00:00,  4.35it/s, Train - Loss=0.179, & ACC=98.7]
100%|███████| 8233/8233 [06:41<00:00, 20.51it/s, Test - Loss=-.0634, & ACC=97.1]



Epoch: 5


100%|█████████| 4117/4117 [15:47<00:00,  4.35it/s, Train - Loss=-.146, & ACC=99]
100%|██████████| 8233/8233 [06:41<00:00, 20.52it/s, Test - Loss=0.329, & ACC=99]


Saving..

Epoch: 6


100%|██████| 4117/4117 [15:45<00:00,  4.35it/s, Train - Loss=0.0348, & ACC=99.2]
100%|████████| 8233/8233 [06:41<00:00, 20.53it/s, Test - Loss=0.224, & ACC=99.2]


Saving..

Epoch: 7


100%|███████| 4117/4117 [15:45<00:00,  4.35it/s, Train - Loss=0.096, & ACC=99.3]
100%|███████| 8233/8233 [06:41<00:00, 20.52it/s, Test - Loss=-.0895, & ACC=99.4]


Saving..

Epoch: 8


100%|██████| 4117/4117 [15:47<00:00,  4.35it/s, Train - Loss=-.0694, & ACC=99.4]
100%|████████| 8233/8233 [06:41<00:00, 20.50it/s, Test - Loss=0.223, & ACC=99.3]



Epoch: 9


100%|██████| 4117/4117 [15:46<00:00,  4.35it/s, Train - Loss=-.0307, & ACC=99.5]
100%|███████| 8233/8233 [06:42<00:00, 20.46it/s, Test - Loss=-.0195, & ACC=99.7]


Saving..


# 5 Make batches >> batch size 1로 하고 진행 & trainset을 쪼개야 함

In [19]:
batchloader  = torch.utils.data.DataLoader(trainset,  batch_size=1,  shuffle=False)

In [20]:
def make_batch():
    global best_acc, DATASET
    model.eval()
    test_loss = 0
    correct = 0
    total = 0
    
    # 같은 이름 파일 초기화
    with open('./make_batches_'+DATASET+'.txt','w') as f:
        f.write('')

    with torch.no_grad():
        tk = tqdm(batchloader, total=len(batchloader))
        for inputs, targets in tk:
            inputs, targets = inputs.to(device), targets.to(device)
            out, loss = model(inputs, targets, criterion=criterion)
            
            test_loss += loss.item()
            predicted = predict(out.permute(1, 2, 0).max(1)[1])
            total += targets.size(0) * 5
            correct += predicted.eq(targets).sum().item()
            
            label = ''
            for p in targets[0]:
                label += mapping_inv[p.item()]
            s = str(float(loss))+'_/data/jh/datasets/Large_Captcha_Dataset/'+label+'.png\n'
            with open('./make_batches_'+DATASET+'.txt','a') as f:
                f.write(s)

In [21]:
make_batch()

100%|█████████████████████████████████████| 65862/65862 [15:10<00:00, 72.37it/s]


In [8]:
with open('./make_batches_'+DATASET+'.txt', 'r') as f:
    losses = f.readlines()
len(losses)

65862

In [9]:
loss_1 = []
name_2 = []
for j in losses:
    loss_1.append(j[:-1].split('_/')[0])
    name_2.append('/'+j[:-1].split('_/')[1])
loss_1[0], name_2[0]

('0.032308224588632584', '/data/jh/datasets/Large_Captcha_Dataset/b9c23.png')

In [10]:
s = np.array(loss_1)
sort_index = np.argsort(s)
x = sort_index.tolist()
x.reverse()
sort_index = np.array(x) # convert to high loss first

In [15]:
if not os.path.isdir('loss_'+DATASET):
    os.mkdir('loss_'+DATASET)

In [16]:
batch_file_size = 5000

In [17]:
# loss txt 기반으로 10등분
for i in range(10):
    # sample minibatch from unlabeled pool 
    sample = sort_index[i*batch_file_size:(i+1)*batch_file_size]
    b = np.zeros(1)
    for jj in sample:
        b[0] +=1
    print(f'{i} Class Distribution: {b}')
    s = './loss_'+DATASET+'/batch_' + str(i) + '.txt'
    for k in sample:
        with open(s, 'a') as f:
            f.write(name_2[k]+'\n')

0 Class Distribution: [5000.]
1 Class Distribution: [5000.]
2 Class Distribution: [5000.]
3 Class Distribution: [5000.]
4 Class Distribution: [5000.]
5 Class Distribution: [5000.]
6 Class Distribution: [5000.]
7 Class Distribution: [5000.]
8 Class Distribution: [5000.]
9 Class Distribution: [5000.]
