## Import

In [None]:
# !pip install lmdb pillow nltk natsort

In [None]:
import os
import random
import sys

sys.path.append("./textocr")

import albumentations as A
import cv2W
import numpy as np
import optuna
import pandas as pd
import torch

from albumentations.pytorch import ToTensorV2
from sklearn.model_selection import StratifiedKFold, train_test_split
from textocr.train import train

In [None]:
import warnings

warnings.filterwarnings(action='ignore')

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

## Hyperparameter Setting

In [None]:
# 하이퍼파라미터 설정
opt = {
    'exp_name': 'None',
    'train_data': './result/',
    'valid_data': './result/valid',
    'manualSeed': 41,
    'workers': 0,
    'batch_size': 96,
    'num_iter': 30000,
    'valInterval': 3000,
    'saved_model': '',

    'FT': False,
    'adam': False,
    'lr': 1,
    'beta1': 0.9,
    'rho': 0.95,
    'eps': 1e-8,
    'grad_clip': 5,
    'baiduCTC': False,
    'select_data': 'train',
    'batch_ratio': '1',
    'total_data_usage_ratio': '1',
    'batch_max_length': 6,

    'imgH': 64,
    'imgW': 100,
    'rgb': False,
    'character': '',
    'sensitive': False,
    'PAD': False,
    'data_filtering_off': False,
    'Transformation': 'TPS',  # None|TPS
    'FeatureExtraction': 'ResNet',  # VGG|ResNet|RCNN
    'SequenceModeling': 'BiLSTM',  # None|BiLSTM
    'Prediction': 'Attn',  # CTC|Attn
    'num_fiducial': 20,
    'input_channel': 1,
    'output_channel': 512,
    'hidden_size': 256,
    'n_splits': 5,
    'verbose': 0,
}

opt = pd.Series(opt)

## Fixed RandomSeed

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(opt.manualSeed)  # Seed 고정

## Data Load & Train/Validation Split

In [None]:
df = pd.read_csv('./train.csv')
df['len'] = df['label'].str.len()
print(df['len'].max())


# 제공된 학습데이터 중 1글자 샘플들의 단어사전이 학습/테스트 데이터의 모든 글자를 담고 있으므로 학습 데이터로 우선 배치
train_v1 = df[df['len'] == 1]
df = df[df['len'] > 1]

In [None]:
# 학습 레이블 생성
str_dict = pd.Series(train_v1['label'].unique()).str.cat()
opt.character = str_dict
len(str_dict)

## Train

In [None]:
def objective(trial):
    p = [trial.suggest_float(f'p{i}', 0.0, 1.0, step=0.1) for i in range(9)]
    mean_acc = []
    
    opt.transforms = A.Compose([
        # Transforms
        A.CLAHE(p=p[0]),
        A.ColorJitter(p=p[1]),
        A.Emboss(alpha=(0.2, 0.9), strength=(0.3, 1.0), 
                 p=p[2]),
        A.Sharpen(alpha=(0.2, 0.6), lightness=(0.6, 0.9), 
                  p=p[3]),
        
        # Blur
        A.AdvancedBlur(p=p[4]),
        A.GaussianBlur(p=p[5]),
        
        
        # Dropout
        A.CoarseDropout(max_holes=20, min_holes=5, 
                        max_height=8, max_width=8, 
                        p=p[6]),
        
        # Geometric
        A.ElasticTransform(alpha=1.0, sigma=10, alpha_affine=10, interpolation=cv2.INTER_CUBIC,
                           p=p[7]),
        A.Rotate(limit=(-30, 30), interpolation=cv2.INTER_CUBIC, 
                 p=p[8])
    ])
    
    for i in range(opt.n_splits):
        opt.exp_name = f'{opt.Transformation}-{opt.FeatureExtraction}-{opt.SequenceModeling}-{opt.Prediction}-Seed{opt.manualSeed}-fold{i}'
        os.makedirs(f'./saved_models/{opt.exp_name}', exist_ok=True)
        
        opt.valid_data = f'./result/valid_{i}'
        opt.select_data = f'train_{i}'
        opt.batch_ratio = '1'
        
        acc, norm = train(opt)
        mean_acc.append(acc) 

    return np.mean(mean_acc)

In [None]:
# # gt 파일을 저장할 폴더 생성
# os.makedirs(f'./gt_file', exist_ok=True)

# kf = StratifiedKFold(n_splits=opt.n_splits, random_state=opt.manualSeed, shuffle=True)  # 데이터 분포를 고르게 하여 K-Fold 진행
# for i, (t_idx, v_idx) in enumerate(kf.split(df, df['len'])):
#     train_v2 = df.iloc[t_idx]
#     val_df = df.iloc[v_idx]
    
#     # 학습 데이터로 우선 배치한 1글자 샘플들과 분할된 2글자 이상의 학습 샘플을 concat하여 최종 학습 데이터로 사용
#     train_df = pd.concat([train_v1, train_v2])
    
#     # gt 파일 생성
#     train_df.drop(['id', 'len'], axis=1).to_csv(f'./gt_file/train_{i}.txt', sep='\t', header=False, index=False)
#     val_df.drop(['id', 'len'], axis=1).to_csv(f'./gt_file/valid_{i}.txt', sep='\t', header=False, index=False)
    
#     # lmdb 데이터 생성
#     os.system(f'python ./textocr/create_lmdb_dataset.py --inputPath \'\' --gtFile ./gt_file/train_{i}.txt --outputPath ./result/train_{i} --file_size 1')
#     os.system(f'python ./textocr/create_lmdb_dataset.py --inputPath \'\' --gtFile ./gt_file/valid_{i}.txt --outputPath ./result/valid_{i} --file_size 1')

In [None]:
""" Seed and GPU setting """
opt.num_gpu = torch.cuda.device_count()

if opt.num_gpu > 1:
    print('------ Use multi-GPU setting ------')
    print('if you stuck too long time with multi-GPU setting, try to set --workers 0')
    # check multi-GPU issue https://github.com/clovaai/deep-text-recognition-benchmark/issues/1
    opt.workers = opt.workers * opt.num_gpu
    opt.batch_size = opt.batch_size * opt.num_gpu


# study = optuna.create_study()
# study.optimize(objective, n_trials=100)

## Train 2

In [None]:
df = pd.read_csv('./train.csv')
df.drop('id', axis=1).to_csv('./gt_file/final_train.txt', sep='\t', header=False, index=False)

In [None]:
# !python ./textocr/create_lmdb_dataset.py --inputPath '' --gtFile ./gt_file/final_train.txt --outputPath ./result/final_train --file_size 1

In [None]:
opt.exp_name = f'{opt.Transformation}-{opt.FeatureExtraction}-{opt.SequenceModeling}-{opt.Prediction}-Seed{opt.manualSeed}-Final'
os.makedirs(f'./saved_models/{opt.exp_name}', exist_ok=True)
print(opt.exp_name)

In [None]:
opt.select_data = 'final_train'
opt.batch_ratio = '1'
opt.valid_data = './result/final_train'
opt.num_iter = 50000
opt.valInterval = 1000
opt.transforms = A.Compose([
        # Transform
        # A.SomeOf(transforms=[A.CLAHE(p=0.7),
        #                      A.ColorJitter(p=0.7),
        #                      A.Emboss(alpha=(0.2, 0.9), strength=(0.3, 1.0), p=0.7),
        #                      A.Sharpen(alpha=(0.2, 0.6), lightness=(0.6, 0.9), p=0.7)], 
        #          n=3),
        
        # Noise
        # A.GaussNoise(p=0.2),
        
        # Blur
        # A.OneOf([A.AdvancedBlur(p=0.4),
        #          A.GaussianBlur(p=0.4)]),
        
        # Dropout
        # A.CoarseDropout(max_holes=20, min_holes=5, 
        #                 max_height=8, max_width=8, 
        #                 p=0.4),
        
        # Geometric
        A.OneOf([A.ElasticTransform(alpha=1.0, sigma=10, alpha_affine=10, 
                                    interpolation=cv2.INTER_CUBIC, p=0.4),
                 A.Rotate(limit=(-15, 15), interpolation=cv2.INTER_CUBIC, p=0.4)])
        ])

opt.verbose = 1
train(opt)

## Inference

In [None]:
opt.saved_model = f'./saved_models/{opt.exp_name}/best_accuracy.pth'
opt.test_data = './test'

In [None]:
from textocr.test import test

result = test(opt)

## Submission

In [None]:
submit = pd.read_csv('./sample_submission.csv')
submit['label'] = result

In [None]:
submit.to_csv('./submission.csv', index=False, encoding='utf8')

In [None]:
submit