## Import

In [None]:
# !pip install lmdb pillow nltk natsort

In [None]:
import os
import random
import sys
sys.path.append("./textocr")

import numpy as np
import pandas as pd
import torch

from sklearn.model_selection import train_test_split

In [None]:
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

## Hyperparameter Setting

In [None]:
# 하이퍼파라미터 설정
opt = {
    'exp_name': 'None',
    'train_data': './result/',
    'valid_data': './result/valid',
    'manualSeed': 41,
    'workers': 0,
    'batch_size': 96,
    'num_iter': 10000,
    'valInterval': 100,
    'saved_model': '',

    'FT': False,
    'adam': False,
    'lr': 1,
    'beta1': 0.9,
    'rho': 0.95,
    'eps': 1e-8,
    'grad_clip': 5,
    'baiduCTC': False,
    'select_data': 'train',
    'batch_ratio': '1',
    'total_data_usage_ratio': '1',
    'batch_max_length': 6,

    'imgH': 32,
    'imgW': 100,
    'rgb': False,
    'character': '',
    'sensitive': False,
    'PAD': False,
    'data_filtering_off': False,
    'Transformation': 'TPS',  # None|TPS
    'FeatureExtraction': 'ResNet',  # VGG|ResNet|RCNN
    'SequenceModeling': 'BiLSTM',  # None|BiLSTM
    'Prediction': 'Attn',  # CTC|Attn
    'num_fiducial': 20,
    'input_channel': 1,
    'output_channel': 512,
    'hidden_size': 256,
}

opt = pd.Series(opt)

## Fixed RandomSeed

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(opt.manualSeed)  # Seed 고정

## Data Load & Train/Validation Split

In [None]:
df = pd.read_csv('./train.csv')
df['len'] = df['label'].str.len()
print(df['len'].max())


# 제공된 학습데이터 중 1글자 샘플들의 단어사전이 학습/테스트 데이터의 모든 글자를 담고 있으므로 학습 데이터로 우선 배치
train_v1 = df[df['len'] == 1]

In [None]:
# 제공된 학습데이터 중 2글자 이상의 샘플들에 대해서 단어길이를 고려하여 Train (80%) / Validation (20%) 분할
df = df[df['len'] > 1]
train_v2, val, _, _ = train_test_split(df, df['len'], test_size=0.2, random_state=opt.manualSeed, stratify=df['len'], shuffle=True)

In [None]:
# 학습 데이터로 우선 배치한 1글자 샘플들과 분할된 2글자 이상의 학습 샘플을 concat하여 최종 학습 데이터로 사용
train = pd.concat([train_v1, train_v2])
print(len(train), len(val))

In [None]:
# 학습 레이블 생성
str_dict = pd.Series(train_v1['label'].unique()).str.cat()
opt.character = str_dict
len(str_dict)

In [None]:
# gt 파일 생성
os.makedirs(f'./gt_file', exist_ok=True)

train.drop(['id', 'len'], axis=1).to_csv('./gt_file/train.txt', sep='\t', header=False, index=False)
val.drop(['id', 'len'], axis=1).to_csv('./gt_file/valid.txt', sep='\t', header=False, index=False)

In [None]:
!python ./textocr/create_lmdb_dataset.py --inputPath '' --gtFile ./gt_file/train.txt --outputPath ./result/train --file_size 1
!python ./textocr/create_lmdb_dataset.py --inputPath '' --gtFile ./gt_file/valid.txt --outputPath ./result/valid --file_size 1

## Train

In [None]:
if opt.exp_name == 'None':
    opt.exp_name = f'{opt.Transformation}-{opt.FeatureExtraction}-{opt.SequenceModeling}-{opt.Prediction}-Seed{opt.manualSeed}'
    print(opt.exp_name)

os.makedirs(f'./saved_models/{opt.exp_name}', exist_ok=True)

""" Seed and GPU setting """
opt.num_gpu = torch.cuda.device_count()

if opt.num_gpu > 1:
    print('------ Use multi-GPU setting ------')
    print('if you stuck too long time with multi-GPU setting, try to set --workers 0')
    # check multi-GPU issue https://github.com/clovaai/deep-text-recognition-benchmark/issues/1
    opt.workers = opt.workers * opt.num_gpu
    opt.batch_size = opt.batch_size * opt.num_gpu

In [None]:
from textocr.train import train

train(opt)

## Train 2

In [None]:
df = pd.read_csv('./train.csv')
df.drop('id', axis=1).to_csv('./gt_file/final_train.txt', sep='\t', header=False, index=False)

In [None]:
!python ./textocr/create_lmdb_dataset.py --inputPath '' --gtFile ./gt_file/final_train.txt --outputPath ./result/train --file_size 1

In [None]:
opt.exp_name = f'{opt.Transformation}-{opt.FeatureExtraction}-{opt.SequenceModeling}-{opt.Prediction}-Seed{opt.manualSeed}-Final'
os.makedirs(f'./saved_models/{opt.exp_name}', exist_ok=True)
print(opt.exp_name)

In [None]:
opt.select_data = 'train'
opt.batch_ratio = '1'
opt.valid_data = './result/train'
opt.num_iter = 15000
opt.valInterval = 500
train(opt)

## Inference

In [None]:
opt.saved_model = f'./saved_models/{opt.exp_name}/best_accuracy.pth'
opt.test_data = './test'

In [None]:
from textocr.test import test

result = test(opt)

## Submission

In [None]:
submit = pd.read_csv('./sample_submission.csv')
submit['label'] = result

In [None]:
submit.to_csv('./submission.csv', index=False, encoding='utf8')

In [None]:
submit