## OCR 모델을 위한 패키지 설치

In [None]:
# # install deep-text-recognition-benchmark dependency
# !pip install lmdb pillow torchvision nltk natsort fire

In [None]:
# # install TextRecognitionDataGenerator
# !pip install trdg seaborn tqdm tensorflow beautifulsoup4

In [None]:
# # ocr 리포지토리 클론
# !git clone https://github.com/mhseo10/customocr.git

## OCR 모델을 위한 데이터 준비

### TextRecognitionDataGenerator 데이터 준비

In [1]:
import pandas as pd
import numpy as np

ko_dict = pd.read_csv('./NIADic.csv')

ko_dict['term'] = ko_dict['term'].str.replace("[^가-힣]", '', regex=True)
ko_dict['term'].replace('', np.nan, inplace=True)
ko_dict.drop_duplicates(subset=['term'], inplace=True)

ko_dict = ko_dict.dropna()
ko_dict = ko_dict['term']

ko_dict.to_csv('./ko.txt', header=False, index=False)

In [None]:
# # 50만장 가로 텍스트 이미지 생성
!trdg -c 500000 -w 4 -f 64 -l ko -e png -na 2 -d 3 -do 2 -k 20 -or 0 -bl 1 -r -rk -rbl -fi -fd ./font/ -dt ./ko.txt --output_dir ./word_h/ -t 8

In [None]:
# # 50만장 세로 텍스트 이미지 생성
!trdg -c 500000 -w 2 -f 64 -l ko -e png -na 2 -d 3 -do 2 -k 20 -or 1 -bl 1 -r -rk -rbl -fi -fd ./font/ -dt ./ko.txt --output_dir ./word_v/ -t 8

In [None]:
# 생성한 이미지를 lmdb 데이터로 만들기 위한 gt 파일 생성
h_label = './word_h/'
v_label = './word_v/'

with open(f'{h_label}labels.txt', 'r', encoding='utf8') as f:
    with open(f'{h_label}train.txt', 'w', encoding='utf8') as d:
        for i in f.readlines():
            a = i.replace(' ', '\t', 1)
            d.write(a)

with open(f'{v_label}labels.txt', 'r', encoding='utf8') as f:
    with open(f'{v_label}train.txt', 'w', encoding='utf8') as d:
        for i in f.readlines():
            a = i.replace(' ', '\t', 1)
            d.write(a)

In [None]:
# # word_h data lmdb 파일 생성

# window
!python ./hallymocr/create_lmdb_dataset.py --inputPath ./word_h/ --gtFile ./word_h/train.txt --outputPath ./result/word_h

# # linux
# !python3 ./hallymocr/create_lmdb_dataset.py --inputPath ./word_h/ --gtFile ./word_h/train.txt --outputPath ./result/word_h

In [None]:
# word_v data lmdb 파일 생성

# window
!python ./hallymocr/create_lmdb_dataset.py --inputPath ./word_v/ --gtFile ./word_v/train.txt --outputPath ./result/word_v
#
# # linux
# !python3 ./hallymocr/create_lmdb_dataset.py --inputPath ./word_v/ --gtFile ./word_v/train.txt --outputPath ./result/word_v

In [None]:
train_csv_path = 'open/train.csv'
train_csv = pd.read_csv(train_csv_path)
train_csv[:1000].to_csv('open/train.txt', sep='\t', header=False, index=False)

In [None]:
# train data lmdb 파일 생성

# window
!python ./hallymocr/create_lmdb_dataset.py --inputPath ./open/ --gtFile ./open/train.txt --outputPath ./result/train
# # linux
# !python3 ./hallymocr/create_lmdb_dataset.py --inputPath ./open/ --gtFile ./open/train.txt --outputPath ./result/train

## OCR 모델 준비

In [None]:
import os
import sys
sys.path.append("./hallymocr")

import random
import string

import torch.backends.cudnn as cudnn
import torch.utils.data

from hallymocr.train import train

# 하이퍼파라미터 설정
opt = {
    'exp_name': None,
    'train_data': './result/',
    'valid_data': './result/train',
    'manualSeed': 1111,
    'workers': 0,
    'batch_size': 192,
    'num_iter': 100000,
    'valInterval': 1000,
    'saved_model': '',

    'FT': False,
    'adam': False,
    'lr': 1,
    'beta1': 0.9,
    'rho': 0.95,
    'eps': 1e-8,
    'grad_clip': 5,
    'baiduCTC': False,
    'select_data': 'word_h-word_v',
    'batch_ratio': '0.5-0.5',
    'total_data_usage_ratio': '1',
    'batch_max_length': 25,

    'imgH': 64,
    'imgW': 64,
    'rgb': False,
    'character': ' 가각간갇갈감갑값갓강갖같갚갛개객걀걔거걱건걷걸검겁것겉게겨격겪견결겹경곁계고곡곤곧골곰곱곳공과관광괜괴굉교구국군굳굴굵굶굽궁권귀귓규균귤그극근글긁금급긋긍기긴길김깅깊까깍깎깐깔깜깝깡깥깨꺼꺾껌껍껏껑께껴꼬꼭꼴꼼꼽꽂꽃꽉꽤꾸꾼꿀꿈뀌끄끈끊끌끓끔끗끝끼낌나낙낚난날낡남납낫낭낮낯낱낳내냄냇냉냐냥너넉넌널넓넘넣네넥넷녀녁년념녕노녹논놀놈농높놓놔뇌뇨누눈눕뉘뉴늄느늑는늘늙능늦늬니닐님다닥닦단닫달닭닮담답닷당닿대댁댐댓더덕던덜덟덤덥덧덩덮데델도독돈돌돕돗동돼되된두둑둘둠둡둥뒤뒷드득든듣들듬듭듯등디딩딪따딱딴딸땀땅때땜떠떡떤떨떻떼또똑뚜뚫뚱뛰뜨뜩뜯뜰뜻띄라락란람랍랑랗래랜램랫략량러럭런럴럼럽럿렁렇레렉렌려력련렬렵령례로록론롬롭롯료루룩룹룻뤄류륙률륭르른름릇릎리릭린림립릿링마막만많말맑맘맙맛망맞맡맣매맥맨맵맺머먹먼멀멈멋멍멎메멘멩며면멸명몇모목몬몰몸몹못몽묘무묵묶문묻물뭄뭇뭐뭘뭣므미민믿밀밉밌및밑바박밖반받발밝밟밤밥방밭배백뱀뱃뱉버번벌범법벗베벤벨벼벽변별볍병볕보복볶본볼봄봇봉뵈뵙부북분불붉붐붓붕붙뷰브븐블비빌빔빗빚빛빠빡빨빵빼뺏뺨뻐뻔뻗뼈뼉뽑뿌뿐쁘쁨사삭산살삶삼삿상새색샌생샤서석섞선설섬섭섯성세섹센셈셋셔션소속손솔솜솟송솥쇄쇠쇼수숙순숟술숨숫숭숲쉬쉰쉽슈스슨슬슴습슷승시식신싣실싫심십싯싱싶싸싹싼쌀쌍쌓써썩썰썹쎄쏘쏟쑤쓰쓴쓸씀씌씨씩씬씹씻아악안앉않알앓암압앗앙앞애액앨야약얀얄얇양얕얗얘어억언얹얻얼엄업없엇엉엊엌엎에엔엘여역연열엷염엽엿영옆예옛오옥온올옮옳옷옹와완왕왜왠외왼요욕용우욱운울움웃웅워원월웨웬위윗유육율으윽은을음응의이익인일읽잃임입잇있잊잎자작잔잖잘잠잡잣장잦재쟁쟤저적전절젊점접젓정젖제젠젯져조족존졸좀좁종좋좌죄주죽준줄줌줍중쥐즈즉즌즐즘증지직진질짐집짓징짙짚짜짝짧째쨌쩌쩍쩐쩔쩜쪽쫓쭈쭉찌찍찢차착찬찮찰참찻창찾채책챔챙처척천철첩첫청체쳐초촉촌촛총촬최추축춘출춤춥춧충취츠측츰층치칙친칠침칫칭카칸칼캄캐캠커컨컬컴컵컷케켓켜코콘콜콤콩쾌쿄쿠퀴크큰클큼키킬타탁탄탈탑탓탕태택탤터턱턴털텅테텍텔템토톤톨톱통퇴투툴툼퉁튀튜트특튼튿틀틈티틱팀팅파팎판팔팝패팩팬퍼퍽페펜펴편펼평폐포폭폰표푸푹풀품풍퓨프플픔피픽필핏핑하학한할함합항해핵핸햄햇행향허헌험헤헬혀현혈협형혜호혹혼홀홈홉홍화확환활황회획횟횡효후훈훌훔훨휘휴흉흐흑흔흘흙흡흥흩희흰히힘',
    'sensitive': False,
    'PAD': False,
    'data_filtering_off': False,
    'Transformation': 'TPS',  # None|TPS
    'FeatureExtraction': 'VIT',  # VGG|ResNet|RCNN|VIT
    'SequenceModeling': 'BiLSTM',  # None|BiLSTM
    'Prediction': 'Attn',  # CTC|Attn
    'num_fiducial': 20,
    'input_channel': 1,
    'output_channel': 256, # VIT:256 ,other: 512
    'hidden_size': 256,
} 

# 모델 추가 세부사항 설정
if not opt['exp_name']:
    opt['exp_name'] = '{Transformation}-{FeatureExtraction}-{SequenceModeling}-{Prediction}'.format(**opt)
    opt['exp_name'] += '-Seed{manualSeed}'.format(**opt)
    # print(opt.exp_name)

os.makedirs('./saved_models/{exp_name}'.format(**opt), exist_ok=True)

""" vocab / character number configuration """
if opt['sensitive']:
    opt['character'] = string.printable[:-6]

""" Seed and GPU setting """
# print("Random Seed: ", opt.manualSeed)
random.seed(opt['manualSeed'])
np.random.seed(opt['manualSeed'])
torch.manual_seed(opt['manualSeed'])
torch.cuda.manual_seed(opt['manualSeed'])

cudnn.benchmark = True
cudnn.deterministic = True
opt['num_gpu'] = torch.cuda.device_count()
# print('device count', opt.num_gpu)

if opt['num_gpu'] > 1:
    print('------ Use multi-GPU setting ------')
    print('if you stuck too long time with multi-GPU setting, try to set --workers 0')
    # check multi-GPU issue https://github.com/clovaai/deep-text-recognition-benchmark/issues/1
    opt['workers'] = opt['workers'] * opt['num_gpu']
    opt['batch_size'] = opt['batch_size'] * opt['num_gpu']

## 모델 학습

In [None]:
# train
train(opt)

## TextRecognitionDataGenerator 데이터로 train data 예측

In [None]:
from hallymocr.test import test

# 저장된 모델 load
opt['saved_model'] = 'saved_models/{Transformation}-{FeatureExtraction}-{SequenceModeling}-{Prediction}-Seed{manualSeed}/best_accuracy.pth'.format(**opt)
opt['test_data'] = './open/train'

# test
result = test(opt)

In [None]:
import pandas as pd

train_csv_path = 'open/train.csv'
train_csv = pd.read_csv(train_csv_path)
train_csv['text'] = result

In [None]:
from datetime import datetime

time = datetime.now().strftime('%m%d_%H%M')
csv_name = f'open/train_{time}.csv'
train_csv.to_csv(csv_name, index=False, encoding='UTF-8')

In [None]:
pd.read_csv(csv_name)

In [None]:
pred_csv_path = csv_name
pred_csv = pd.read_csv(pred_csv_path)

# lmdb 파일을 만들기 위한 txt 파일 저장
pred_csv.to_csv('open/train.txt', sep='\t', header=False, index=False)

### train의 레이블을 모델의 예측값으로 교체한 후 해당 모델에 변경된 train data 학습

In [None]:
# train data lmdb 파일 생성

# window
!python ./hallymocr/create_lmdb_dataset.py --inputPath ./open/ --gtFile open/train.txt --outputPath ./result/train
# # linux
# !python3 ./hallymocr/create_lmdb_dataset.py --inputPath ./open/ --gtFile open/train.txt --outputPath ./result/train

In [None]:
opt['exp_name'] = '{Transformation}-{FeatureExtraction}-{SequenceModeling}-{Prediction}-Seed{manualSeed}'.format(**opt)
opt['select_data'] = 'train'
opt['batch_ratio'] = '1'

In [None]:
train(opt)

## test data 예측

In [None]:
# 저장된 모델 load
opt['saved_model'] = 'saved_models/{Transformation}-{FeatureExtraction}-{SequenceModeling}-{Prediction}-Seed{manualSeed}/best_accuracy.pth'.format(**opt)
opt['test_data'] = './open/test'

# test
result = test(opt)

## CSV 파일 생성

In [None]:
result_csv_path = 'open/sample_submission.csv'
result_csv = pd.read_csv(result_csv_path)

result_csv['text'] = result

In [None]:
time = datetime.now().strftime('%m%d_%H%M')
csv_name = f'{time}.csv'
result_csv.to_csv(csv_name, index=False, encoding='UTF-8')

In [None]:
pd.read_csv(csv_name)