In [18]:
import tensorflow as tf
from tensorflow.keras.applications.resnet import preprocess_input

import os
import random
import numpy as np
import pandas as pd
from PIL import Image
from imgaug import augmenters as iaa

from tqdm.notebook import tqdm

In [19]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

physical_devices = tf.config.list_physical_devices('GPU')
print(physical_devices)
tf.compat.v1.disable_eager_execution()

try:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
    print("Limited GPU")
except:
    print("Failed to limit GPU")

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Limited GPU


In [52]:
TRAIN_CORPUS = [
    "chinh-phu-ngam-khuc-nlvnpf-0064-NomText.txt",
    "buom-hoa-tan-truyen-2.txt",
    "bai-ca-ran-co-bac.txt",
    "gia-huan-ca.txt",
    "ho-xuan-huong.txt",
    "ngoc-kieu-le-tan-truyen.txt",
    "tale-of-kieu-1870.txt",
    "tale-of-kieu-1871.txt"]

TEST_CORPUS = [
    "tale-of-kieu-1902.txt",
    "luc-van-tien-nlvnpf-0059.txt"]

DATA_PATH = "/data1/trucndt3/OCR/macBERT/Data/"

# Masking LVT

In [20]:
df = pd.read_csv("/data1/trucndt3/OCR/data/Luc-Van-Tien/annotation.csv")
df

Unnamed: 0,label,x1,x2,y1,y2,source_img,img_w,img_h
0,7567,427,462,149,190,nlvnpf-0059-002.jpg,493,760
1,7551,426,464,192,227,nlvnpf-0059-002.jpg,493,760
2,4021,425,462,226,262,nlvnpf-0059-002.jpg,493,760
3,50B3,426,463,261,302,nlvnpf-0059-002.jpg,493,760
4,897F,426,464,304,339,nlvnpf-0059-002.jpg,493,760
...,...,...,...,...,...,...,...,...
14421,610F,291,325,524,560,nlvnpf-0059-105.jpg,494,763
14422,20CE8,292,327,558,592,nlvnpf-0059-105.jpg,494,763
14423,215F6,288,324,591,625,nlvnpf-0059-105.jpg,494,763
14424,9EBB,288,324,625,658,nlvnpf-0059-105.jpg,494,763


In [24]:
class BBox:
    def __init__(self, x1, x2, y1, y2, label):
        self.x1 = x1       
        self.y1 = y1     
        self.x2 = x2        
        self.y2 = y2      
        self.label = label
        
        self.x_center = (self.x1+self.x2)/2
        self.y_center = (self.y1+self.y2)/2
        self.CXCYDifference = self.x_center - self.y_center

def createListOfBBoxes(img_df):
    listOfBBoxes = []
    for _, row in img_df.iterrows():
        x1 = float(row["x1"]) 
        y1 = float(row["y1"]) 
        x2 = float(row["x2"]) 
        y2 = float(row["y2"]) 
        label = row["label"]
        
        # Create object
        
        box = BBox(x1, x2, y1, y2, label)
        listOfBBoxes.append(box)

    return listOfBBoxes

def CalculateBBoxCenterAndSize(bbox_lst):
    left = bbox_lst[0]
    top = bbox_lst[1]
    right = bbox_lst[2]
    bottom = bbox_lst[3]
    center_x = bbox_lst[4]
    center_y = bbox_lst[5]
    
    width = right - left
    height = bottom - top
    xywh = np.array([center_x, center_y, width, height])

    return xywh


def CompareCenterAndSize(xywh_1, xywh_2, hyper_lambda=1.5):
    if (abs(xywh_1[0] - xywh_2[0]) > hyper_lambda*((xywh_1[2] + xywh_2[2])/2)) \
    or (abs(xywh_1[1] - xywh_2[1]) > hyper_lambda*((xywh_1[3] + xywh_2[3])/2)):
        return True
    return False


def SplitLine(bbox_lst, hyper_lambda=1.5):
    sep_lst = []
    # print(type(bbox_lst))
    # print(bbox_lst)
    for idx in range(len(bbox_lst)-1):
        xywh_1 = CalculateBBoxCenterAndSize(bbox_lst[idx])
        xywh_2 = CalculateBBoxCenterAndSize(bbox_lst[idx+1])

        if CompareCenterAndSize(xywh_1, xywh_2, hyper_lambda):
            sep_lst.append("\n")
        else:
            sep_lst.append(" ")
    return sep_lst

def write_txt_file(listOfSortedBBoxes, sep_lst, source_img, path):
    source_img = source_img.split(".")[0]
    result_path = os.path.join(path, f"{source_img}.txt")

    with open(result_path, 'w', encoding="utf-16") as f:
        for i in range(0, len(sep_lst)):
            if listOfSortedBBoxes[i].label == 'UKN':
                f.write("[UKN]")
                f.write(sep_lst[i])    
            else:
                f.write(chr(int(listOfSortedBBoxes[i].label, 16)))
                f.write(sep_lst[i])

In [28]:
source_img_lst = df["source_img"].unique()
RESULT_OCR_PATH = "/data1/trucndt3/OCR/macBERT/Data/LVT"

with tqdm(total = len(source_img_lst)) as pbar:
    for source_img in source_img_lst:
        img_df = df[df['source_img'] == source_img]
        listOfSortedBBoxes = createListOfBBoxes(img_df)
        sorted_bbox_lst = []
        for bbox in listOfSortedBBoxes:
            sorted_bbox_lst.append(np.array([int(bbox.x1), int(bbox.y1), int(bbox.x2), int(bbox.y2), float(bbox.x_center), float(bbox.y_center)]))
        
        sorted_bbox_lst = np.array(sorted_bbox_lst)    
        sep_lst = SplitLine(sorted_bbox_lst, hyper_lambda=1.5)
        write_txt_file(listOfSortedBBoxes, sep_lst, source_img, RESULT_OCR_PATH)
                       
        pbar.update()

  0%|          | 0/104 [00:00<?, ?it/s]

In [26]:
vocab_resnet = pd.read_pickle("/data1/trucndt3/OCR/macBERT/Data/vocab_resnet.pkl")
vocab_resnet

Unnamed: 0,img_path,true_label,top_5
0,/data1/trucndt3/OCR/data/data_character/20027/...,20027,"[20027, 4E09, 5339, 26C1F, 4E9B]"
1,/data1/trucndt3/OCR/data/data_character/20040/...,20040,"[20040, 6B72, 94B1, 22A6E, 5E95]"
2,/data1/trucndt3/OCR/data/data_character/20042/...,20042,"[20042, 6253, 4EC3, 6C40, 7D05]"
3,/data1/trucndt3/OCR/data/data_character/200E9/...,200E9,"[200E9, 2248E, 3431, 73CD, 5F62]"
4,/data1/trucndt3/OCR/data/data_character/20129/...,20129,"[20129, 7D05, 548D, 53F0, 8A3C]"
...,...,...,...
1897,/data1/trucndt3/OCR/data/data_character/9F14/9...,9F14,"[9F14, 9F13, 2A520, 5C90, 215C9]"
1898,/data1/trucndt3/OCR/data/data_character/9F4A/9...,9F4A,"[9F4A, 252F3, 71F6, 85DD, 700B]"
1899,/data1/trucndt3/OCR/data/data_character/9F4B/9...,9F4B,"[9F4B, 864E, 275F1, 21A3A, 796D]"
1900,/data1/trucndt3/OCR/data/data_character/9F8E/9...,9F8E,"[9F8E, 9B54, 9E7F, 28EEB, 8FB0]"


In [36]:
def create_vocab_corpus(corpus):
    vocab = ""

    with tqdm(total = len(corpus)) as pbar:
        for file in corpus:
            file_path = os.path.join(RESULT_OCR_PATH, file)
            with open(file_path, encoding="utf16", mode = "r") as file_handler:
                content = file_handler.readlines()
                for line in content:
                    vocab += line.strip('\n')
                    vocab += " "

            pbar.update()

    vocab_set = list(set(vocab.split(" ")))

    vocab_set = [i for i in vocab_set if i] #remove empty string
    return vocab_set

In [40]:
LVT_CORPUS = os.listdir(RESULT_OCR_PATH)
LVT_vocab = create_vocab_corpus(LVT_CORPUS)
len(LVT_vocab)

  0%|          | 0/104 [00:00<?, ?it/s]

2005

In [47]:
def process_masking(corpus, vocab):
    correct_text_lst = []
    original_text_lst = []
    wrong_ids_lst = []

    with tqdm(total = len(corpus)) as pbar:
        for file in corpus:
            file_path = os.path.join(RESULT_OCR_PATH, file)

            with open(file_path, encoding="utf16", mode = "r") as file_handler:
                content = file_handler.readlines()

            for line in content:
                correct_text = line.strip('\n').replace(" ","")
                if len(correct_text) > 0 and "[UKN]" not in correct_text:
                    original_text = correct_text
                    original_text = list(original_text)

                    num_word_mask = random.randint(0, len(original_text) // 3)
                    chosen_words = np.random.choice(range(len(original_text)), size=num_word_mask, replace=False)
                    wrong_ids_str = " ".join(str(e+1) for e in chosen_words)

                    for word_idx in chosen_words:             
                        vocab_ix = random.randint(0, len(vocab) - 1)
                        original_text[word_idx] = vocab[vocab_ix]

                    correct_text_lst.append(correct_text)
                    original_text_lst.append(''.join(original_text))
                    wrong_ids_lst.append(wrong_ids_str)


            pbar.update()

    df = pd.DataFrame({
      "original_text": original_text_lst,
      "correct_text": correct_text_lst,
      "wrong_ids": wrong_ids_lst
      })
    
    return df

In [105]:
def process_masking_sentences(df, length = 10):
    original_text_lst = []
    correct_text_lst = []
    wrong_ids_lst = []
    
    for idx in range(0, len(df), length):
        ori_txt = df.iloc[idx]["original_text"]
        cor_txt = df.iloc[idx]["correct_text"]
        wrong_ids = df.iloc[idx]["wrong_ids"].split(" ")
        wrong_ids = [int(e) for e in wrong_ids if e]
        idx_sen = idx + 1
        
        step = 1
        increase_length = 0
        while (step < length) and (idx+step < len(df)):
            ori_txt += '\n' + df.iloc[idx+step]["original_text"]
            cor_txt += '\n' + df.iloc[idx+step]["correct_text"]
            current_wrong_ids = df.iloc[idx+step]["wrong_ids"].split(" ")
            increase_length += len(df.iloc[idx+step-1]["correct_text"])
            current_wrong_ids = [int(e)+increase_length for e in current_wrong_ids if e]
            wrong_ids.extend(current_wrong_ids)
            
            step += 1
        
        wrong_ids_str = " ".join(str(e) for e in wrong_ids)
        
        original_text_lst.append(ori_txt)
        correct_text_lst.append(cor_txt)
        wrong_ids_lst.append(wrong_ids_str)

    df = pd.DataFrame({
      "original_text": original_text_lst,
      "correct_text": correct_text_lst,
      "wrong_ids": wrong_ids_lst
      })
    
    return df

In [71]:
train_df = process_masking(LVT_CORPUS, LVT_vocab)
train_df

  0%|          | 0/104 [00:00<?, ?it/s]

Unnamed: 0,original_text,correct_text,wrong_ids
0,畧畑䀡傳西銘,畧畑䀡傳西銘,
1,𡄎唭𠄩𡦂人情喓𠻗,𡄎唭𠄩𡦂人情喓𠻗,
2,埃匕𠳺匕麻𦖑,埃匕𠳺匕麻𦖑,
3,𡨺唯役畧苓𠽮身中,𡨺噒役畧苓𠽮身𡢐,8 2
4,固𠊛扵𤻻臣城,固𠊛扵郡東城,4 5
...,...,...,...
1578,𢧚之沛寫𢁍𠳒吟哦,𢧚之沛寫𢁍𠳒吟哦,
1579,底麻差待儒家,底麻除待儒家,3
1580,沛修飛行買𦋦歷𠊚,沛修節行買𦋦歷𠊚,3
1581,仃香悲𢪀事𠁀,仃群悲𢪀事𠁀,2


In [109]:
train_df_sen = process_masking_sentences(train_df, length = 10)
train_df_sen

Unnamed: 0,original_text,correct_text,wrong_ids
0,畧畑䀡傳西銘\n𡄎唭𠄩𡦂人情喓𠻗\n埃匕𠳺匕麻𦖑\n𡨺唯役畧苓𠽮身中\n固𠊛扵𤻻臣城\n歲𣃣...,畧畑䀡傳西銘\n𡄎唭𠄩𡦂人情喓𠻗\n埃匕𠳺匕麻𦖑\n𡨺噒役畧苓𠽮身𡢐\n固𠊛扵郡東城\n歲𣃣...,28 22 32 33 45 46 55 57
1,徐仙𠓨謝尊師落𧗱\n㐌鮮氣象𥢆蜂精神\n𫢩它及会風雲\n𠊛匕埃拱立身貝尼\n橷𭸓𢏑儉边𩄲\n...,雲仙𠓨謝尊師𦋦𧗱\n㐌鮮氣象吏吹精神\n𫢩它及会風雲\n𠊛匕埃拱立身貝尼\n志𭸓𢏑鴈边𩄲\n...,7 1 14 13 34 31 45 48 57
2,沛朱𠬠法底楚𫉚身\n𭸓滝供𠊡𨖲𰎉拱\n尊横𠓨准後乖\n𠻨仙魚謹𢚸強生𪟽\n之朋哙吏理芇朱明\...,沛朱𠬠法底防𫉚身\n𠖈滝供𠊡𨖲𰎉拱\n尊師𠓨准後堂\n雲仙魚謹𢚸強生𪟽\n之朋哙吏理芇朱明\...,6 5 9 17 21 22 60 62 66
3,云仙霜呐連䜹\n小生会別𬋟慢𠯿\n椿萱歲𨃐悲高\n吀柴排訢音末朱𨴦\n匪師𦖑呐添傷\n因机謀...,云仙𦖑呐連䜹\n小生庄別𬋟湄𠯿\n椿萱歲鶴㐌高\n吀柴排訢音毛朱詳\n尊師𦖑呐添傷\n因机謀...,3 9 12 18 17 27 25 28 45 48 51 57 58 62 65
4,斗星㐌唹紫微添和\n嫌爲馭唉群賖\n𫽄它间俸𬷤它𠰁散\n包徐朱細北方\n𡢐油鄧𡦂顕荣\n要勾...,斗星㐌𤎜紫微添和\n嫌爲馭唉群賖\n兎它路俸𬷤它𠰁散\n包徐朱細北方\n𡢐油鄧𡦂顕荣\n汝勾...,4 17 15 35 37 50 66
...,...,...,...
154,荘啐昆啿紅顔\n媄昆些吏杜承武公\n鼓𠻁打蠟𦟐紅督縁\n𫢩㐌細𣅶状元\n寒江𧗱細侈連埬軍\n...,荘浪昆固紅顔\n媄昆些吏杜承武公\n𤐝𠻁打蠟𦟐紅逴縁\n𫢩㐌細𣅶状元\n寒江𧗱細侈連埬軍\n...,2 4 21 15 47 54 67 66
155,媄昆碎細𦋦侯幔元\n父咽㐌𠖈黄落\n嗔傷𥙩妾𤇧娟分尼\n状浪挷家𡳪苔\n寃家我恒呂衝\n少裙...,媄昆碎細𦋦侯状元\n父親㐌𠖈黄泉\n嗔傷𥙩妾婵娟分尼\n状浪挷鉢渃苔\n寃家女畧呂衝\n少之...,7 14 10 19 27 26 32 31 37 36 43 54 57 64 67
156,𧗱朱嫂𢽙拖𩅀捇𨃐\n媄昆𨅸時春台\n阻𧗱渚及細蛮\n𧡊𠄩翁𤜯𧼋𦋦迍塘\n汰栗扒媄昆娘\n𦊚皮...,𧗱朱嫂嫂拖欺捇𨃐\n媄昆𨅸時春台\n阻𧗱渚及細茄\n𧡊𠄩翁𤜯𧼋𦋦迍塘\n汰調扒媄昆娘\n𦊚皮...,4 6 20 30 37 39 47 48 51 53 60 59 67
157,托它𠅍刧㗂群碑名\n状元𧗱𫴋東成\n六公𫢩㐌𡏦营於廊\n排𦋦紅條産床\n各官鈎户𡠣娘囂娥\n...,托它𠅍刧㗂群碑名\n状元𧗱細東成\n六公𫢩㐌𡏦营於廊\n排𦋦六礼産床\n各官𠫾户𡠣娘月娥\n...,12 26 25 31 35 53 65


In [51]:
test_df = process_masking(LVT_CORPUS, LVT_vocab)
test_df

  0%|          | 0/104 [00:00<?, ?it/s]

Unnamed: 0,original_text,correct_text,wrong_ids
0,畧畑䀡塲西銘,畧畑䀡傳西銘,4
1,𡄎唭𠄩𡦂人情喓𠻗,𡄎唭𠄩𡦂人情喓𠻗,
2,埃匕𠳺匕麻𦖑,埃匕𠳺匕麻𦖑,
3,𡨺噒役畧又𠽮身𡢐,𡨺噒役畧苓𠽮身𡢐,5
4,固𠊛扵郡東城,固𠊛扵郡東城,
...,...,...,...
1578,𢧚之沛饒𢁍𠳒吟哦,𢧚之沛寫𢁍𠳒吟哦,4
1579,底尊除待儒家,底麻除待儒家,2
1580,沛修節行買𦋦歷𠊚,沛修節行買𦋦歷𠊚,
1581,仃群悲𢪀事𠁀,仃群悲𢪀事𠁀,


In [110]:
test_df_sen = process_masking_sentences(test_df, length = 7)
test_df_sen

Unnamed: 0,original_text,correct_text,wrong_ids
0,畧畑䀡塲西銘\n𡄎唭𠄩𡦂人情喓𠻗\n埃匕𠳺匕麻𦖑\n𡨺噒役畧又𠽮身𡢐\n固𠊛扵郡東城\n歲𣃣...,畧畑䀡傳西銘\n𡄎唭𠄩𡦂人情喓𠻗\n埃匕𠳺匕麻𦖑\n𡨺噒役畧苓𠽮身𡢐\n固𠊛扵郡東城\n歲𣃣...,4 25 42 41 45 48
1,𣎃㝵包𬋩乃𮎦劳刀\n文它起鳳滕雲\n武添叁畧六韜埃\n雲仙𠓨謝尊師𦋦𧗱\n㐌鮮氣象吏吹精神\...,𣎃㝵包𬋩功程劳刀\n文它起鳳滕雲\n武添叁畧六韜埃\n雲仙𠓨謝尊師𦋦𧗱\n㐌鮮氣象吏吹精神\...,5 6 48
2,志𭸓𢏑鴈边𩄲\n名碎㐌𤎜㗂柴屯賖\n尊師欺意論盘\n𡄎数兜係科塲群賖\n吹柴傷伵㤕車𥪝𢚸\n...,志𭸓𢏑鴈边𩄲\n名碎㐌𤎜㗂柴屯賖\n尊師欺意論盘\n𡄎𥪝𢼂係科塲群賖\n吹柴傷伵㤕車𥪝𢚸\n...,23 22 42 43 44
3,𠖈滝𣥱𠊡𨖲𰎉拱\n尊師𠓨就枚堂\n雲仙魚謹𢚸強生𫎇\n之朋哙吏理芇朱明\n一䜹朱訢事情\n丕...,𠖈滝供𠊡𨖲𰎉拱\n尊師𠓨准後堂\n雲仙魚謹𢚸強生𪟽\n之朋哙吏理芇朱明\n𠓨䜹朱訢事情\n丕...,3 12 11 21 30 40 46
4,哙浪萬里程途\n些盘𠬠𥒥科期群賖\n蜉仙𦖑呐拠䜹\n小生庄別𬋟湄𠯿\n椿萱歲北㐌高\n吀柴排...,哙浪萬里程途\n柴盘𠬠𭛣科期群賖\n云仙𦖑呐連䜹\n小生庄別𬋟湄𠯿\n椿萱歲鶴㐌高\n吀柴排...,7 10 19 15 31 42 43
...,...,...,...
222,寃䚂女畧呂衝\n少舘𢧚浽麻蒙𠰙隊\n苔明子直調𥋳\n𠽋鴻花窖爫𠿃嘹蜂\n𠳐朱點庒蛉𢚸\n明浪...,寃家女畧呂衝\n少之𢧚浽麻蒙細隊\n漢明子直調𥋳\n𠽋唭花窖爫𠿃嘹蜂\n𠳐朱𢚸庒𢢆𢚸\n明浪...,2 8 13 15 22 33 31 43 41
223,媄昆𠅍時春途\n𡗉螉渚及細茄\n𧡊𠄩翁𤜯𧼋𦋦迍塘\n汰調扒媄卿娘\n𦊚皮𥒥泣莂蓬\n媄昆嘆哭...,媄昆𨅸時春台\n阻𧗱渚及細茄\n𧡊𠄩翁𤜯𧼋𦋦迍塘\n汰調扒媄昆娘\n𦊚皮𥒥泣莂蓬\n媄昆嘆哭...,6 3 8 7 25 47 43
224,唉𠊚爫𡢼𠊚𡢐\n仃嵬抔斫媄昆\n托它𠅍刧㗂群碑名\n𣋚送𧗱細東成\n六公𫢩㐌𡏦营於廊\n𢖵𦋦...,唉𠊚爫𡢼𠊚𡢐\n仃群抔斫媄昆\n托它𠅍刧㗂群碑名\n状元𧗱細東成\n六公𫢩㐌𡏦营於廊\n排𦋦...,8 21 22 38 35 47
225,𡢽爺㐌称𡢽爺\n姓𢆥誰𤎜精神\n嫌爲格阻堆尼\n𢧚之沛饒𢁍𠳒吟哦\n底尊除待儒家\n沛修節行...,𡢽爺㐌称𡢽爺\n𤾓𢆥誰𤎜精神\n嫌爲格阻堆尼\n𢧚之沛寫𢁍𠳒吟哦\n底麻除待儒家\n沛修節行...,7 22 28


In [111]:
train_df_sen.to_excel(os.path.join(DATA_PATH, "train_LVT_sentences.xlsx"), index = False)
test_df_sen.to_excel(os.path.join(DATA_PATH, "test_LVT_sentences.xlsx"), index = False)