In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

# OCR part

In [2]:
! pip install -q pytesseract
!apt-get update
!apt-get install -y tesseract-ocr-tha

Get:1 https://packages.cloud.google.com/apt gcsfuse-focal InRelease [1225 B]
Get:2 https://packages.cloud.google.com/apt cloud-sdk InRelease [1616 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64  InRelease [1581 B]
Get:4 https://packages.cloud.google.com/apt google-fast-socket InRelease [5015 B]
Hit:5 http://archive.ubuntu.com/ubuntu focal InRelease
Get:6 http://security.ubuntu.com/ubuntu focal-security InRelease [114 kB]
Get:7 http://archive.ubuntu.com/ubuntu focal-updates InRelease [114 kB]
Get:8 https://packages.cloud.google.com/apt gcsfuse-focal/main amd64 Packages [19.5 kB]
Get:9 https://packages.cloud.google.com/apt cloud-sdk/main amd64 Packages [2806 kB]
Get:10 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64  Packages [1426 kB]
Get:11 https://packages.cloud.google.com/apt cloud-sdk/main all Packages [1391 kB]
Hit:12 http://archive.ubuntu.com/ubuntu focal-backports InRelease
Get:13 http://archive.ubuntu.co

In [3]:
! pip install -q easyocr

In [4]:
!pip install deskew

Collecting deskew
  Downloading deskew-1.5.1-py3-none-any.whl.metadata (4.6 kB)
Downloading deskew-1.5.1-py3-none-any.whl (7.7 kB)
Installing collected packages: deskew
Successfully installed deskew-1.5.1


In [5]:
import os
import pandas as pd
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pytesseract
import re
from deskew import determine_skew
import easyocr

In [6]:
# Rotate the image around its center
def rotateImage(cvImage, angle: float):
    newImage = cvImage.copy()
    (h, w) = newImage.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    newImage = cv2.warpAffine(newImage, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return newImage


# grayscale
def img2gray(img):
    img = cv2.fastNlMeansDenoisingColored(img, None, 5, 5, 7, 21)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    return gray


# Deskew image
def deskew(img):
    angle = determine_skew(img)
    return rotateImage(img, angle)


def closing_text_region(deskew):
    gray_blur = cv2.GaussianBlur(deskew, (9, 9), 0)
    thresh = cv2.threshold(gray_blur, 150, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
    kernel = np.ones((5, 14), np.uint8)
    closing = cv2.erode(thresh, kernel, iterations=2)
    return closing


def img2text(img):
    text = pytesseract.image_to_string(img, lang='tha', config='--oem 3 --psm 7')
    # reader = easyocr.Reader(['th'])
    # text_list = reader.readtext(img)
    # text = ' '.join([result[1] for result in text_list]) # Extract text from each result tuple and join them into a single string
    # print(text)
    return text


def clean_data(txt):
    txt = txt.replace('\n', '')
    txt = txt.replace('|', '')
    txt = txt.replace(' ', '')
    txt = txt.replace('-', '')
    txt = txt.replace(',', '')
    txt = txt.replace("'", '')
    txt = txt.replace(".", '')
    txt = txt.replace('\x0c', '')

    if txt == '':
        txt = '_'
    return txt

In [7]:
def text_detection(img_name, df):
    img = cv2.imread(os.path.join(DATA_PATH, img_name))
    # gray
    gray = img2gray(img)

    # deskew
    deskew_rgb = deskew(img)   # for display
    deskew_img = deskew(gray)

    # closing
    closing_deskew_img = closing_text_region(deskew_img)

    # processing each contour
    contours, hierachy = cv2.findContours(closing_deskew_img, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    w, h = closing_deskew_img.shape
    print(f'{img_name} : found {len(contours)} contours')

    kernel_sharp = np.array([[0, -1, 0],
                             [-1, 5, -1],
                             [0, -1, 0]], dtype=np.float32)

    # Sort contours based on their y-coordinate (top to bottom)
    contours = sorted(contours, key=lambda c: cv2.boundingRect(c)[1])

    text_in_img = []
    for i, cnt in enumerate(contours):

        if i != 0 and i <= 14: #len(contours) - 1:
            (x, y, w, h) = cv2.boundingRect(cnt)
            crop_text = deskew_img[y - 3:y + h + 3, x - 2:x + w + 5]
            h_, w_ = crop_text.shape

            # post-processing gray
            try:
                crop_text = cv2.filter2D(crop_text, -1, kernel_sharp)
                crop_text = cv2.fastNlMeansDenoising(crop_text, None, 20, 7, 21)
            except:
                crop_text = np.zeros_like(deskew_img)
            # crop_text = cv2.resize(crop_text, (w_ * 8, h_ * 8), interpolation=cv2.INTER_NEAREST)

            # process ocr
            text = img2text(crop_text)
            text = clean_data(text)

            df['image_name'].append(img_name)
            df['text'].append(text)
            text_in_img.append(text)

#             #plot
#             cv2.rectangle(deskew_rgb, (x, y), (x + w, y + h), (0, 0, 255), 2)
#             plt.imshow(cv2.cvtColor(deskew_rgb, cv2.COLOR_BGR2RGB))
#             plt.axis('off')

    # len contour should = 15
    if len(contours) < 15:
        for i in range(15 - len(contours)):
            df['image_name'].append(img_name)
            df['text'].append('_')
            text_in_img.append(text)

    # print(pd.DataFrame(df, columns=['image_name', 'text']))
    # plt.show()
    print(f'{text_in_img}, len={len(text_in_img)}')
    return df

In [8]:
# !mkdir -p '/content/drive/MyDrive/Colab Notebooks/SuperAIHackathon/hybrid/text_df'

In [9]:
if __name__ == '__main__':
    DATA_PATH = '/kaggle/input/nithan-chadok-hybrid-ocr-ner/images/images'
    img_list = os.listdir(DATA_PATH)
    print(len(img_list))
    # df = pd.DataFrame(columns=['image_name', 'text'])
    df = {'image_name': [], 'text': []}
    range_file = 500 #from 4612 files
    sorted_img_list = sorted(img_list)
    for i, img_name in enumerate(sorted_img_list):
        df = text_detection(img_name, df)
#         if i % range_file == 0: #backup text_df every 500images
#             text_df = pd.DataFrame(df, columns=['image_name', 'text'])
#             text_df.to_csv(f'/kaggle/working/save_text_{i}_.csv')

    text_df = pd.DataFrame(df, columns=['image_name', 'text'])
    text_df.to_csv(f'/kaggle/working/save_text_{i}.csv')

4612
00000.jpg : found 15 contours
['ใน', 'สมัย', 'หนึ่ง', '๓=', 'พระพุทธเจ้า', 'ประทับ', 'อยู่', 'วัด', 'เชตวัน', '_', 'เมือง', 'สาวัตถี', 'ทรง', 'ปรารภ'], len=14
00001.jpg : found 15 contours
['พระ', 'โกกาลิกะ', 'ผู้', 'ขอบ', 'หลอกลวง', 'ผู้ธ่น', '_', 'ได้', 'ครัส', 'จดีต', 'นิทาน', 'มา', 'สารก', '_'], len=14
00002.jpg : found 14 contours
['วา', '_', 'กาลครั้งหนึ่งนานมาแล้ว', '_', 'พระโหธิสัตว์', 'เกิด', 'เป็น', 'ขาย', 'ขาวนา', 'คน', 'หนึ่ง', '_', 'อาศัย', 'อาศัย'], len=14
00003.jpg : found 15 contours
['อยู', 'ใน', 'หมู่บ้าน', 'แห่ง', 'หนึ่ง', 'ซ๊', 'มื', 'พ่อค้า', 'คน', 'หนึ่ง', 'เที่ยว', 'ค้าขาย', 'ด้วย', 'การ'], len=14
00004.jpg : found 15 contours
['บรรทุก', 'สินค้า', 'บน', 'หลังลา', '_', 'ไป', 'ถึง', 'หมู่บ้าน', 'หนึ่ง', 'แล้ว', 'ก็', 'จะ', 'เอา', 'หนัง'], len=14
00005.jpg : found 15 contours
['ราชสีห์', 'คลุม', 'หลัง', 'ลา', '_', 'ปล่อย', 'ไป', 'กิน', 'ข้าวสาลี', 'และ', 'ข้าวเหนียว', 'ของ', 'ขาวบ้าน', '_'], len=14
00006.jpg : found 15 contours
['ส่วน', 'ตนเอง', 'ก็', 'จะ', 'แบ

In [10]:
# text_df = pd.DataFrame(df, columns=['image_name', 'text'])
# text_df.to_csv(f'/content/drive/MyDrive/Colab Notebooks/SuperAIHackathon/hybrid/text_df/save_text_{i}_1000.csv')

# NER

In [11]:
!pip install -q simpletransformers
! pip install scikit-learn
! pip install torch
! pip install scipy



## Load dataset

In [12]:
import torch
import pandas as pd
from simpletransformers.ner import NERModel, NERArgs
import scipy as sp
import pandas as pd

2024-03-11 11:05:24.018654: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-11 11:05:24.018757: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-11 11:05:24.290353: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [13]:
# ! tar -xvzf AIFORTHAI-LST20Corpus.tar.gz

In [14]:
from datasets import load_dataset
lst20 = load_dataset("lst20", data_dir="/kaggle/input/aiforthai-lst20corpus/LST20_Corpus")

Downloading builder script:   0%|          | 0.00/2.42k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

Downloading and preparing dataset lst20/default to /root/.cache/huggingface/datasets/lst20/default-bc043e57afb2a3c5/0.0.0/e1b2a921fb011578ab43ddbbf789f3c500d62cb2df8ae4ed4b60bae8e4c0d3ad...


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset lst20 downloaded and prepared to /root/.cache/huggingface/datasets/lst20/default-bc043e57afb2a3c5/0.0.0/e1b2a921fb011578ab43ddbbf789f3c500d62cb2df8ae4ed4b60bae8e4c0d3ad. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

## Datasets to Dataframe

In [15]:
train_df = pd.DataFrame(lst20['train'])
validation_df = pd.DataFrame(lst20['validation'])
test_df = pd.DataFrame(lst20['test'])

In [16]:
# Split the input string into lines, then split each line by comma to extract the 'tag' column only
NER_TAGS = [
       "O",
        "B_BRN",        "B_DES",        "B_DTM",        "B_LOC",        "B_MEA",        "B_NUM",        "B_ORG",        "B_PER",        "B_TRM",        "B_TTL",
       "I_BRN",        "I_DES",        "I_DTM",        "I_LOC",        "I_MEA",        "I_NUM",        "I_ORG",        "I_PER",        "I_TRM",        "I_TTL",
        "E_BRN",        "E_DES",        "E_DTM",        "E_LOC",        "E_MEA",        "E_NUM",        "E_ORG",        "E_PER",        "E_TRM",        "E_TTL"]
print(NER_TAGS)

['O', 'B_BRN', 'B_DES', 'B_DTM', 'B_LOC', 'B_MEA', 'B_NUM', 'B_ORG', 'B_PER', 'B_TRM', 'B_TTL', 'I_BRN', 'I_DES', 'I_DTM', 'I_LOC', 'I_MEA', 'I_NUM', 'I_ORG', 'I_PER', 'I_TRM', 'I_TTL', 'E_BRN', 'E_DES', 'E_DTM', 'E_LOC', 'E_MEA', 'E_NUM', 'E_ORG', 'E_PER', 'E_TRM', 'E_TTL']


## Convert Dataframe to Simple Transformer Format

In [17]:
def convert_to_simple_transformer_format(df, field_name, tags):
    sentence_id = []
    words = []
    labels = []

    for (idx, r) in df.iterrows():
        for (i, t) in enumerate(r['tokens']):
            sentence_id.append(idx)
            words.append(t)
            labels.append(tags[r[field_name][i]])
    return pd.DataFrame({"sentence_id": sentence_id, "words": words, "labels": labels})


train_ = convert_to_simple_transformer_format(train_df, "ner_tags", NER_TAGS)
validation_ = convert_to_simple_transformer_format(validation_df, "ner_tags", NER_TAGS)
test_ = convert_to_simple_transformer_format(test_df, "ner_tags", NER_TAGS)

## Training

In [18]:
torch.cuda.is_available()

True

In [19]:
ner_args = NERArgs()
ner_args.train_batch_size = 128
ner_args.use_multiprocessing = True
ner_args.evaluate_during_training = True
ner_args.overwrite_output_dir = True
ner_args.eval_batch_size = 128
ner_args.num_train_epochs = 12
ner_args.gradient_accumulation_steps = 4
ner_args.learning_rate = 1e-5

In [20]:
model = NERModel( "camembert", 
                 "airesearch/wangchanberta-base-att-spm-uncased",
                 args=ner_args, use_cuda=torch.cuda.is_available(),labels=NER_TAGS )

config.json:   0%|          | 0.00/546 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/423M [00:00<?, ?B/s]

Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at airesearch/wangchanberta-base-att-spm-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/282 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/905k [00:00<?, ?B/s]



In [21]:
model.train_model(train_, eval_data=validation_)

  0%|          | 0/63310 [00:00<?, ?it/s]

Epoch:   0%|          | 0/12 [00:00<?, ?it/s]

Running Epoch 1 of 12:   0%|          | 0/495 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/44 [00:00<?, ?it/s]



Running Epoch 2 of 12:   0%|          | 0/495 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/44 [00:00<?, ?it/s]

Running Epoch 3 of 12:   0%|          | 0/495 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/44 [00:00<?, ?it/s]

Running Epoch 4 of 12:   0%|          | 0/495 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/44 [00:00<?, ?it/s]

Running Epoch 5 of 12:   0%|          | 0/495 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/44 [00:00<?, ?it/s]

Running Epoch 6 of 12:   0%|          | 0/495 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/44 [00:00<?, ?it/s]

Running Epoch 7 of 12:   0%|          | 0/495 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/44 [00:00<?, ?it/s]

Running Epoch 8 of 12:   0%|          | 0/495 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/44 [00:00<?, ?it/s]

Running Epoch 9 of 12:   0%|          | 0/495 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/44 [00:00<?, ?it/s]

Running Epoch 10 of 12:   0%|          | 0/495 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/44 [00:00<?, ?it/s]

Running Epoch 11 of 12:   0%|          | 0/495 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/44 [00:00<?, ?it/s]

Running Epoch 12 of 12:   0%|          | 0/495 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/44 [00:00<?, ?it/s]

(1476,
 defaultdict(list,
             {'global_step': [123,
               246,
               369,
               492,
               615,
               738,
               861,
               984,
               1107,
               1230,
               1353,
               1476],
              'train_loss': [0.4087303876876831,
               0.17389841377735138,
               0.14993759989738464,
               0.1907256841659546,
               0.12007780373096466,
               0.14476503431797028,
               0.09824486821889877,
               0.12560708820819855,
               0.12644073367118835,
               0.131383016705513,
               0.1444486528635025,
               0.11512061953544617],
              'eval_loss': [0.4989550167864019,
               0.2343782748688351,
               0.19403299469162116,
               0.1787567368962548,
               0.1703965567391027,
               0.16448945433578707,
               0.16113003140146082,
           

In [22]:
# #Copy best model folderinto gdrive folder
# !cp -r /content/outputs/best_model '/content/drive/MyDrive/Colab Notebooks/SuperAIHackathon/ner/best_model/wangchanberta-base-att-spm-uncased_best_models'

---

## Load data test

In [23]:
import pandas as pd

#Read test file
sample_submission = pd.read_csv('/kaggle/input/nithan-chadok-hybrid-ocr-ner/sample_submission.csv')
# df = pd.read_csv('/kaggle/input/save-combine-hybrid-oem-3-psm-7/save_combine_hybrid_oem 3-psm 7.csv')
# df = df.drop('text_default', axis=1) # for oem 3-psm 7

df = text_df.copy()
df['pred'] = sample_submission['pred']
sample_submission.head()

Unnamed: 0,Id,pred
0,0,0.0
1,1,0.0
2,2,0.0
3,3,
4,4,


In [24]:
# Sentence Tokenizer
def split_into_sentences(tokens, tokens_per_sentence=48): #40
    sentences = []
    for i in range(0, len(tokens), tokens_per_sentence):
        sentence = tokens[i:i+tokens_per_sentence]
        sentences.append(sentence)
    return sentences


# define '' to '_'
def blank_space(x):
    if x == '':
        x = '_'
    return x

In [25]:
texts_test_raw = []
for i in range(len(df)):
    texts_test_raw.append(df.loc[i, "text"].replace('\x0c',''))
#     texts_test_raw.append(df.loc[i, "text_psm7"].replace('\x0c','')) # for oem 3-psm 7

for i in range(len(texts_test_raw)):
    texts_test_raw[i] = blank_space(texts_test_raw[i])

my_token = split_into_sentences(texts_test_raw)
len(my_token[0])

48

In [26]:
len(my_token)

1346

In [27]:
#Check word inside list #Useful function
def data_inside(data_list):
    x = 0
    for i in range(len(data_list)):
        a = len(data_list[i])
        x = x+a
    return x

data_inside(my_token)

64568

## Predict model

In [28]:
import pandas as pd
import numpy as np
tag_list = pd.read_csv('/kaggle/input/nithan-chadok-hybrid-ocr-ner/tag_list.csv')
tags = {row['tag']:row['class'] for _, row in tag_list.iterrows()}
tags

{'O': 0,
 'B_ORG': 1,
 'B_PER': 2,
 'B_LOC': 3,
 'B_MEA': 4,
 'I_DTM': 5,
 'I_ORG': 6,
 'E_ORG': 7,
 'I_PER': 8,
 'B_TTL': 9,
 'E_PER': 10,
 'B_DES': 11,
 'E_LOC': 12,
 'B_DTM': 13,
 'B_NUM': 14,
 'I_MEA': 15,
 'E_DTM': 16,
 'E_MEA': 17,
 'I_LOC': 18,
 'I_DES': 19,
 'E_DES': 20,
 'I_NUM': 21,
 'E_NUM': 22,
 'B_TRM': 23,
 'B_BRN': 24,
 'I_TRM': 25,
 'E_TRM': 26,
 'I_TTL': 27,
 'I_BRN': 28,
 'E_BRN': 29,
 'E_TTL': 30,
 'B_NAME': 31}

In [29]:
# Split the input string into lines, then split each line by comma to extract the 'tag' column only
NER_TAGS = [
       "O",
        "B_BRN",        "B_DES",        "B_DTM",        "B_LOC",        "B_MEA",        "B_NUM",        "B_ORG",        "B_PER",        "B_TRM",        "B_TTL",
       "I_BRN",        "I_DES",        "I_DTM",        "I_LOC",        "I_MEA",        "I_NUM",        "I_ORG",        "I_PER",        "I_TRM",        "I_TTL",
        "E_BRN",        "E_DES",        "E_DTM",        "E_LOC",        "E_MEA",        "E_NUM",        "E_ORG",        "E_PER",        "E_TRM",        "E_TTL"]
print(NER_TAGS)

['O', 'B_BRN', 'B_DES', 'B_DTM', 'B_LOC', 'B_MEA', 'B_NUM', 'B_ORG', 'B_PER', 'B_TRM', 'B_TTL', 'I_BRN', 'I_DES', 'I_DTM', 'I_LOC', 'I_MEA', 'I_NUM', 'I_ORG', 'I_PER', 'I_TRM', 'I_TTL', 'E_BRN', 'E_DES', 'E_DTM', 'E_LOC', 'E_MEA', 'E_NUM', 'E_ORG', 'E_PER', 'E_TRM', 'E_TTL']


In [30]:
from simpletransformers.ner import NERModel, NERArgs
import torch
print(torch.cuda.is_available())

# Test Model
ner_args = NERArgs()
ner_args.eval_batch_size = 128
ner_args.use_multiprocessing = True
model = NERModel(
     "auto", 
    "/kaggle/working/outputs/best_model",
#     "/kaggle/input/wangchanberta-base-att-spm-uncased-best-models/wangchanberta-base-att-spm-uncased_best_models",
     args=ner_args, use_cuda=torch.cuda.is_available(),
     labels= NER_TAGS  # your latest model
)

True


In [31]:
# Make predictions with the model
predictions, raw_outputs = model.predict(my_token, False)

  0%|          | 0/3 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/11 [00:00<?, ?it/s]

In [32]:
#Check result sentence
len(predictions)

1346

In [33]:
#check word prediction
data_inside(predictions)

64568

In [34]:
tag2class = {}
for i in range(len(tag_list)):
    tag2class[tag_list.loc[i, "tag"]] = tag_list.loc[i, "class"]#Extract data value from dict list

final_test_df = []
for i in range(len(predictions)):
    for j in range(len(predictions[i])):
        data = predictions[i][j]
        value = data.values()
        final_test_df += value

In [35]:
final_result = pd.DataFrame(final_test_df)
final_result

Unnamed: 0,0
0,O
1,O
2,O
3,O
4,O
...,...
64563,O
64564,O
64565,O
64566,B_TTL


# Prepare submission file

In [36]:
submisstion_df = pd.read_csv('/kaggle/input/nithan-chadok-hybrid-ocr-ner/sample_submission.csv')
submisstion_df

Unnamed: 0,Id,pred
0,0,0.0
1,1,0.0
2,2,0.0
3,3,
4,4,
...,...,...
64563,64563,
64564,64564,
64565,64565,
64566,64566,


In [37]:
submisstion_df['pred'] = final_result
submisstion_df.tail(20)

Unnamed: 0,Id,pred
64548,64548,O
64549,64549,O
64550,64550,O
64551,64551,O
64552,64552,O
64553,64553,O
64554,64554,O
64555,64555,O
64556,64556,O
64557,64557,O


In [38]:
NER_DICT = {row['tag']:row['class'] for _, row in tag_list.iterrows()}
NER_DICT

{'O': 0,
 'B_ORG': 1,
 'B_PER': 2,
 'B_LOC': 3,
 'B_MEA': 4,
 'I_DTM': 5,
 'I_ORG': 6,
 'E_ORG': 7,
 'I_PER': 8,
 'B_TTL': 9,
 'E_PER': 10,
 'B_DES': 11,
 'E_LOC': 12,
 'B_DTM': 13,
 'B_NUM': 14,
 'I_MEA': 15,
 'E_DTM': 16,
 'E_MEA': 17,
 'I_LOC': 18,
 'I_DES': 19,
 'E_DES': 20,
 'I_NUM': 21,
 'E_NUM': 22,
 'B_TRM': 23,
 'B_BRN': 24,
 'I_TRM': 25,
 'E_TRM': 26,
 'I_TTL': 27,
 'I_BRN': 28,
 'E_BRN': 29,
 'E_TTL': 30,
 'B_NAME': 31}

In [39]:
submisstion_df['pred'] = submisstion_df['pred'].map(NER_DICT)
submisstion_df

Unnamed: 0,Id,pred
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
64563,64563,0
64564,64564,0
64565,64565,0
64566,64566,9


In [40]:
submisstion_df.to_csv("cut48_wangchan.csv", index=False)

---

# Post processing

## Text similarity

In [41]:
df_similarity = df.copy()

In [42]:
filter
df_similarity["text"] = df_similarity["text"].str.replace(r'[“”"\%+*#!:]', '', regex=True)
df_similarity["text"] = df_similarity["text"].str.replace(r'ขาว', 'ชาว', regex=True)
df_similarity["text"] = df_similarity["text"].str.replace(r'ข่าง', 'ช่าง', regex=True)
df_similarity["text"] = df_similarity["text"].str.replace(r'.พระ', 'พระ', regex=True)

# ### for imported 'oem 3-psm 7 file'
# df_similarity["text_psm7"] = df_similarity["text_psm7"].str.replace(r'[“”"\%+*#!:]', '', regex=True)
# df_similarity["text_psm7"] = df_similarity["text_psm7"].str.replace(r'ขาว', 'ชาว', regex=True)
# df_similarity["text_psm7"] = df_similarity["text_psm7"].str.replace(r'ข่าง', 'ช่าง', regex=True)
# df_similarity["text_psm7"] = df_similarity["text_psm7"].str.replace(r'.พระ', 'พระ', regex=True)

In [43]:
def levenshteinDistance(str1, str2):
    # Create a matrix to store Levenshtein distances
    dp = [[0] * (len(str2) + 1) for _ in range(len(str1) + 1)]

    # Initialize the matrix with base cases
    for i in range(len(str1) + 1):
        dp[i][0] = i
    for j in range(len(str2) + 1):
        dp[0][j] = j

    # Populate the matrix
    for i in range(1, len(str1) + 1):
        for j in range(1, len(str2) + 1):
            if str1[i - 1] == str2[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
            else:
                dp[i][j] = 1 + min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1])

    return dp[len(str1)][len(str2)]


def levenshteinClosestMatch(str1_list, str2):
    min_distance = float('inf')  # Initialize minimum distance as infinity
    most_similar_text = None

    # Iterate through each candidate string in the list
    for candidate in str1_list:
        distance = levenshteinDistance(candidate, str2)
        similarity_score = 1 - (distance / max(len(candidate), len(str2)))

        # Update most similar text if distance is less than 5
        if distance < 3 and distance < min_distance and np.abs(len(str2)-len(candidate))<=2 and len(str2)>4:
            min_distance = distance
            most_similar_text = candidate

    return most_similar_text, min_distance, similarity_score


# Example usage:
candidate_list = ["พระโพธิสัตว์",
                  "พระราชา",
                  "พระพุทธเจ้า",
                  "พราหมณ์",
                  "พระสุบิน",
                  "พระเจ้า",
                  "พระองค์",
                  "พระพุทธองค์",
                  "พระราชทาน",
                  "พระรัตนตรัย",
                  "พระราชกุมาร",
                  "พราหมณ์",
                  "มหาราช",
                  "เชตวัน",
                  "ราชสีห์",
                  "สาวัตถี",
                  "ไมตรีจิตร",
                  "ไมตรีจิต",
                  "นักปราชญ์",
                  "อาจารย์",
                  "กาลครั้งหนึ่งนานมาแล้ว",
                  "ข้าพเจ้า",
                  "คนธรรพ์",
                  "รุกขเทวดา",
                  "ผู้พิพากษา"]
str2 = "ผู้พิพากษา"
most_similar, distance, similarity_score = levenshteinClosestMatch(candidate_list, str2)
if most_similar:
    print("Most similar text:", most_similar)
    print("Levenshtein distance:", distance)
    print("Similarity score:", similarity_score)
else:
    print("No similar text found within the given Levenshtein distance threshold.")

Most similar text: ผู้พิพากษา
Levenshtein distance: 0
Similarity score: 1.0


In [44]:
df_similarity_edit = df_similarity.copy()
for i in range(len(df_similarity)):

#     df_text = df_similarity_edit['text_psm7'][i]
    df_text = df_similarity_edit['text'][i]
    most_similar, distance, similarity_score = levenshteinClosestMatch(candidate_list, df_text)

    if most_similar is not None:
#         df_similarity_edit.loc[i, 'text_psm7'] = most_similar
        df_similarity_edit.loc[i, 'text'] = most_similar
        if distance > 0:
            print(df_text , most_similar, distance)

พระโหธิสัตว์ พระโพธิสัตว์ 1
ราชสีท์ ราชสีห์ 1
ราพสีห์ ราชสีห์ 1
รายสีห์ ราชสีห์ 1
เขตวัน เชตวัน 1
สาวัดถึ สาวัตถี 2
พระสุบิบ พระสุบิน 1
พระสบิน พระสุบิน 1
ทระสุบัน พระสุบิน 2
พระราชๆ พระราชา 1
สาวัดถี สาวัตถี 1
กาลครั้งหนึ่งบานมาแล้ว กาลครั้งหนึ่งนานมาแล้ว 1
พระราชาต พระราชา 1
ทระราชา พระราชา 1
ข้พระองค์ พระองค์ 2
พระราชาต พระราชา 1
พระราชาต พระราชา 1
กาลครั้งหนึ่งบานมาแล้ว กาลครั้งหนึ่งนานมาแล้ว 1
พราชมณ์ พราหมณ์ 1
สาวัดถี สาวัตถี 1
ข้าพเจ้าขอ ข้าพเจ้า 2
ข้าทเจ้า ข้าพเจ้า 1
เขตวัน เชตวัน 1
สาวัดถี สาวัตถี 1
ข้าหเจ้า ข้าพเจ้า 1
เขตวัน เชตวัน 1
พระโหธิสัตว์ พระโพธิสัตว์ 1
ประสงค์ พระองค์ 2
เวฬวัน เชตวัน 2
ราชคฤห์ ราชสีห์ 2
จ้าพเจ้า ข้าพเจ้า 1
ข้าพเจ้าตา ข้าพเจ้า 2
ข้าหเจ้า ข้าพเจ้า 1
ข้าหเจ้า ข้าพเจ้า 1
พระพุทธเจ้ว พระพุทธเจ้า 1
พราหมผ์ พราหมณ์ 1
จ้าพเจ้า ข้าพเจ้า 1
เขตวัน เชตวัน 1
พระอาญา พระราชา 2
พรของค์ พระองค์ 1
พระราชฤมาร พระราชกุมาร 2
พระราจกุมาร พระราชกุมาร 1
รุกขเทวคา รุกขเทวดา 1
เขตวัน เชตวัน 1
นักปราชณ์ นักปราชญ์ 1
นักปราชผ์ นักปราชญ์ 1
ประสงค์ พระองค์ 2
เขตวัน เชตวัน 1
เขตวั

In [45]:
texts_test_raw = []
for i in range(len(df_similarity_edit)):
    texts_test_raw.append(df_similarity_edit.loc[i, "text"].replace('\x0c',''))
#   texts_test_raw.append(df_similarity_edit.loc[i, "text_psm7"].replace('\x0c',''))

for i in range(len(texts_test_raw)):
    texts_test_raw[i] = blank_space(texts_test_raw[i])

my_sentence = split_into_sentences(texts_test_raw)
len(my_sentence[0])

48

In [46]:
# Make predictions with the model
predictions_sim, raw_outputs_sim = model.predict(my_sentence, False)

  0%|          | 0/3 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/11 [00:00<?, ?it/s]

In [47]:
data_inside(predictions_sim)

64568

In [48]:
final_test_df_sim = []
for i in range(len(predictions_sim)):
    for j in range(len(predictions_sim[i])):
        data_sim = predictions_sim[i][j]
        value_sim = data_sim.values()
        final_test_df_sim += value_sim

In [49]:
final_result_sim = pd.DataFrame(final_test_df_sim)
final_result_sim

Unnamed: 0,0
0,O
1,O
2,O
3,O
4,O
...,...
64563,O
64564,O
64565,O
64566,B_TTL


In [50]:
submisstion_df_sim = pd.read_csv('/kaggle/input/nithan-chadok-hybrid-ocr-ner/sample_submission.csv')

In [51]:
submisstion_df_sim['pred'] = final_result_sim
# submisstion_df.tail(20)

In [52]:
NER_DICT = {row['tag']:row['class'] for _, row in tag_list.iterrows()}

In [53]:
submisstion_df_sim['pred'] = submisstion_df_sim['pred'].map(NER_DICT)
submisstion_df_sim

Unnamed: 0,Id,pred
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
64563,64563,0
64564,64564,0
64565,64565,0
64566,64566,9


In [54]:
submisstion_df_sim.to_csv("cut48_text_similarity_submission.csv", index=False)

---

In [55]:
# def manual_edit(df_new, new_pred):
# #     for idx , _pred in zip(df_new['Unnamed: 0'].tolist() , df_new['text'].tolist()):
#     for idx , _pred in zip(df_new['Unnamed: 0'].tolist() , df_new["text_psm7"].tolist()):
#         text_pred = _pred #df_new.loc[idx, "text_psm7"]

#         if text_pred in ["_"]:
#             old_pred = new_pred[idx]
#             new_pred[idx] = '0'
#             if text_pred == "_" and old_pred != 0:
#                 print(text_pred, old_pred, new_pred[idx])
#     return new_pred
        
# df_new = df_similarity.copy()
# new_pred = submisstion_df_sim.copy()
# new_pred = new_pred['pred'].tolist()
# edit_new_pred = manual_edit(df_new, new_pred)

In [56]:
# save_df = submisstion_df_sim.copy()
# save_df['pred'] = new_pred
# save_df.to_csv("post_cut48_similarity_submission.csv", index=False)