In [107]:
# import all
import torch
import torchtext
import os
import numpy as np
import pandas as pd

In [145]:
#First we import necessary library such as math, nltk, bigram, and collections.
import math
import nltk
import io
import random
from random import shuffle
from nltk import bigrams, trigrams
from collections import Counter, defaultdict
random.seed(999)

In [146]:
# We choose news domain as our dataset
best2010=[]
fp= io.open('BEST2010/news.txt','r',encoding='utf-8')
for i,line in enumerate(fp):
    best2010.append(line.strip()[:-1])
fp.close()
all_vocabulary =set()
total_word_count =0
for line in best2010:
    for word in line.split('|'):        
        all_vocabulary.add(word)
        total_word_count+=1
sentences = best2010

In [147]:
tokenize_sentence = [sentence.split("|") for sentence in sentences]

In [148]:
vocab = torchtext.vocab.build_vocab_from_iterator(tokenize_sentence,min_freq = 3)
vocab.insert_token('<unk>', 0) #add <unk> token to index 0
vocab.insert_token('<eos>', 1) #add <eos> token to index 1
# vocab.insert_token('<pad>', 2) #add <eos> token to index 2
vocab.set_default_index(vocab['<unk>']) #make index 0 the default index (when encountering unknown words)
print(f"Vocabulary Size: {len(vocab)}")                         
print(vocab.get_itos()[:10]) #get first 10 words  

Vocabulary Size: 11776
['<unk>', '<eos>', ' ', 'ที่', 'การ', 'ว่า', 'มี', 'ใน', 'และ', 'ได้']


# Load dataset

In [149]:
df = pd.read_json('./error.json', lines=True)

In [150]:
for i in range(30,35):
    print(df[1][i])
    sent_ = df[1][i]
    list_wrong_word = df[2][i]
    for idx,right_word,wrong_class in list_wrong_word:
        print("index : ",idx,"WRONG : ",sent_[idx[0]:idx[1]]," CORRECT : ",right_word," CLASSS : ",wrong_class)

['จะ', 'กลับ', 'ไป', 'แก้', 'ใข้', 'ใน', 'สิ่ง', 'ที่ผ่าน', 'มา']
index :  [3, 5] WRONG :  ['แก้', 'ใข้']  CORRECT :  ['แก้ไข']  CLASSS :  misspelled
['จะ', 'เปลี่ยน', 'ไป', 'ใช้', 'อีก', 'เบอร์', 'ต้อง', 'ทำ', 'ยังไง', 'ค่ะ']
index :  [9, 10] WRONG :  ['ค่ะ']  CORRECT :  ['คะ']  CLASSS :  misspelled
['อยาก', 'ได้', 'ดงิน', 'คืน', 'อ่ะ', 'ค่ะ']
index :  [2, 3] WRONG :  ['ดงิน']  CORRECT :  ['เงิน']  CLASSS :  misspelled
index :  [4, 6] WRONG :  ['อ่ะ', 'ค่ะ']  CORRECT :  ['อะ', 'คะ']  CLASSS :  misspelled
['เห้อ', ' ', 'เหนื่อยใจ']
index :  [0, 1] WRONG :  ['เห้อ']  CORRECT :  ['เฮ้อ']  CLASSS :  misspelled
['เสียดาย', 'อ่ะ', ' ', 'ไม่', 'งัน', 'พี่', 'ปุ๊ก', 'ได้', 'ปิด', 'ซอย', 'เลี้ยง', 'แล้ว']
index :  [1, 2] WRONG :  ['อ่ะ']  CORRECT :  ['อะ']  CLASSS :  misspelled


as we can see this dataset will have the sentence with som wrong words that needs to be correct 
1. End-to-end network : use this as a ground truth for correcting the wrong word  where the input can be a whole sentence or each words
2. We can input a whole sentence and network will predict where is the index of wrong word and correct it combine with dictionary
3. We will use this as testset

# To detect where the mis-spelling is

In [151]:
# generate the correct labels and wrong labels for each token
def generate_label_index(df,k):
    sent_ = df[1][k]
    list_wrong_word = df[2][k]
    labels = [0 for _ in range(len(sent_))]
    for idx,_,_ in list_wrong_word:
        for i in range(idx[0],idx[1]):
            labels[i] = 1
    return labels

In [152]:
for k in range(29):
    print("SENTENCE : ",df[1][k])
    print("LABELS : ",generate_label_index(df,k))
    print("\n")

SENTENCE :  ['หริอ', 'มี', 'ปัญหา', 'อะไร', 'ช่วย', 'ตอบ', 'ด้วย', 'ครับ']
LABELS :  [1, 0, 0, 0, 0, 0, 0, 0]


SENTENCE :  ['อยาก', 'สมัคร', ' ', 'sms', ' ', 'เงิน', 'เข้า', 'เงิน', 'ออก', 'ทาง', 'เน็ต', 'ต้อง', 'ทำ', 'ยังงัย']
LABELS :  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]


SENTENCE :  ['ไม่', 'ทราบ', 'ว่า', 'คุณ', ' ', 'พัชชาา', ' ', 'ไม่', 'สน', 'โลก', ' ', 'เปลี่ยน', 'เฉพาะ', 'เครื่อง', ' ', 'หรือ', 'เปลี่ยน', 'ทั้ง', 'เครื่อง', 'และ', 'เบอร์', 'คะ']
LABELS :  [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


SENTENCE :  ['ทำ', 'รายการ', 'ที่', 'ตู้', 'atm', ' ', 'อะ', 'คะ', ' ', 'เป็น', 'การ', 'ทำ', 'รายการ', 'เปลี่ยน', 'เบอ', 'โทรสับ', 'อะ', 'คะ']
LABELS :  [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1]


SENTENCE :  ['ย่า', 'ยัง', 'ดี', 'ที่', 'สิริ', 'ตอบ', 'แบบ', 'นั้น', ' ', 'ดู', 'ของ', 'เค้า', 'ดิ']
LABELS :  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1]


SENTENCE :  ['จขกท', '.', ' ', '3', '3', '3', '3', '3', '3', '3', ' ', 'เจ้าของ', 'เม้นท์'

# To detect real word spelling error

In [153]:
all_list_string = []+df[1][0]
for i in range(len(df)):
    all_list_string += df[1][i]
all_list_string = "".join(all_list_string)

In [154]:
np_str = np.array(list(all_list_string))
all_char = np.unique(np_str)

sorted(all_char)
print("There are %d unique chars in the data set" % len(all_char))
print(all_char)
char_map = dict(zip(all_char, range(len(all_char))))


There are 139 unique chars in the data set
[' ' '!' '"' "'" '(' ')' '*' '+' ',' '-' '.' '/' '0' '1' '2' '3' '4' '5'
 '6' '7' '9' '?' 'A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I' 'K' 'L' 'M' 'N' 'O'
 'P' 'R' 'S' 'T' 'U' 'V' 'W' 'Z' '^' '_' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h'
 'i' 'k' 'l' 'm' 'n' 'o' 'p' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z' 'ก' 'ข'
 'ฃ' 'ค' 'ฆ' 'ง' 'จ' 'ฉ' 'ช' 'ซ' 'ญ' 'ฐ' 'ฑ' 'ฒ' 'ณ' 'ด' 'ต' 'ถ' 'ท' 'ธ'
 'น' 'บ' 'ป' 'ผ' 'ฝ' 'พ' 'ฟ' 'ภ' 'ม' 'ย' 'ร' 'ฤ' 'ล' 'ว' 'ศ' 'ษ' 'ส' 'ห'
 'อ' 'ฮ' 'ฯ' 'ะ' 'ั' 'า' 'ำ' 'ิ' 'ี' 'ึ' 'ื' 'ุ' 'ู' 'ฺ' 'เ' 'แ' 'โ' 'ใ'
 'ไ' 'ๅ' 'ๆ' '็' '่' '้' '๊' '๋' '์' 'ํ' '๐' '๒' '๖']


In [155]:
def count_str(string):
    global all_char, char_map
    result = np.zeros(len(all_char))
    np_str = np.array(list(string))
    str_char, str_char_count = np.unique(np_str, return_counts=True)
    for char, count in zip(str_char, str_char_count):
        result[char_map[char]] = count
    return result

# run example feature transformation
print("Example String to feature conversion")
display(df[1][0])
display(count_str("".join(df[1][0])))

Example String to feature conversion


['หริอ', 'มี', 'ปัญหา', 'อะไร', 'ช่วย', 'ตอบ', 'ด้วย', 'ครับ']

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0.,
       1., 1., 0., 0., 0., 0., 2., 1., 0., 0., 0., 0., 0., 1., 2., 3., 0.,
       0., 2., 0., 0., 0., 2., 3., 0., 0., 1., 2., 1., 0., 1., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0.,
       0., 0., 0.])

In [156]:
# make a correct version and vocab by 
# replace wrong word into the list 
# and make vocab from them

In [173]:
from torch.nn.utils.rnn import pad_sequence
def data2features(data,dict_):
  features = [torch.LongTensor(dict_(sentence)) for sentence in data]
  return features

data = df.to_numpy()

x_ = data2features(data[:,1],vocab)
x_ = pad_sequence(x_, batch_first=True)

temp = [np.vectorize(count_str, otypes=[object])("".join(data[:,1][k])) for k in range(len(data))]
x_f1 = torch.LongTensor([[e for e in sl] for sl in temp])

label = np.array([generate_label_index(df,k) for k in range(len(data))],dtype=object)
data_embed = torch.cat((x_,x_f1),1)
print("Data")
print("Word embbeding shape ",x_.shape)
print("Character feature shape", x_f1.shape)
print("Concatenate feature shape", data_embed.shape)
print("label shape", label.shape)

Data
Word embbeding shape  torch.Size([15597, 108])
Character feature shape torch.Size([15597, 139])
Concatenate feature shape torch.Size([15597, 247])
label shape (15597,)


  x_f1 = torch.LongTensor([[e for e in sl] for sl in temp])


In [176]:
# Select for "Action" Classification Task
from sklearn.model_selection import train_test_split
X_train, X_test_val, y_train, y_test_val = train_test_split(data_embed, label, test_size=0.30, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test_val, y_test_val, test_size=0.50, random_state=42)

In [180]:
print("Training dataset size: ",X_train.shape)
print("Training labels size: ",y_train.shape)
print("Validation dataset size: ",X_val.shape)
print("Validation labels size: ",y_val.shape)
print("Testing dataset size: ",X_test.shape)
print("Testing labels size: ",y_test.shape)

Training dataset size:  torch.Size([10917, 247])
Training labels size:  (10917,)
Validation dataset size:  torch.Size([2340, 247])
Validation labels size:  (2340,)
Testing dataset size:  torch.Size([2340, 247])
Testing labels size:  (2340,)


In [15]:
import torch.nn as nn
from torch.nn import Embedding, Linear, Dropout
import torch.nn.functional as F 
from torch.optim import Adam 
from torch.utils.data import Dataset, DataLoader
class ErrorDetect(Dataset):
  def __init__(self, data, labels): 
    self.data = data
    self.labels = labels

  def __getitem__(self, idx): 
    return self.data[idx],self.labels[idx]

  def __len__(self): 
    return len(self.data) 


train_dataset = ErrorDetect(X_train,) 
train_loader = DataLoader(train_dataset, batch_size=4, num_workers=2)

# CBOW for predict the right word at the error index

# Select Candidate by Character n-gram inverted index

In [None]:
# google --> (tri-grams) --> goo oog ogl gle
# if we consider goo --> google good good-bye 
# this can simply find what the word cause this tri-grams