# CRF
* Preprocess > Train Set by Original .txt 

## frist time install

In [1]:
!pip install python-crfsuite
!pip install pythainlp
!mkdir models
!pip install sklearn

## import library

In [2]:
import glob
import pandas as pd

import os
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import re

import json
import pycrfsuite
from ast import literal_eval
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from pythainlp.tokenize import word_tokenize
from pythainlp.tag import pos_tag

# Model CRF-Cut
* Train Set [Separate Train,Test][with out sentence =2, >=400]
* Feature: start,stop (Original CRF)
* Feature: n_gram = 3
* trainer.set_params({ 'c1': 1, 'c2': 0, 'max_iterations': 1000, 'feature.possible_transitions': True, })

## Import Data

In [3]:
df = pd.read_csv('Data_Train_CRF[IO].csv')
df

Unnamed: 0.1,Unnamed: 0,word,pos,ner,clause
0,0,"""",PU,O,I_SENT
1,1,สมชาย,NN,B_PER,I_SENT
2,2,"""",PU,O,I_SENT
3,3,ลอยลำ,VV,O,I_SENT
4,4,นายก,NN,O,I_SENT
...,...,...,...,...,...
2773709,2773709,,PU,I_MEA,I_SENT
2773710,2773710,บาท,CL,E_MEA,I_SENT
2773711,2773711,,PU,O,I_SENT
2773712,2773712,เป็นต้น,AV,O,I_SENT


In [4]:
df_TrainCRF = pd.DataFrame(columns = ['word', 'pos', 'ner', 'clause'], data = df)
df_TrainCRF.clause.value_counts()

I_SENT    2710519
O           63195
Name: clause, dtype: int64

## Create Extract feature function

In [5]:
enders = ["ครับ","ค่ะ","คะ","นะคะ","นะ","จ้ะ","จ้า","จ๋า","ฮะ", #ending honorifics
          #enders
          "ๆ","ได้","แล้ว","ด้วย","เลย","มาก","น้อย","กัน","เช่นกัน","เท่านั้น",
          "อยู่","ลง","ขึ้น","มา","ไป","ไว้","เอง","อีก","ใหม่","จริงๆ",
          "บ้าง","หมด","ทีเดียว","เดียว",
          #demonstratives
          "นั้น","นี้","เหล่านี้","เหล่านั้น",
          #questions
          "อย่างไร","ยังไง","หรือไม่","มั้ย","ไหน","อะไร","ทำไม","เมื่อไหร่"]
starters = ["ผม","ฉัน","ดิฉัน","ชั้น","คุณ","มัน","เขา","เค้า",
            "เธอ","เรา","พวกเรา","พวกเขา", #pronouns
            #connectors
            "และ","หรือ","แต่","เมื่อ","ถ้า","ใน",
            "ด้วย","เพราะ","เนื่องจาก","ซึ่ง","ไม่",
            "ตอนนี้","ทีนี้","ดังนั้น","เพราะฉะนั้น","ฉะนั้น",
            "ตั้งแต่","ในที่สุด",
            #demonstratives
            "นั้น","นี้","เหล่านี้","เหล่านั้น"]

In [6]:
def extract_features(doc, window=2, max_n_gram=3):
    doc_features = []
    #paddings for word and POS
    doc = ['xxpad' for i in range(window)] + doc + ['xxpad' for i in range(window)]
    doc_ender = []
    doc_starter = []
    #add enders/starters
    for i in range(len(doc)):
        if doc[i] in enders:
            doc_ender.append('ender')
        elif doc[i] in starters:
            doc_starter.append('starter')
        else:
            doc_ender.append('normal')
            
    #for each word
    for i in tqdm(range(window, len(doc)-window)):
        #bias term
        word_features = ['bias']
        
        #ngram features
        for n_gram in range(1, min(max_n_gram+1,2+window*2)):
            for j in range(i-window,i+window+2-n_gram):
                feature_position = f'{n_gram}_{j-i}_{j-i+n_gram}'
                
                word_ = f'{"|".join(doc[j:(j+n_gram)])}'
                word_features += [f'word_{feature_position}={word_}']
                
                ender_ =  f'{"|".join(doc_ender[j:(j+n_gram)])}'
                word_features += [f'ender_{feature_position}={ender_}']
                
                starter_ =  f'{"|".join(doc_starter[j:(j+n_gram)])}'
                word_features += [f'starter_{feature_position}={starter_}']
        
        #append to feature per word
        doc_features.append(word_features)
    return doc_features

* Test Extract feature

In [7]:
words=['ฉัน','ชอบ','กิน','มะนาว',' ','ฉัน','ชอบ','กิน','ส้ม',' ','แต่','ดี']

In [8]:
#try
extract_features(words, window = 2, max_n_gram = 3)

100%|██████████| 12/12 [00:00<00:00, 41838.44it/s]


[['bias',
  'word_1_-2_-1=xxpad',
  'ender_1_-2_-1=normal',
  'starter_1_-2_-1=starter',
  'word_1_-1_0=xxpad',
  'ender_1_-1_0=normal',
  'starter_1_-1_0=starter',
  'word_1_0_1=ฉัน',
  'ender_1_0_1=normal',
  'starter_1_0_1=starter',
  'word_1_1_2=ชอบ',
  'ender_1_1_2=normal',
  'starter_1_1_2=',
  'word_1_2_3=กิน',
  'ender_1_2_3=normal',
  'starter_1_2_3=',
  'word_2_-2_0=xxpad|xxpad',
  'ender_2_-2_0=normal|normal',
  'starter_2_-2_0=starter|starter',
  'word_2_-1_1=xxpad|ฉัน',
  'ender_2_-1_1=normal|normal',
  'starter_2_-1_1=starter|starter',
  'word_2_0_2=ฉัน|ชอบ',
  'ender_2_0_2=normal|normal',
  'starter_2_0_2=starter',
  'word_2_1_3=ชอบ|กิน',
  'ender_2_1_3=normal|normal',
  'starter_2_1_3=',
  'word_3_-2_1=xxpad|xxpad|ฉัน',
  'ender_3_-2_1=normal|normal|normal',
  'starter_3_-2_1=starter|starter|starter',
  'word_3_-1_2=xxpad|ฉัน|ชอบ',
  'ender_3_-1_2=normal|normal|normal',
  'starter_3_-1_2=starter|starter',
  'word_3_0_3=ฉัน|ชอบ|กิน',
  'ender_3_0_3=normal|normal|normal',

## Map word with label

In [9]:
all_tuples = [(row['word'],row['clause']) for i,row in tqdm(df_TrainCRF.iterrows())]
all_tuples[0:4]

2773714it [01:38, 28283.10it/s]


[('"', 'I_SENT'), ('สมชาย', 'I_SENT'), ('"', 'I_SENT'), ('ลอยลำ', 'I_SENT')]

In [10]:
len(all_tuples)

2773714

## Extract feature

In [11]:
#target
y = [l for (w,l) in tqdm(all_tuples)]
#features
x_pre = [w for (w,l) in tqdm(all_tuples)]
x = extract_features(x_pre, window=2, max_n_gram = 3)

100%|██████████| 2773714/2773714 [00:00<00:00, 5012207.49it/s]
100%|██████████| 2773714/2773714 [00:00<00:00, 5358005.45it/s]
100%|██████████| 2773714/2773714 [01:03<00:00, 43763.81it/s]


* Split train and test set at 80/20 proportion

In [12]:
idx = int(len(x)*0.8)
x_train, x_test = x[:idx], x[idx:]
y_train, y_test = y[:idx], y[idx:]
print(f"Length of training set: {len(x_train)}")
print(f"Length of testing set: {len(x_test)}")

Length of training set: 2218971
Length of testing set: 554743


## Train model

* Model CRF

In [16]:
%%time
# Train model
trainer = pycrfsuite.Trainer(verbose=True)
trainer.append(x_train, y_train)

trainer.set_params({
    'c1': 1,
    'c2': 0,
    'max_iterations': 1000,
    'feature.possible_transitions': True,
})

trainer.train('models/sub1-crf.model')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 5047100
Seconds required: 19.728

L-BFGS optimization
c1: 1.000000
c2: 0.000000
num_memories: 6
max_iterations: 1000
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 355277.906930
Feature norm: 1.000000
Error norm: 333031.111551
Active features: 854812
Line search trials: 1
Line search step: 0.000000
Seconds required for this iteration: 8.280



IOStream.flush timed out


***** Iteration #2 *****
Loss: 340552.341246
Feature norm: 0.956663
Error norm: 326168.458467
Active features: 615073
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 4.208

***** Iteration #3 *****
Loss: 310145.679126
Feature norm: 0.439600
Error norm: 859475.646114
Active features: 109295
Line search trials: 3
Line search step: 0.250000
Seconds required for this iteration: 12.488

***** Iteration #4 *****
Loss: 286664.320074
Feature norm: 0.823120
Error norm: 288709.899686
Active features: 210807
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 4.309

***** Iteration #5 *****
Loss: 258555.140427
Feature norm: 0.723769
Error norm: 220726.603403
Active features: 173843
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 4.239

***** Iteration #6 *****
Loss: 237167.316604
Feature norm: 0.587633
Error norm: 110402.527493
Active features: 131140
Line search trials: 2
Line search ste

IOStream.flush timed out


***** Iteration #19 *****
Loss: 128482.382431
Feature norm: 4.607233
Error norm: 47970.453316
Active features: 128586
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 8.408

***** Iteration #20 *****
Loss: 127854.493997
Feature norm: 4.806486
Error norm: 75406.881856
Active features: 128154
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 4.385

***** Iteration #21 *****
Loss: 126321.262059
Feature norm: 5.314837
Error norm: 96955.133430
Active features: 126761
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 4.422

***** Iteration #22 *****
Loss: 123588.192283
Feature norm: 5.478217
Error norm: 42970.514138
Active features: 126875
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 4.407

***** Iteration #23 *****
Loss: 122694.298367
Feature norm: 5.885184
Error norm: 75186.042226
Active features: 125958
Line search trials: 1
Line search step

IOStream.flush timed out


***** Iteration #169 *****
Loss: 81379.519560
Feature norm: 86.335939
Error norm: 225.689509
Active features: 44666
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 4.348

***** Iteration #170 *****
Loss: 81379.225641
Feature norm: 86.344062
Error norm: 134.255641
Active features: 44653
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 4.389

***** Iteration #171 *****
Loss: 81378.870406
Feature norm: 86.361480
Error norm: 300.455714
Active features: 44647
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 4.462

***** Iteration #172 *****
Loss: 81378.514627
Feature norm: 86.372289
Error norm: 283.512687
Active features: 44649
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 4.348

***** Iteration #173 *****
Loss: 81378.033271
Feature norm: 86.389317
Error norm: 263.006533
Active features: 44624
Line search trials: 1
Line search step: 1.000000

IOStream.flush timed out


***** Iteration #182 *****
Loss: 81376.515838
Feature norm: 86.452054
Error norm: 25.631892
Active features: 44582
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 4.405

***** Iteration #183 *****
Loss: 81376.426022
Feature norm: 86.455810
Error norm: 18.805185
Active features: 44568
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 4.384

***** Iteration #184 *****
Loss: 81376.284264
Feature norm: 86.472190
Error norm: 83.585791
Active features: 44528
Line search trials: 6
Line search step: 0.031250
Seconds required for this iteration: 24.965

***** Iteration #185 *****
Loss: 81376.269293
Feature norm: 86.472283
Error norm: 170.811823
Active features: 44554
Line search trials: 4
Line search step: 0.125000
Seconds required for this iteration: 16.745

***** Iteration #186 *****
Loss: 81376.117914
Feature norm: 86.481478
Error norm: 121.274964
Active features: 44532
Line search trials: 1
Line search step: 1.000000


## Try to Predict and evaluate

In [17]:
# Predict (using test set)
tagger = pycrfsuite.Tagger()
tagger.open('models/sub1-crf.model')
y_pred = tagger.tag(x_test)

In [None]:
y_pred

* Evaluate at word-level

In [19]:
labels = {'O': 0, "I_SENT": 1} # classification_report() needs values in 0s and 1s
predictions = np.array([labels[tag] for tag in y_pred])
truths = np.array([labels[tag] for tag in y_test])

print(classification_report(
    truths, predictions,
    target_names=["O", "I_SENT"]))

              precision    recall  f1-score   support

           O       0.48      0.61      0.54     12399
      I_SENT       0.99      0.99      0.99    542344

    accuracy                           0.98    554743
   macro avg       0.74      0.80      0.76    554743
weighted avg       0.98      0.98      0.98    554743



# Predict

## Prepare data submission

In [20]:
df_sub = pd.read_csv('ss_test.csv')
df_sub

Unnamed: 0.1,Unnamed: 0,word
0,0,กล่าว
1,1,ชม
2,2,ตร.
3,3,เร่ง
4,4,สำนวน
...,...,...
74400,74400,การ
74401,74401,
74402,74402,2/2004
74403,74403,


In [21]:
df_sub=df_sub.drop([74403,74404])
len(df_sub)

74403

In [22]:
df_sub[df_sub["word"] == ""].count()

Unnamed: 0    0
word          0
dtype: int64

In [23]:
x_sub=list(df_sub['word'])
x_sub[0:20]

['กล่าว',
 'ชม',
 'ตร.',
 'เร่ง',
 'สำนวน',
 'คดี',
 'เร็ว',
 'ดี',
 'ลั่น',
 'จะ',
 'ไม่',
 'หนี',
 'เหมือน',
 'นัก',
 'การ',
 'เมือง',
 'บาง',
 'คน',
 ' ',
 'ตาม']

## Extract Feature

In [24]:
#features
x_ex = extract_features(x_sub, window=2, max_n_gram = 3)

100%|██████████| 74403/74403 [00:01<00:00, 51158.24it/s]


In [25]:
# x_ex

## predict label

In [26]:
# Predict (using test set)
tagger = pycrfsuite.Tagger()
tagger.open('models/sub1-crf.model')
y_pred_sub = tagger.tag(x_ex)
len(y_pred_sub)

74403

In [27]:
df_sub["Predicted"] = y_pred_sub
df_sub

Unnamed: 0.1,Unnamed: 0,word,Predicted
0,0,กล่าว,I_SENT
1,1,ชม,I_SENT
2,2,ตร.,I_SENT
3,3,เร่ง,I_SENT
4,4,สำนวน,I_SENT
...,...,...,...
74398,74398,ผู้,I_SENT
74399,74399,จัด,I_SENT
74400,74400,การ,I_SENT
74401,74401,,O


In [28]:
#Cheack clause
df_sub.groupby("Predicted").count()

Unnamed: 0_level_0,Unnamed: 0,word
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
I_SENT,73105,73105
O,1298,1298


In [29]:
df_sub[df_sub["Predicted"] == "O"]

Unnamed: 0.1,Unnamed: 0,word,Predicted
157,157,,O
229,229,,O
250,250,,O
285,285,,O
323,323,,O
...,...,...,...
74349,74349,,O
74357,74357,,O
74387,74387,,O
74396,74396,,O


## Save data frame

**Check Point [df_sub2]**

In [30]:
df_sub2 = pd.DataFrame({'Id':df_sub['Unnamed: 0'],'Predicted':df_sub['Predicted'],'word':df_sub['word']})
#try
df_sub2

Unnamed: 0,Id,Predicted,word
0,0,I_SENT,กล่าว
1,1,I_SENT,ชม
2,2,I_SENT,ตร.
3,3,I_SENT,เร่ง
4,4,I_SENT,สำนวน
...,...,...,...
74398,74398,I_SENT,ผู้
74399,74399,I_SENT,จัด
74400,74400,I_SENT,การ
74401,74401,O,


## Rule Base

* index 74401 is not "O"

In [31]:
df_sub2.Predicted[74401] = "I_SENT"
df_sub2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub2.Predicted[74401] = "I_SENT"


Unnamed: 0,Id,Predicted,word
0,0,I_SENT,กล่าว
1,1,I_SENT,ชม
2,2,I_SENT,ตร.
3,3,I_SENT,เร่ง
4,4,I_SENT,สำนวน
...,...,...,...
74398,74398,I_SENT,ผู้
74399,74399,I_SENT,จัด
74400,74400,I_SENT,การ
74401,74401,I_SENT,


* E > O > B

In [32]:
df_sub3=df_sub2.copy()
#try
df_sub3

Unnamed: 0,Id,Predicted,word
0,0,I_SENT,กล่าว
1,1,I_SENT,ชม
2,2,I_SENT,ตร.
3,3,I_SENT,เร่ง
4,4,I_SENT,สำนวน
...,...,...,...
74398,74398,I_SENT,ผู้
74399,74399,I_SENT,จัด
74400,74400,I_SENT,การ
74401,74401,I_SENT,


In [33]:
# Find index of "O"
index_sub3_EOB=df_sub3[(df_sub3['Predicted']=='O')].index
index_sub3_EOB

Int64Index([  157,   229,   250,   285,   323,   475,   485,   560,   635,
              647,
            ...
            74261, 74264, 74270, 74301, 74319, 74334, 74349, 74357, 74387,
            74396],
           dtype='int64', length=1297)

In [34]:
#fill E,B_SENT
df_sub3.loc[index_sub3_EOB-1, "Predicted"] = "E_SENT"
df_sub3.loc[index_sub3_EOB[:-1]+1, "Predicted"] = "B_SENT"
#Cheack clause
df_sub3.groupby("Predicted").count()

Unnamed: 0_level_0,Id,word
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
B_SENT,1296,1296
E_SENT,1295,1295
I_SENT,70515,70515
O,1297,1297


* index 0 is "B_SENT", index 74402 is "E_SENT"

In [35]:
df_sub4=df_sub3.copy()
#try
df_sub4

Unnamed: 0,Id,Predicted,word
0,0,I_SENT,กล่าว
1,1,I_SENT,ชม
2,2,I_SENT,ตร.
3,3,I_SENT,เร่ง
4,4,I_SENT,สำนวน
...,...,...,...
74398,74398,I_SENT,ผู้
74399,74399,I_SENT,จัด
74400,74400,I_SENT,การ
74401,74401,I_SENT,


In [36]:
df_sub4.Predicted[0] = "B_SENT"
df_sub4.Predicted[74402] = "E_SENT"
df_sub4

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub4.Predicted[0] = "B_SENT"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub4.Predicted[74402] = "E_SENT"


Unnamed: 0,Id,Predicted,word
0,0,B_SENT,กล่าว
1,1,I_SENT,ชม
2,2,I_SENT,ตร.
3,3,I_SENT,เร่ง
4,4,I_SENT,สำนวน
...,...,...,...
74398,74398,I_SENT,ผู้
74399,74399,I_SENT,จัด
74400,74400,I_SENT,การ
74401,74401,I_SENT,


In [37]:
df_sub4.groupby("Predicted").count()

Unnamed: 0_level_0,Id,word
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
B_SENT,1297,1297
E_SENT,1296,1296
I_SENT,70513,70513
O,1297,1297


## Id+1

In [38]:
df_sub5=df_sub4.copy()
#try
df_sub5

Unnamed: 0,Id,Predicted,word
0,0,B_SENT,กล่าว
1,1,I_SENT,ชม
2,2,I_SENT,ตร.
3,3,I_SENT,เร่ง
4,4,I_SENT,สำนวน
...,...,...,...
74398,74398,I_SENT,ผู้
74399,74399,I_SENT,จัด
74400,74400,I_SENT,การ
74401,74401,I_SENT,


In [39]:
df_sub5.Id[:] = df_sub5.Id[:]+1
df_sub5

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub5.Id[:] = df_sub5.Id[:]+1


Unnamed: 0,Id,Predicted,word
0,1,B_SENT,กล่าว
1,2,I_SENT,ชม
2,3,I_SENT,ตร.
3,4,I_SENT,เร่ง
4,5,I_SENT,สำนวน
...,...,...,...
74398,74399,I_SENT,ผู้
74399,74400,I_SENT,จัด
74400,74401,I_SENT,การ
74401,74402,I_SENT,


## Save submission.csv

In [40]:
df_sub6=df_sub5.copy()
#try
df_sub6

Unnamed: 0,Id,Predicted,word
0,1,B_SENT,กล่าว
1,2,I_SENT,ชม
2,3,I_SENT,ตร.
3,4,I_SENT,เร่ง
4,5,I_SENT,สำนวน
...,...,...,...
74398,74399,I_SENT,ผู้
74399,74400,I_SENT,จัด
74400,74401,I_SENT,การ
74401,74402,I_SENT,


In [41]:
df_sub7=df_sub6.drop(columns=['word'])
df_sub7

Unnamed: 0,Id,Predicted
0,1,B_SENT
1,2,I_SENT
2,3,I_SENT
3,4,I_SENT
4,5,I_SENT
...,...,...
74398,74399,I_SENT
74399,74400,I_SENT
74400,74401,I_SENT
74401,74402,I_SENT


In [42]:
df_sub7.to_csv('submission1.csv',index=False)

# Post Process Rule Base

## มี O ขั้นกลาง > ชื่อ-นามสกุล

In [47]:
df_submission=df_sub6.copy()
#try
df_submission

Unnamed: 0,Id,Predicted,word
0,1,B_SENT,กล่าว
1,2,I_SENT,ชม
2,3,I_SENT,ตร.
3,4,I_SENT,เร่ง
4,5,I_SENT,สำนวน
...,...,...,...
74398,74399,I_SENT,ผู้
74399,74400,I_SENT,จัด
74400,74401,I_SENT,การ
74401,74402,I_SENT,


In [48]:
df_Mr_Id=df_submission[(df_submission['word']=='น.ส.')|
                       (df_submission['word']=='นาย')|
                       (df_submission['word']=='นาง')|
                       (df_submission['word']=='นางสาว')]['Id']

df_Mr_Id=pd.DataFrame(df_Mr_Id)
df_Mr_Id

Unnamed: 0,Id
21,22
45,46
147,148
152,153
183,184
...,...
73806,73807
73892,73893
73995,73996
74251,74252


Ex
* นาย
* ประสิทธิ์
* << check ว่าตรงนี้เป็น "O" มั้ย
* จันทร์โอ

In [49]:
list_Mr_Id=list(df_Mr_Id['Id'])
Mr_id=[]
Mr_word=[]
li = []
for id in list_Mr_Id:
    add_id=id+1
    if df_submission['Predicted'][add_id]=='O':
        # Mr_id.append(range(id,id+3))
        print('***',range(id,id+3))
        # print()
        print(df_submission['word'][id-1:id+3],df_submission['Predicted'][id-2:id+3])
        # Mr_word.append(df_submission['word'][range(id,id+3)])

# df_MrMr=pd.DataFrame({"Id": Mr_id,"word": Mr_word})
# df_MrMr
# Mr_id

//No

## กล่าว E/ กล่าวว่า?

In [50]:
df_submission_copy=df_submission.copy()
#try
df_submission_copy

Unnamed: 0,Id,Predicted,word
0,1,B_SENT,กล่าว
1,2,I_SENT,ชม
2,3,I_SENT,ตร.
3,4,I_SENT,เร่ง
4,5,I_SENT,สำนวน
...,...,...,...
74398,74399,I_SENT,ผู้
74399,74400,I_SENT,จัด
74400,74401,I_SENT,การ
74401,74402,I_SENT,


In [51]:
df_Said_Id=df_submission[(df_submission['word']=='กล่าว')]['Id']

df_Said_Id=pd.DataFrame(df_Said_Id)
df_Said_Id

Unnamed: 0,Id
0,1
69,70
90,91
144,145
255,256
...,...
73895,73896
73958,73959
73999,74000
74254,74255


In [52]:
list_Said_Id=list(df_Said_Id['Id'])
i=0
for id in list_Said_Id:
  # print('id',id)
  if df_submission['word'][id]==" ":
    if df_submission['Predicted'][id]!='O':
        df_submission_copy.loc[id, "Predicted"] = "O"
        df_submission_copy.loc[id-1, "Predicted"] = "E_SENT"
        df_submission_copy.loc[id+1, "Predicted"] = "B_SENT"
        print(id)

In [123]:
# df_submission_copy.loc[6415]