<a href="https://colab.research.google.com/github/nrj130613/myproject/blob/main/NER_word_identity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Download Data

In [None]:
!pip install sklearn-crfsuite

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting python-crfsuite>=0.8.3
  Downloading python_crfsuite-0.9.9-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.9 sklearn-crfsuite-0.3.6


In [None]:
!gdown 1LHZe1etwxaKnP6TWTbDscasBe5pGykjJ

Downloading...
From: https://drive.google.com/uc?id=1LHZe1etwxaKnP6TWTbDscasBe5pGykjJ
To: /content/train_auto_tok.tsv
100% 38.9M/38.9M [00:00<00:00, 116MB/s]


In [None]:
!head train_auto_tok.tsv

ธรรมนูญ	B_PER
แชมป์	O
สิงห์	O
คลาสสิก	O
กวาด	O
รางวัล	O
แสน	O
สี่	O
หมื่น	O
บาท	O


In [None]:
!gdown 1LtdB8q2xVhK7vivJxTU6yYnl3KTG-QGS

Downloading...
From: https://drive.google.com/uc?id=1LtdB8q2xVhK7vivJxTU6yYnl3KTG-QGS
To: /content/dev_auto_tok.tsv
  0% 0.00/3.56M [00:00<?, ?B/s]100% 3.56M/3.56M [00:00<00:00, 167MB/s]


In [None]:
def load_data(file_name):
    with open(file_name, 'r') as f:
        lines = f.read().splitlines()

    X, Y = [], []
    sentence, labels = [], []
    for line in lines:
        if not line:
            if sentence:
                X.append(sentence)
                Y.append(labels)
                sentence, labels = [], []
        else:
            word, tag = line.split('\t')
            sentence.append(word)
            labels.append(tag)
    if sentence:
        X.append(sentence)
        Y.append(labels)

    return (X, Y)


In [None]:
Xtrain, Ytrain = load_data('train_auto_tok.tsv')
Xtest, Ytest = load_data('dev_auto_tok.tsv')

In [None]:
import sklearn_crfsuite
import sklearn_crfsuite.metrics

# 2. Feature Engineering

In [None]:
!pip install pythainlp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pythainlp
  Downloading pythainlp-4.0.0-py3-none-any.whl (13.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.4/13.4 MB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pythainlp
Successfully installed pythainlp-4.0.0


In [None]:
from pythainlp import (
    thai_lead_vowels,
    thai_follow_vowels,
    thai_above_vowels,
    thai_below_vowels,
    thai_consonants,
    thai_vowels,
    thai_tonemarks,
    thai_signs,
    thai_digits,
    thai_punctuations)

In [None]:
consonants = set(thai_consonants)
vowels = set(thai_vowels)
tonemarks = set(thai_tonemarks)
punc = set(thai_punctuations)

# Orthographical Features

In [None]:
def featurize(sentence):
    feature_seq = []
    feature_dict_seq = []
    for i, token in enumerate(sentence):
        feature_dict = {}
        # word identity feature
        feature_dict['word'] = token
        # position features
        feature_dict['bias'] = 1.0
        feature_dict['pos'] = i
        feature_dict['pos_inv'] = len(sentence) - i
        feature_dict_seq.append(feature_dict)
    feature_seq.append(feature_dict_seq)
    return feature_seq

from sklearn_crfsuite import CRF, metrics

def train_and_evaluate(Xtrain, Ytrain, Xtest, Ytest):
    X_train_feats = []
    X_test_feats = []
    for sent in Xtrain:
        X_train_feats.append(featurize(sent))
    for sent in Xtest:
        X_test_feats.append(featurize(sent))
    crf = CRF()
    crf.fit(X_train_feats, Ytrain)
    y_pred = crf.predict(X_test_feats)
    f1_score = metrics.flat_f1_score(Ytest, y_pred, average='weighted', labels=crf.classes_, zero_division=1)
    return f1_score


In [None]:
sample = Xtrain[1]

In [None]:
featurize(sample)

[{'word': 'ธรรมนูญ', 'bias': 1.0, 'pos': 0, 'pos_inv': 44},
 {'word': ' ', 'bias': 1.0, 'pos': 1, 'pos_inv': 43},
 {'word': 'ศรี', 'bias': 1.0, 'pos': 2, 'pos_inv': 42},
 {'word': 'โรจน์', 'bias': 1.0, 'pos': 3, 'pos_inv': 41},
 {'word': ' ', 'bias': 1.0, 'pos': 4, 'pos_inv': 40},
 {'word': 'เก็บ', 'bias': 1.0, 'pos': 5, 'pos_inv': 39},
 {'word': 'เพิ่ม', 'bias': 1.0, 'pos': 6, 'pos_inv': 38},
 {'word': ' ', 'bias': 1.0, 'pos': 7, 'pos_inv': 37},
 {'word': '4', 'bias': 1.0, 'pos': 8, 'pos_inv': 36},
 {'word': ' ', 'bias': 1.0, 'pos': 9, 'pos_inv': 35},
 {'word': 'อันเดอร์', 'bias': 1.0, 'pos': 10, 'pos_inv': 34},
 {'word': 'พาร์', 'bias': 1.0, 'pos': 11, 'pos_inv': 33},
 {'word': ' ', 'bias': 1.0, 'pos': 12, 'pos_inv': 32},
 {'word': '68', 'bias': 1.0, 'pos': 13, 'pos_inv': 31},
 {'word': ' ', 'bias': 1.0, 'pos': 14, 'pos_inv': 30},
 {'word': 'เข้า', 'bias': 1.0, 'pos': 15, 'pos_inv': 29},
 {'word': 'ป้าย', 'bias': 1.0, 'pos': 16, 'pos_inv': 28},
 {'word': 'รับ', 'bias': 1.0, 'pos': 17

# 3. Train and evaluate models

In [None]:
from sklearn_crfsuite import CRF, metrics

def featurize(sentence):
    feature_seq = []
    for i, token in enumerate(sentence):
        feature_dict = {}
        # word identity feature
        feature_dict['word'] = token
        # position features
        feature_dict['bias'] = 1.0
        feature_seq.append(feature_dict)
    return feature_seq

def train_and_evaluate(Xtrain, Ytrain, Xtest, Ytest):
    X_train_feats = [featurize(sent) for sent in Xtrain]
    X_test_feats = [featurize(sent) for sent in Xtest]
    crf = CRF()
    crf.fit(X_train_feats, Ytrain)
    y_pred = crf.predict(X_test_feats)
    f1_score = metrics.flat_f1_score(Ytest, y_pred, average='weighted', labels=crf.classes_, zero_division=1)
    return f1_score


In [None]:
train_and_evaluate(Xtrain[0:20000], Ytrain[0:20000], Xtest[0:20000], Ytest[0:20000])

0.8726523969836957