<a href="https://colab.research.google.com/github/nrj130613/myproject/blob/main/NER_pos_window_ngram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bigram

# 1. Download Data

In [None]:
!pip install sklearn-crfsuite

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting python-crfsuite>=0.8.3
  Downloading python_crfsuite-0.9.9-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.9 sklearn-crfsuite-0.3.6


In [None]:
!gdown 1LHZe1etwxaKnP6TWTbDscasBe5pGykjJ

Downloading...
From: https://drive.google.com/uc?id=1LHZe1etwxaKnP6TWTbDscasBe5pGykjJ
To: /content/train_auto_tok.tsv
100% 38.9M/38.9M [00:00<00:00, 129MB/s]


In [None]:
!head train_auto_tok.tsv

ธรรมนูญ	B_PER
แชมป์	O
สิงห์	O
คลาสสิก	O
กวาด	O
รางวัล	O
แสน	O
สี่	O
หมื่น	O
บาท	O


In [None]:
!gdown 1LtdB8q2xVhK7vivJxTU6yYnl3KTG-QGS

Downloading...
From: https://drive.google.com/uc?id=1LtdB8q2xVhK7vivJxTU6yYnl3KTG-QGS
To: /content/dev_auto_tok.tsv
  0% 0.00/3.56M [00:00<?, ?B/s]100% 3.56M/3.56M [00:00<00:00, 122MB/s]


In [None]:
def load_data(file_name):
    with open(file_name, 'r') as f:
        lines = f.read().splitlines()

    X, Y = [], []
    sentence, labels = [], []
    for line in lines:
        if not line:
            if sentence:
                X.append(sentence)
                Y.append(labels)
                sentence, labels = [], []
        else:
            word, tag = line.split('\t')
            sentence.append(word)
            labels.append(tag)
    if sentence:
        X.append(sentence)
        Y.append(labels)

    return (X, Y)


In [None]:
Xtrain, Ytrain = load_data('train_auto_tok.tsv')
Xtest, Ytest = load_data('dev_auto_tok.tsv')

In [None]:
import sklearn_crfsuite
import sklearn_crfsuite.metrics

# 2. Feature Engineering

In [None]:
!pip install pythainlp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pythainlp
  Downloading pythainlp-4.0.0-py3-none-any.whl (13.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.4/13.4 MB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pythainlp
Successfully installed pythainlp-4.0.0


# POS Features


In [None]:
import pythainlp
from pythainlp.tag import pos_tag

def pos_tagging(sentences):
  sents_pos = []
  for i in range(len(sentences)):
    #for sent in sentences[i]:
    each_sent_pos = pos_tag(sentences[i])

    sents_pos.append(each_sent_pos)
  return sents_pos

In [None]:
test = pos_tagging(Xtrain[0:5])
test

In [None]:
def featurize(sent, window_size):
    feature_seq = []
    for i in range(len(sent)):
        word, pos = sent[i]
        features = {}
        features["{}_{}_pos".format(word, i)] = pos
        # position features
        #features['bias'] = 1.0
        #features['pos'] = i
        #features['pos_inv'] = len(sent) - i
        # bigram word window feature
        for j in range(1, window_size + 1):
            if i - j >= 0:
                features['prev_word_{}'.format(j)] = sent[i-j][1]
            if i + j < len(sent):
                features['next_word_{}'.format(j)] = sent[i+j][1]
            if i - j >= 0 and i + j < len(sent):
                features['conjunctive_{}'.format(j)] = sent[i-j][0] + '_' + word + '_' + sent[i+j][0]

        features['token_{}'.format(word, i)] = word.isdigit()

        feature_seq.append(features)
    return feature_seq


In [None]:
test[1]

In [None]:
featurize(test[1], 2)

# 3. Train and evaluate models

In [None]:
train_set_pos = pos_tagging(Xtrain[0:5000])
test_set_pos = pos_tagging(Xtest)

In [None]:
import sklearn_crfsuite

In [None]:
from sklearn_crfsuite import CRF, metrics

def train_and_evaluate(Xtrain, Ytrain, Xtest, Ytest):
    X_train_feats = [featurize(sent, 2) for sent in Xtrain]
    X_test_feats = [featurize(sent, 2) for sent in Xtest]
    crf = CRF()
    crf.fit(X_train_feats, Ytrain)
    y_pred = crf.predict(X_test_feats)
    #f1_score = metrics.flat_f1_score(Ytest, y_pred, average='weighted', labels=crf.classes_, zero_division=1)
    return y_pred

In [None]:
sample = Xtest[0:5000]

In [None]:
y_pred = train_and_evaluate(train_set_pos, Ytrain[0:5000], test_set_pos, Ytest)

In [None]:
y_pred

In [None]:
predicted = []
for i in range(len(Xtest)):
    each = list(zip(Xtest[i], y_pred[i]))
    predicted.append(each)

In [None]:
predicted

In [None]:
def extract_entities(sentence):

    entities = []
    entity_sofar = []
    type_sofar = None
    tokens = []
    for token, ner_tag in sentence:
        tokens.append(token)
        if ner_tag[0] == 'B':
            if type_sofar is not None:
                entities.append((type_sofar, ''.join(entity_sofar)))
                entity_sofar = []
                type_sofar = None
            if len(ner_tag) > 1:
                _, tag = ner_tag.split('_')
                type_sofar = tag
                entity_sofar.append(token)
            else:
                type_sofar = 'MISC'
                entity_sofar.append(token)

        elif ner_tag[0] == 'I':
            if len(ner_tag) > 1:
                _, tag = ner_tag.split('_')
                type_sofar = tag
            entity_sofar.append(token)
        elif ner_tag[0] == 'E':
            entity_sofar.append(token)
            entities.append((type_sofar, ''.join(entity_sofar)))
            entity_sofar = []
            type_sofar = None
        elif ner_tag == 'O':
            if len(entity_sofar) != 0:
                entities.append((type_sofar, ''.join(entity_sofar)))
                entity_sofar = []
                type_sofar = None
    return entities
    #return ''.join(tokens), [(t, x) for t, x in entities if t is not None]


In [None]:
output_list = list(map(extract_entities, predicted))
print(len(output_list))

5620


In [None]:
import json
json.dump(output_list, open('pos_test11111.json', encoding='utf8', mode='w'))

In [None]:
from sklearn_crfsuite import metrics

def train_and_evaluate(Xtrain, Ytrain, Xtest, Ytest):
    X_train_feats = [featurize(sent, 2) for sent in Xtrain]
    X_test_feats = [featurize(sent, 2) for sent in Xtest]
    crf = sklearn_crfsuite.CRF()
    crf.fit(X_train_feats, Ytrain)
    y_pred = crf.predict(X_test_feats)
    f1_score = metrics.flat_f1_score(Ytest, y_pred, average='weighted', labels=crf.classes_, zero_division=1)
    return f1_score

In [None]:
train_and_evaluate(train_set_pos, Ytrain[0:5000], test_set_pos, Ytest[0:60000])

0.8610347083464913

# Trigram

In [None]:
import sklearn_crfsuite

In [None]:
from sklearn_crfsuite import metrics

def train_and_evaluate_tri(Xtrain, Ytrain, Xtest, Ytest):
    X_train_feats = [featurize(sent, 3) for sent in Xtrain]
    X_test_feats = [featurize(sent, 3) for sent in Xtest]
    crf = sklearn_crfsuite.CRF()
    crf.fit(X_train_feats, Ytrain)
    y_pred = crf.predict(X_test_feats)
    f1_score = metrics.flat_f1_score(Ytest, y_pred, average='weighted', labels=crf.classes_, zero_division=1)
    return f1_score

In [None]:
train_and_evaluate_tri(train_set_pos, Ytrain[0:60000], test_set_pos, Ytest[0:60000])