In [2]:
!pip install python-crfsuite
!pip install scikit-learn
!pip install nltk gensim

Collecting python-crfsuite
  Downloading python_crfsuite-0.9.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (993 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/993.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/993.5 kB[0m [31m2.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m952.3/993.5 kB[0m [31m14.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m993.5/993.5 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite
Successfully installed python-crfsuite-0.9.9


In [3]:
import pycrfsuite

from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [4]:
train_path = "/content/drive/MyDrive/Colab Notebooks/Dataset/train.txt"
test_path = "/content/drive/MyDrive/Colab Notebooks/Dataset/test.txt"
tags_path = "/content/drive/MyDrive/Colab Notebooks/Dataset/tags.txt"
model_path = "/content/drive/MyDrive/Colab Notebooks/Models/crf/ner.crfsuite"

In [5]:
def read_dataset(file_path):

    data_list = []
    label_list = []
    dataset = []

    with open(file_path, "r", encoding="utf-8") as file:
        dataset = file.read().split("\n\n")

    dataset = [sent.split("\n") for sent in dataset]
    dataset = [[word for word in sent if word != ""] for sent in dataset]
    dataset = [[tuple(word.split("\t")) for word in sent] for sent in dataset]
    dataset = [[word for word in sent if word[0] != "*"] for sent in dataset]

    for sent in dataset:
        try:
            words = [word[0] for word in sent]
            labels = [word[1] for word in sent]
            data_list.append(words)
            label_list.append(labels)
        except:
            print("Error in sentence: ")
            print(sent)
            exit(0)

    return data_list, label_list


def get_from_txt(file_path):
    data = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f.readlines():
            line = line.strip()
            data.append(line)
    return data

In [6]:
import pycrfsuite
import warnings
import nltk
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

warnings.filterwarnings("ignore")
nltk.download('averaged_perceptron_tagger')

from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer


class CRFNER:
    def __init__(self, model_path=None):
        self.model_path = model_path
        self.model = pycrfsuite.Trainer(verbose=False)
        self.model.set_params({
            "c1": 1.0,
            "c2": 1e-3,
            "max_iterations": 100,
            "feature.possible_transitions": True
        })
        self.tagger = None

    def train(self, train_data, train_label):
        for xseq, yseq in zip(train_data, train_label):
            sent_features = [self.word2features(xseq, i) for i in range(len(xseq))]
            self.model.append(sent_features, yseq)
        self.model.train(self.model_path)

    def word2features(self, sent, i):
        word = sent[i][0]
        prev_word = "<s>" if i == 0 else sent[i-1][0]
        next_word = "</s>" if i == len(sent)-1 else sent[i+1][0]

        # 特徵字典
        features = {
            "word": word,
            "prev_word": prev_word,
            "next_word": next_word,
            "word+prev_word": word + prev_word,
            "word+next_word": word + next_word,
            "is_first": i == 0,
            "is_last": i == len(sent) - 1,
            "is_numeric": word.isdigit(),
            "is_alpha": word.isalpha(),
            "pos_tag": self.get_pos_tag(word),  # 加入詞性標籤作為特徵
        }

        return features

    def get_pos_tag(self, word):
        """
        獲取給定詞的詞性標籤
        """
        pos_tags = nltk.pos_tag([word])
        return pos_tags[0][1]

    def load_model(self):
        self.tagger = pycrfsuite.Tagger()
        self.tagger.open(self.model_path)

    def predict(self, sent):
        if self.tagger is None:
            self.load_model()
        sent = list(sent)
        sent_features = [self.word2features(sent, i) for i in range(len(sent))]
        labels = self.tagger.tag(sent_features)
        return labels

    def evaluate(self, test_data, test_label, tags2idx):

        if self.tagger is None:
            self.load_model()

        # remove [PAD] and O tags
        tags2idx.pop("[PAD]")
        tags2idx.pop("O")

        custom_classes = list(tags2idx.keys())
        pred = [self.predict(sent) for sent in test_data]
        pred = MultiLabelBinarizer(classes=custom_classes).fit_transform(pred)
        test_label = MultiLabelBinarizer(classes=custom_classes).fit_transform(test_label)

        print(classification_report(test_label, pred, target_names=tags2idx.keys()))
        print("f1-score: ", classification_report(test_label, pred, target_names=tags2idx.keys(), output_dict=True)["micro avg"]["f1-score"])

    def train(self, train_data, train_label):
        for xseq, yseq in zip(train_data, train_label):
            sent_features = [self.word2features(xseq, i) for i in range(len(xseq))]
            self.model.append(sent_features, yseq)
        self.model.train(self.model_path)

    def word2features(self, sent, i):
        """
            Extract features from a single word
            sent: a list of words
            i: index of the word
        """
        word = sent[i][0]
        prev_word = "<s>" if i == 0 else sent[i-1][0]
        next_word = "</s>" if i == len(sent)-1 else sent[i+1][0]

        pos_tag = self.get_pos_tag(word)

        features = {
            "word": word,
            "prev_word": prev_word,
            "next_word": next_word,
            "word+prev_word": word + prev_word,
            "word+next_word": word + next_word,
            "is_first": i == 0,
            "is_last": i == len(sent) - 1,
            "is_numeric": word.isdigit(),
            "is_alpha": word.isalpha(),
            "pos_tag": pos_tag,  # 新增詞性標籤作為特徵
        }

        return features

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [9]:
train_data, train_label = read_dataset(train_path)
test_data, test_label = read_dataset(test_path)

print(len(train_data))
print(len(train_label))
print(len(test_data))
print(len(test_label))

tags = get_from_txt(tags_path)
tags = ["[PAD]"] + tags
tags2idx = {tag: idx for idx, tag in enumerate(tags)}

ner = CRFNER(model_path)
# ner.train(train_data, train_label)
ner.evaluate(test_data, test_label, tags2idx)

50075
50075
14775
14775
              precision    recall  f1-score   support

       B-Tim       0.93      1.00      0.97      9773
       I-Tim       0.97      1.00      0.99      3721
       E-Tim       0.97      1.00      0.98     10121
       S-Tim       0.00      0.00      0.00         1
       B-Org       0.93      0.96      0.94      1468
       I-Org       0.85      0.92      0.88        37
       E-Org       0.93      0.96      0.94      1469
       S-Org       0.83      0.68      0.75       396
       B-Sym       1.00      1.00      1.00     13434
       I-Sym       0.99      0.98      0.98      5788
       E-Sym       1.00      1.00      1.00     13428
       S-Sym       0.96      0.88      0.92      1128
       B-Abb       0.99      1.00      1.00      3056
       I-Abb       0.99      0.94      0.96      2985
       E-Abb       0.99      0.97      0.98      3161
       S-Abb       0.00      0.00      0.00      1176
       B-Exa       0.99      0.99      0.99      1025
   

In [24]:
# test
sent = "昨天開始全身起紅疹。"
print(ner.predict(sent))

['B-Tim', 'E-Tim', 'O', 'O', 'B-Org', 'E-Org', 'O', 'B-Sym', 'E-Sym', 'O']
