In [None]:
!pip install simpletransformers
!pip install transformers



In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
train_path = "/content/drive/MyDrive/mer/Dataset/train.txt"
test_path = "/content/drive/MyDrive/mer/Dataset/test.txt"
tags_path = "/content/drive/MyDrive/mer/Dataset/tags.txt"
model_path = "/content/drive/MyDrive/mer/Models/bert/"

In [None]:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

import logging
import warnings
warnings.filterwarnings("ignore")

from simpletransformers.ner import NERModel
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer

transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.ERROR)

import pandas as pd


def conll_format(train_data, train_labels):
    # Get sentence id
    sentence_ids = []
    for i, sent in enumerate(train_data):
        ids = []
        for j, word in enumerate(sent):
            ids.append(i)
        sentence_ids.append(ids)

    # Flatten the lists
    train_data = [item for sublist in train_data for item in sublist]
    train_labels = [item for sublist in train_labels for item in sublist]
    sentence_ids = [item for sublist in sentence_ids for item in sublist]

    # Convert to CoNLL format
    train_df = pd.DataFrame()
    train_df["sentence_id"] = sentence_ids
    train_df["words"] = train_data
    train_df["labels"] = train_labels

    return train_df

def read_dataset(file_path):

    data_list = []
    label_list = []
    dataset = []

    with open(file_path, "r", encoding="utf-8") as file:
        dataset = file.read().split("\n\n")

    dataset = [sent.split("\n") for sent in dataset]
    dataset = [[word for word in sent if word != ""] for sent in dataset]
    dataset = [[tuple(word.split("\t")) for word in sent] for sent in dataset]
    dataset = [[word for word in sent if word[0] != "*"] for sent in dataset]

    for sent in dataset:
        try:
            words = [word[0] for word in sent]
            labels = [word[1] for word in sent]
            data_list.append(words)
            label_list.append(labels)
        except:
            print("Error in sentence: ")
            print(sent)
            exit(0)

    return data_list, label_list


def get_from_txt(file_path):
    data = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f.readlines():
            line = line.strip()
            data.append(line)
    return data

In [None]:
def train():
    # Read data
    train_data, train_labels = read_dataset(train_path)
    test_data, test_labels = read_dataset(test_path)

    # Convert to CoNLL format
    train_df = conll_format(train_data, train_labels)
    test_df = conll_format(test_data, test_labels)

    # Read tags
    tags = get_from_txt(tags_path)

    # Define hyperparameters
    train_args = {
        "output_dir": model_path, # save model
        "overwrite_output_dir": True, # overwrite model
        "reprocess_input_data": True, # reprocess input data
        "num_train_epochs": 5, # number of epochs
        "sliding_window": True, # sliding window
        "max_seq_length": 128, # max sequence length
        "train_batch_size": 64, # batch size
        "fp16": True, # use mixed precision
    }

    # Create a NERModel
    model = NERModel(
        "bert",
        "bert-base-chinese",
        args=train_args,
        labels=tags,
        use_cuda=True
    )

    # Train the model
    model.train_model(train_df)

In [None]:
def test(sent):
    # Read data
    train_data, train_labels = read_dataset(train_path)
    test_data, test_labels = read_dataset(test_path)

    # Convert to CoNLL format
    train_df = conll_format(train_data, train_labels)
    test_df = conll_format(test_data, test_labels)

    # Read tags
    tags = get_from_txt(tags_path)

    # Define hyperparameters
    train_args = {
        "output_dir": model_path, # save model
        "overwrite_output_dir": True, # overwrite model
        "reprocess_input_data": True, # reprocess input data
        "num_train_epochs": 5, # number of epochs
        "sliding_window": True, # sliding window
        "max_seq_length": 128, # max sequence length
        "train_batch_size": 64, # batch size
        "fp16": True, # use mixed precision
    }

    # Create a NERModel
    model = NERModel(
        "bert",
        model_path,
        args=train_args,
        labels=tags,
        use_cuda=True
    )

    # Get test ground truth groupby sentence id
    test_labels = test_df.groupby("sentence_id")["labels"].apply(list).values.tolist()

    # Evaluate the model
    result, _, predictions = model.eval_model(test_df)
    print(result)

    # remove "O" tags
    custom_classes = [tag for tag in tags if tag != "O"]

    predictions = MultiLabelBinarizer(classes=custom_classes).fit_transform(predictions)
    test_labels = MultiLabelBinarizer(classes=custom_classes).fit_transform(test_labels)

    print(classification_report(test_labels, predictions, target_names=custom_classes))
    print("f1-score: ", classification_report(test_labels, predictions, target_names=custom_classes, output_dict=True)["micro avg"]["f1-score"])

    predictions, _ = model.predict([sent], split_on_space=False)
    print(predictions)

In [None]:
train()
test("雙腰痛多天，今天開始小便不適，解血尿。")

  0%|          | 0/3 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/781 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/781 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/781 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/781 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/781 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1847 [00:00<?, ?it/s]

{'eval_loss': 0.169800819883743, 'precision': 0.9657096550127344, 'recall': 0.9302984342240264, 'f1_score': 0.9476733618104153}
              precision    recall  f1-score   support

       B-Tim       0.92      1.00      0.96      9773
       I-Tim       0.97      1.00      0.98      3721
       E-Tim       0.95      1.00      0.97     10121
       S-Tim       0.00      0.00      0.00         1
       B-Org       0.95      0.98      0.96      1468
       I-Org       0.81      0.95      0.88        37
       E-Org       0.95      0.98      0.96      1469
       S-Org       0.90      0.81      0.85       396
       B-Sym       1.00      1.00      1.00     13434
       I-Sym       0.99      0.99      0.99      5788
       E-Sym       1.00      1.00      1.00     13428
       S-Sym       0.96      0.89      0.93      1128
       B-Abb       0.98      0.99      0.99      3056
       I-Abb       0.98      0.92      0.95      2985
       E-Abb       0.98      0.96      0.97      3161
       

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[[{'雙': 'O'}, {'腰': 'B-Sym'}, {'痛': 'E-Sym'}, {'多': 'B-Tim'}, {'天': 'E-Tim'}, {'，': 'O'}, {'今': 'B-Tim'}, {'天': 'E-Tim'}, {'開': 'O'}, {'始': 'O'}, {'小': 'B-Sym'}, {'便': 'I-Sym'}, {'不': 'I-Sym'}, {'適': 'E-Sym'}, {'，': 'O'}, {'解': 'B-Sym'}, {'血': 'I-Sym'}, {'尿': 'E-Sym'}, {'。': 'O'}]]
