In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import sys
sys.path.append('/home/chendian/pure')

import time
import logging
import jsonlines
from glob import glob

In [6]:
class DataProcessor(object):
    """Base class for data converters for sequence classification data sets."""

    def get_train_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()

    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()

    @classmethod
    def _read_tsv(cls, input_file, quotechar=None):
        """Reads a tab separated value file."""
        with open(input_file, "r", encoding="utf-8-sig") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            lines = []
            for line in reader:
                lines.append(line)
            return lines

    @classmethod
    def _read_text(self,input_file):
        lines = []
        with open(input_file,'r') as f:
            words = []
            labels = []
            for line in f:
                if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                    if words:
                        lines.append({"words":words,"labels":labels})
                        words = []
                        labels = []
                else:
                    # splits = line.split(" ")
                    splits = line.split()
                    words.append(splits[0])
                    if len(splits) > 1:
                        labels.append(splits[-1].replace("\n", ""))
                    else:
                        # Examples could have no label for mode = "test"
                        labels.append("O")
            if words:
                lines.append({"words":words,"labels":labels})
        return lines

    @classmethod
    def _read_json(self,input_file):
        lines = []
        with open(input_file,'r') as f:
            for line in f:
                line = json.loads(line.strip())
                text = line['text']
                label_entities = line.get('label',None)
                words = list(text)
                labels = ['O'] * len(words)
                if label_entities is not None:
                    for key,value in label_entities.items():
                        for sub_name,sub_index in value.items():
                            for start_index,end_index in sub_index:
                                assert  ''.join(words[start_index:end_index+1]) == sub_name
                                if start_index == end_index:
                                    labels[start_index] = 'S-'+key
                                else:
                                    labels[start_index] = 'B-'+key
                                    labels[start_index+1:end_index+1] = ['I-'+key]*(len(sub_name)-1)
                lines.append({"words": words, "labels": labels})
        return lines

In [7]:
dp = DataProcessor()

In [3]:
def get_entity_bios(seq):
    """Gets entities from sequence.
    note: BIOS
    Args:
        seq (list): sequence of labels.
    Returns:
        list: list of (chunk_type, chunk_start, chunk_end).
    Example:
        # >>> seq = ['B-PER', 'I-PER', 'O', 'S-LOC']
        # >>> get_entity_bios(seq)
        [['PER', 0,1], ['LOC', 3, 3]]
    """
    chunks = []
    chunk = [-1, -1, -1]
    for indx, tag in enumerate(seq):
        if tag.startswith("S-"):
            if chunk[2] != -1:
                chunks.append(chunk)
            chunk = [-1, -1, -1]
            chunk[1] = indx
            chunk[2] = indx
            chunk[0] = tag.split('-')[1]
            chunks.append(chunk)
            chunk = (-1, -1, -1)
        if tag.startswith("B-"):
            if chunk[2] != -1:
                chunks.append(chunk)
            chunk = [-1, -1, -1]
            chunk[1] = indx
            chunk[0] = tag.split('-')[1]
        elif tag.startswith('I-') and chunk[1] != -1:
            _type = tag.split('-')[1]
            if _type == chunk[0]:
                chunk[2] = indx
            if indx == len(seq) - 1:
                chunks.append(chunk)
        else:
            if chunk[2] != -1:
                chunks.append(chunk)
            chunk = [-1, -1, -1]
    return chunks

def get_entity_bio(seq):
    """Gets entities from sequence.
    note: BIO
    Args:
        seq (list): sequence of labels.
    Returns:
        list: list of (chunk_type, chunk_start, chunk_end).
    Example:
        seq = ['B-PER', 'I-PER', 'O', 'B-LOC']
        get_entity_bio(seq)
        #output
        [['PER', 0,1], ['LOC', 3, 3]]
    """
    chunks = []
    chunk = [-1, -1, -1]
    for indx, tag in enumerate(seq):
        if tag.startswith("B-"):
            if chunk[2] != -1:
                chunks.append(chunk)
            chunk = [-1, -1, -1]
            chunk[1] = indx
            chunk[0] = tag.split('-')[1]
            chunk[2] = indx
            if indx == len(seq) - 1:
                chunks.append(chunk)
        elif tag.startswith('I-') and chunk[1] != -1:
            _type = tag.split('-')[1]
            if _type == chunk[0]:
                chunk[2] = indx

            if indx == len(seq) - 1:
                chunks.append(chunk)
        else:
            if chunk[2] != -1:
                chunks.append(chunk)
            chunk = [-1, -1, -1]
    return chunks

In [9]:
!ls -lht /home/chendian/BERT-NER-Pytorch/datasets/

total 20K
drwxrwxr-x 2 chendian chendian 4.0K May  7 05:25 msra
drwxrwxr-x 2 chendian chendian 4.0K May  6 20:27 onto4
drwxrwxr-x 2 chendian chendian 4.0K May  6 20:11 resume
drwxrwxr-x 2 chendian chendian 4.0K May  6 15:56 cner
drwxrwxr-x 2 chendian chendian 4.0K May  6 15:56 cluener


In [21]:
!ls -lht /data/chendian/bert_ner_output/onto4_output/macbert

total 40K
-rw-rw-r-- 1 chendian chendian 7.3K May  6 20:31 bert-onto4-2022-05-06-20:30:58.log
-rw-rw-r-- 1 chendian chendian  29K May  6 20:27 bert-onto4-2022-05-06-20:27:13.log


In [None]:
import json
from tqdm import tqdm
from pprint import pprint

is_tagger_model = False
boundary_error_examples = []

for task_name in ['msra', 'resume', 'onto4']:
    for bert_type in ['bert', 'macbert']:
        tagger_str = '_tagger' if is_tagger_model else ''
        
        print(f"{bert_type}{tagger_str} model on {task_name} dataset: ")
        
        if task_name == 'msra':
            ge_func = get_entity_bio
        else:
            ge_func = get_entity_bios

        test_answer_path = f'/home/chendian/BERT-NER-Pytorch/datasets/{task_name}/test.ner'
        test_answers = dp._read_text(test_answer_path)  # a list of {'words': [], 'labels': []}

        test_log_path = f'/data/chendian/bert_ner_output/{task_name}_{bert_type}{tagger_str}_output/bert/test_results.txt'
        test_json_path = f'/data/chendian/bert_ner_output/{task_name}_{bert_type}{tagger_str}_output/bert/test_prediction.json'
        
        model_logs = [line.strip() for line in open(test_log_path, 'r')]
        lines = [line.strip() for line in open(test_json_path, 'r')]
        # print(len(lines), len(test_answers))


        hit = 0
        total = 0
        others = 0
        err_type = 0
        mis_report = 0
        err_boundary = 0

        for line_id, line in tqdm(enumerate(lines)):
            res = json.loads(line)  # prediction results
            pred = res['entities']
            answers = test_answers[line_id+1]
            truth_labels = [t.replace('E-', 'I-').replace('M-', 'I-') for t in answers['labels']][:510]
            try:
                assert len(res['tag_seq'].split()) == len(truth_labels)
            except:
                print(">", line_id)
                print(len(res['tag_seq'].split()), len(truth_labels))
                print(res['tag_seq'].split())
                print(truth_labels)
            truth = ge_func(truth_labels)
            # print(line_id, pred, truth)
            # print(res['tag_seq'].split())
            # print(truth_labels)
            ner_offset = [0 for _ in range(len(truth_labels))]
            span2tag = {(l, r): t for t, l, r in truth}
            for truth_ner in truth:
                t, l, r = truth_ner
                l, r = int(l), int(r)
                for idx in range(l, r+1):
                    ner_offset[idx] = 1
            for predict_ner in pred:
                total += 1
                t, l, r = predict_ner
                if predict_ner in truth:
                    hit += 1
                elif span2tag.get((l, r)) is not None:
                    err_type += 1
                elif sum([ner_offset[_i] for _i in range(l, r+1)]) >= 1:
                    err_boundary += 1
                    if bert_type == 'macbert':
                        boundary_error_examples.append({
                            'sample_id': line_id,
                            'task_name': task_name,
                            'predict': predict_ner,
                            'truth': truth,
                            'sentence': ''.join(answers['words']),
                            'predict_entity': [''.join(answers['words'][predict_ner[1]:predict_ner[2]+1])]
                        })
                        # pprint(boundary_error_examples[-1])
                        # raise ValueError()
                elif sum([ner_offset[_i] for _i in range(l, r+1)]) == 0:
                    mis_report += 1
                else:
                    others += 1

        print("	".join(map(str, [total, hit, err_type, err_boundary, mis_report, others])))
        print(model_logs)

In [None]:
test_answers[1]

In [119]:
sent2item = {item['sentence']: item for item in boundary_error_examples}

In [151]:
for model_path in ['msra_origin_macB_prop10_bs8_220505', 'resume_macB_prop10_bs8_220505', 'onto4_macB_prop10_bs8_220505']:
    prediction_file = '/data/chendian/pure_output_dir/' + model_path + 
    print(prediction_file)

/data/chendian/pure_output_dir/msra_origin_macB_prop10_bs8_220505
/data/chendian/pure_output_dir/resume_macB_prop10_bs8_220505
/data/chendian/pure_output_dir/onto4_macB_prop10_bs8_220505


In [155]:
!tail /data/chendian/pure_output_dir/onto4_macB_prop10_bs8_220505/train.log

Extracted 4346 samples from 1 documents, with 7684 NER labels, 47.875 avg input length, 271 max length
Max Length: 271, max NER: 32
Span Candidates' Count: 1858719, Cover: 7684
Evaluating...
Accuracy: 0.998594
Cor: 6159, Pred TOT: 7403, Gold TOT: 7684
P: 0.83196, R: 0.80154, F1: 0.81646
Used time: 107.451279
Total pred entities: 7403
Output predictions to /data/chendian/pure_output_dir/onto4_macB_prop10_bs8_220505/ent_pred_test.json..


In [450]:
from collections import defaultdict

for model_path in [
    'msra_origin_macB_times20_220406', 
    'resume_macB_times20_220430', 
    'onto4_macB_times20_220430'
    'weibo_macB_ratio20_221003']:
# for model_path in ['msra_origin_macB_prop10_bs8_220505', 'resume_macB_prop10_bs8_220505', 'onto4_macB_prop10_bs8_220505']:
    hit = 0
    total = 0
    others = 0
    err_type = 0
    mis_report = 0
    err_boundary = 0

    prediction_file = '/data/chendian/pure_output_dir/' + model_path + '/ent_pred_test.json'
    ret = json.load(open(prediction_file, 'r'))
    # print(ret.keys())
    sentences = ret['sentences']
    ner = ret['ner']
    sentences = ret['sentences']
    predicted_ner = ret['predicted_ner']
    offset = 0
    for pred, truth, sent in zip(predicted_ner + [''] * 5, ner, sentences):
        # print(pred)
        # print(truth)
        # print(sent)
        text = ''.join(sent)
        if text in sent2item:
            print(sent2item[text]['task_name'], '-', sent2item[text]['sample_id'])
            print(text)
            _t, _l, _r = sent2item[text]['predict']  # err_span
            truth_no_off = sent2item[text]['truth']
            crf_prediction = sent2item[text]['predict'] + sent2item[text]['predict_entity']
            print("MacBERT-CRF:", crf_prediction)
            for _a, _b, _c in zip(truth_no_off, pred, truth):
                if set(range(int(_l), int(_r)+1)).intersection(set(range(int(_a[1]), int(_a[2])+1))):
                    _b = [_b[2], int(_b[0])-offset, int(_b[1])-offset]
                    _b.append(''.join([sent[_ci] for _ci in range(_b[1], _b[2]+1)]))
                    _c = [_c[2], int(_c[0])-offset, int(_c[1])-offset]
                    _c.append(''.join([sent[_ci] for _ci in range(_c[1], _c[2]+1)]))
                    if _b[1:] == crf_prediction[1:]:
                        del sent2item[text]
                        break
                    print("Ours:", _b)
                    print("Truth:", _c)
            print("解释：")
            print(pred)
            # print(truth)
            raise ValueError()
        ner_offset = defaultdict(int)
        span2tag = {(l, r): t for l, r, t in truth}
        for truth_ner in truth:
            l, r, t = truth_ner
            l, r = int(l), int(r)
            for idx in range(l, r+1):
                ner_offset[idx] = 1
        for predict_ner in pred:
            total += 1
            l, r, t = predict_ner
            if predict_ner in truth:
                hit += 1
            elif span2tag.get((l, r)) is not None:
                err_type += 1
            elif sum([ner_offset[_i] for _i in range(l, r+1)]) >= 1:
                err_boundary += 1
            elif sum([ner_offset[_i] for _i in range(l, r+1)]) == 0:
                mis_report += 1
            else:
                others += 1
        offset += len(sent)

    # print(f"MacBert-Ratio20 model on {model_path.split('_')[0]} dataset: ")
    print(f"MacBert-Prop model on {model_path.split('_')[0]} dataset: ")
    print("	".join(map(str, [total, hit, err_type, err_boundary, mis_report, others])))

MacBert-Prop model on msra dataset: 
6153	5931	15	120	87	0
MacBert-Prop model on resume dataset: 
1633	1577	1	46	9	0
MacBert-Prop model on onto4 dataset: 
7400	6141	162	411	686	0


In [16]:
import json
from collections import defaultdict

for model_path in ['scierc_220612', 'ace04_220612', 'ace05_220612']:
    hit = 0
    total = 0
    others = 0
    err_type = 0
    mis_report = 0
    err_boundary = 0

    prediction_file = '/data/chendian/pure_output_dir/' + model_path + '/ent_pred_test.json'
    for line in open(prediction_file, 'r'):
        ret = json.loads(line)
        ner = ret['ner']
        sentences = ret['sentences']
        predicted_ner = ret['predicted_ner']
        for pred, truth, sent in zip(predicted_ner, ner, sentences):
            # print(pred)
            # print(truth)
            # print(sent)
            ner_offset = defaultdict(int)
            span2tag = {(l, r): t for l, r, t in truth}
            for truth_ner in truth:
                l, r, t = truth_ner
                l, r = int(l), int(r)
                for idx in range(l, r+1):
                    ner_offset[idx] = 1
            for predict_ner in pred:
                total += 1
                l, r, t = predict_ner
                if predict_ner in truth:
                    hit += 1
                elif span2tag.get((l, r)) is not None:
                    err_type += 1
                elif sum([ner_offset[_i] for _i in range(l, r+1)]) >= 1:
                    err_boundary += 1
                elif sum([ner_offset[_i] for _i in range(l, r+1)]) == 0:
                    mis_report += 1
                else:
                    others += 1
    print(f"Span-based Model on {model_path.split('_')[0]} dataset: ")
    print('\t'.join("total, hit, err_type, , , err_boundary, , , mis_report, others".split(', ')))
    errs = total - hit
    print("	".join(map(str, [total, hit, err_type, err_type/errs, err_boundary, err_boundary/errs, mis_report, mis_report/errs, others])))

Span-based Model on scierc dataset: 
total	hit	err_type			err_boundary			mis_report	others
1660	1114	210	0.38461538461538464	178	0.326007326007326	158	0.2893772893772894	0
Span-based Model on ace04 dataset: 
total	hit	err_type			err_boundary			mis_report	others
2952	2542	157	0.3829268292682927	181	0.44146341463414634	72	0.17560975609756097	0
Span-based Model on ace05 dataset: 
total	hit	err_type			err_boundary			mis_report	others
2957	2537	152	0.3619047619047619	145	0.34523809523809523	123	0.29285714285714287	0


In [446]:
del sent2item[text]

In [232]:
text

'请问这位秘书长是：（１）萨利姆（２）穆加贝（３）埃西５．６月６日，孟加拉国、印度、斯里兰卡、泰国四国签署宣言，宣布成立孟、印、斯、泰经济合作组织。'

In [110]:
!gpustat

[1m[37mc21[m  Tue May 10 01:14:03 2022
[36m[0][m [34mNVIDIA GeForce GTX 1080 Ti[m |[31m 29'C[m, [32m  0 %[m | [36m[1m[33m    0[m / [33m11178[m MB |
[36m[1][m [34mNVIDIA GeForce GTX 1080 Ti[m |[31m 33'C[m, [32m  0 %[m | [36m[1m[33m    0[m / [33m11178[m MB |
[36m[2][m [34mNVIDIA GeForce GTX 1080 Ti[m |[31m 27'C[m, [32m  0 %[m | [36m[1m[33m    0[m / [33m11178[m MB |
[36m[3][m [34mNVIDIA GeForce GTX 1080 Ti[m |[31m 29'C[m, [32m  0 %[m | [36m[1m[33m    0[m / [33m11178[m MB |
