In [33]:
import gzip
import itertools


def get_ner_reader(data):
    fin = gzip.open(data, 'rt') if data.endswith('.gz') else open(data, 'rt',encoding='utf-8')
    for is_divider, lines in itertools.groupby(fin, _is_divider):
        if is_divider:
            continue
        lines = [line.strip().replace('\u200d', '').replace('\u200c', '') for line in lines]

        metadata = lines[0].strip() if lines[0].strip().startswith('# id') else None
        fields = [line.split() for line in lines if not line.startswith('# id')]
        fields = [list(field) for field in zip(*fields)]


        yield fields, metadata


def _assign_ner_tags(ner_tag, rep_):
    ner_tags_rep = []
    token_masks = []

    sub_token_len = len(rep_)
    token_masks.extend([True] * sub_token_len)
    if ner_tag[0] == 'B':
        in_tag = 'I' + ner_tag[1:]

        ner_tags_rep.append(ner_tag)
        ner_tags_rep.extend([in_tag] * (sub_token_len - 1))
    else:
        ner_tags_rep.extend([ner_tag] * sub_token_len)
    return ner_tags_rep, token_masks


def extract_spans(tags):
    cur_tag = None
    cur_start = None
    gold_spans = {}

    def _save_span(_cur_tag, _cur_start, _cur_id, _gold_spans):
        if _cur_start is None:
            return _gold_spans
        _gold_spans[(_cur_start, _cur_id - 1)] = _cur_tag  # inclusive start & end, accord with conll-coref settings
        return _gold_spans

    # iterate over the tags
    for _id, nt in enumerate(tags):
        indicator = nt[0]
        if indicator == 'B':
            gold_spans = _save_span(cur_tag, cur_start, _id, gold_spans)
            cur_start = _id
            cur_tag = nt[2:]
            pass
        elif indicator == 'I':
            # do nothing
            pass
        elif indicator == 'O':
            gold_spans = _save_span(cur_tag, cur_start, _id, gold_spans)
            cur_tag = 'O'
            cur_start = _id
            pass
    _save_span(cur_tag, cur_start, _id + 1, gold_spans)
    return gold_spans


def _is_divider(line: str) -> bool:
    empty_line = line.strip() == ''
    if empty_line:
        return True

    first_token = line.split()[0]
    if first_token == "-DOCSTART-":# or line.startswith('# id'):  # pylint: disable=simplifiable-if-statement
        return True

    return False


def get_tags(tokens, tags, tokenizer=None, start_token_pattern='▁'):
    token_results, tag_results = [], []
    index = 0
    token_word = []
    tokens = tokenizer.convert_ids_to_tokens(tokens)
    for token, tag in zip(tokens, tags):
        if token == tokenizer.pad_token:
            # index += 1
            continue

        if index == 0:
            tag_results.append(tag)

        elif token.startswith(start_token_pattern) and token != '▁́':
            tag_results.append(tag)

            if tokenizer is not None:
                token_results.append(''.join(token_word).replace(start_token_pattern, ''))
            token_word.clear()

        token_word.append(token)

        index += 1
    token_results.append(''.join(token_word).replace(start_token_pattern, ''))

    return token_results, tag_results

import pandas as pd

def conll_to_df(file = './/training_data//EN-English//en_train.conll'):
    reader=get_ner_reader(file)
    id_list=[]
    tokens_list=[]
    ner_tags_list=[]
    for r in reader:
        id =  r[1].split('\t')[0]
        id_list.append(id)
        tokens = r[0][0]
        tokens_list.append(tokens)
        ner_tags = r[0][-1]
        ner_tags_list.append(ner_tags)
    df = pd.DataFrame({"id":id_list,"tokens":tokens_list,"ner_tags":ner_tags_list})

    return df


from seqeval.metrics import accuracy_score,precision_score,recall_score,f1_score,classification_report
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    predict_tags =[]
    true_tags = []
    for idx in range(len(eval_tokenized_datasets)):
        predict_tag = get_tags_by_wordid(eval_tokenized_datasets[idx]['word_ids'][1:1+len(true_predictions[idx])], true_predictions[idx])
        true_tag = get_tags_by_wordid(eval_tokenized_datasets[idx]['word_ids'][1:1+len(true_labels[idx])], true_labels[idx])
        predict_tags.append(predict_tag)
        true_tags.append(true_tag)


    return {
        "precision":precision_score(predict_tags,true_tags),
        "recall": recall_score(predict_tags,true_tags),
        "f1": f1_score(predict_tags,true_tags),
        "accuracy":  accuracy_score(predict_tags,true_tags)
         }

def get_tags_by_wordid(word_ids, tags):
    tags_results = []
    if len(word_ids) != len(tags):
        print (word_ids,tags) 
        assert len(word_ids)==len(tags)
    pre_id=-1
    for word_id,tag_id in zip(word_ids,range(len(tags))):
        if word_id != pre_id:
            tags_results.append(tags[tag_id])
        pre_id = word_id
            
    
    return  tags_results

def load_dict(file='./dict/entity_vocab.tsv'):
    return [  l.strip().split('\t')[0].lower()
            for l in open(file,'r',encoding='utf-8').readlines() 
            if len(l.strip().split('\t')[0])>1 and len(l.strip().split('\t')[0].split())<=15]

def is_substr(query:list(),target:list()):
     #1 是否自子集 2、是否是字串 保证顺序 用集合运算来判断太慢了
    #return set(query) < set(target) and " ".join(query) in " ".join(target)
    return "-"+"-".join(query)+"-" in "-"+"-".join(target)+"-"

def is_contain_entity(entity_vocab:list(),target:list()):
    matched = []
    for entity in entity_vocab:
        if is_substr(entity,target):
            matched.append(entity)
        if len(matched)>3:#限制最多匹配多少个entity
            break
    return matched
            


In [34]:
task = "ner" # Should be one of "ner", "pos" or "chunk"
# model_checkpoint = "distilbert-base-uncased" #英文baseline
# model_checkpoint = "bert-base-uncased"
# model_checkpoint = "roberta-base"
# model_checkpoint = "albert-base-v2"
model_checkpoint = "bert-large-uncased-whole-word-masking"
# model_checkpoint = "roberta-large"
# model_checkpoint = "studio-ousia/luke"
batch_size = 16
#language = 'ZH-Chinese'
language = 'EN-English'

cache_dir = f'/home/rao/disk1/data0/chu/project/AI/.cache/{model_checkpoint}'

In [35]:
from datasets import Dataset
df = conll_to_df(file = f'./data/{language}/{language.split("-")[0].lower()}_train.conll')
import collections
all_ner_list=[]
for idx in df.index: 
    all_ner_list+=df['ner_tags'][idx]
count=collections.Counter(all_ner_list)
#df= origin_df.copy()

In [61]:
# label_list = {0: 'O',1: 'B-CORP',2: 'I-CORP',3: 'B-CW', 4: 'I-CW',5: 'B-GRP',6: 'I-GRP',
#  7: 'B-LOC',8: 'I-LOC',9:'B-PER',10: 'I-PER',11: 'B-PROD',12: 'I-PROD'}
# ner_mapping={ key:id   for id,key in label_list.items()}
import re 
ner_mapping = { key:id   for id,key in enumerate(count.keys())}
label_list = {v:k for k,v in ner_mapping.items()}

'Wikipedia的entit词典'
entity_vocab = list(set(load_dict()))
print ("词典大小",len(entity_vocab))


词典大小 499806


In [84]:
'0 整理词典'
import re
adjust_entity_words=[]
for v in entity_vocab:
    w=re.split('\(|,',v)
    if w:
        adjust_entity_words.append(w[0])
print (len(list(set(adjust_entity_words))))


477633


In [77]:
'1分析标签占比'
print("标签  样本数")
for type in ["PROD","CW","CORP","GRP","PER","LOC"]:
    df[f'{type}_flag'] = df['ner_tags'].apply(lambda x: f"B-{type}" in x or f"I-{type}" in x)
    print (f'{type}_flag',len(df[df[f'{type}_flag']==True]))

标签  样本数
PROD_flag 2527
CW_flag 3190
CORP_flag 2802
GRP_flag 3122
PER_flag 3956
LOC_flag 3183


In [94]:
'2查看entity'
print('实体 总个数')
entity_type_words_dict={}
short_entity = []
for type in ["PROD","CW","CORP","GRP","PER","LOC"]:
    
    sub_df=df[df[f'{type}_flag']==True]
    entity_words=[]
    
    for idx in sub_df.index:
        tmp_entity=[]
        for token,tag in zip(sub_df['tokens'][idx],sub_df['ner_tags'][idx]):
            if tag == f"B-{type}":
                tmp_entity=[token]
            elif tag ==  f"I-{type}":
                tmp_entity.append(token)
            else:
                if tmp_entity:
                    entity_words.append(" ".join(tmp_entity))
                    if len(" ".join(tmp_entity))<5:
                        print (f'--------{idx}--------')
                        print (tmp_entity)
                        print (sub_df["tokens"][idx])
                        print (sub_df["ner_tags"][idx])
                        short_entity.append(" ".join(tmp_entity))
                    tmp_entity=[]
        if tmp_entity:
            entity_words.append(" ".join(tmp_entity))
            if len(" ".join(tmp_entity))<5:
                print (f'--------{idx}--------')
                print (tmp_entity)
                print (sub_df["tokens"][idx])
                print (sub_df["ner_tags"][idx])
                short_entity.append(" ".join(tmp_entity))
            tmp_entity=[]
    print (f'{type}_entity总计',len(entity_words))
    entity_type_words_dict[type]=list(set(entity_words))
    print (f'{type}_entity去重',len(entity_type_words_dict[type]))


实体 总个数
--------12--------
['pie']
['for', 'example', ',', 'in', 'one', 'advertisement', 'a', 'woman', 'wearing', 'a', 'yellow', 'shirt', 'and', 'a', 'pin', 'is', 'juxtaposed', 'with', 'a', 'similarly', 'colored', 'piece', 'of', 'pie', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PROD', 'O']
--------39--------
['wood']
['bamboo', ',', 'like', 'wood', ',', 'is', 'a', 'natural', 'composite', 'material', 'with', 'a', 'high', 'strength', 'to', 'weight', 'ratio', 'useful', 'for', 'structures', '.']
['O', 'O', 'O', 'B-PROD', 'O', 'O', 'O', 'O', 'B-PROD', 'I-PROD', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
--------50--------
['loan']
['a', 'term', 'loan', 'is', 'a', 'monetary', 'loan', 'that', 'is', 'repaid', 'in', 'regular', 'payments', 'over', 'a', 'set', 'period', 'of', 'time', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'B-PROD', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
--------73---

--------15110--------
['cake']
['they', 'are', 'usually', 'an', 'alternative', 'to', 'a', 'biscuit', '(', 'cookie', ')', 'or', 'cake', ',', 'and', 'textures', 'range', 'from', 'soft', 'and', 'moist', 'to', 'dry', 'and', 'crisp', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PROD', 'O', 'B-PROD', 'O', 'O', 'B-PROD', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
--------15156--------
['pump']
['hydrostatically', 'lubricated', 'bearings', 'are', 'lubricated', 'by', 'an', 'external', 'pump', 'that', 'maintains', 'a', 'static', 'amount', 'of', 'pressure', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PROD', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
--------15221--------
['vada']
['vada', 'meaning']
['B-PROD', 'O']
PROD_entity总计 2912
PROD_entity去重 1749
--------181--------
['saga']
['the', 'series', 'also', 'produced', 'spin', 'off', 'titles', 'including', 'saga', 'and', 'mana', ',', 'and', 'in', 'turn', 'influenced', 'later', 'game', 'developers', 'and', 'studios', '.']
['O

--------1364--------
['dhl']
['it', 'provides', 'service', 'in', '220', 'plus', 'countries', 'and', 'territories', 'all', 'over', 'the', 'world', 'through', 'their', 'parent', 'company', "'s", 'parcel', 'service', 'dhl', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-CORP', 'O']
--------1381--------
['spa']
['other', 'variations', 'of', 'this', 'type', 'of', 'business', 'include', 'hair', 'salons', 'and', 'spa', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-CORP', 'O']
--------1432--------
['wwe']
['karen', 'was', 'never', 'involved', 'in', 'a', 'storyline', 'of', 'wwe', 'while', 'her', 'then', 'husband', 'kurt', 'angle', 'was', 'under', 'contract', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-CORP', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O']
--------1562--------
['nhk']
['he', 'appeared', 'on', 'an', 'nhk', 'drama', 'series', 'in', '2000', ',', 'and', 'has', 'starred', 'in', 'several', '

--------1639--------
['snfu']
['snfu', 'formed', 'in', 'edmonton', 'in', '1981', 'and', 'also', 'later', 'relocated', 'to', 'vancouver', '.']
['B-GRP', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O']
--------1882--------
['nme']
['mark', 'beaumont', 'of', 'nme', 'called', 'it', 'a', 'surrealist', ',', 'mystical', 'odyssey', 'of', 'self', 'discovery', ',', 'maximal', 'hedonism', 'and', 'jaws', 'of', 'death', 'revelation', '.']
['B-PER', 'I-PER', 'O', 'B-GRP', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
--------2392--------
['lgbt']
['amaki', 'is', 'a', 'supporter', 'of', 'lgbt', 'rights', '.']
['O', 'O', 'O', 'O', 'O', 'B-GRP', 'O', 'O']
--------3075--------
['korn']
['she', 'also', 'starred', 'in', 'the', 'music', 'video', 'for', 'korn', 'did', 'my', 'time', ',', 'which', 'was', 'used', 'to', 'promote', 'the', 'sequel', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-GRP', 'B-CW', 'I-CW', 'I-CW', 'O', 'O', 'O', 'O'

--------14327--------
['pink']
['christina', 'aguilera', ',', 'pink', ',', "lil'", 'kim', 'and', 'mýa', '–', 'lady', 'marmalade']
['B-PER', 'I-PER', 'O', 'B-PER', 'O', 'B-PER', 'I-PER', 'O', 'B-PER', 'O', 'B-CW', 'I-CW']
--------14327--------
['mýa']
['christina', 'aguilera', ',', 'pink', ',', "lil'", 'kim', 'and', 'mýa', '–', 'lady', 'marmalade']
['B-PER', 'I-PER', 'O', 'B-PER', 'O', 'B-PER', 'I-PER', 'O', 'B-PER', 'O', 'B-CW', 'I-CW']
--------14417--------
['eve']
['sara', 'gilbert', ',', 'sharon', 'osbourne', ',', 'sheryl', 'underwood', ',', 'eve', ',', 'carrie', 'ann', 'inaba', ',', 'and', 'marie', 'osmond']
['B-PER', 'I-PER', 'O', 'B-PER', 'I-PER', 'O', 'B-PER', 'I-PER', 'O', 'B-PER', 'O', 'B-PER', 'I-PER', 'I-PER', 'O', 'O', 'B-PER', 'I-PER']
--------14456--------
['jitō']
['the', 'slat', 'dated', 'back', 'to', 'the', 'reign', 'of', 'temmu', 'and', 'jitō', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'B-PER', 'O']
--------14696--------
['yi', 'u']
['seo', 'later', 

In [80]:
'3、查看entity长度分布'
entity_words=[]
for type in ["PROD","CW","CORP","GRP","PER","LOC"]:
    entity_words+=entity_type_words_dict[type]
len_type_entity=[len(w.split()) for w in entity_words]
import numpy as np
print('按实体包含token数')
print('max min median')
print (max(len_type_entity),min(len_type_entity),np.median(len_type_entity))
print ("总计entitywords",len(entity_words))

按实体包含token数
max min median
15 1 2.0
总计entitywords 17182


In [125]:
'4 整理 输出调整后的entity词典保存'
final_adjust_entity_words=[ w for w in list(set(entity_words)|set(adjust_entity_words)) if len(w)>1 and w.strip(" ") not in ['the','an']]
print ('final_adjust_entity_words',len(final_adjust_entity_words))
with open('./dict/adjust_entity.tsv','w',encoding='utf-8') as wf:
    wf.write("\n".join(final_adjust_entity_words))


final_adjust_entity_words 478358


In [130]:
'it' in final_adjust_entity_words

False

In [8]:
#! pip3 install pandarallel 
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=16)

df['entity_tokens'] = df['tokens'].parallel_apply(lambda x: is_contain_entity(entity_vocab,x))


INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [10]:
df['origin_tokens'] = df['tokens'].copy() #数据集给到的tokens
df['ner_tags_ids'] = df['ner_tags'].apply(lambda x : [ ner_mapping[xx] for xx in x])
for idx in df.index:
    tokens = df['origin_tokens'][idx].copy()
    tokens.append('[SEP]')
    for entity in df['entity_tokens'][idx]:
        if len("".join(entity))>1:
            tokens+=entity
    df['ner_tags_ids'][idx] += [-100]*(len(tokens)-len(df['origin_tokens'][idx]))
    df['tokens'][idx] = tokens
    assert len(df['ner_tags_ids'][idx])==len(df['tokens'][idx])

In [None]:
idx =0
df['tokens'][idx],df['ner_tags'][idx],df['ner_tags_ids'][idx]

In [12]:
#df['sentence'] = df['tokens'].apply(lambda x: " ".join(x))
#df[['sentence']].to_csv(f'./training_data/{language}/{language.split("-")[0].lower()}_train.csv',encoding='utf-8',index=False)

In [13]:

# from datasets import load_dataset
# wnut_datasets = load_dataset("wnut_17")
# df_wnut_train=wnut_datasets['train'].to_pandas()
# df_wnut_val=wnut_datasets['validation'].to_pandas()
# df_wnut_test=wnut_datasets['test'].to_pandas()

# df_wnut = df_wnut_train.append(df_wnut_test).append(df_wnut_test)
# df_wnut['tokens'] = df_wnut['tokens'].apply(lambda x: [str(xx).lower() for xx in x])
# df_wnut['ner_tags_ids'] = df_wnut['ner_tags']
# df_wnut['ner_tags'] = df_wnut['ner_tags'].apply(lambda x: [ label_list[xx] for xx in x])

#将wnut数据集全部加入训练集
#df = df.append(df_wnut)

In [14]:

#只加入CW,PROD的数据
# df_wnut['flag']=df_wnut['ner_tags'].apply(lambda x: len(set(x).intersection(set([ 'B-CW',  'I-CW', 'B-PROD', 'I-PROD'])))>0)
# df_wnut_CWPROD=df_wnut[df_wnut['flag'] == True][['id','tokens','ner_tags','ner_tags_ids']]
# df = df.append(df_wnut_CWPROD)

In [15]:
eval_df = conll_to_df(file = f'.//training_data/{language}/{language.split("-")[0].lower()}_dev.conll')
eval_df['ner_tags_ids'] = eval_df['ner_tags'].apply(lambda x : [ ner_mapping[xx] for xx in x])

eval_df['entity_tokens'] = eval_df['tokens'].parallel_apply(lambda x: is_contain_entity(entity_vocab,x))

eval_df['origin_tokens'] = eval_df['tokens'].copy() #数据集给到的tokens

for idx in eval_df.index:
    tokens = eval_df['origin_tokens'][idx].copy()
    tokens.append('[SEP]')
    for entity in eval_df['entity_tokens'][idx]:
        if len("".join(entity))>1:
            tokens+=entity
    eval_df['ner_tags_ids'][idx] += [-100]*(len(tokens)-len(eval_df['origin_tokens'][idx]))
    eval_df['tokens'][idx] = tokens
    assert len(eval_df['ner_tags_ids'][idx])==len(eval_df['tokens'][idx])

In [16]:
datasets = Dataset.from_pandas(df)
eval_datasets = Dataset.from_pandas(eval_df)

In [17]:
from datasets import ClassLabel, Sequence
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(lambda x: [typ.feature.names[i] for i in x])
    display(HTML(df.to_html()))

In [20]:
#show_random_elements(eval_datasets)


In [21]:
from transformers import AutoTokenizer
if 'roberta' in model_checkpoint:
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint,add_prefix_space=True,cache_dir=cache_dir)
else:
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint,cache_dir=cache_dir)

In [22]:
label_all_tokens = True

In [23]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True )
    word_id_list=[]

    labels = []
    for i, label in enumerate(examples[f"{task}_tags_ids"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        word_id_list.append(word_ids)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    tokenized_inputs['word_ids'] = word_id_list
    return tokenized_inputs

In [24]:
tokenize_and_align_labels(datasets[:1])


{'input_ids': [[101, 2010, 2377, 9863, 2950, 13584, 21146, 18933, 3600, 1010, 1043, 4143, 1010, 2406, 27071, 2015, 1998, 1996, 12536, 1038, 1012, 1045, 1012, 1043, 1012, 102, 1996, 12536, 1038, 1012, 1045, 1012, 1043, 1012, 13584, 21146, 18933, 3600, 2406, 1043, 4143, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 0, 0, 0, 0, 1, 2, 2, 2, 0, 1, 1, 0, 3, 4, 4, 0, 1, 2, 2, 2, 2, 2, 2, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]], 'word_ids': [[None, 0, 1, 1, 2, 3, 4, 4, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12, 13, 13, 13, 13, 13, 13, 14, 15, 16, 17, 17, 17, 17, 17, 17, 18, 19, 19, 19, 20, 21, 21, None]]}

In [25]:
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/16 [00:00<?, ?ba/s]

In [26]:
eval_tokenized_datasets = eval_datasets.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [1]:
#label_list = {v:k  for k, v in ner_mapping.items()}
import torch
torch.cuda.is_available(), torch.cuda.device_count(),torch.cuda.current_device()

  return torch._C._cuda_getDeviceCount() > 0


RuntimeError: CUDA unknown error - this may be due to an incorrectly set up environment, e.g. changing env variable CUDA_VISIBLE_DEVICES after program start. Setting the available devices to be zero.

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list.keys()))

In [155]:
import os
os.environ["WANDB_DISABLED"] = "true"
special = 'entity'
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-{task}{special}",
     overwrite_output_dir=True,
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=15,
    weight_decay=0.01,
    push_to_hub=False,
    #save_steps=4000,
    save_strategy='epoch'
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [156]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [152]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets,
    eval_dataset=eval_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [59]:
# model_name = 'bert-large-uncased-whole-word-masking'
# num = 7656
# special= "-onlysemdata"
# model_checkpoint = f"{model_name}-finetuned-{task}{special}/checkpoint-{num}"
# model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list.keys()))
# args = TrainingArguments(
#     f"{model_name}-finetuned-{task}{special}",
#     evaluation_strategy = "epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=batch_size,
#     per_device_eval_batch_size=batch_size,
#     num_train_epochs=25,
#     weight_decay=0.01,
#     push_to_hub=False,
#     #save_steps=4000,
#     save_strategy='epoch'
# )

# trainer = Trainer(
#     model,
#     args,
#     train_dataset=tokenized_datasets,
#     eval_dataset=eval_tokenized_datasets,
#     data_collator=data_collator,
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics
# )
trainer.evaluate()

loading configuration file bert-large-uncased-whole-word-masking-finetuned-ner-onlysemdata/checkpoint-7656/config.json
Model config BertConfig {
  "_name_or_path": "bert-large-uncased-whole-word-masking",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_12": 12,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8,
   

{'eval_loss': 0.20742078125476837,
 'eval_precision': 0.9032520325203252,
 'eval_recall': 0.8810467882632831,
 'eval_f1': 0.8920112404656763,
 'eval_accuracy': 0.9758436152817006,
 'eval_runtime': 3.0587,
 'eval_samples_per_second': 261.547,
 'eval_steps_per_second': 16.347}

In [60]:

predictions, labels, _ = trainer.predict(eval_tokenized_datasets)
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]


predict_tags =[]
true_tags = []
for idx in range(len(eval_tokenized_datasets)):
    predict_tag = get_tags_by_wordid(eval_tokenized_datasets[idx]['word_ids'][1:-1], true_predictions[idx])
    true_tag = get_tags_by_wordid(eval_tokenized_datasets[idx]['word_ids'][1:-1], true_labels[idx])
    predict_tags.append(predict_tag)
    true_tags.append(true_tag)
    

report = classification_report(
        y_true=true_tags,
        y_pred=predict_tags,
        digits=5)

print (report)

{
        "precision":precision_score(predict_tags,true_tags),
        "recall": recall_score(predict_tags,true_tags),
        "f1": f1_score(predict_tags,true_tags),
        "accuracy":  accuracy_score(predict_tags,true_tags),
    }

The following columns in the test set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: ner_tags_ids, id, word_ids, ner_tags, tokens.
***** Running Prediction *****
  Num examples = 800
  Batch size = 16


              precision    recall  f1-score   support

        CORP    0.87766   0.85492   0.86614       193
          CW    0.82659   0.81250   0.81948       176
         GRP    0.84038   0.94211   0.88834       190
         LOC    0.90164   0.94017   0.92050       234
         PER    0.96959   0.98966   0.97952       290
        PROD    0.79592   0.79592   0.79592       147

   micro avg    0.88105   0.90325   0.89201      1230
   macro avg    0.86863   0.88921   0.87832      1230
weighted avg    0.88106   0.90325   0.89158      1230



{'precision': 0.9032520325203252,
 'recall': 0.8810467882632831,
 'f1': 0.8920112404656763,
 'accuracy': 0.9758436152817006}

PER : Person

LOC : Location

GRP : Group

CORP : Corporation

PROD : Product

CW: Creative Work

In [61]:
#wf = open(f'./res/{language.split("-")[0].lower()}.pred.conll','w',encoding='utf-8')
wf = open(f'./res/{language.split("-")[0].lower()}.dev.badcases.connl','w',encoding='utf-8')
total_num = len(eval_tokenized_datasets)
wrong_num = 0
for idx in range(len(eval_tokenized_datasets)):

    #tokens = tokenizer.convert_ids_to_tokens(eval_tokenized_datasets[idx]['input_ids'])[1:-1]
    tokens = eval_tokenized_datasets[idx]['tokens']
    #assert len(tokens) == len( true_predictions[idx])
    #tokens,predict_tags,true_tags= get_tags(tokens, true_predictions[idx],true_labels[idx])
    predict_tags = get_tags_by_wordid(eval_tokenized_datasets[idx]['word_ids'][1:-1], true_predictions[idx])
    true_tags = get_tags_by_wordid(eval_tokenized_datasets[idx]['word_ids'][1:-1], true_labels[idx])
    assert len(true_tags) == len(predict_tags)
    
    
    if true_tags!= predict_tags:
        wrong_num += 1
        wf.write(eval_tokenized_datasets[idx]['id']+'\t'+'domain=dev\n')
        cur = 'start'
        for token,label,pred in zip(tokens,true_tags,predict_tags):
            pre = cur
            cur = pred
            if cur.startswith('I') and pre not in [cur.replace("I","B"),cur,'start']:
                #当前出现I，但是type和前一个B不一致，那么该位置预测纠正为O
                pred = "O"
                cur = "O"
                print ("bad predict:",predict_tags)
            
            wf.write(" _ _ ".join([token,label,pred])+'\n')
            #wf.write(" _ _ ".join([pred])+'\n')
        wf.write('\n')
wf.close()

print (wrong_num,wrong_num/total_num)

bad predict: ['O', 'O', 'O', 'O', 'O', 'B-PROD', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
bad predict: ['B-CW', 'I-CW', 'O', 'B-CW', 'O', 'B-CW', 'O', 'B-PROD', 'I-PROD', 'O', 'B-PROD', 'I-PROD', 'O', 'B-PROD', 'I-PROD', 'I-PROD', 'O', 'I-PROD', 'I-PROD']
bad predict: ['B-CW', 'I-CW', 'O', 'B-CW', 'O', 'B-CW', 'O', 'B-PROD', 'I-PROD', 'O', 'B-PROD', 'I-PROD', 'O', 'B-PROD', 'I-PROD', 'I-PROD', 'O', 'I-PROD', 'I-PROD']
bad predict: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'I-LOC', 'O']
bad predict: ['O', 'O', 'O', 'O', 'B-PROD', 'I-PROD', 'O', 'B-PROD', 'I-PROD', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-CORP', 'O']
bad predict: ['O', 'O', 'O', 'O', 'B-LOC', 'I-GRP', 'O', 'O', 'O', 'O', 'O', 'B-GRP', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
bad predict: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'B-CW', 'I-GRP', 'O', 'O']
bad predict: ['O', 'O', 'O', 'O', 'O', 'I-LO

In [65]:
wf = open(f'./res/{language.split("-")[0].lower()}.pred.conll','w',encoding='utf-8')
dev_wf = open(f'./res/{language.split("-")[0].lower()}.dev.conll','w',encoding='utf-8')
total_num = len(eval_tokenized_datasets)
wrong_num = 0
for idx in range(len(eval_tokenized_datasets)):
    tokens = eval_tokenized_datasets[idx]['tokens']
    predict_tags = get_tags_by_wordid(eval_tokenized_datasets[idx]['word_ids'][1:-1], true_predictions[idx])
    true_tags = get_tags_by_wordid(eval_tokenized_datasets[idx]['word_ids'][1:-1], true_labels[idx])
    assert len(true_tags) == len(predict_tags)
    
    cur = 'start'
    for token,label,pred in zip(tokens,true_tags,predict_tags):
        
        if not pred:
            print (eval_tokenized_datasets[idx])
            break
        pre = cur
        cur = pred
        
        if cur.startswith('I') and pre not in [cur.replace("I","B"),cur,'start']:
            #当前出现I，但是type和前一个B不一致，那么该位置预测纠正为O
            pred = "O"
            cur = "O"
            #print ("bad predict:",predict_tags)
        wf.write(" _ _ ".join([pred])+'\n')
        dev_wf.write(" _ _ ".join([label])+'\n')
    wf.write('\n')
    dev_wf.write('\n')
wf.close()
dev_wf.close()
