In [None]:
import os
import numpy as np
import random
import mindspore
import mindspore.nn as nn
import mindspore.dataset as ds
from mindnlp.modules import CRF
from tqdm import tqdm

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    mindspore.set_seed(seed)
    mindspore.dataset.config.set_seed(seed)

# 读取文本，返回词典，索引表，句子，标签
def read_data(path):
    sentences = []
    labels = []
    with open(path, 'r', encoding='utf-8') as f:
        sent = []
        label = []
        for line in f:
            parts = line.split()
            if len(parts) == 0:
                if len(sent) != 0:
                    sentences.append(sent)
                    labels.append(label)
                sent = []
                label = []
            else:
                sent.append(parts[0])
                label.append(parts[-1])
                
    return (sentences, labels)

# 返回词典映射表、词数字典
def get_dict(sentences):
    max_number = 1
    char_number_dict={}

    id_indexs={}
    id_indexs['paddding']=0
    id_indexs['unknow']=1
    
    for sent in sentences:
        for c in sent:
            if c not in char_number_dict:
                char_number_dict[c]=0
            char_number_dict[c]+=1
                
    for c,n in char_number_dict.items():
        if n>=max_number:
            id_indexs[c]=len(id_indexs)
            
    return char_number_dict, id_indexs

def get_entity(decode):
    starting=False
    p_ans=[]
    for i,label in enumerate(decode):
        if label > 0:
            if label%2==1:
                starting=True
                p_ans.append(([i],labels_text_mp[label//2]))
            elif starting:
                p_ans[-1][0].append(i)
        else:
            starting=False
    return p_ans

# 处理数据 
class Feature(object):
    def __init__(self,sent, label):
        self.or_text = sent  #文本原句
        self.seq_length = len(sent) if len(sent) < Max_Len else Max_Len
        self.labels = [LABEL_MAP[c] for c in label][:Max_Len] + [0]*(Max_Len - len(label)) # 标签
        self.token_ids = self.tokenizer(sent)[:Max_Len]  + [0]*(Max_Len - len(sent)) #文本token
        self.entity = get_entity(self.labels)
        
    def tokenizer(self, sent):
        token_ids = []
        for c in sent:
            if c in id_indexs.keys():
                token_ids.append(id_indexs[c])
            else:
                token_ids.append(id_indexs['unknow'])
        return token_ids

class GetDatasetGenerator:
    def __init__(self, data):
        self.features = [Feature(data[0][i], data[1][i]) for i in range(len(data[0]))]
        
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, index):
        feature = self.features[index]
        token_ids = feature.token_ids
        labels = feature.labels
        
        return (token_ids, feature.seq_length, labels)
    
def debug_dataset(dataset):
    dataset = dataset.batch(batch_size=16)
    for data in dataset.create_dict_iterator():
        print(data["data"].shape, data["label"].shape)
        break
        
def get_metric(P_ans, valid):
    predict_score = 0 # 预测正确个数
    predict_number = 0 # 预测结果个数
    totol_number = 0 # 标签个数
    for i in range(len(P_ans)):
        predict_number += len(P_ans[i])
        totol_number += len(valid.features[i].entity)
        pred_true = [x for x in valid.features[i].entity if x in P_ans[i]]
        predict_score += len(pred_true)
    P = predict_score/predict_number if predict_number>0 else 0.
    R = predict_score/totol_number if totol_number>0 else 0.
    f1=(2*P*R)/(P+R) if (P+R)>0 else 0.
    print(f'f1 = {f1}， P(准确率) = {P}, R(召回率) = {R}')

In [3]:
class LSTM_CRF(nn.Module):
    def __init__(self,embedding_num,embedding_dim,num_labels):
        super().__init__()
        self.num_labels = num_labels
        self.embedding_num = embedding_num
        self.embedding_dim = embedding_dim
        self.model_name = 'LSTM_CRF'
        self.em = nn.Embedding(vocab_size=self.embedding_num,embedding_size=self.embedding_dim, padding_idx=0)
        self.bilstm = nn.LSTM(embedding_dim, embedding_dim//2, batch_first=True, bidirectional=True)
        self.crf_hidden_fc = nn.Dense(embedding_dim, self.num_labels)
        self.crf = CRF(self.num_labels, batch_first=True, reduction='mean')

    def construct(self, ids, seq_length=None, labels=None):
        seq=self.em(ids)
        lstm_feat, _ = self.bilstm(seq)
        emissions = self.crf_hidden_fc(lstm_feat)
        loss_crf = self.crf(emissions, tags=labels, seq_length=seq_length)
        return loss_crf

In [4]:
seed = 42
seed_everything(seed)
Max_Len = 113
Entity = ['PER', 'LOC', 'ORG', 'MISC']
labels_text_mp={k:v for k,v in enumerate(Entity)}
LABEL_MAP = {'O': 0}
for i, e in enumerate(Entity):
    LABEL_MAP[f'B-{e}'] = 2 * (i+1) - 1
    LABEL_MAP[f'I-{e}'] = 2 * (i+1)

In [5]:
# !wget https://data.deepai.org/conll2003.zip --no-check-certificate -O conll2003.zip
# !unzip -o conll2003.zip -d conll2003

In [6]:
train = read_data('conll2003/train.txt')
test = read_data('conll2003/test.txt')
dev = read_data('conll2003/valid.txt')
char_number_dict, id_indexs = get_dict(train[0])

In [7]:
Epoch = 2
batch_size = 16
dataset_generator = GetDatasetGenerator(train)
dataset = ds.GeneratorDataset(dataset_generator, ["data", "length", "label"], shuffle=False)
dataset_train = dataset.batch(batch_size=batch_size)

In [8]:
model = LSTM_CRF(embedding_num=len(id_indexs), embedding_dim=256, num_labels=len(Entity)*2+1)
optimizer = nn.Adam(model.trainable_params(), learning_rate=0.001)
grad_fn = mindspore.value_and_grad(model, None, optimizer.parameters)

In [9]:
@mindspore.jit
def train_step(token_ids, seq_length, labels):
    loss, grads = grad_fn(token_ids, seq_length, labels)
    optimizer(grads)
    return loss

In [10]:
# 训练
size = dataset_train.get_dataset_size()
steps = size
tloss = []
for epoch in range(Epoch):
    model.set_train()
    with tqdm(total=steps) as t:
        for batch, (token_ids, seq_length, labels) in enumerate(dataset_train.create_tuple_iterator()):
            loss = train_step(token_ids, seq_length, labels)
            tloss.append(loss.asnumpy())
            t.set_postfix(loss=np.array(tloss).mean())
            t.update(1)

100%|█████████████████████████████████████████| 937/937 [04:59<00:00,  3.13it/s, loss=5.65]
100%|█████████████████████████████████████████| 937/937 [04:54<00:00,  3.18it/s, loss=3.71]


In [11]:
# 预测：train
dataset_generator = GetDatasetGenerator(train)
dataset = ds.GeneratorDataset(dataset_generator, ["data", "length", "label"], shuffle=False)
dataset_train = dataset.batch(batch_size=batch_size)

size = dataset_train.get_dataset_size()
steps = size
decodes=[]
model.set_train(False)
with tqdm(total=steps) as t:
    for batch, (token_ids, seq_length, labels) in enumerate(dataset_train.create_tuple_iterator()):
        score, history = model(token_ids, seq_length=seq_length)
        best_tags = CRF.post_decode(score, history, seq_length)
        decode = [[y.asnumpy().item() for y in x] for x in best_tags]
        decodes.extend(list(decode))
        t.update(1)
        
v_pred = [get_entity(x) for x in decodes]
get_metric(v_pred, dataset_generator)

100%|████████████████████████████████████████████████████| 937/937 [06:18<00:00,  2.48it/s]


f1 = 0.8882163554410817， P(准确率) = 0.8960634013251917, R(召回率) = 0.8805055534278055


In [12]:
# 预测：dev
dev_dataset_generator = GetDatasetGenerator(dev)
dataset_dev = ds.GeneratorDataset(dev_dataset_generator, ["data", "length", "label"], shuffle=False)
dataset_dev = dataset_dev.batch(batch_size=batch_size)

size = dataset_dev.get_dataset_size()
steps = size
decodes=[]
model.set_train(False)
with tqdm(total=steps) as t:
    for batch, (token_ids, seq_length, labels) in enumerate(dataset_dev.create_tuple_iterator()):
        score, history = model(token_ids, seq_length=seq_length)
        best_tags = model.crf.post_decode(score, history, seq_length)
        decode = [[y.asnumpy().item() for y in x] for x in best_tags]
        decodes.extend(list(decode))
        t.update(1)
v_pred = [get_entity(x) for x in decodes]
get_metric(v_pred, dev_dataset_generator)

100%|████████████████████████████████████████████████████| 217/217 [01:31<00:00,  2.37it/s]

f1 = 0.7575705437026841， P(准确率) = 0.7749032030975008, R(召回率) = 0.7409962975429149





In [13]:
# 预测：test
test_dataset_generator = GetDatasetGenerator(test)
dataset_test = ds.GeneratorDataset(test_dataset_generator, ["data", "length", "label"], shuffle=False)
dataset_test = dataset_test.batch(batch_size=batch_size)

size = dataset_test.get_dataset_size()
steps = size
decodes_pred=[]
model.set_train(False)
with tqdm(total=steps) as t:
    for batch, (token_ids, seq_length, labels) in enumerate(dataset_test.create_tuple_iterator()):
        score, history = model(token_ids, seq_length=seq_length)
        best_tags = model.crf.post_decode(score, history, seq_length)
        decode = [[y.asnumpy().item() for y in x] for x in best_tags]
        decodes_pred.extend(list(decode))
        t.update(1)
        

pred = [get_entity(x) for x in decodes_pred]
get_metric(pred, test_dataset_generator)

100%|████████████████████████████████████████████████████| 231/231 [01:33<00:00,  2.47it/s]

f1 = 0.6655755591925804， P(准确率) = 0.6838565022421524, R(召回率) = 0.6482465462274176



