In [None]:
from keras_bert import load_trained_model_from_checkpoint
from keras.layers import *
from keras.models import Model
import keras.backend as K
from keras.optimizers import Adam
from keras_contrib.layers import CRF
from keras_bert import AdamWarmup, calc_train_steps

!wget -q https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip
!unzip -o chinese_L-12_H-768_A-12.zip

In [None]:
import os

pretrained_path = 'chinese_L-12_H-768_A-12'
config_path = os.path.join(pretrained_path, 'bert_config.json')
checkpoint_path = os.path.join(pretrained_path, 'bert_model.ckpt')
vocab_path = os.path.join(pretrained_path, 'vocab.txt')

# TF_KERAS must be added to environment variables in order to use TPU
# os.environ['TF_KERAS'] = '1'

bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=None)

for l in bert_model.layers:
    l.trainable = True
   

In [None]:
import codecs
import pandas as pd
from tqdm import tqdm

train_reviews=pd.read_csv('Train_reviews.csv')
train_labels=pd.read_csv('Train_labels.csv')

token_dict = {}
with codecs.open(vocab_path, 'r', 'utf8') as reader:
    for line in reader:
        token = line.strip()
        token_dict[token] = len(token_dict)


In [None]:
#O,B-opinion,I-opinion,B-aspect,I-aspect
output_res=[]
for id_ in tqdm(range(len(train_reviews))):#5改成句子总数
    to_cut=train_reviews.iloc[id_]['Reviews']
    list_cut=[]
    for i in to_cut:
        list_cut.append(i)
    
    a_list,o_list=[],[]
    b_set=set()#形容词或名词的开头位置
    temp_labels=train_labels[train_labels['id']==id_+1].reset_index(drop=True)
    for i in range(len(temp_labels)):
        temp=temp_labels.iloc[i]#一行记录
        if temp['AspectTerms']!='_':
            b_set.add(temp['A_start'])
            for num in range(int(temp['A_start']),int(temp['A_end'])):
                a_list.append(num)
        if temp['OpinionTerms']!='_':
            b_set.add(temp['O_start'])
            for num in range(int(temp['O_start']),int(temp['O_end'])):
                o_list.append(num)
    pivot=0
    last_tag='O'
    
    for i in range(len(list_cut)):
        in_a = False
        in_o = False
        tag='O'
        if (pivot in a_list):
            in_a = True
            if (((last_tag=='B-aspect') | (last_tag=='I-aspect')) & (str(pivot) not in b_set)):
                tag='I-aspect'
            else:
                tag='B-aspect'
        if (pivot in o_list):
            in_o = True
            if (((last_tag=='B-opinion') | (last_tag=='I-opinion')) & (str(pivot) not in b_set)):
                tag='I-opinion'
            else:
                tag='B-opinion'
        output_res.append([id_+1,tag,list_cut[i]])
        last_tag=tag
        pivot+=len(list_cut[i])

In [None]:
import pandas as pd

data_train=pd.DataFrame(output_res)
data_train.columns=['Sentence #','Tag','Word']
data_train

In [None]:
x1_in = Input(shape=(None,))#embedding去掉，启用bert
x2_in = Input(shape=(None,))

x = bert_model([x1_in, x2_in])

x = TimeDistributed(Dense(50, activation="tanh"))(x)
crf = CRF(6, sparse_target=False)
x = crf(x)
model = Model([x1_in, x2_in], x)

decay_steps, warmup_steps = calc_train_steps(
    #train_y.shape[0],
    2600,
    batch_size=16,
    epochs=1,
)

LR=5e-5

model.compile(
    #optimizer=Adam(learning_rate),
    optimizer=AdamWarmup(decay_steps=decay_steps, warmup_steps=warmup_steps, lr=LR),
    loss=crf.loss_function, 
    metrics=[crf.accuracy])
model.summary()

In [None]:
# @title Convert Data to Array
import os
import numpy as np
from tqdm import tqdm
from keras_bert import Tokenizer

SEQ_LEN=75

tag2idx = {}
tag2idx['[PAD]'] = [1,0,0,0,0,0]
tag2idx['O'] = [0,1,0,0,0,0]
tag2idx['B-aspect'] = [0,0,1,0,0,0]
tag2idx['I-aspect'] = [0,0,0,1,0,0]
tag2idx['B-opinion'] = [0,0,0,0,1,0]
tag2idx['I-opinion'] = [0,0,0,0,0,1]

class OurTokenizer(Tokenizer):
    def _tokenize(self, text):
        R = []
        for c in text:
            if c in self._token_dict:
                R.append(c)
            elif self._is_space(c):
                R.append('[unused1]') # space类用未经训练的[unused1]表示
            else:
                R.append('[UNK]') # 剩余的字符是[UNK]
        return R

tokenizer = OurTokenizer(token_dict)

def load_data():
    global tokenizer
    indices, seqs = [], []
    for i in range(len(train_reviews)):#句子id-1
        id_ = i+1
        text = train_reviews.iloc[i]['Reviews']
        ids, segments = tokenizer.encode(text, max_len=SEQ_LEN)
        tag_list = list(data_train[data_train['Sentence #']==id_]['Tag'])
        seq = [[0,1,0,0,0,0]]
        seq = seq + ([tag2idx[k] for k in tag_list])
        for j in range(SEQ_LEN-len(seq)):
            seq.append([1,0,0,0,0,0])
        seq = np.array(seq)
        indices.append(ids)
        seqs.append(seq)
    items = list(zip(indices, seqs))
    #np.random.shuffle(items)
    indices, seqs = zip(*items)
    indices = np.array(indices)
    return [indices, np.zeros_like(indices)], np.array(seqs)
  
def load_test_data():
    global tokenizer
    indices = []
    for i in range(len(test_reviews)):#句子id-1
        id_ = i+1
        text = test_reviews.iloc[i]['Reviews']
        ids, segments = tokenizer.encode(text, max_len=SEQ_LEN)
        indices.append(ids)
    indices = np.array(indices)
    return indices#[indices, np.zeros_like(indices)]
  
test_reviews=pd.read_csv('Test_reviews.csv')
train_x, train_y = load_data()
test_x = load_test_data()

In [None]:
#测试时
x_train=train_x[0][0:2600,:]
y_train=train_y[0:2600,:,:]
x_test=train_x[0][2600:,:]
y_test=train_y[2600:,:,:]

#生成结果时
# x_train=train_x[0]
# y_train=train_y
# x_test=test_x

In [None]:
model.fit(
    [x_train,np.zeros(x_train.shape)],
    y_train,
    epochs=4,
    batch_size=16,
)

In [None]:
pre = model.predict([x_test,np.zeros(x_test.shape)])

In [None]:
pre_idx0 = [[np.argmax(k,axis=0) for k in i] for i in pre]
pre_idx_test = [[np.argmax(k,axis=0) for k in i] for i in y_test]

cnt=0
same=0
for i in range(len(pre_idx0)):
  cnt+=1
  if pre_idx0[i]==pre_idx_test[i]:
    same+=1
print(same*1.0/cnt)

In [None]:
pre_idx0 = [[np.argmax(k,axis=0) for k in i] for i in pre]


idx2tag={}
idx2tag[0]='[PAD]'
idx2tag[1]='O'
idx2tag[2]='B-aspect'
idx2tag[3]='I-aspect'
idx2tag[4]='B-opinion'
idx2tag[5]='I-opinion'
pre_idx = [[idx2tag[j]for j in i] for i in pre_idx0]

test_data=test_reviews

test_review_list = []
for to_cut in tqdm(test_data['Reviews']):
    s_list=[]
    for i in to_cut:
      s_list.append(i)
    test_review_list.append(s_list)

p_all_tags_new=[]
for i in tqdm(range(len(pre_idx))):
    l_sentence=len(test_review_list[i])+1
    l_new=pre_idx[i][0:l_sentence]
    l_temp=[s if s!='[PAD]' else 'O' for s in l_new]
    l_temp=l_temp[1:len(l_temp)]
    p_all_tags_new.append(l_temp)

In [None]:
def rtn_AO_list(s_id,l_word,l_tag):
    res=[]
    l_ao_tag=[]
    l_ao_phrase=[]
    s=''
    for i in range(len(l_word)):
        if l_tag[i]=='O':
            s=''
        elif l_tag[i]=='B-aspect':
            l_ao_tag.append('A')
            if (i+1)==len(l_word):
                l_ao_phrase.append(l_word[i])
            elif l_tag[i+1]!='I-aspect':
                l_ao_phrase.append(l_word[i])
            else:
                s=l_word[i]
        elif l_tag[i]=='B-opinion':
            l_ao_tag.append('O')
            if (i+1)==len(l_word):
                l_ao_phrase.append(l_word[i])
            elif l_tag[i+1]!='I-opinion':
                l_ao_phrase.append(l_word[i])
            else:
                s=l_word[i]
        else:
#这段代码是把这个为I而上一个为O的，把O+I一起输出为这个元素,这样会导致准确率降低召回率升高
#             if i>=1:#
#                 if l_tag[i-1]=='O':#
#                     if l_tag[i]=='I-aspect':#
#                         l_ao_tag.append('A')#
#                         s=l_word[i-1]#
#                     else:#
#                         l_ao_tag.append('O')#
#                         s=l_word[i-1]#
            if (i+1)==len(l_word):
                l_ao_phrase.append(s+l_word[i]) 
            elif ((l_tag[i+1]!='I-aspect') & (l_tag[i+1]!='I-opinion')):
                if l_tag[i-1]!='O':#这个if是和上面相反的策略，即如果I前不是B则不输出
                    l_ao_phrase.append(s+l_word[i])
            else:
                s+=l_word[i]
    container_tag=''
    container_phrase=''
    len0=len(l_ao_tag)
    if len0>0:
        for i in range(len0):
            if container_tag=='':
                container_tag=l_ao_tag[i]
                container_phrase=l_ao_phrase[i]
            elif container_tag=='A':
                if l_ao_tag[i]=='O':
                    res.append([s_id,container_phrase,l_ao_phrase[i]])
                    container_tag,container_phrase='',''
                else:
                    res.append([s_id,container_phrase,'_'])
                    container_phrase=l_ao_phrase[i]
            elif container_tag=='O':
                if l_ao_tag[i]=='A':
                    res.append([s_id,l_ao_phrase[i],container_phrase])
                    container_tag,container_phrase='',''
                else:
                    res.append([s_id,'_',container_phrase])
                    container_phrase=l_ao_phrase[i]
        if container_tag!='':
            if container_tag=='A':
                res.append([s_id,container_phrase,'_'])
            else:
                res.append([s_id,'_',container_phrase])
    return res

In [None]:
import re

res=[]
for sentence_index in tqdm(range(len(test_review_list))):
    s_id=list(test_data['id'])[sentence_index]
    l_word,l_tag=[],[]
    for word_index in range(len(test_review_list[sentence_index])):
        this_word=test_review_list[sentence_index][word_index]
        if len(re.findall(',|，|。|、|\.|!|！|\?|？|；|;| ',this_word))==0:
            l_word.append(this_word)
            l_tag.append(p_all_tags_new[sentence_index][word_index])######
        else:
            res+=rtn_AO_list(s_id,l_word,l_tag)
            l_word,l_tag=[],[]
    res+=rtn_AO_list(s_id,l_word,l_tag)

In [None]:
df_res=pd.DataFrame(res)
df_res['tuple']=df_res.apply(lambda x:(x[0],x[1],x[2]),axis=1)
df_res

In [None]:
df_true=pd.read_csv('Train_labels.csv')
df_true=df_true[df_true['id']>2600]
df_true=df_true[['id','AspectTerms','OpinionTerms']]
df_true['tuple']=df_true.apply(lambda x:(x[0],x[1],x[2]),axis=1)
c=0
for i in df_res['tuple']:
    if i in set(df_true['tuple']):
        c+=1
precision=c/len(df_res)
recall=c/len(df_true)
print(c)
print(precision)
print(recall)
2*precision*recall/(precision+recall)