In [1]:
class TC_LSTM(object):
    '''
    基于双向LSTM的多视角分析程序
    @2016.11.29
    '''
    def __init__(self,w2v,output_dim=3):
        from sklearn.preprocessing import LabelBinarizer
        self.w2v=w2v
        self.word_dim=w2v.shape[1]
        self.output_dim=output_dim
        self.lb=LabelBinarizer()
        self.lb.fit([0,1,2])
        self.build()
        
    def build(self):
        from keras.layers import Embedding,Input,merge,Merge
        import keras.backend as K
        import theano
        import theano.tensor as T
        from keras import backend as K
        from keras.layers import Dense,Dropout,Lambda,LSTM
        from yuml.keras.layers import MaskMeanLayer,ConnectAspectLayer
        from keras.layers import Dense
        from keras.models import Model
        import keras

        left_input=Input(shape=(None,),dtype='int32',name='left_input')
        right_input=Input(shape=(None,),dtype='int32',name='right_input')
        aspect_input=Input(shape=(None,),dtype='int32',name='aspect_input')   #32x100

        #词向量Embedding
        layer=Embedding(input_dim=self.w2v.shape[0],output_dim=self.word_dim,weights=[self.w2v],mask_zero=True,name='WordEmbedding')
        left_x=layer(left_input) #32x24x100
        right_x=layer(right_input) #32x104x100
        aspect_x=layer(aspect_input) #32x4x100

        aspect_vector=MaskMeanLayer()(aspect_x) #32x1x100

        left_mx=ConnectAspectLayer()([left_x,aspect_vector]) #32x24x200

        right_mx=ConnectAspectLayer()([right_x,aspect_vector]) #32x24x200

        left_vector=LSTM(output_dim=200,dropout_W=0.3,dropout_U=0.3,activation='tanh')(left_mx) #32x100
        right_vector=LSTM(output_dim=200,go_backwards=True,dropout_W=0.3,dropout_U=0.3,activation='tanh')(right_mx) #32x100

        lstm_output=merge(inputs=[left_vector,right_vector],mode='concat')
        x=Dropout(0.5)(lstm_output)
        x=Dense(50,activation='tanh')(x)
        x=Dropout(0.5)(x)
        output=Dense(self.output_dim,activation='softmax')(x)

        model=Model(input=[left_input,right_input,aspect_input],output=output)
        model.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['accuracy'])
        self.get_lstm_output=K.function(inputs=[left_input,right_input,aspect_input,K.learning_phase()],outputs=[lstm_output,output])
        self.model=model
        

    def train(self,patched_docs,patched_ys,epoch=500,class_weight=None):
        model=self.model
        n_batch=len(patched_docs)
        indexes=np.random.randint(low=0,high=n_batch,size=(epoch,))
        loss,acc,total=0,0,0
        for n,i in enumerate(indexes):
            val=model.train_on_batch(patched_docs[i],patched_ys[i],class_weight=class_weight)
            num=len(patched_docs[i])
            loss,acc,total=loss+val[0]*num,acc+val[1]*num,total+num
            if (n+1)%100==0:
                print('\r%d/%d'%(n+1,epoch),val[0],val[1],end='')
        loss,acc=loss/total,acc/total
        return loss,acc

    def test(self,patched_docs,patched_ys):
        model=self.model
        loss,acc,total=0,0,0
        for x_test,y_test in zip(patched_docs,patched_ys):
            val=model.test_on_batch(x_test,y_test)
            num=len(x_test)
            loss,acc,total=loss+val[0]*num,acc+val[1]*num,total+num
        loss,acc=loss/total,acc/total
        return loss,acc

    def predict(self,patched_docs,patched_ids):
        
        results=[]
        for x_test,ids in zip(patched_docs,patched_ids):
            val=self.model.predict_on_batch(x_test)
            results.extend(zip(val,ids))
        return results

    def fit(self,train_data,valid_data=None,class_weight=None,n_earlystop=30,filename='best.model',
            cnt_in_epoch=100,n_epoch=500,best_type='best_loss'):
        model=self.model
        best_loss=1000
        best_epoch=0
        best_acc=0
        early_stop=0
        n_stop=n_earlystop
        import datetime
        for i in range(n_epoch):
            early_stop+=1
            val=self.train(train_data[0],train_data[1],cnt_in_epoch,class_weight)
            print('\r',i+1,'train',val)
            if valid_data:
                print('testing...',end='')
                val=self.test(valid_data[0],valid_data[1])
                if (val[0]<best_loss and best_type=='best_loss') or (val[1]>best_acc and best_type=='best_acc'):
                    print(best_type)
                    best_loss=val[0]
                    best_epoch=i
                    best_acc=val[1]
                    model.save_weights(filename)
                    early_stop=0
                t=datetime.datetime.now().strftime('%H:%M:%S')
                print('\r',i+1,'test',t,'loss:%f, acc:%f'%val)
                print('-----')
                if early_stop>n_stop:
                    print('early stop')
                    break
        if valid_data:
            print('best:',best_epoch,best_loss,best_acc)
            self.model.load_weights(filename)
        else:
            best_epoch=n_epoch
            best_loss=val[0]
            best_acc=val[1]
            self.model.save_weights(filename)
        return best_epoch,best_loss,best_acc
    
    def get_patched_data(self,valid_data,is_train=True):
        patched_data=[]
        batch_size=32
        opinions=['neg','neu','pos']
        n_batch=int((len(valid_data)-1)/batch_size+1)
        for i in range(n_batch):
            items=valid_data[i*batch_size:(i+1)*batch_size]
            doc=patchMatrix(items.WordIds.tolist())
            wpos=patchMatrix(items.POSID.tolist())
            position=patchMatrix(items.Position.tolist())
            distance=patchMatrix(items.Distances.tolist())
            sentiments=patchMatrix(items.Sentiment.tolist())

            left=patchMatrix([list(a)+list(b) for a,b in zip(items.LeftIds.tolist(),items.ViewIds.tolist())])
            right=patchMatrix([list(a)+list(b) for a,b in zip(items.ViewIds.tolist(),items.RightIds.tolist())])
            if is_train:
                ys=items.Opinion.apply(lambda x:opinions.index(x)).tolist()
            else:
                ys=items.Opinion.apply(lambda x:-1).tolist()
            views=patchMatrix(items.ViewIds.tolist())
            patched_data.append((left,right,views,items.index.tolist(),items.SentenceId.tolist(),ys))
        return patched_data
    

    def get_xs(self,train_df,is_train=True):
        train_data=self.get_patched_data(train_df,is_train)
        train_xs=[list(item)[:3] for item in train_data]
        train_ys=[self.lb.transform(item[-1]) for item in train_data]
        return train_xs,train_ys
    

In [1]:
'''
TC_LSTM神经网络模型
输入process_data处理好的pkl文件，包括(data,vocs,id2words,w2v)
输出情感分析结果
by: liyumeng
@ 2016/11/28

'''

import numpy as np
np.random.seed(100)
import pickle
from sklearn.preprocessing import LabelBinarizer
from yuml.datasets.gridsum2016 import load_data,patchMatrix,get_patched_data
from collections import Counter
import pandas as pd


if __name__=='__main__':
    input_filename='data/car_review_data.pkl'
    model_filename='best.tc_lstm.model'
    output_filename='answer.csv'
    retrain='1'
    
    print('input',input_filename)
    if retrain=='1':
        print('retrain and save model to',model_filename)
    else:
        print('load model from ',model_filename)
    print('output result to',output_filename)
    
    #----------------------------------------------------------
    '''读取训练数据'''
    train_df,valid_df,test_df,vocs,w2v,POS_dict=load_data(input_filename,pos_rate=0.2,neg_rate=0.2)


    #----------------------------------------------------------
    '''开始训练'''

    model=TC_LSTM(w2v)
    train_data=model.get_xs(train_df)
    valid_data=model.get_xs(valid_df)
    test_data=model.get_xs(test_df,False)
    if retrain =='1':
        model.fit(train_data=train_data,valid_data=valid_data,filename=model_filename,best_type='best_acc')
    else:
        model.model.load_weights(model_filename)

    #-------------------------------------------------------------------
    '''预测并输出
    '''
    res=model.predict(test_data[0],test_data[1])

    opinions=['neg','neu','pos']
    yp=[opinions[np.argmax(r[0])] for r in res]
    
    print('输出的各类别比例：')
    test_num=Counter(yp)
    for key in test_num:
        print(key,test_num[key]/len(test_df),test_num[key])
    
    test_df.loc[:,'Opinion']=yp
    test_df.loc[:,['SentenceId','RawView','Opinion']].to_csv(output_filename,index=False,sep=',',encoding='utf8',header=True)
    print('运行完毕！已输出到',output_filename)
    


input data/car_review_data.pkl
retrain and save model to best.tc_lstm.model
output result to answer.csv


AttributeError: 'DataFrame' object has no attribute 'POS'

In [4]:
data

NameError: name 'data' is not defined