In [1]:
import os
import pandas as pd
import jieba
import re
import jieba.posseg as pseg
import numpy as np
from collections import Counter

def read_train(f_train,f_label):
    labels=pd.read_csv(f_label,sep='\t')
    train_data=pd.read_csv(f_train,sep='\t')
    data=pd.merge(labels,train_data,how='left',on='SentenceId')
    return data[data.Content.notnull()].reset_index(drop=True)

train_d1=read_train('data/CarReview1.csv','data/CarAspect1.csv')
train_d2=read_train('data/CarReview2.csv','data/CarAspect2.csv')
# 去除视角不在原句中的
has_view=[]
for i in range(len(train_d1)):
    if train_d1.loc[i,'View'] not in train_d1.loc[i,'Content']:
        has_view.append(False)
    else:
        has_view.append(True)

train_d1=train_d1[has_view].reset_index(drop=True)
train_data=pd.concat([train_d1,train_d2],ignore_index=True)

print('训练集数据总数',len(train_data))
vals=Counter(train_data.Opinion.tolist())
for key in vals:
    print(key,vals[key],vals[key]/len(train_data))
data=train_data

训练集数据总数 20581
neu 14288 0.6942325445799524
neg 1527 0.07419464554686361
pos 4766 0.23157280987318402


# 样本扩展完毕

In [2]:
data['Left']=pd.Series(name='Left',index=data.index,data=['' for _ in range(len(data))])
data['Right']=pd.Series(name='Right',index=data.index,data=['' for _ in range(len(data))])

for i in range(len(data)):
    content=data.loc[i,'Content']
    index=content.find(data.loc[i,'View'])
    if index<0:
        print(data.loc[i,'SentenceId'])
    data.set_value(i,'Left',content[:index])
    data.set_value(i,'Right',content[index+len(data.loc[i,'View']):])

In [4]:
views=pd.read_csv('data/CarAspectDict.csv',header=None)[0].tolist()
'''自定义词典
'''
for word in views:
    for w in word.split(' '):
        if len(w)>0:
            jieba.add_word(w,tag='nz')
'''分词
'''
print('分词处理...')

data['LeftWords']=data['Left'].apply(lambda x:pseg.lcut(x))
data['RightWords']=data['Right'].apply(lambda x:pseg.lcut(x))
data['ViewWords']=data['View'].apply(lambda x:pseg.lcut(x))

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.901 seconds.
Prefix dict has been built succesfully.


分词处理...


In [6]:
from gensim.models.word2vec import Word2Vec
w2v_model=Word2Vec.load('data/car100_v2.w2v')

In [7]:
'''生成词表
'''
vocs={}
id2words=['']
word_dim=w2v_model.vector_size
w2v=[np.zeros((word_dim,))]

def get_sentence_vector(w2v_model,words,i,default_dim,random=False):
    '''传入word2vec模型以及词序列
    返回窗口内的平均向量
    返回句子中平均词向量，用来填充缺失的词向量
    '''
    if random:
        return np.random.uniform(low=-0.01,high=0.01,size=(default_dim,))
    else:
        window_size=5
        start=max(0,i-window_size)
        end=min(i+window_size,len(words))
        vectors=[w2v_model[w] for w in words[start:end] if w in w2v_model]
        if len(vectors)==0:
            return np.random.uniform(low=-0.01,high=0.01,size=(default_dim,))
        return np.average(vectors,axis=0)

def get_wids(words):
    '''获得评价内容中词语的Id
    '''
    wids=[]
    for i,word in enumerate(words):
        w=word.word
        if w not in vocs:
            vocs[w]=len(vocs)+1
            id2words.append(w)
            if w not in w2v_model:
                sen_vector=get_sentence_vector(w2v_model,words,i,word_dim)
                w2v.append(sen_vector)
            else:
                w2v.append(w2v_model[w])
            
        wids.append(vocs[w])
    return wids
data.loc[:,'LeftIds']=data.LeftWords.apply(lambda x:get_wids(x))
data.loc[:,'RightIds']=data.RightWords.apply(lambda x:get_wids(x))
data.loc[:,'ViewIds']=data.ViewWords.apply(lambda x:get_wids(x))

w2v=np.array(w2v,dtype=np.float32)

In [10]:
'''保存处理好的训练数据到文件中
'''
import pickle
pickle.dump([data,vocs,id2words,w2v],open('data/car_review_data.pkl','wb'))