In [5]:
import numpy as np
np.random.seed(1337)  # for reproducibility

import pandas as pd
df=pd.read_csv('data/DoubanZH.txt',names=['label','content'])
df['label']=df.label.apply(lambda x:1 if x==50 else 0)
df=df[df.content.isnull()==False]
df.head()

Unnamed: 0,label,content
0,1,智取 威虎山 之 寻找 梁家辉
1,1,燃爆 了 ！ ！ ！
2,1,硬到 骨子里
3,0,红色 电影 新 马甲 。
4,0,看 完 影评 我 觉得 我 是 一个 人 不能 更 糟心 的 片子


In [6]:
# 提取全文的词表
vocabs={'\s':0}
for content in df.content:
    for word in content.split():
        vocabs[word]=len(vocabs)
print('词表中共包含单词%d个'%len(vocabs))

# 将中文的句子中的单词转为id
data=[]
for content in df.content:
    sentence=[]
    for word in content.split():
        sentence.append(vocabs[word])
    data.append(sentence)
    
print('最长句子长度为%d个词'%np.max([len(sentence) for sentence in data]))

# 载入数据
from keras.preprocessing import sequence

xs=sequence.pad_sequences(data,maxlen=80)
ys=df.label.values

词表中共包含单词124374个
最长句子长度为140个词


# LSTM模型

In [9]:
from keras.preprocessing import sequence
from keras.models import Model
from keras.layers import Dense, Activation, Embedding,Input
from keras.layers import LSTM
from keras.datasets import imdb

max_features = len(vocabs)+1
embedding_dims = 50
maxlen = 80  # cut texts after this number of words (among top max_features most common words)
batch_size = 32
nb_epoch=2

In [None]:
input=Input(shape=(80,),dtype='int32')

x=Embedding(max_features+1,
                    embedding_dims,
                    input_length=maxlen,
                    dropout=0.2,)(input)


x=LSTM(128, dropout_W=0.2, dropout_U=0.2)(x)


x=Dense(1)(x)
output=Activation('sigmoid')(x)

model=Model(input=[input],output=output)

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(xs, ys,
          batch_size=batch_size,
          nb_epoch=nb_epoch,validation_split=0.2,verbose=1)
