In [1]:
import collections
import os

import nltk
import numpy as np
from keras.callbacks import TensorBoard
from keras.layers import Activation, Dense, Dropout, Embedding, LSTM
from keras.models import Sequential
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
import codecs

Using TensorFlow backend.


In [2]:
DATA_DIR = "./data"
LOG_DIR = "./logs"


EMBEDDING_SIZE = 128
HIDDEN_LAYER_SIZE = 64
BATCH_SIZE = 32
NUM_EPOCHS = 10

In [3]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/coly/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
maxlen = 0
word_freqs = collections.Counter()
num_recs = 0
with codecs.open(os.path.join(DATA_DIR,"umich-sentiment-train.txt"),"r","utf-8") as ftrain:
    for line in ftrain:
        label , sentence = line.strip().split("\t")    #水平タブで分割
        words = nltk.word_tokenize(sentence.lower())   # 単語の表現を正規化
        maxlen = max(maxlen,len(words))                # 最大maxlenを更新していく
        # すべての文章について出現する単語の数を数えておく
        for word in words:
            word_freqs[word] += 1
            
        num_recs += 1

In [11]:
print(maxlen)
print(len(word_freqs))
print(num_recs)

42
2326
7086


In [6]:
MAX_FEATURES = 2000
MAX_SENTENCE_LENGTH = 40
# 上の値はあくまで今回決めたパラメータである
# コーパスの数とMAX_FEATURESを比較して小さい方を語彙の数として定義する
vocab_size = min(MAX_FEATURES,len(word_freqs)) + 2

In [7]:
word2index = dict()
index2word = dict()

In [8]:
for i,x in enumerate(word_freqs.most_common(MAX_FEATURES)):
    word2index[x[0]] = i+2

In [9]:
# "PAD" : パディングのための値
# "UNK" : 語彙にない単語,読み取れない単語（記号とか？）
word2index["PAD"] = 0
word2index["UNK"] = 1

In [10]:
for k, v in word2index.items():
    index2word[v] = k

In [12]:
X = np.empty((num_recs,),dtype=list)
y = np.zeros((num_recs,))
i = 0

In [19]:
# 文章をコーパスから作成した語彙リストのidのリストに変換する
X = np.empty((num_recs, ), dtype=list)
y = np.zeros((num_recs, ))
i = 0
with codecs.open(os.path.join(DATA_DIR, "umich-sentiment-train.txt"),'r', 'utf-8') as ftrain:
    for line in ftrain:
        # データはタブ(\t)でラベルと文章に区切られている
        label, sentence = line.strip().split("\t")
        # 単語を正規化したリストを作成
        words = nltk.word_tokenize(sentence.lower())
        seqs = []
        # 語彙のリストに入っていればその単語のidを入れ
        # なければ"unknown"を意味するUNKのidを入れる
        for word in words:
            if word in word2index:
                seqs.append(word2index[word])
            else:
                seqs.append(word2index["UNK"])
        X[i] = seqs
        y[i] = int(label)
        i += 1


In [21]:
# 入力データをすべてmaxlenを最大長にする
# 最大長に満たないものは0でパディングする
X = sequence.pad_sequences(X,maxlen=MAX_SENTENCE_LENGTH )

In [23]:
# データを8:2で分割する
Xtrain,Xtest,ytrain,ytest = train_test_split(X,y,test_size=0.2,random_state=42)

In [24]:
# モデルを作成する
model = Sequential()
model.add(Embedding(vocab_size,EMBEDDING_SIZE,input_length=MAX_SENTENCE_LENGTH))
model.add(LSTM(HIDDEN_LAYER_SIZE,dropout=0.5,recurrent_dropout=0.5))
model.add(Dense(1))
model.add(Activation("sigmoid"))
model.compile(loss="binary_crossentropy",optimizer="adam",metrics=["accuracy"])

In [25]:
history = model.fit(Xtrain,ytrain,batch_size=BATCH_SIZE,epochs=NUM_EPOCHS,callbacks=[TensorBoard(LOG_DIR)],validation_data=(Xtest,ytest))

Train on 5668 samples, validate on 1418 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [26]:
score,acc = model.evaluate(Xtest,ytest,batch_size=BATCH_SIZE)
print("Test score: {},accuracy: {}".format(score,acc))


Test score: 0.07267036836013745,accuracy: 0.9774330040631745


In [29]:
for i in range(5):
    idx = np.random.randint(len(Xtest))
    xtest = Xtest[idx].reshape(1,40)
    ylabel = ytest[idx]
    ypred = model.predict(xtest)[0][0]
    sent_list = []
    for x in xtest[0].tolist():
        if x != 0:
            sent_list.append(index2word[x])
    sent = " ".join(sent_list)
    print("{:.0f}\t{:.0f}\t{}".format(ypred,ylabel,sent))

0	0	oh , and brokeback mountain is a terrible movie ...
0	0	i hate harry potter , it 's retarted , gay and stupid and there 's only one black guy ...
1	1	i am going to start reading the harry potter series again because that is one awesome story .
1	0	then we drove to bayers lake for the da vinci code , which as expected , tom hanks sucks ass in that movie , but the dramatic last 2 minutes were good .
1	1	i love brokeback mountain .


In [2]:
import os
os.path.exists("./data/umich-sentiment-tests.txt")

False