In [1]:
import csv
import numpy as np
import keras
import datetime

print("%s start" % datetime.datetime.today())
###
# 1.データ準備
###
print("initializing...")
n = 8500              # 入力層の要素数(=データの最大文字数)
k = 3                 # 出力層の要素数(いくつに分類するか)

max_rows = 50000  # 読み取り行数最大
train_ratio = 0.9 # 読み取ったデータのうち学習に使う比率
print("max_rows=%s" % max_rows)
print("train_ratio=%s" % train_ratio)

print("reading data..")
# numpy.appendは遅いので先にlistに取り込んでからnumpyに展開
xlist=[]
ylist=[]
with open('reviews-ccnn-train-data04.txt', 'r') as f:
    reader = csv.reader(f,delimiter = '\t')
#    header = next(reader)  # ヘッダーを読み飛ばしたい時

    for i,row in enumerate(reader):
        if i >= max_rows:
            break
#        print(i,row[0],row[1])          # 1行づつ取得できる
        if (i+1) % 10000 == 0:
            print("%s %s rows read .. " % (datetime.datetime.today(),i+1))
        x=[]
        for c in row[1]:
            x.append(float(np.int.from_bytes(c.encode('utf-8'),'big')))
        xx = np.array(x)
        xx.resize(n,refcheck=False)
        xx = xx.astype('float32')
        y = float(row[0])
        xlist.append(xx)
        ylist.append(y)

total_rows = len(xlist)
print("%s total %s rows read" % (datetime.datetime.today(),total_rows) )



Using TensorFlow backend.


2018-02-16 00:51:18.641887 start
initializing...
max_rows=50000
train_ratio=0.9
reading data..
2018-02-16 00:51:25.990378 10000 rows read .. 
2018-02-16 00:51:33.215029 20000 rows read .. 
2018-02-16 00:51:39.841229 30000 rows read .. 
2018-02-16 00:51:46.899224 40000 rows read .. 
2018-02-16 00:51:53.344282 50000 rows read .. 
2018-02-16 00:51:53.345430 total 50000 rows read


In [2]:

train_rows = int(total_rows * train_ratio) # 学習に使うデータの行数
test_rows = total_rows - train_rows        # 評価に使うデータの行数
print("%s train_rows=%s" % (datetime.datetime.today(),train_rows))
print("%s test_rows=%s" % (datetime.datetime.today(),test_rows))

X=np.zeros((train_rows,n),float) # 学習用入力データの行列
Y=np.zeros((train_rows,k),float)  # 学習用出力データの行列
X_test=np.zeros((test_rows,n),float) # 評価用入力データの行列
Y_test=np.zeros((test_rows,k),float) # 評価用出力データの行列



2018-02-16 00:51:53.362871 train_rows=45000
2018-02-16 00:51:53.363171 test_rows=5000


In [3]:
train_index = 0
test_index = 0
for i in range(total_rows):
    x = xlist[i]
    y = ylist[i]
#    n_sub_samples = len(x.shape[0]
    # 上位train_rows分を学習データ　それ以外を評価データに使う
    # ホントはシャッフルしたほうがいい
    if i < train_rows:
        X[i:i+1,:] = x
        Y[i:i+1,:] = keras.utils.np_utils.to_categorical(y, k)
    else:
        X_test[i-train_rows:i-train_rows+1,:] = x
        Y_test[i-train_rows:i-train_rows+1,:] = keras.utils.np_utils.to_categorical(y, k)

#X.resize((train_rows,n))
#Y.resize((train_rows,k))
#X_test.resize((test_rows,n))
#Y_test.resize((test_rows,k))

# Xを３次元の行列に変換。この意味はわからない・・Kerasの都合か。
X=X.reshape(X.shape[0],n,1)
X_test=X_test.reshape(X_test.shape[0],n,1)
# 正規化
X /= 0x1FFFFF
X_test /= 0x1FFFFF

print("%s initialized" % datetime.datetime.today())


2018-02-16 00:51:56.927071 initialized


In [4]:
# 学習データの件数確認
X.shape


(45000, 8500, 1)

In [5]:
X_test.shape

(5000, 8500, 1)

In [6]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import Adadelta
from sklearn import datasets
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from keras.layers.core import Dropout
from keras.initializers import TruncatedNormal
from keras.layers import Flatten, Conv1D, MaxPooling1D

###
# 2.モデル設定
###
print("%s building model start" % datetime.datetime.today())
kernel_size = (3)
model = Sequential()
# 入力層
model.add(Conv1D(32, kernel_size=kernel_size,
                 activation='relu',
                 input_shape=(n,1)
                 )
        )
# 畳み込み層
model.add(Conv1D(64, kernel_size, activation='relu'))
# プーリング層
model.add(MaxPooling1D(pool_size=(2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
# 出力層
model.add(Dense(k, activation='softmax'))

# 確率的勾配降下法
model.compile(loss='categorical_crossentropy', optimizer=Adadelta(), metrics=['accuracy'])

print("%s model built" % datetime.datetime.today())


2018-02-16 00:51:57.703336 building model start
2018-02-16 00:51:57.952060 model built


In [7]:
print("%s trainning model start" % datetime.datetime.today())

###
# 3.モデル学習
###
epochs = 50     # 学習繰り返し数
batch_size = 120
hist = model.fit(X, Y, epochs=epochs, batch_size=batch_size,)
#               validation_data=(X_validation ,Y_validation))

print("%s training finish" % datetime.datetime.today())


2018-02-16 00:51:57.967621 trainning model start
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50

KeyboardInterrupt: 

In [None]:
###
# 4-1.評価
###
print("%s evaluate start" % datetime.datetime.today())
loss_and_metrics = model.evaluate(X_test, Y_test)
print()
print(loss_and_metrics)
# 可視化
val_loss = hist.history['loss']
val_acc = hist.history['acc']
print(val_loss)
print(val_acc)

print("%s evaluate finish" % datetime.datetime.today())


In [None]:
###
# 4-2.予測
###

predicted = model.predict(X_test)


In [None]:
# ランダムに１０個ほど予測結果を確認
for i in np.random.randint(0,max_rows-train_rows,10):
    print(i,Y_test[i],predicted[i])
    # ID化したデータを文字列に戻す
    ss = str()
    for xx in (X_test[i] * 0x1FFFFF).reshape(n).astype(int):
        ixx = int(xx)
        ss += (ixx.to_bytes((ixx.bit_length() + 7) // 8,'big')).decode(encoding='utf-8')

    print(ss)




In [None]:
import os

###
# 5.学習の可視化
###

plt.rc('font', family='serif')
fig = plt.figure()
# 精度
ax_acc = fig.add_subplot(111)
ax_acc.plot(range(epochs), val_acc, label='acc', color='black')
# 損失
ax_loss = ax_acc.twinx()
ax_loss.plot(range(epochs), val_loss, label='loss', color='gray')
plt.xlabel('epochs')
#file,ext = os.path.splitext(os.path.basename(__file__))
#plt.savefig(file+'.eps')

plt.show()