In [20]:
import csv
import numpy as np
import keras
import datetime

print("start at %s" % datetime.datetime.today())
###
# 1.データ準備
###
print("initializing...")
n = 8500              # 入力層の要素数(=データの最大文字数)
k = 3                 # 出力層の要素数(いくつに分類するか)

max_rows = 10000  # 読み取り行数最大
learn_ratio = 0.9 # 読み取ったデータのうち学習に使う比率
learn_rows = max_rows * learn_ratio # 学習に使うデータの行数
test_rows = max_rows - learn_rows
print("max_rows=%s" % max_rows)
print("learn_rows=%s" % learn_rows)

print("reading data..")
# numpy.appendは遅いので先にlistに取り込んでからnumpyに展開
xlist=[]
ylist=[]
with open('reviews-ccnn-train-data04.txt', 'r') as f:
    reader = csv.reader(f,delimiter = '\t')
#    header = next(reader)  # ヘッダーを読み飛ばしたい時

    for i,row in enumerate(reader):
#        print(i,row[0],row[1])          # 1行づつ取得できる
        if (i+1) % 10000 == 0:
            print("%s rows read .. %s" % (i+1,datetime.datetime.today()))
        x=[]
        for c in row[1]:
            x.append(float(np.int.from_bytes(c.encode('utf-8'),'big')))
        xx = np.array(x)
        xx.resize(n,refcheck=False)
        xx = xx.astype('float32')
        y = float(row[0])
        xlist.append(xx)
        ylist.append(y)
        if i > max_rows:
            break

print("total %s rows read" % (i-1) )

X=np.zeros((i,n),float) # 学習用入力データの行列
Y=np.zeros((i,k),float)  # 学習用出力データの行列
X_test=np.zeros((i,n),float) # 評価用入力データの行列
Y_test=np.zeros((i,k),float) # 評価用出力データの行列
train_start = 0
test_start = 0
for i in range(len(xlist)):
    x = xlist[i]
    y = ylist[i]
#    n_sub_samples = len(x.shape[0]
    # 上位learn_rows分を学習データ　それ以外を評価データに使う
    # ホントはシャッフルしたほうがいい
    if i < learn_rows:
        X[start:start+1,:] = x
        Y[start:start+1,:] = keras.utils.np_utils.to_categorical(y, k)
        train_start += 1
    else:
        X_test[start:start+1,:] = x
        Y_test[start:start+1,:] = keras.utils.np_utils.to_categorical(y, k)
        test_start += 1

X.resize(learn_rows,refcheck=False)
Y.resize(learn_rows,refcheck=False)
X_test.resize(learn_rows)
Y_test.resize(learn_rows)

# Xを３次元の行列に変換。この意味はわからない・・Kerasの都合か。
X=X.reshape(X.shape[0],n,1)
X_test=X_test.reshape(X_test.shape[0],n,1)
# 正規化
X /= 0x1FFFFF
X_test /= 0x1FFFFF

print("initialized at %s" % datetime.datetime.today())


start at 2018-02-08 08:06:21.637696
initializing...
max_rows=10000
learn_rows=9000.0
reading data..
10000 rows read .. 2018-02-08 08:06:28.793102
total 10000 rows read


TypeError: 'float' object cannot be interpreted as an integer

In [15]:
# 学習データの件数確認
X.shape

(10001, 8500, 1)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import Adadelta
from sklearn import datasets
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from keras.layers.core import Dropout
from keras.initializers import TruncatedNormal
from keras.layers import Flatten, Conv1D, MaxPooling1D

###
# 2.モデル設定
###
print("building model start at %s" % datetime.datetime.today())
kernel_size = (3)
model = Sequential()
# 入力層
model.add(Conv1D(10, kernel_size=kernel_size,
                 activation='relu',
                 input_shape=(n,1)
                 )
        )
# 畳み込み層
model.add(Conv1D(20, kernel_size, activation='relu'))
# プーリング層
model.add(MaxPooling1D(pool_size=(2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dropout(0.5))
# 出力層
model.add(Dense(k, activation='softmax'))

# 確率的勾配降下法
model.compile(loss='categorical_crossentropy', optimizer=Adadelta(), metrics=['accuracy'])

print("model built at %s" % datetime.datetime.today())


In [None]:
print("learning model start at %s" % datetime.datetime.today())

###
# 3.モデル学習
###
epochs = 12     # 学習繰り返し数
batch_size = 120
hist = model.fit(X, Y, epochs=epochs, batch_size=batch_size,)
#               validation_data=(X_validation ,Y_validation))

print("learned at %s" % datetime.datetime.today())


In [None]:
###
# 4-1.評価
###
loss_and_metrics = model.evaluate(X_test, Y_test)
print()
print(loss_and_metrics)
# 可視化
val_loss = hist.history['loss']
val_acc = hist.history['acc']
print(val_loss)
print(val_acc)

In [None]:
###
# 4-2.予測
###

predicted = model.predict(X_test)


In [None]:
# ランダムに１０個ほど予測結果を確認
for i in np.random.randint(0,max_rows-learn_rows,10):
    print(i,Y_test[i],predicted[i])
    # ID化したデータを文字列に戻す
    ss = str()
    for xx in (X_test[i] * 0x1FFFFF).reshape(n).astype(int):
        ixx = int(xx)
        ss += (ixx.to_bytes((ixx.bit_length() + 7) // 8,'big')).decode(encoding='utf-8')

    print(ss)




In [None]:
import os

###
# 5.学習の可視化
###

plt.rc('font', family='serif')
fig = plt.figure()
# 精度
ax_acc = fig.add_subplot(111)
ax_acc.plot(range(epochs), val_acc, label='acc', color='black')
# 損失
ax_loss = ax_acc.twinx()
ax_loss.plot(range(epochs), val_loss, label='loss', color='gray')
plt.xlabel('epochs')
#file,ext = os.path.splitext(os.path.basename(__file__))
#plt.savefig(file+'.eps')

plt.show()