In [0]:
# 経過時間を確認
!cat /proc/uptime | awk '{print $1 /60 " mins (" $1 "sec)"}'

In [0]:
# google-drive-ocamlfuseのインストール
!apt install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt -y install -qq google-drive-ocamlfuse fuse
# Colab用のAuth token作成
from google.colab import auth
auth.authenticate_user()
# Drive FUSE library用のcredential生成
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}
# drive/ を作り、そこにGoogle Driveをマウントする
!mkdir -p drive
!google-drive-ocamlfuse drive

In [0]:
!ls

In [0]:
%cd drive

In [0]:
# csvを読み込み、レビューと点数の配列を生成
import numpy as np
import csv

issues = []

# CSVファイルを読み込む
with open("eiga_score_texts.csv", 'r', encoding="utf-8") as f:
    csv_file = csv.reader(f, delimiter=',')
    
    # 配列に変換
    for row in csv_file:
        issues.append(row)

issues.pop(0) # 1行目を削除


texts = [t[1] for t in issues  if t[0] != "-"] # レビューのテキストの配列を作成
labels = [int(float(t[0])//3) for t in issues if t[0] != "-"] # レビューの点数の配列を作成


'''
leng = len(labels)
i = 0
while i < leng:
  if len(texts[i])<50:
    del labels[i]
    del texts[i]
    leng -= 1
    i-=1
  i += 1
print(labels.count(0))
print(labels.count(1))
'''

In [0]:
# 単語をベクトルに変換
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from keras.utils.np_utils import to_categorical

maxlen = 400 # ひとつのレビューの最大の文の長さ(単語の個数)
max_words = 80000 # 最大語彙数

# word indexを作成(単語を数値に変換)
tokenizer = Tokenizer(num_words=max_words) # テキストをベクトルに変換
tokenizer.fit_on_texts(texts) # 辞書を作成 -> word_indexが生成される
sequences = tokenizer.texts_to_sequences(texts) # テキストをシーケンス(辞書に基づいて、単語を数字)に変換

word_index = tokenizer.word_index #これが辞書
#print(word_index)
#print(sequences[:100])
#print("Found {} unique tokens.".format(len(word_index)))

data = pad_sequences(sequences, maxlen=maxlen) # 0埋めして、長さを揃える
#print(data)

In [0]:
# バイナリの行列に変換
categorical_labels = to_categorical(labels) # OneHot表現に変更
labels = np.asarray(categorical_labels) #NumPy型に変換

#print("Shape of data tensor:{}".format(data.shape))
#print("Shape of label tensor:{}".format(labels.shape))

In [0]:
# モデルに入れるよ！
from keras.models import Sequential
from keras.layers import Flatten, Dense,Embedding,  Dropout
from keras.layers import  GRU
from keras.optimizers  import Adam , SGD

model = Sequential() # addでレイヤーを追加できるで
model.add(Embedding(max_words, 100, input_length=maxlen))
model.add(GRU(32))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax')) #　ニューロン？が２個（出力）で、活性化関数はsoftmax
model.compile(optimizer=Adam(lr=1e-4) ,loss='categorical_crossentropy', metrics=['acc']) #最適化アルゴリズム, 損失関数, 評価関数
model.summary() # モデルの要約を出力します．utils.print_summaryへのショートカットです．

In [0]:
x_train = data
y_train = labels

In [0]:
import tensorflow as tf
tf.test.gpu_device_name()

In [0]:
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2,verbose=1,mode='min', min_lr=0.001) # 学習率を減らす
early_stopping = EarlyStopping(patience=0, verbose=1) # Early Stopping

history = model.fit(x_train, y_train, epochs=50, batch_size=5000,validation_split=0.19,callbacks=[early_stopping, reduce_lr])


Train on 32502 samples, validate on 7624 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [0]:
%matplotlib inline

import matplotlib.pyplot as plt

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()

In [0]:
# モデルを保存
model.save('model.h5')

In [0]:
from keras import backend as K

# モデルを初期化する
del model
K.clear_session()