# RNN (recurrent neural network), 순환 신경망 


In [9]:
from keras.preprocessing import sequence
from keras.datasets import imdb
from keras import layers, models

## 데이터 설명

영화평(리뷰)와 긍정리뷰또는 부정리뷰의 레이블 정보가 있는 데이터셋

파일로 다운로드를 받으면 영어로된 원본 영화평을 볼 수 있습니다.


http://ai.stanford.edu/~amaas/data/sentiment/



긍정리뷰의 예

ONE DARK NIGHT is a highly overlooked and little known film from the early 80's that deserves an audience that I fear it will never get, and that's a damn shame. I have seen this film compared to others that have gotten a bigger name over the years, most notably PHANTASM, HELL NIGHT and MAUSOLEUM. This is a much different film than those and I don't see the comparisons other than the mausoleum, which is a bit similar to the one in PHANTASM, but not enough to make any real comparisons. ...

In [10]:
max_features=20000
maxlen=80

- 딴어의 빈도수가 max_features 등 안에 드는 단어만 취급
- maxlen : 한 문장의 최대 단어수
- 그외의 옵션은 케라스 홈페이지에서 확인할 수 있습니다.

https://keras.io/datasets/

path: if you do not have the data locally (at '~/.keras/datasets/' + path), it will be downloaded to this location.
num_words: integer or None. Top most frequent words to consider. Any less frequent word will appear as oov_char value in the sequence data.
skip_top: integer. Top most frequent words to ignore (they will appear as oov_char value in the sequence data).
maxlen: int. Maximum sequence length. Any longer sequence will be truncated.
seed: int. Seed for reproducible data shuffling.
start_char: int. The start of a sequence will be marked with this character. Set to 1 because 0 is usually the padding character.
oov_char: int. words that were cut out because of the num_words or skip_top limit will be replaced with this character.
index_from: int. Index actual words with this index and higher.

In [11]:
(train_X, train_Y), (test_X, test_Y) = imdb.load_data(num_words=max_features)

In [12]:
train_X.shape

(25000,)

In [13]:
type(train_X), type(train_X[0])

(numpy.ndarray, list)

In [14]:
print(train_X[0])

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]


In [15]:
import numpy as np

In [16]:
np.max(np.max(train_X))

19972

In [17]:
tmp_x = sequence.pad_sequences(train_X, maxlen=maxlen)
tmp_x.shape

(25000, 80)

In [18]:
tmp_x[0].shape

(80,)

In [19]:
train_Y

array([1, 0, 0, ..., 0, 1, 0])

In [20]:
def prepare_imdb_data(max_features=20000, maxlen=80):
    (train_X, train_Y), (test_X, test_Y) = imdb.load_data(num_words=max_features,
                                                         )
    train_X = sequence.pad_sequences(train_X, maxlen=maxlen)
    test_X = sequence.pad_sequences(test_X, maxlen=maxlen)
    return (train_X, train_Y), (test_X, test_Y)

In [21]:
(train_X, train_Y), (test_X, test_Y) = prepare_imdb_data()

train_X.shape

(25000, 80)

In [22]:
class LSTM(models.Model):
    def __init__(self, max_features, maxlen):
        x = layers.Input((maxlen,))
        print(x)
        h = layers.Embedding(max_features, 128)(x)
        print(h)
        h = layers.LSTM(128, dropout=0.2, recurrent_dropout=0.2)(h)
        print(h)
        y = layers.Dense(1, activation='sigmoid')(h)
        print(y)
        super().__init__(x, y)
        
        self.compile(loss='binary_crossentropy',
                        optimizer='adam', metrics=['accuracy'])
        
        
class LSTMMachine:
    def __init__(self, max_features=20000, maxlen=80):
        self.data = prepare_imdb_data(max_features, maxlen)
        print(self.data)
        self.model = LSTM(max_features, maxlen)
        
    def run(self, epochs=3, batch_size=32):
        data = self.data
        (train_X, train_Y), (test_X, test_Y) = data
        model = self.model
        model.fit(train_X, train_Y, batch_size=batch_size, epochs=epochs,
                         validation_data=(test_X, test_Y))
        score, acc = model.evaluate(test_X, test_Y, batch_size=batch_size)
        print('Test performance: accuracy={}, loss={}'.format(acc, score))
        


In [23]:
def main():
    machine = LSTMMachine()
    machine.run()
    
main()

((array([[   15,   256,     4, ...,    19,   178,    32],
       [  125,    68,     2, ...,    16,   145,    95],
       [  645,   662,     8, ...,     7,   129,   113],
       ..., 
       [  529,   443, 17793, ...,     4,  3586,     2],
       [  286,  1814,    23, ...,    12,     9,    23],
       [   97,    90,    35, ...,   204,   131,     9]], dtype=int32), array([1, 0, 0, ..., 0, 1, 0])), (array([[   0,    0,    0, ...,   14,    6,  717],
       [1669,  398,  229, ...,  125,    4, 3077],
       [ 687,    2,  203, ...,    9,   57,  975],
       ..., 
       [   0,    0,    0, ...,   21,  846, 5518],
       [   8,   97,   14, ..., 2302,    7,  470],
       [ 718,    2,    9, ...,   34, 2005, 2643]], dtype=int32), array([0, 1, 1, ..., 0, 0, 0])))
Tensor("input_1:0", shape=(?, 80), dtype=float32)
Tensor("embedding_1/Gather:0", shape=(?, 80, 128), dtype=float32)
Tensor("lstm_1/TensorArrayReadV3:0", shape=(?, 128), dtype=float32)
Tensor("dense_1/Sigmoid:0", shape=(?, 1), dtype=float32