## 네이버 영화평을 이용한 감정 분석

In [60]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import Sequential, Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from pprint import pprint
from tensorflow.python.keras.preprocessing.text import Tokenizer
import codecs
%matplotlib inline

print(tf.__version__)
tf.enable_eager_execution()

1.13.1


In [61]:
def read_data(filename):
    with open(filename, 'r', encoding='utf8') as f:
        result = [line.split('\t') for line in f.read().splitlines()]
        result = result[1:]   # header 제외
    return result

In [62]:
train_tmp = read_data('./data/ratings_train.txt')
test_tmp = read_data('./data/ratings_test.txt')

In [63]:
def kor_movie(max_num_words=1000):
    # 데이터 구조: ID | 리뷰데이터 | 감성라벨
    # 필요없는 id 제외. 학습용.
    train_x = []
    train_y = []
    for i in range(len(train_tmp)):
        train_x.append(train_tmp[i][1])      # 리뷰 데이터
        train_y.append(int(train_tmp[i][2])) # 라벨
        
    # 테스트용.
    test_x = []
    test_y = []
    for i in range(len(test_tmp)):
        test_x.append(test_tmp[i][1])
        test_y.append(int(test_tmp[i][2]))
    
    # 단어사전 만들고, 문장을 단어사전에 맞게 자연수로 변형
    # 빈도수가 높은 단어순으로 max_num_words 개의 단어가 들어있는 사전 생성
    tokenizer = Tokenizer(num_words=max_num_words-1)
    tokenizer.fit_on_texts(train_x)
    
    # 위에서 만든 단어사전을 기준으로 텍스트 데이터를 자연수 수열로 변환
    token_train_x = tokenizer.texts_to_sequences(train_x)
    token_test_x = tokenizer.texts_to_sequences(test_x)
    
    return (token_train_x, train_y), (token_test_x, test_y)

In [64]:
print('...전처리...')
max_num_words = 5000 # 단어사전 크기
maxlen = 100 # 문장 최대 길이
batch_size = 32

...전처리...


In [65]:
(x_train, y_train), (x_test, y_test) = kor_movie(max_num_words)
print(x_train[0:10])

[[23, 936, 4, 1097], [602], [], [73, 356, 27, 33], [107, 1, 852, 568], [592, 2290, 51, 4218, 409], [1098, 2234, 134], [111, 1254, 58, 2741, 3], [714, 96, 37, 4915, 1], [1187, 40, 285, 3285, 2, 928]]


In [66]:
print(y_train[0:10])  # 0: 부정, 1: 긍정

[0, 1, 0, 0, 1, 0, 0, 0, 1, 1]


In [67]:
# 각 영화 리뷰 문장들의 길이가 다름. 딥러닝을 위해 통일. 100 길이 안에서 데이터 채우고, 길이가 부족해 빈 영역 생기는 경우 0으로 채움.
x_train = pad_sequences(x_train, maxlen=maxlen, padding='post')
x_test = pad_sequences(x_test, maxlen=maxlen, padding='post')

In [68]:
x_train[0:10]

array([[  23,  936,    4, 1097,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0],
       [ 602,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    

In [69]:
x_train.shape

(150000, 100)

In [70]:
x_test[0:10]

array([[ 639,   49,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    

In [71]:
one_hot = np.eye(max_num_words)
one_hot[0:3]

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]])

In [87]:
model = Sequential()
model.add(layers.Embedding(input_dim=max_num_words, output_dim=max_num_words, input_length=100,
                           embeddings_initializer=keras.initializers.Constant(one_hot)))

In [73]:
matrix1 = model(x_train[0:10])
matrix1

<tf.Tensor: id=14438218, shape=(10, 100, 5000), dtype=float32, numpy=
array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
 

In [88]:
model.add(tf.keras.layers.LSTM(units=516))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 5000)         25000000  
_________________________________________________________________
lstm_3 (LSTM)                (None, 516)               11387088  
Total params: 36,387,088
Trainable params: 36,387,088
Non-trainable params: 0
_________________________________________________________________


In [75]:
matrix2 = model(x_train[0:10])

In [76]:
matrix2

<tf.Tensor: id=14450460, shape=(10, 516), dtype=float32, numpy=
array([[-0.00862178,  0.00303746, -0.02889573, ..., -0.00537256,
         0.02085857,  0.01743507],
       [-0.00862178,  0.00303746, -0.02889573, ..., -0.00537256,
         0.02085857,  0.01743508],
       [-0.00862178,  0.00303746, -0.02889573, ..., -0.00537256,
         0.02085857,  0.01743508],
       ...,
       [-0.00862178,  0.00303746, -0.02889573, ..., -0.00537256,
         0.02085857,  0.01743507],
       [-0.00862178,  0.00303746, -0.02889573, ..., -0.00537256,
         0.02085857,  0.01743507],
       [-0.00862178,  0.00303746, -0.02889573, ..., -0.00537256,
         0.02085857,  0.01743507]], dtype=float32)>

In [77]:
model.add(layers.Dense(units=516, activation=tf.nn.relu))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 5000)         25000000  
_________________________________________________________________
lstm_2 (LSTM)                (None, 516)               11387088  
_________________________________________________________________
dense_4 (Dense)              (None, 516)               266772    
Total params: 36,653,860
Trainable params: 36,653,860
Non-trainable params: 0
_________________________________________________________________


In [78]:
matrix3 = model(x_train[0:10])
matrix3

<tf.Tensor: id=14462355, shape=(10, 516), dtype=float32, numpy=
array([[0.        , 0.0264582 , 0.01817428, ..., 0.        , 0.        ,
        0.0354516 ],
       [0.        , 0.0264582 , 0.01817428, ..., 0.        , 0.        ,
        0.0354516 ],
       [0.        , 0.0264582 , 0.01817429, ..., 0.        , 0.        ,
        0.0354516 ],
       ...,
       [0.        , 0.0264582 , 0.01817429, ..., 0.        , 0.        ,
        0.0354516 ],
       [0.        , 0.0264582 , 0.01817429, ..., 0.        , 0.        ,
        0.0354516 ],
       [0.        , 0.0264582 , 0.01817428, ..., 0.        , 0.        ,
        0.03545159]], dtype=float32)>

In [89]:
num_classes = 2
model.add(layers.Dense(units=num_classes, activation=tf.nn.softmax))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 5000)         25000000  
_________________________________________________________________
lstm_3 (LSTM)                (None, 516)               11387088  
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 1034      
Total params: 36,388,122
Trainable params: 36,388,122
Non-trainable params: 0
_________________________________________________________________


In [80]:
matrix4 = model(x_train[0:10])
matrix4

<tf.Tensor: id=14474253, shape=(10, 2), dtype=float32, numpy=
array([[0.49230698, 0.507693  ],
       [0.49230698, 0.507693  ],
       [0.49230698, 0.507693  ],
       [0.49230698, 0.507693  ],
       [0.49230698, 0.507693  ],
       [0.49230698, 0.507693  ],
       [0.49230698, 0.507693  ],
       [0.49230698, 0.507693  ],
       [0.49230698, 0.507693  ],
       [0.49230698, 0.507693  ]], dtype=float32)>

In [90]:
def next_batch(num, data, labels):
    '''
    Return a total of 'num' random samples and labels.
    '''
    idx = np.arange(0, len(data))
    np.random.shuffle(idx)
    idx = idx[:num]
    data_shuffle = [data[i] for i in idx]
    labels_shuffle = [labels[i] for i in idx]
    
    return np.asarray(data_shuffle), np.asarray(labels_shuffle)

In [91]:
a = [1, 2, 3, 4, 5, 6, 7]

c = [3, 4, 5]

b = [a[i] for i in c]
b
# a의 3, 4, 5 index 값 출력

# idx (인덱스)를 셔플하고, data와 label을 매칭을 유지하면서 셔플하려고 이렇게 하는구나!

[4, 5, 6]

In [92]:
X, Y = next_batch(100, x_train, y_train)
X

array([[1842,   41,   19, ...,    0,    0,    0],
       [3559,    1,    0, ...,    0,    0,    0],
       [1001,   67, 3168, ...,    0,    0,    0],
       ...,
       [ 335,    0,    0, ...,    0,    0,    0],
       [   3,  688,  987, ...,    0,    0,    0],
       [4080, 4453, 4452, ...,    0,    0,    0]])

In [93]:
Y

array([1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1])

In [94]:
learning_rate = .001
opt = tf.train.AdamOptimizer(learning_rate=learning_rate)

In [95]:
import tensorflow.contrib.eager as tfe

for epoch in range(10):
    for batch in range(15000//100):
        X, Y = next_batch(100, x_train, y_train)
        with tf.GradientTape() as tape:
            hypothesis = model(X)
            cost = tf.losses.sparse_softmax_cross_entropy(labels=Y, logits=hypothesis)
            
        grads = tape.gradient(target=cost, sources=model.variables)
        opt.apply_gradients(grads_and_vars=zip(grads, model.variables))

        print('step: {:3}, cost: {:.3f}'.format(batch, cost))

step:   0, cost: 0.693
step:   1, cost: 0.698
step:   2, cost: 0.691
step:   3, cost: 0.691
step:   4, cost: 0.694
step:   5, cost: 0.693
step:   6, cost: 0.696
step:   7, cost: 0.694
step:   8, cost: 0.693
step:   9, cost: 0.696
step:  10, cost: 0.692
step:  11, cost: 0.696
step:  12, cost: 0.693
step:  13, cost: 0.693
step:  14, cost: 0.692
step:  15, cost: 0.694
step:  16, cost: 0.696
step:  17, cost: 0.698
step:  18, cost: 0.695
step:  19, cost: 0.693
step:  20, cost: 0.693
step:  21, cost: 0.693
step:  22, cost: 0.695
step:  23, cost: 0.695
step:  24, cost: 0.690
step:  25, cost: 0.694
step:  26, cost: 0.695


KeyboardInterrupt: 