In [26]:
from tensorflow import keras
from tensorflow.keras.datasets import imdb
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['axes.unicode_minus'] = False  # 마이너스 표시 해결
# 한글설정
matplotlib.rcParams['font.family'] = 'Malgun Gothic' # windows 사용자
# matplotlib.rcParams['font.family'] = 'AppleGothic Gothic' # Mac사용자
matplotlib.rcParams['font.size'] = '10' # 글자크기

In [42]:
# imdb데이터 불러오기
# 단어사전은 500개 까지 만 가져옴
# 원핫인코딩하면 500개 컬럼이 만들어짐.
(train_input,train_target),(test_input,test_target) = keras.datasets.imdb.load_data(
    num_words = 2000
)

In [55]:
train_input[0]

[1,
 73,
 89,
 81,
 25,
 60,
 967,
 6,
 20,
 141,
 17,
 14,
 31,
 127,
 12,
 60,
 28,
 1360,
 1107,
 66,
 45,
 6,
 20,
 15,
 497,
 8,
 79,
 17,
 491,
 8,
 112,
 6,
 2,
 20,
 17,
 614,
 691,
 4,
 436,
 20,
 9,
 2,
 6,
 762,
 7,
 493,
 2,
 6,
 185,
 250,
 24,
 55,
 2,
 5,
 23,
 350,
 7,
 15,
 82,
 24,
 15,
 821,
 66,
 10,
 10,
 45,
 578,
 15,
 4,
 20,
 805,
 8,
 30,
 17,
 821,
 5,
 1621,
 17,
 614,
 190,
 4,
 20,
 9,
 43,
 32,
 99,
 1214,
 18,
 15,
 8,
 157,
 46,
 17,
 1436,
 4,
 2,
 5,
 2,
 9,
 32,
 1796,
 5,
 1214,
 267,
 17,
 73,
 17,
 2,
 36,
 26,
 400,
 43,
 2,
 83,
 4,
 1873,
 247,
 74,
 83,
 4,
 250,
 540,
 82,
 4,
 96,
 4,
 250,
 2,
 8,
 32,
 4,
 2,
 9,
 184,
 2,
 13,
 384,
 48,
 14,
 16,
 147,
 1348,
 59,
 62,
 69,
 2,
 12,
 46,
 50,
 9,
 53,
 2,
 74,
 1930,
 11,
 14,
 31,
 151,
 10,
 10,
 4,
 20,
 9,
 540,
 364,
 352,
 5,
 45,
 6,
 2,
 589,
 33,
 269,
 8,
 2,
 142,
 1621,
 5,
 821,
 17,
 73,
 17,
 204,
 5,
 2,
 19,
 55,
 1763,
 2,
 92,
 66,
 104,
 14,
 20,
 93,
 76,
 1488,
 151

In [43]:
print(train_input.shape,test_input.shape)

(25000,) (25000,)


In [44]:
# 2진 분류 : 긍정 - 1, 부정 - 0
print(train_target[:20])

[1 0 0 1 0 0 1 0 1 0 1 0 0 0 0 0 1 1 0 1]


In [45]:
from sklearn.model_selection import train_test_split
train_input,val_input,train_target,val_target = train_test_split(
    train_input,train_target,test_size=0.2, random_state=42
)

In [46]:
print(train_input.shape,val_input.shape)

(20000,) (5000,)


In [32]:
lengths = np.array([len(x) for x in train_input])
lengths

array([259, 520, 290, ..., 300,  70,  77])

In [33]:
print(np.mean(lengths),np.median(lengths))

239.00925 178.0


In [34]:
np.max(lengths)

1854

In [47]:
# 시퀀스 패딩
# 글자의 수를 제한해서 없는 부분은 0으로 채워줌
# 최대 글자수 100개 제한
from tensorflow.keras.preprocessing.sequence import pad_sequences

train_seq = pad_sequences(train_input,maxlen=300)

In [48]:
# 20000개 샘플, 크기 100
train_seq.shape

(20000, 300)

In [49]:
# 검증세트 - 시퀀스패딩
val_seq = pad_sequences(val_input,maxlen=300)

In [38]:
train_seq[0][0:20]

array([  24,   15,  821,   66,   10,   10,   45,  578,   15,    4,   20,
        805,    8,   30,   17,  821,    5, 1621,   17,  614])

단어 임베딩

In [50]:
model = keras.Sequential()
# 임베딩층 추가
model.add(keras.layers.Embedding(2000,16,input_length=300))
model.add(keras.layers.SimpleRNN(8))
model.add(keras.layers.Dense(1,activation='sigmoid'))
model.summary()

In [51]:
# 인공신경망 훈련 동일
# Flatten 필요없음.
# 원핫인코딩을 사용하지 않음. - 임베딩층 사용
# train_oh -> train_seq 데이터 사용
rmsprop = keras.optimizers.RMSprop(learning_rate=0.0001)
model.compile(optimizer=rmsprop,loss='binary_crossentropy',metrics=['accuracy'])
checkpoint_cb = keras.callbacks.ModelCheckpoint('simpleRnn_embedding_model.keras')
early_stopping_cb = keras.callbacks.EarlyStopping(patience=3,
                                                  restore_best_weights=True )
history = model.fit(train_seq,train_target,batch_size=64,epochs=100,
                validation_data=(val_seq,val_target), 
                callbacks=[checkpoint_cb,early_stopping_cb])

Epoch 1/100
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 26ms/step - accuracy: 0.5764 - loss: 0.6738 - val_accuracy: 0.7232 - val_loss: 0.5921
Epoch 2/100
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 26ms/step - accuracy: 0.7509 - loss: 0.5733 - val_accuracy: 0.7854 - val_loss: 0.5445
Epoch 3/100
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 25ms/step - accuracy: 0.7929 - loss: 0.5329 - val_accuracy: 0.8112 - val_loss: 0.5117
Epoch 4/100
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 26ms/step - accuracy: 0.8165 - loss: 0.5011 - val_accuracy: 0.8216 - val_loss: 0.4865
Epoch 5/100
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 26ms/step - accuracy: 0.8369 - loss: 0.4676 - val_accuracy: 0.8330 - val_loss: 0.4656
Epoch 6/100
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 26ms/step - accuracy: 0.8516 - loss: 0.4435 - val_accuracy: 0.8342 - val_loss: 0.4485
Epoch 7/100
[1m

In [52]:
# stop위치 출력 - 2번 전의 모델이 가장 좋음.
early_stopping_cb.stopped_epoch

17

In [53]:
model.evaluate(val_seq,val_target)

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8553 - loss: 0.3647


[0.3600843548774719, 0.859000027179718]

In [54]:
# test_input
# 시퀀스 패딩
test_seq = pad_sequences(test_input,maxlen=100)
# 원핫인코딩
# test_oh = keras.utils.to_categorical(test_seq)
model.evaluate(test_seq,test_target)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8183 - loss: 0.4103


[0.4097102880477905, 0.8181999921798706]

In [None]:
# 딥러닝 훈련
# 데이터전처리 train_seq = pad_sequences(train_input,maxlen=100)

# 모델결정 model = keras.Sequential()
# 순환 신경망 (RNN)
# model2.add(keras.layers.Embedding(500,16,input_length=100))
# model.add(keras.layers.SimpleRNN(8))
# model.add(keras.layers.Dense(1,activation='sigmoid'))
# model.summary()


# 인공신경망 훈련
# 인공신경망 훈련 동일
# Flatten 필요없음. 원핫인코딩
# 모델설정 옵티마이저 - rmsprop
# rmsprop = keras.optimizers.RMSprop(learning_rate=0.0001)
# model.compile(optimizer=rmsprop,loss='binary_crossentropy',metrics=['accuracy'])
# 콜백모델체크포인트 checkpoint_cb = keras.callbacks.ModelCheckpoint('simpleRnn_model.keras')
# 종기종료 early_stopping_cb = keras.callbacks.EarlyStopping(patience=3,restore_best_weights=True )
# 모델훈련 history = model.fit(train_seq,train_target,batch_size=64,epochs=100,
#                 validation_data=(val_oh,val_target), 
#                 callbacks=[checkpoint_cb,early_stopping_cb])
# 모델평가 model.evaluate(val_seq,val_target)

# test세트 검증 - 3차원행렬로 변경해서 test진행해야 함.
# # 시퀀스 패딩
# test_seq = pad_sequences(test_input,maxlen=100)
# # 원핫인코딩
# #test_oh = keras.utils.to_categorical(test_seq)
# model.evaluate(test_seq,test_target)