### 데이터 준비

In [9]:
import requests

res = requests.get('https://github.com/euphoris/datasets/raw/master/imdb.zip')

with open('imdb.zip', 'wb') as f:
  f.write(res.content)

In [1]:
import pandas as pd

df = pd.read_csv('imdb.zip')

In [2]:
import tensorflow as tf

tk = tf.keras.preprocessing.text.Tokenizer(num_words=2000, oov_token='<unk>')

2022-11-07 09:40:30.062000: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-11-07 09:40:30.062033: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [3]:
tk.fit_on_texts(df['review'])

In [4]:
import joblib

joblib.dump(tk, 'tokenizer.pkl')

['tokenizer.pkl']

In [5]:
import joblib

tk = joblib.load('tokenizer.pkl')

In [6]:
from sklearn.model_selection import train_test_split

review_train, review_test, y_train, y_test = train_test_split(df['review'], \
    df['sentiment'], test_size=0.2, random_state=42)

In [7]:
seqs = tk.texts_to_sequences(review_train)

In [None]:
seqs_test = tk.texts_to_sequences(review_test)

In [8]:
review_train.iloc[0]

'It is an insane game.'

In [9]:
seqs[:3]

[[9, 6, 33, 1258, 214],
 [178, 5, 28, 35, 23, 168, 713, 591, 3, 713, 1, 10, 1, 280],
 [206, 336, 4]]

### 순방향 순환신경망

In [39]:
# maxlen: None(기본값), 10일 경우 10으로 문장의 길이를 잘라서 패딩 처리
# padding: pre(기본값), post(뒤쪽에 0 패딩 처리), pre를 권장
# truncating: pre(기본값), maxlen 값이 설정되어 있어야 동작함. 성능 테스트 후 적당한 걸 선택

import tensorflow as tf

pads = tf.keras.preprocessing.sequence.pad_sequences(seqs, maxlen = None, \
    padding = 'pre', truncating = 'pre')

In [None]:
pads_test = tf.keras.preprocessing.sequence.pad_sequences(seqs_test, maxlen=None, \
    padding = 'pre', truncating = 'pre')

In [11]:
len(pads), pads.shape

(800, (800, 73))

In [12]:
NUM_WORDS = tk.num_words + 1

In [27]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(NUM_WORDS, 8, mask_zero = True), # mask_zero는 0으로 되어있는 패딩을 무시하라
    tf.keras.layers.LSTM(8),
    tf.keras.layers.Dense(1, activation = 'sigmoid')
])

In [28]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, None, 8)           16008     
                                                                 
 lstm_3 (LSTM)               (None, 8)                 544       
                                                                 
 dense_3 (Dense)             (None, 1)                 9         
                                                                 
Total params: 16,561
Trainable params: 16,561
Non-trainable params: 0
_________________________________________________________________


In [29]:
model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics='accuracy')

In [30]:
model.fit(pads, y_train.values, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f431d088e20>

In [None]:
model.evaluate(pads_test, y_test.values)

### 역방향 순환신경망
역방향 순환신경망일 경우 패딩을 post로 처리하는 것이 좋음. 순방향일 경우는 pre

#### padding = 'pre'

In [40]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(NUM_WORDS, 8, mask_zero = True),
    tf.keras.layers.LSTM(8, go_backwards = True), # go_backwards는 역방향 설정
    tf.keras.layers.Dense(1, activation = 'sigmoid')
])

In [41]:
model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics='accuracy')

In [42]:
model.fit(pads, y_train.values, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f42ec4d61a0>

#### padding = 'post'

In [44]:
import tensorflow as tf

pads = tf.keras.preprocessing.sequence.pad_sequences(seqs, padding = 'post')

In [45]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(NUM_WORDS, 8, mask_zero = True),
    tf.keras.layers.LSTM(8, go_backwards = True),
    tf.keras.layers.Dense(1, activation = 'sigmoid')
])

In [46]:
model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics='accuracy')

In [47]:
model.fit(pads, y_train.values, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f42cb484e80>

### 양방향 순환신경망

In [48]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(NUM_WORDS, 8, mask_zero = True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(8)),
    tf.keras.layers.Dense(1, activation = 'sigmoid')
])

In [49]:
model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, None, 8)           16008     
                                                                 
 bidirectional (Bidirectiona  (None, 16)               1088      
 l)                                                              
                                                                 
 dense_8 (Dense)             (None, 1)                 17        
                                                                 
Total params: 17,113
Trainable params: 17,113
Non-trainable params: 0
_________________________________________________________________


In [52]:
model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [53]:
model.fit(pads, y_train.values, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f42c87f3d60>