<a href="https://colab.research.google.com/github/panghanwu/machine_learning_Elwing/blob/main/RNN_sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf

dataset = tf.keras.utils.get_file(
    fname="aclImdb.tar.gz", 
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
    extract=True,
)

In [2]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import os
import glob
import pandas as pd

def read(path):
  with open(path, 'r', encoding='utf-8') as f:
    content = f.read()
  return content

# get path
dn = os.path.dirname(dataset)
pattern = os.path.join(dn, 'aclImdb', 'train', 'pos', '*')
pos = glob.glob(pattern)
pattern = os.path.join(dn, 'aclImdb', 'train', 'neg', '*')
neg = glob.glob(pattern)
sentiments = [1]*len(pos) + [0]*len(neg)

contents = map(read, pos+neg)


def get_data(t):
  dn = os.path.dirname(dataset)
  pattern = os.path.join(dn, "aclImdb", t, "pos", "*.txt")
  pos = glob.glob(pattern)
  pattern = os.path.join(dn, "aclImdb", t, "neg", "*.txt")
  neg = glob.glob(pattern)
  sentiments = [1] * len(pos) + [0] * len(neg)
  contents = map(read, pos + neg)
  df = pd.DataFrame({
      "contents":contents,
      "sentiment":sentiments
  })
  return df

train_df = get_data('train')
test_df = get_data('test')

# parameter
TOK = 3000
LEN = 512
EM = 128

tok = Tokenizer(num_words=TOK)
tok.fit_on_texts(train_df['contents'])

x_train_seq = tok.texts_to_sequences(train_df['contents'])
x_test_seq = tok.texts_to_sequences(test_df['contents'])

x_train_pad = pad_sequences(x_train_seq, LEN)
x_test_pad = pad_sequences(x_test_seq, LEN)

# build RNN 
layers = [
      Embedding(TOK+1, EM, mask_zero=True, input_length=LEN),
      SimpleRNN(64),
      Dense(2, activation="softmax")  
]

model = Sequential(layers)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 512, 128)          384128    
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 64)                12352     
_________________________________________________________________
dense (Dense)                (None, 2)                 130       
Total params: 396,610
Trainable params: 396,610
Non-trainable params: 0
_________________________________________________________________


In [3]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

model.compile(loss=SparseCategoricalCrossentropy(),
       optimizer=Adam(),
       metrics=["accuracy"])

y_train = train_df["sentiment"]
y_test = test_df["sentiment"]

callbacks = [
    ModelCheckpoint("model.h5", save_best_only=True),
    EarlyStopping(patience=5, restore_best_weights=True)
]
model.fit(x_train_pad, 
     y_train,
     batch_size=100,
     epochs=50,
     validation_split=0.1,
     callbacks=callbacks)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50


<tensorflow.python.keras.callbacks.History at 0x7f55281d8da0>

In [4]:
model.evaluate(x_test_pad, y_test)



[0.39333149790763855, 0.8313599824905396]

不推薦RNN，無法平行運算，效果也沒有特別好。