In [1]:
import tensorflow as tf

dataset = tf.keras.utils.get_file(
    fname="aclImdb.tar.gz", 
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
    extract=True,
)

In [2]:
import os
import glob
import pandas as pd

def read(fp):
    with open(fp, "r", encoding="utf-8") as f:
        content = f.read()
    return content

def read_data(base):
    pos = glob.glob(os.path.join(base, "pos", "*"))
    neg = glob.glob(os.path.join(base, "neg", "*"))
    df = pd.DataFrame({
        "path":(neg + pos),
        "target":([0]*len(neg) + [1] * len(pos))
    })
    df["content"] = df["path"].apply(read)
    return df

In [None]:
dirname = os.path.dirname(dataset)
base = os.path.join(dirname, "aclImdb", "train")
train_df = read_data(base)
base = os.path.join(dirname, "aclImdb", "test")
test_df = read_data(base)
test_df

In [None]:
# 預處理1. 先把文字化成數字
from tensorflow.keras.preprocessing.text import Tokenizer

# 出現太少的詞, 你可以選擇不看, 只留出現次數最高的2000(num_words)
tok = Tokenizer(num_words=2000)
tok.fit_on_texts(train_df["content"])

In [None]:
# 想要看每個單詞被給的編號: tok.word_index

In [None]:
x_train_seq = tok.texts_to_sequences(train_df["content"])
x_test_seq = tok.texts_to_sequences(test_df["content"])
pd.DataFrame(x_train_seq)

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

x_train_pad = pad_sequences(x_train_seq, maxlen=256)
x_test_pad = pad_sequences(x_test_seq, maxlen=256)
pd.DataFrame(x_train_pad)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dropout, Dense

layers = [
    # 2001 * 64 = 128064
    Embedding(2001, 64, mask_zero=True, input_length=256),
    Flatten(),
    Dense(256, activation='relu'),
    Dropout(0.25),
    Dense(2, activation='softmax')
]

model = Sequential(layers)
model.summary()

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

callbacks = [
    EarlyStopping(patience=3, restore_best_weights=True)
]

In [None]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy

model.compile(loss=SparseCategoricalCrossentropy(),
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
import numpy as np
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

model.fit(x_train_pad,
         np.array(train_df['target']),
         batch_size=200,
         epochs=100,
         validation_split=0.1,
         verbose=2,
         callbacks=callbacks)

In [None]:
model.evaluate(x_test_pad, np.array(test_df['target']))