# 1-3 文本数据建模流程

## 1 数据集描述
imdb数据集的目标是根据电影评论的文本内容预测评论的感情标签
* 训练集有20000条电影评论文本
* 测试集有5000条电影评论文本
* 一般的处理步骤包括了：构建词典、编码转换、序列填充、构建数据管道
* 中文的文本处理还包括了中文的文本分割

一般TensorFlow之中的文本数据处理方式有两种
* tf.keras.preprocessing中的Tokenizer词典构建工具和tf.keras.util.Squence构建文本数据生成器管道
* tf.data.TextLineDataset搭配tf.keras.layers.experimental.preprocessing.TextVectorization预处理层

## 2 数据导入

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import tensorflow as tf
from tensorflow.keras import models, layers, preprocessing, optimizers, losses, metrics
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import re, string

In [2]:
train_data_path = '../data/imdb/train.csv'
test_data_path = '../data/imdb/test.csv'
max_words = 1000  # 词典大小
max_len = 200  # 每个sample保留200次词
batch_size = 20

# 将标签和文本分割开
def split_line(line):
    arr = tf.strings.split(line, '\t')
    label = tf.expand_dims(tf.cast(tf.strings.to_number(arr[0]), tf.int32), axis = 0)
    text = tf.expand_dims(arr[1], axis = 0)
    return (text, label)

# 加载数据集
def load_dataset(file_path):
    return tf.data.TextLineDataset(filenames = [file_path]) \
                .map(split_line, num_parallel_calls = tf.data.experimental.AUTOTUNE) \
                .shuffle(buffer_size = 1000) \
                .batch(batch_size) \
                .prefetch(tf.data.experimental.AUTOTUNE)

ds_train = load_dataset(train_data_path)
ds_test = load_dataset(test_data_path)

# 构建词典
def clean_text(text):
    lowercase = tf.strings.lower(text)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    cleaned_punctuation = tf.strings.regex_replace(stripped_html, '[%s]' % re.escape(string.punctuation), '')
    return cleaned_punctuation

vectorize_layer = TextVectorization(standardize = clean_text, split = 'whitespace',
                                    max_tokens = max_words - 1, output_mode = 'int',
                                   output_sequence_length = max_len)

ds_text = ds_train.map(lambda text, label: text)
vectorize_layer.adapt(ds_text)
print(vectorize_layer.get_vocabulary()[0:100])
for text in ds_text.unbatch().take(1):
    print("text: ", text)
    print(vectorize_layer(tf.expand_dims(text, 0)))

[b'the', b'and', b'a', b'of', b'to', b'is', b'in', b'it', b'i', b'this', b'that', b'was', b'as', b'for', b'with', b'movie', b'but', b'film', b'on', b'not', b'you', b'his', b'are', b'have', b'be', b'he', b'one', b'its', b'at', b'all', b'by', b'an', b'they', b'from', b'who', b'so', b'like', b'her', b'just', b'or', b'about', b'has', b'if', b'out', b'some', b'there', b'what', b'good', b'more', b'when', b'very', b'she', b'even', b'my', b'no', b'would', b'up', b'time', b'only', b'which', b'story', b'really', b'their', b'were', b'had', b'see', b'can', b'me', b'than', b'we', b'much', b'well', b'get', b'been', b'will', b'into', b'people', b'also', b'other', b'do', b'bad', b'because', b'great', b'first', b'how', b'him', b'most', b'dont', b'made', b'then', b'them', b'films', b'movies', b'way', b'make', b'could', b'too', b'any', b'after', b'characters']
text:  tf.Tensor([b"I only saw this recently but had been aware of it for a number of years and have always been intrigued by its title. It now be

In [3]:
# 构建单词数据集
ds_train = ds_train.map(lambda text, label: (vectorize_layer(text), label)).take(100) # 减少数据量
ds_test = ds_test.map(lambda text, label: (vectorize_layer(text), label)).take(100)  # 减少数据量

In [4]:
# 构建模型, 模型简单就不用类封装了
tf.keras.backend.clear_session()
x_input = layers.Input(shape = (max_len, ))
x = layers.Embedding(max_words, output_dim = 7, input_length = max_len)(x_input)  # output:(None, max_len, 7)
x = layers.LSTM(16)(x)  # output: (None, 16)
x = layers.Dense(32)(x)
x = layers.Dense(1, activation = 'sigmoid')(x)  # output: (None, 1)
model = tf.keras.Model(inputs = [x_input], outputs = [x])
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 200)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 200, 7)            7000      
_________________________________________________________________
lstm (LSTM)                  (None, 16)                1536      
_________________________________________________________________
dense (Dense)                (None, 32)                544       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 9,113
Trainable params: 9,113
Non-trainable params: 0
_________________________________________________________________


In [5]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss = tf.keras.losses.binary_crossentropy, metrics=['accuracy'])
model.fit(ds_train, epochs = 20, validation_data = ds_test)  # 训练时间较长, 先停一下

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20

KeyboardInterrupt: 