## 引入依赖

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## 训练数据

In [2]:
df = pd.read_csv('./labeledTrainData.tsv', sep='\t')
df

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...
...,...,...,...
24995,3453_3,0,It seems like more consideration has gone into...
24996,5064_1,0,I don't believe they made this film. Completel...
24997,10905_3,0,"Guy is a loser. Can't get girls, needs to buil..."
24998,10194_3,0,This 30 minute documentary Buñuel made in the ...


## 切分数据

In [3]:
train, test = train_test_split(df, test_size=0.15, random_state=41)
train, val = train_test_split(train, test_size=0.15, random_state=41)

## 处理文本

文本分词，保留词频最高的5000个词，用数字编号，得到词表

In [4]:
# 英文分词，保留频次最高的5000个词
vocab_size = 10000
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(train['review'])

# 补充：如果中文则需要自己对样本先jieba分词，然后通过tokenizer.fit_on_sequences输入分词列表即可，后续都是一样的。

In [5]:
# 查看词表
conf = tokenizer.get_config()
conf.keys() 
# index_word: 词ID -> 词
# word_index: 词 -> 词ID

dict_keys(['num_words', 'filters', 'lower', 'split', 'char_level', 'oov_token', 'document_count', 'word_counts', 'word_docs', 'index_docs', 'index_word', 'word_index'])

词表可以导出json，工程侧可以加载词表进行文本预处理

In [6]:
# 保存词表为Json字符串
json_conf = tokenizer.to_json()

# 从json字符串加载词表
tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(json_conf)

In [7]:
# 根据词表将样本转换为词id列表
train_review = tokenizer.texts_to_sequences(train['review'])
test_review = tokenizer.texts_to_sequences(test['review'])

In [8]:
#train_review

每个样本仅保留首先出现的256个词

In [9]:
# 每条评论只保留256个词
train_review = tf.keras.preprocessing.sequence.pad_sequences(train_review, value=0, padding='post',maxlen=256)
test_review = tf.keras.preprocessing.sequence.pad_sequences(test_review, value=0, padding='post',maxlen=256)

In [10]:
#train_review

## 转换dataset

In [11]:
# 处理成dataset
train_ds = tf.data.Dataset.from_tensor_slices(({'review': train_review}, train['sentiment'])).batch(32)
test_ds = tf.data.Dataset.from_tensor_slices(({'review': test_review}, test['sentiment'])).batch(32)
#for features, label in train_ds:
#    print(features, label)

## 特征预处理（embedding)

In [12]:
# 特征预处理
review_cate_col = tf.feature_column.categorical_column_with_vocabulary_list('review', range(vocab_size+1),default_value=0)
review_embedding_col = tf.feature_column.embedding_column(review_cate_col, dimension=16)

## 构造模型

In [13]:
model = tf.keras.Sequential([
    tf.keras.layers.DenseFeatures([review_embedding_col]), # 预处理层
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid'), # sigmoid激活到0~1以拟合目标值
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])  # 用交叉熵算损失

## 训练模型

In [14]:
model.fit(train_ds, batch_size=512, epochs=40)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<tensorflow.python.keras.callbacks.History at 0x7fe4ecd272d0>

## 评估模型

In [15]:
model.evaluate(test_ds, verbose=2)

118/118 - 0s - loss: 1.3793 - accuracy: 0.8605


[1.3792898654937744, 0.8605333566665649]