# 使用TensorFlow实现RNN模型进行垃圾短信预测

In [3]:
import os
import re
import io
import csv
import requests
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from zipfile import ZipFile
from tensorflow.python.framework import ops
ops.reset_default_graph()

sess = tf.Session()

### RNN config parameters
短信固定长度为25，超过部分被截取，不够的部分用0填充<p>
RNN模型有10个单元组成

In [4]:
epochs = 50
batch_size = 250
max_sequence_length = 25
rnn_size = 10
embedding_size = 50
min_word_frequency = 10
learning_rate = 0.0005
dropout_keep_prob = tf.placeholder(tf.float32)

这个数据用的是06节NLP中的垃圾短信。可以直接调用。

In [5]:
save_file_name = os.path.join('temp','temp_spam_data.csv')

# Create directory if it doesn't exist
if not os.path.exists('temp'):
    os.makedirs('temp')

if os.path.isfile(save_file_name):
    text_data = []
    with open(save_file_name, 'r',newline='') as temp_output_file:
        reader = csv.reader(temp_output_file)
        for row in reader:
            text_data.append(row)
else:
    zip_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip'
    r = requests.get(zip_url)
    z = ZipFile(io.BytesIO(r.content))
    file = z.read('SMSSpamCollection')

    text_data = file.decode()
    text_data = text_data.encode('ascii',errors='ignore')
    text_data = text_data.decode().split('\n')
    text_data = [x.split('\t') for x in text_data if len(x)>=1]
    
    with open(save_file_name, 'w',newline='') as temp_output_file:
        writer = csv.writer(temp_output_file)
        writer.writerows(text_data)

text_data_train = [x[1] for x in text_data]
target = [x[0] for x in text_data]

数据清理,单词构建索引，用索引来表示文档

In [6]:
def clean_text(text_string):
    text_string = re.sub(r'([^\s\w]|_|[0-9])+', '', text_string)
    text_string = " ".join(text_string.split())
    text_string = text_string.lower()
    return text_string

text_data_train = [clean_text(x) for x in text_data_train]

vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(max_sequence_length,
                                                                     min_frequency=min_word_frequency)
text_processed = np.array(list(vocab_processor.fit_transform(text_data_train)))

Instructions for updating:
Please use tensorflow/transform or tf.data.
Instructions for updating:
Please use tensorflow/transform or tf.data.
Instructions for updating:
Please use tensorflow/transform or tf.data.


打乱数据集

In [7]:
text_processed = np.array(text_processed)
text_data_target = np.array([1 if x == 'ham' else 0 for x in target])
shuffled_index = np.random.permutation(np.arange(len(target)))
x_shuffled = text_processed[shuffled_index]
y_shuffled = text_data_target[shuffled_index]

分割数据集 80%/20%<p>
也可以用scikit-learn的train_test_split(）

In [8]:
ix_cutoff = int(len(y_shuffled)*0.80)
x_train, x_test = x_shuffled[:ix_cutoff], x_shuffled[ix_cutoff:]
y_train, y_test = y_shuffled[:ix_cutoff], y_shuffled[ix_cutoff:]
vocab_size = len(vocab_processor.vocabulary_)
print("Vocabulary Size: {:d}".format(vocab_size))
print("80%-20% Train Test split: {:d} -- {:d}".format(len(y_train), len(y_test)))

Vocabulary Size: 933
80%-20% Train Test split: 4459 -- 1115


占位符，词嵌入

In [9]:
x_data = tf.placeholder(shape=[None,max_sequence_length], dtype=tf.float32)
y_target = tf.placeholder(shape=[None], dtype=tf.int32)

embedding_mat = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), dtype=tf.float32)
embedding_output = tf.nn.embedding_lookup(embedding_mat, x_data)

TypeError: Value passed to parameter 'indices' has DataType float32 not in list of allowed values: int32, int64