In [None]:
import collections
import io
import time
import urllib
import zipfile

import matplotlib.pyplot as plt
import numpy as np
import sklearn.metrics
import sklearn.datasets
import tensorflow as tf

# Prints numpy arrays nicer
np.set_printoptions(precision=2, suppress=True, linewidth=100)

In [None]:
# Download dataset.
try:  # Prevent redownloading data if code block is repeatedly executed.
    data
except NameError:
    data = []
    url = ('https://archive.ics.uci.edu/'
           + 'ml/machine-learning-databases/00228/smsspamcollection.zip')
    with zipfile.ZipFile(
            io.BytesIO(urllib.request.urlopen(url).read())) as archive:
        # Uncomment the following lines for a description of the dataset.
        # with archive.open('readme') as f:
        #     print(f.read().decode('ISO-8859-1'))
        with archive.open('SMSSpamCollection') as f:
            for i, l in enumerate(f):
                label, text = l.decode('ISO-8859-1').split(maxsplit=1)
                words = tf.keras.preprocessing.text.text_to_word_sequence(text)
                is_spam = label == 'spam'
                data.append((words, is_spam))

    # Build vocabulary (set of unique words) and assign word ids.
    # Restrict vocabulary to 300 most frequent words and assign word ids.
    vocab_size = 300
    word_counter = collections.Counter()
    for words, is_spam in data:
        word_counter.update(words)

    sorted_word_counts = sorted(word_counter.items(),
                                key=lambda x: (-x[1], x[0]))[:vocab_size]
    id_to_word = [word for word, count in sorted_word_counts]
    word_to_id = {word: i for i, word in enumerate(id_to_word)}

    # Transform lists of words into bag-of-words vectors.
    for i in range(len(data)):
        words, is_spam = data[i]
        bag_of_words = np.zeros(shape=[vocab_size])
        # if words:  # The preprocessing turns the SMS ':)' into an empty list.
        for word in words:
            if word in word_to_id:
                bag_of_words[word_to_id[word]] += 1
        data[i] = (bag_of_words, int(is_spam))

# Perform 60% / 40% training/test split
np.random.shuffle(data)
split_index = int(len(data) * 0.6)
train_data = data[:split_index]
test_data = data[split_index:]
print('Num training examples:', len(train_data))
print('Num testing examples:', len(test_data))

In [None]:
# Hyperparameters
learning_rate = 100
num_epochs = 100
num_features = len(train_data[0][0])
batch_size = 100

# Model Definition
batch_x = tf.placeholder(tf.float32, shape=[None, num_features])
batch_y = tf.placeholder(tf.int32, shape=[None])

w = tf.Variable(tf.random_normal(shape=[num_features], mean=0, stddev=1))
b = tf.Variable(0.0)

logits = tf.tensordot(batch_x, w, 1) + b
y_prediction = tf.cast(tf.greater(logits, 0), tf.int32)

loss = tf.losses.sigmoid_cross_entropy(batch_y, logits)
# The above equivalent to the following but without numerical instability.
# loss = tf.reduce_mean(batch_y * -tf.log(tf.nn.sigmoid(logits))
#                       + (1 - batch_y) * -tf.log(1 - tf.nn.sigmoid(logits)))

train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)

In [None]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    # Training
    time_before = time.time()
    losses = []  # Storing losses so we can plot them later
    for epoch in range(num_epochs):
        np.random.shuffle(train_data)
        cumulative_loss = 0
        for i in range(0, len(train_data), batch_size):
            _batch_x, _batch_y = zip(*train_data[i:i + batch_size])
            _loss, _train_op = sess.run(
                (loss, train_op),
                feed_dict={batch_x: _batch_x, batch_y: _batch_y})
            cumulative_loss += _loss * len(_batch_x)
        average_loss = cumulative_loss / len(train_data)
        if epoch % 5 == 4:
            print('Epoch: {}, Loss: {}'.format(epoch + 1, average_loss))
        losses.append(average_loss)
    time_after = time.time()
    print('Training took {:.2f}s.'.format(time_after - time_before))

    # Introspection
    _w, _b = sess.run([w, b])

    # Prediction
    train_xs, train_ys = zip(*train_data)
    train_ys_prediction = sess.run(y_prediction, feed_dict={batch_x: train_xs})

    test_xs, test_ys = zip(*test_data)
    test_ys_prediction = sess.run(y_prediction, feed_dict={batch_x: test_xs})

In [None]:
plt.figure(dpi=150)
plt.title('Loss over Time')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.plot(range(len(losses)), losses, color='#458588')
plt.show()

In [None]:
print('Precision on Training data:',
      sklearn.metrics.precision_score(train_ys, train_ys_prediction))
print('Recall on Training data:',
      sklearn.metrics.recall_score(train_ys, train_ys_prediction))
print('F1-Score on Training data:',
      sklearn.metrics.f1_score(train_ys, train_ys_prediction))
print('Accuracy on Training data:',
      sklearn.metrics.accuracy_score(train_ys, train_ys_prediction))
print()
print('Precision on Testing data:',
      sklearn.metrics.precision_score(test_ys, test_ys_prediction))
print('Recall on Testing data:',
      sklearn.metrics.recall_score(test_ys, test_ys_prediction))
print('F1-Score on Testing data:',
      sklearn.metrics.f1_score(test_ys, test_ys_prediction))
print('Accuracy on Testing data:',
      sklearn.metrics.accuracy_score(test_ys, test_ys_prediction))

train_num_spam = np.sum(train_ys)
train_num_spam_prediction = np.sum(train_ys_prediction)
train_num_ham = len(train_ys) - train_num_spam
train_num_ham_prediction = len(train_ys_prediction) - train_num_spam_prediction
test_num_spam = np.sum(test_ys)
test_num_spam_prediction = np.sum(test_ys_prediction)
test_num_ham = len(test_ys) - test_num_spam
test_num_ham_prediction = len(test_ys_prediction) - test_num_spam_prediction

plt.figure(dpi=150)
plt.title('Class Distribution Actual vs Predicted: Training Data')
plt.ylabel('Frequency')
plt.bar([1, 2, 3.5, 4.5],
        [train_num_spam, train_num_spam_prediction,
         train_num_ham, train_num_ham_prediction],
        tick_label=['Actual Spam', 'Predicted Spam',
                    'Actual Ham', 'Predicted Ham'],
        color=['#458588', '#CC241D'])
plt.show()

plt.figure(dpi=150)
plt.title('Class Distribution Actual vs Predicted: Testing Data')
plt.ylabel('Frequency')
plt.bar([1, 2, 3.5, 4.5],
        [test_num_spam, test_num_spam_prediction,
         test_num_ham, test_num_ham_prediction],
        tick_label=['Actual Spam', 'Predicted Spam',
                    'Actual Ham', 'Predicted Ham'],
        color=['#458588', '#CC241D'])
plt.show()

In [None]:
most_spammy_words = np.argsort(_w)[-50:][::-1]
print('Most Spammy Words:')
for i, word_id in enumerate(most_spammy_words):
    print('{:2d}. {}'.format(i + 1, id_to_word[word_id]))