In [1]:
################
### PREAMBLE ###
################

from __future__ import division
import tensorflow as tf
import numpy as np
import tarfile
import os
import matplotlib.pyplot as plt
import time


In [2]:
###################
### IMPORT DATA ###
###################

def csv_to_numpy_array(filePath, delimiter):
    return np.genfromtxt(filePath, delimiter=delimiter, dtype=None)

def import_data():
    if "data" not in os.listdir(os.getcwd()):
        # Untar directory of data if we haven't already
        tarObject = tarfile.open("data.tar.gz")
        tarObject.extractall()
        tarObject.close()
        print("Extracted tar to current directory")
    else:
        # we've already extracted the files
        pass

    print("loading training data")
    trainX = csv_to_numpy_array("data/trainX.csv", delimiter="\t")
    trainY = csv_to_numpy_array("data/trainY.csv", delimiter="\t")
    print("loading test data")
    testX = csv_to_numpy_array("data/testX.csv", delimiter="\t")
    testY = csv_to_numpy_array("data/testY.csv", delimiter="\t")
    return trainX,trainY,testX,testY

trainX,trainY,testX,testY = import_data()

loading training data
loading test data


In [3]:
# Parameters
learning_rate = 0.001
training_iters = 20000
batch_size = 128
display_step = 10

In [4]:
#########################
### GLOBAL PARAMETERS ###
#########################

# DATA SET PARAMETERS
# Get our dimensions for our different variables and placeholders:
# numFeatures = the number of words extracted from each email
numFeatures = trainX.shape[1]
# numLabels = number of classes we are predicting (here just 2: Ham or Spam)
numLabels = trainY.shape[1]

# TRAINING SESSION PARAMETERS
# number of times we iterate through training data
# tensorboard shows that accuracy plateaus at ~25k epochs
#numEpochs = 27000
# a smarter learning rate for gradientOptimizer
#learningRate = tf.train.exponential_decay(learning_rate=0.0008,
                                          #global_step= 1,
                                          #decay_steps=trainX.shape[0],
                                          #decay_rate= 0.95,
                                          #staircase=True)

In [5]:
numLabels

2

In [6]:
####################
### PLACEHOLDERS ###
####################

# X = X-matrix / feature-matrix / data-matrix... It's a tensor to hold our email
# data. 'None' here means that we can hold any number of emails
X = tf.placeholder(tf.float32, [None, numFeatures])
# yGold = Y-matrix / label-matrix / labels... This will be our correct answers
# matrix. Every row has either [1,0] for SPAM or [0,1] for HAM. 'None' here 
# means that we can hold any number of emails
yGold = tf.placeholder(tf.float32, [None, numLabels])

In [7]:
#################
### VARIABLES ###
#################

# Values are randomly sampled from a Gaussian with a standard deviation of:
#     sqrt(6 / (numInputNodes + numOutputNodes + 1))

weights = tf.Variable(tf.random_normal([numFeatures,numLabels],
                                       mean=0,
                                       stddev=(np.sqrt(6/numFeatures+
                                                         numLabels+1)),
                                       name="weights"))

bias = tf.Variable(tf.random_normal([1,numLabels],
                                    mean=0,
                                    stddev=(np.sqrt(6/numFeatures+numLabels+1)),
                                    name="bias"))

In [8]:
######################
### PREDICTION OPS ###
######################

# INITIALIZE our weights and biases
init_OP = tf.initialize_all_variables()

# PREDICTION ALGORITHM i.e. FEEDFORWARD ALGORITHM
apply_weights_OP = tf.matmul(X, weights, name="apply_weights")
add_bias_OP = tf.add(apply_weights_OP, bias, name="add_bias") 
activation_OP = tf.nn.sigmoid(add_bias_OP, name="activation")

In [12]:
#####################
### EVALUATION OP ###
#####################

# COST FUNCTION i.e. MEAN SQUARED ERROR
cost = tf.nn.l2_loss(activation_OP-yGold, name="squared_error_cost")
#cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(activation_OP, yGold))

In [13]:
#######################
### OPTIMIZATION OP ###
#######################

# OPTIMIZATION ALGORITHM i.e. GRADIENT DESCENT
optimizer = tf.train.GradientDescentOptimizer(0.01).minimize(cost)
#optimizer = tf.train.AdamOptimizer(learning_rate=0.1).minimize(cost)
correct_pred = tf.equal(tf.argmax(activation_OP,1),tf.argmax(yGold,1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, "float"))

In [14]:
numEpochs =2000
# Launch the graph
errors = []
with tf.Session() as sess:
    sess.run(init_OP )
    print('Initialized Session.')
    for step in range(numEpochs):
        # run optimizer at each step in training
        sess.run(optimizer, feed_dict={X: trainX, yGold: trainY})
        # fill errors array with updated error values
        accuracy_value = accuracy.eval(feed_dict={X: trainX, yGold: trainY})
        errors.append(1 - accuracy_value)
    print('Optimization Finished!')
    print("final accuracy on test set: %s" %str(sess.run(accuracy, feed_dict={X: testX, yGold: testY})))    

Initialized Session.
Optimization Finished!
final accuracy on test set: 0.933333
