In [74]:
# Deep Learning
# Let's build a spam clasifier

import pandas as pd

dataset = pd.read_csv('spam.csv', encoding='latin-1')

print(dataset.head())

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [75]:
# Fill null values with empty strings
dataset.fillna(value='', inplace=True)


In [76]:
# score phrase and title
print(dataset.columns.values)

['v1' 'v2' 'Unnamed: 2' 'Unnamed: 3' 'Unnamed: 4']


In [77]:
# Extract the required columns for inputs and outputs
totalX = dataset.v2 # SMS text
totalY = dataset.v1 # spam or ham

In [78]:
import tflearn
from tflearn.data_utils import pad_sequences, VocabularyProcessor

import numpy as np

# Convert the strings in the input into integers corresponding to the dictionary positions
# Data is automatically padded so we need to pad_sequences manually
vocab_proc = VocabularyProcessor(30) # max document length
totalX = np.array(list(vocab_proc.fit_transform(totalX)))

In [79]:
totalX

array([[    1,     2,     3, ...,     0,     0,     0],
       [   21,    22,    23, ...,     0,     0,     0],
       [   27,    28,     8, ...,    49,    50,    51],
       ..., 
       [11153,   415,     8, ...,     0,     0,     0],
       [  174,  3484,   286, ...,     0,     0,     0],
       [ 4643,   660,  1641, ...,     0,     0,     0]])

In [80]:
# We will have 2 classes in total for prediction, indices from 0 to 1
vocab_proc2 = VocabularyProcessor(1)
totalY = np.array(list(vocab_proc2.fit_transform(totalY))) - 1

In [81]:
totalY

array([[0],
       [0],
       [1],
       ..., 
       [0],
       [0],
       [0]])

In [82]:
from keras.utils.np_utils import to_categorical

# here totalY is numbered dictionary entries (0, 1, ... to 10)
totalY[5]
# Convert the indices into 11 dimensional vectors
totalY = to_categorical(totalY, 2)
# here totalY is a binary matrix
totalY

array([[ 1.,  0.],
       [ 1.,  0.],
       [ 0.,  1.],
       ..., 
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.]])

In [83]:
from sklearn.cross_validation import train_test_split

# Split into training and testing data
trainX, testX, trainY, testY = train_test_split(totalX, totalY, test_size=0.1)

print(trainX[0])
print(testX[0])
print(trainY[0])
print(testY[0])

[  330   118   555   111    63 10566    77   118   771  5257    81    63
  5591    53   583   648   111    91  4389     0     0     0     0     0
     0     0     0     0     0     0]
[ 452  548 1348   33  150 5709    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0]
[ 1.  0.]
[ 1.  0.]


In [86]:
import tensorflow as tf
tf.reset_default_graph()

# Network building
# 15 words max, so 15 input data
net = tflearn.input_data([None, 30])

# dictionary has 20k words max
# Turns positive integers (indexes) into dense vectors of fixed size.
net = tflearn.embedding(net, input_dim=20000, output_dim=128)

# Long Short Term Memory Recurrent Layer.
# Each input would have a size of i15x128 and each of these 128
# sized vectors are fed into the LSTM layer one at a time.
# All the intermediate outputs are collected and then passed on to the second LSTM layer.
net = tflearn.lstm(net, 128, dropout=0.8)

# The output is then sent to a fully connected layer that would give us our final 2 classes
net = tflearn.fully_connected(net, 2, activation='softmax')

# We use the adam optimizer instead of standard SGD since it converges much faster
net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
                         loss='categorical_crossentropy')

In [87]:


# Training
model = tflearn.DNN(net, tensorboard_verbose=0)

# uncomment to load saved model
# model.load('spam_model.tfl')

# comment to use saved model
model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True, batch_size=32)

model.save('spam_model.tfl')

print("done training")

Training Step: 1569  | total loss: [1m[32m0.00039[0m[0m | time: 12.106s
| Adam | epoch: 010 | loss: 0.00039 - acc: 1.0000 -- iter: 4992/5014
Training Step: 1570  | total loss: [1m[32m0.00040[0m[0m | time: 13.187s
| Adam | epoch: 010 | loss: 0.00040 - acc: 1.0000 | val_loss: 0.09412 - val_acc: 0.9821 -- iter: 5014/5014
--
INFO:tensorflow:/Users/miquel/dev/github/deep-learning/spam_model.tfl is not in all_model_checkpoint_paths. Manually adding it.
done training


In [42]:
# Test predictions

testIdx = 11

prediction = model.predict(np.reshape(trainX[testIdx], (-1, 15)))

print(dataset.v2[testIdx])
print(dataset.v1[testIdx])
print(totalY[testIdx])
print(prediction[0])

SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info
spam
[ 0.  1.]
[ 0.00839915  0.99160087]


In [44]:
user_input = "SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info"
user_input

'SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info'

In [72]:
user_input = "Hello Bye WIN CASH TODAY FREE pounds info ajd;asdasd"

# Transform manual input into vocabulary numbers
input_array = np.array(list(vocab_proc.fit_transform(np.array([user_input]))))

# reverse to check if the input matches what the dictionary has
reverse = np.array(list(vocab_proc.reverse(input_array)))
print(input_array)
print(reverse)

# perform prediction
prediction = model.predict(input_array)
print("Ham: {0:.2f}".format(prediction[0][0]))
print("Spam: {0:.2f}".format(prediction[0][1]))

[[ 481 8041 2311  200 2048  177  205  218    0    0    0    0    0    0
     0]]
[ 'Hello Bye WIN CASH TODAY FREE pounds info <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK>']
Ham: 0.14
Spam: 0.86
