In [46]:
import codecs
import os

import numpy as np
import pandas as pd
import scipy as sp

from keras import metrics
from keras.layers import Dense, Dropout, Activation
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer

from sklearn import linear_model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC  


from time import time

First we need to retrieve files that will be used for training. Let's store file contents and file names in two lists:

In [47]:
def get_training_files(dir):
    'Get relevant training data files from the folder.'
    r = []
    for root, dirs, files in os.walk(dir):
        for name in files:
            if 'spam' in name or 'ham' in name:
                r.append(os.path.join(root, name))
    return r

all_files = 'data'
train_data = []
labels = []
for f in get_training_files(all_files):
    with codecs.open(f, 'r', encoding='utf-8', errors='ignore') as fdata:
        train_data.append(fdata.read().replace('\n', ' ').replace('\r', ''))
        labels.append('spam') if 'spam' in f else labels.append('ham')

print(len(train_data), len(labels))

33716 33716


Now we can convert the two lists to a Pandas dataframe which makes it easier to work with:

In [48]:
df = pd.DataFrame(list(zip(train_data, labels)), columns=['text', 'label'])

Let's do some analysis of the data:

In [49]:
print("Total number of data points:", df.shape[0])
print("Number of ham data points:", df[(df['label'] == 'ham')].shape[0])
print("Number of spam data points:", df[(df['label'] == 'spam')].shape[0])

Total number of data points: 33716
Number of ham data points: 16545
Number of spam data points: 17171


Pretty balanced regarding labels. 

We need to vectorize the text, i.e. convert it to a representation suitable for training a classifier. For this, we will use the TF-IDF vectorizer (Term Frequency — Inverse Document Frequency), an embedding technique which takes into account the importance of each term to a text. We will also consider bigrams (sequences of two words), lowercase the texts and eliminate stopwords.

In [60]:
df = df.sample(frac=1) # shuffle rows in dataframe
df.head(10)
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size = 0.1, random_state = 1)
vectorizer = TfidfVectorizer(ngram_range=(1, 2), lowercase=True, stop_words='english')
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

  (0, 1444138)	0.012421620199937693
  (0, 1570112)	0.48445104864588057
  (0, 374150)	0.03888352178585067
  (0, 487665)	0.12697100370886735
  (0, 677916)	0.08458536474045623
  (0, 1569849)	0.0742571012466618
  (0, 1488339)	0.13200753728611364
  (0, 670361)	0.0681761191221993
  (0, 999921)	0.05835243300525048
  (0, 1385691)	0.1054244482004252
  (0, 1638358)	0.09543283458501586
  (0, 1604651)	0.13200753728611364
  (0, 625015)	0.12062572182204993
  (0, 1182916)	0.11644619149091767
  (0, 347325)	0.06549938788990445
  (0, 1533233)	0.06686372864277365
  (0, 1386712)	0.04391144418757016
  (0, 1066186)	0.042866829031315064
  (0, 602080)	0.06916127581364155
  (0, 504888)	0.07301390253617075
  (0, 713018)	0.04502376539637998
  (0, 1056263)	0.0412652637854407
  (0, 957270)	0.1047144480924745
  (0, 1380031)	0.09253095963057424
  (0, 601061)	0.06253526739816576
  :	:
  (30343, 1352948)	0.23983095625783568
  (30343, 1074571)	0.07994365208594521
  (30343, 1270615)	0.07994365208594521
  (30343, 1494049

Let us write a function for printing out evaluation results:

In [51]:
def evaluate(y, pred):
        """
        Use sklearn for model evaluation
        :param y: gold labels
        :param pred: predicted labels
        """
        print("accuracy: ", accuracy_score(y, pred))
        print("recall: ", recall_score(y, pred, average='weighted'))
        print("precision: ", precision_score(y, pred, average='weighted'))
        print("f1_score macro: ", f1_score(y, pred, average='macro'))
        print("f1_score micro: ", f1_score(y, pred, average='micro'))

Let's try several different classification algorithms. 

We can start with **Naive Bayes**, which is often used as a baseline for spam filtering. 


In [52]:
clf = MultinomialNB()
t0=time()
clf.fit(X_train, y_train)  
print("Training time:", round(time() - t0, 3), "s")
t1 = time()
y_pred = clf.predict(X_test)
print("Prediction time:", round(time() - t1, 3), "s")
print("Results for Naive Bayes: ")
evaluate(y_test, y_pred)

Training time: 0.384 s
Prediction time: 0.036 s
Results for Naive Bayes: 
accuracy:  0.9911032028469751
recall:  0.9911032028469751
precision:  0.9911200136450838
f1_score macro:  0.991101950745128
f1_score micro:  0.9911032028469751



We can try **Logistic Regression**, a binary classifier that takes a linear combination of features and applies non-linear function (sigmoid) to it. Logistic regression provides lots of ways to regularize the model, and you don’t have to worry as much about your features being correlated like in Naive Bayes, for example.

In [53]:
clf = linear_model.LogisticRegression()
t0 = time()
clf.fit(X_train, y_train)
print("Training time:", round(time() - t0, 3), "s") # the time would be round to 3 decimal in seconds
t1 = time()
y_pred = clf.predict(X_test)
print("Prediction time:", round(time() - t1, 3), "s")
print("Results for Logistic Regression: ")
evaluate(y_test, y_pred)



Training time: 5.616 s
Prediction time: 0.006 s
Results for Logistic Regression: 
accuracy:  0.9845788849347569
recall:  0.9845788849347569
precision:  0.9848221925751746
f1_score macro:  0.9845726110704431
f1_score micro:  0.9845788849347569


**K-nearest neighbors** can be useful in case of nonlinear data,  training phase is fast, but testing can be slow as it requires large memory for storing the entire training dataset for prediction. 

In [54]:
clf = KNeighborsClassifier(n_neighbors=3)
t0 = time()
clf.fit(X_train, y_train)
print("Training time:", round(time() - t0, 3), "s")
t1 = time()
y_pred = clf.predict(X_test)
print("Prediction time:", round(time() - t1, 3), "s")
print("Results for K-nearest neighbours: ")
evaluate(y_test, y_pred)

Training time: 0.086 s
Prediction time: 8.506 s
Results for K-nearest neighbours: 
accuracy:  0.982502965599051
recall:  0.982502965599051
precision:  0.982524519915733
f1_score macro:  0.9825024100654853
f1_score micro:  0.982502965599051


**Support vector machines** is another popular algorithm which chooses the decision boundary that maximizes the distance from the nearest data points of all the classes. 

In [55]:
clf = SVC(kernel='linear') 
t0=time()
clf.fit(X_train, y_train)  
print("Training time:", round(time() - t0, 3), "s")
t1 = time()
y_pred = clf.predict(X_test)
print("Prediction time:", round(time() - t1, 3), "s")
print("Results for SVM: ")
evaluate(y_test, y_pred)

Training time: 941.163 s
Prediction time: 28.025 s
Results for SVM: 
accuracy:  0.9916963226571768
recall:  0.9916963226571768
precision:  0.9917207114158839
f1_score macro:  0.99169503422579
f1_score micro:  0.9916963226571768


Finally, we can try training a **neural network**. Let's build a simple feed-forward neural network.

In [56]:
num_max = 2000
le = LabelEncoder()
tags = le.fit_transform(df['label'])
tokenizer = Tokenizer(num_words=num_max)
tokenizer.fit_on_texts(df['text']) # set up internal vocab using all words from training data and attach indices to them

X_train, X_test, y_train, y_test = train_test_split(df['text'], tags, test_size = 0.1, random_state = 1)


mat_texts_tr = tokenizer.texts_to_matrix(X_train,mode='count')
mat_texts_tst = tokenizer.texts_to_matrix(X_test,mode='count')

train_data_seq = tokenizer.texts_to_sequences(X_train)
test_data_seq = tokenizer.texts_to_sequences(X_test)

Let's specify the architecture of the model. It will be a simple sequential model that will use an input layer with 2000 input neurons (this number was chosen experimentally), two hidden layers for internal transformation and one output layer that gives us a scalar prediction value indicating if we have spam or ham.

In [57]:
def get_simple_model():
    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=(num_max,)))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.summary()
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['acc',metrics.binary_accuracy])
    return model

Now let's train the network.

In [58]:
m = get_simple_model()
m.fit(mat_texts_tr,y_train,batch_size=32,epochs=10,verbose=1,validation_split=0.3)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_22 (Dense)             (None, 512)               1024512   
_________________________________________________________________
dense_23 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_24 (Dense)             (None, 1)                 257       
Total params: 1,156,097
Trainable params: 1,156,097
Non-trainable params: 0
_________________________________________________________________
Train on 21240 samples, validate on 9104 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1329d81d0>

Finally, let's evaluate the performance of the network on the test data:

In [59]:
results = m.evaluate(mat_texts_tst,y_test)
print(m.metrics_names)
print('Test result: ', results)

['loss', 'acc', 'binary_accuracy']
Test result:  [0.08104704655032939, 0.9899169632265717, 0.9899169632265717]


Overall, I think I will choose the Naive Bayes algorithm for this task. The training and prediction time is quite fast, and the accuracy, precision and recall are very good on the test data (over 99%). SVM and neural network are also not bad, but the training time is just too long compared to Naive Bayes. Also, Naive Bayes is quite intuitive to understand and so it's easier to interpret results.