# Spam classification

In this example, we will build a spam classifier for text messages (SMS).

In [91]:
import pandas as pd
import csv
import matplotlib as plt
% matplotlib inline
import numpy as np
import numpy.linalg as la
from scipy import sparse
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.utils import shuffle

We first need to read in the data. The data is contained in a file called SMSSpamCollection. This file consists of 5574 rows, and each row contains a label (ham or spam) and the text message. The label and the text message are separated by a TAB (the TAB key on the keyboard, denoted by '\t' in Python). A convenient way to read the data into Python is by using the read_csv function of the Pandas library.

In [62]:
# Read in messages using the Pandas library
messages = pd.read_csv('data/SMSSpamCollection', sep='\t', quoting=csv.QUOTE_NONE, names=["label", "message"])
messages = shuffle(messages) # Use random order
print messages

     label                                            message
5429  spam  Santa Calling! Would your little ones like a c...
3279   ham                        What happened in interview?
5369   ham  Just trying to figure out when I'm suppose to ...
5318   ham                       Jus finish watching tv... U?
4119   ham  Watch lor. I saw a few swatch one i thk quite ...
3894   ham                     Have you heard from this week?
1181   ham                         I'm in chennai velachery:)
5421   ham  I'm at bruce &amp; fowler now but I'm in my mo...
1389   ham     Oh k.i think most of wi and nz players unsold.
3937   ham     They r giving a second chance to rahul dengra.
588    ham  "Pete can you please ring meive hardly gotany ...
1472   ham  Oh. U must have taken your REAL Valentine out ...
2327   ham  The Xmas story is peace.. The Xmas msg is love...
3411   ham  Joy's father is John. Then John is the ____ of...
2653   ham                      No need for the drug anymore.
4176   h

In [61]:
# Extract some important information
messages.groupby('label').describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,message
label,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,count,4827
ham,unique,4518
ham,top,"Sorry, I'll call later"
ham,freq,30
spam,count,747
spam,unique,653
spam,top,Please call our customer service representativ...
spam,freq,4


In [63]:
count_vectorizer = CountVectorizer()
bag_of_words = count_vectorizer.fit_transform(messages['message'])
print bag_of_words.shape

(5574, 8713)


In [64]:
# We can explore the vocabulary
count_vectorizer.vocabulary_

{u'raining': 6259,
 u'yellow': 8645,
 u'four': 3354,
 u'malarky': 4884,
 u'woods': 8533,
 u'hanging': 3753,
 u'looking': 4739,
 u'electricity': 2900,
 u'scold': 6698,
 u'lord': 4748,
 u'rp176781': 6574,
 u'callin': 1840,
 u'09063440451': 237,
 u'screaming': 6714,
 u'disturb': 2672,
 u'prize': 6113,
 u'nottingham': 5448,
 u'wednesday': 8367,
 u'oooh': 5579,
 u'specially': 7167,
 u'nigh': 5378,
 u'tired': 7779,
 u'snuggles': 7073,
 u'clubmoby': 2110,
 u'second': 6730,
 u'08718720201': 157,
 u'txtno': 7993,
 u'scraped': 6708,
 u'2kbsubject': 455,
 u'scallies': 6686,
 u'laxinorficated': 4563,
 u'cooking': 2271,
 u'fingers': 3231,
 u'maraikara': 4908,
 u'hero': 3857,
 u'y87': 8623,
 u'here': 3856,
 u'specialise': 7166,
 u'china': 2032,
 u'dogwood': 2711,
 u'dorm': 2737,
 u'08718711108': 156,
 u'087187262701': 161,
 u'84122': 773,
 u'w111wx': 8255,
 u'kids': 4441,
 u'84128': 774,
 u'eastenders': 2845,
 u'09058091870': 202,
 u'spotty': 7217,
 u'golden': 3588,
 u'replace': 6437,
 u'brought': 1

In [65]:
# Or a row of the bag-of-words matrix and the corresponding messages
print bag_of_words[10,:]
print messages['message'][10]

  (0, 8668)	1
  (0, 1853)	1
  (0, 5832)	1
  (0, 5917)	1
  (0, 6522)	1
  (0, 4996)	1
  (0, 3773)	1
  (0, 3617)	1
  (0, 2345)	1
I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today.


In [66]:
# Normalize messages so that the rows have length 1
tfidf = TfidfTransformer()
normalized_bow = tfidf.fit_transform(bag_of_words)
normalized_bow.shape

(5574, 8713)

In [81]:
# Transform 'spam' and 'ham' into 1 and -1
bin_labels = np.array([1. if x =='spam' else -1. for x in messages['label']])

In [83]:
# Separate training data from test data
train, train_l, test, test_l = normalized_bow[:1000], bin_labels[:1000], normalized_bow[1000:], bin_labels[1000:]

Now that we have our data and labels set up, we can run the optimization! Here, we will use a naive implementation of Stochastic Gradient Descent

In [87]:
def sgd_bt(f, df, X, y, x0, tol, maxiter=1000, rho=0.5, c=0.1):
    """
    Stochastic Gradient Descent with backtracking
    """
    x = np.vstack((x0+2*tol*np.ones(x0.shape),x0)).transpose()
    i = 1
    m, n = X.shape
    while ( la.norm(x[:,i]-x[:,i-1]) > tol ) and ( i < maxiter ):
        # Choose random index
        ind = np.random.randint(m)
        p = -df(x[:,i], X[ind], y[ind])
        # Start backtracking
        alpha = 1
        xnew = x[:,i] + alpha*p
        while (f(xnew, X[ind], y[ind]) >= f(x[:,i], X[ind], y[ind]) + alpha*c*np.dot(p, df(x[:,i], X[ind], y[ind]))):
            alpha = alpha*rho
            xnew = x[:,i] + alpha*p
        x = np.concatenate((x,xnew.reshape((len(x0),1))), axis=1)
        i += 1
    return x[:,1:]

In [138]:
def f(w, X, y):
    Y = np.concatenate((X.toarray(),
    return np.log(1+np.exp(-y*(sparse.csc_matrix.dot(X.transpose(),w[:-1])+w[-1])))

def df(w, X, y):
    Y = np.concatenate((X.toarray(),[1.]))
    return -Y*y*np.exp(-y*(np.dot(Y,w)))/(1+np.exp(-y*np.dot(Y,w)))

In [151]:
x = np.ones(8713)
np.concatenate((normalized_bow[10].toarray()[0],np.array([1.])))

array([ 0.,  0.,  0., ...,  0.,  0.,  1.])

In [140]:
x0 = np.ones(8714)
W = sgd_bt(f, df, train, train_l, x0, 1e-2)

ValueError: all the input arrays must have same number of dimensions