# Spam Classification

## Some background information for text processing

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

### Bag of words? Bag of whaa...?

<img src="figures/bowjpg" width="100%">


In [2]:
X = ["It was a bright cold day in April, and the clocks were striking thirteen",
    "The sky above the port was the color of television, tuned to a dead channel"]

In [4]:
len(X)

2

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(X)


CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [7]:
vectorizer.vocabulary_

{u'above': 0,
 u'and': 1,
 u'april': 2,
 u'bright': 3,
 u'channel': 4,
 u'clocks': 5,
 u'cold': 6,
 u'color': 7,
 u'day': 8,
 u'dead': 9,
 u'in': 10,
 u'it': 11,
 u'of': 12,
 u'port': 13,
 u'sky': 14,
 u'striking': 15,
 u'television': 16,
 u'the': 17,
 u'thirteen': 18,
 u'to': 19,
 u'tuned': 20,
 u'was': 21,
 u'were': 22}

In [8]:
X_bag_of_words = vectorizer.transform(X)

In [9]:
X_bag_of_words

<2x23 sparse matrix of type '<type 'numpy.int64'>'
	with 25 stored elements in Compressed Sparse Row format>

In [10]:
X_bag_of_words.shape

(2, 23)

In [11]:
X_bag_of_words.toarray()

array([[0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1],
       [1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 3, 0, 1, 1, 1, 0]])

In [12]:
vectorizer.get_feature_names()

[u'above',
 u'and',
 u'april',
 u'bright',
 u'channel',
 u'clocks',
 u'cold',
 u'color',
 u'day',
 u'dead',
 u'in',
 u'it',
 u'of',
 u'port',
 u'sky',
 u'striking',
 u'television',
 u'the',
 u'thirteen',
 u'to',
 u'tuned',
 u'was',
 u'were']

In [13]:
vectorizer.inverse_transform(X_bag_of_words)

[array([u'and', u'april', u'bright', u'clocks', u'cold', u'day', u'in',
        u'it', u'striking', u'the', u'thirteen', u'was', u'were'], 
       dtype='<U10'),
 array([u'above', u'channel', u'color', u'dead', u'of', u'port', u'sky',
        u'television', u'the', u'to', u'tuned', u'was'], 
       dtype='<U10')]

### TF-IDF
In information retrieval, tf–idf, short for term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.

The importance increases proportionally to the number of times a word appears in the document but is offset by the frequency of the word in the corpus. 

Typically, the tf-idf weight is composed by two terms: the first computes the normalized Term Frequency (TF), aka. the number of times a word appears in a document, divided by the total number of words in that document; the second term is the Inverse Document Frequency (IDF), computed as the logarithm of the number of the documents in the corpus divided by the number of documents where the specific term appears.

**TF**: Term Frequency, which measures how frequently a term occurs in a document. Since every document is different in length, it is possible that a term would appear much more times in long documents than shorter ones. Thus, the term frequency is often divided by the document length (aka. the total number of terms in the document) as a way of normalization: 

**TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document).**

**IDF**: Inverse Document Frequency, which measures how important a term is. While computing TF, all terms are considered equally important. However it is known that certain terms, such as "is", "of", and "that", may appear a lot of times but have little importance. Thus we need to weigh down the frequent terms while scale up the rare ones, by computing the following: 

**IDF(t) = log_e(Total number of documents / Number of documents with term t in it).**



In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(X)

TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [15]:
import numpy as np
np.set_printoptions(precision=2)

print(tfidf_vectorizer.transform(X).toarray())

[[ 0.    0.29  0.29  0.29  0.    0.29  0.29  0.    0.29  0.    0.29  0.29
   0.    0.    0.    0.29  0.    0.21  0.29  0.    0.    0.21  0.29]
 [ 0.26  0.    0.    0.    0.26  0.    0.    0.26  0.    0.26  0.    0.
   0.26  0.26  0.26  0.    0.26  0.55  0.    0.26  0.26  0.18  0.  ]]


### Bigrams and N-Grams
Entirely discarding word order is not always a good idea, as composite phrases often have specific meaning, and modifiers like "not" can invert the meaning of words.
A simple way to include some word order are n-grams, which don't only look at a single token, but at all pairs of neighborhing tokens:

In [17]:
# look at sequences of tokens of minimum length 2 and maximum length 2
bigram_vectorizer = CountVectorizer(ngram_range=(2, 2))
bigram_vectorizer.fit(X)

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(2, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [18]:
bigram_vectorizer.get_feature_names()

[u'above the',
 u'and the',
 u'april and',
 u'bright cold',
 u'clocks were',
 u'cold day',
 u'color of',
 u'day in',
 u'dead channel',
 u'in april',
 u'it was',
 u'of television',
 u'port was',
 u'sky above',
 u'striking thirteen',
 u'television tuned',
 u'the clocks',
 u'the color',
 u'the port',
 u'the sky',
 u'to dead',
 u'tuned to',
 u'was bright',
 u'was the',
 u'were striking']

In [19]:
bigram_vectorizer.transform(X).toarray()

array([[0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
        1, 0, 1],
       [1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
        0, 1, 0]])

In [20]:
gram_vectorizer = CountVectorizer(ngram_range=(1, 2))
gram_vectorizer.fit(X)

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [21]:
gram_vectorizer.get_feature_names()

[u'above',
 u'above the',
 u'and',
 u'and the',
 u'april',
 u'april and',
 u'bright',
 u'bright cold',
 u'channel',
 u'clocks',
 u'clocks were',
 u'cold',
 u'cold day',
 u'color',
 u'color of',
 u'day',
 u'day in',
 u'dead',
 u'dead channel',
 u'in',
 u'in april',
 u'it',
 u'it was',
 u'of',
 u'of television',
 u'port',
 u'port was',
 u'sky',
 u'sky above',
 u'striking',
 u'striking thirteen',
 u'television',
 u'television tuned',
 u'the',
 u'the clocks',
 u'the color',
 u'the port',
 u'the sky',
 u'thirteen',
 u'to',
 u'to dead',
 u'tuned',
 u'tuned to',
 u'was',
 u'was bright',
 u'was the',
 u'were',
 u'were striking']

In [22]:
gram_vectorizer.transform(X).toarray()

array([[0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
        1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
        1, 0, 1, 1],
       [1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0,
        0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 3, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,
        0, 1, 0, 0]])

### Character N-grams

Sometimes it is also helpful to not look at words, but instead single character.
That is particularly useful if you have very noisy data, want to identify the language, or we want to predict something about a single word.
We can simply look at characters instead of words by setting ``analyzer="char"``.
Looking at single characters is usually not very informative, but looking at longer n-grams of characters can be:

In [24]:
char_vectorizer = CountVectorizer(ngram_range=(2, 2), analyzer="char")
char_vectorizer.fit(X)

CountVectorizer(analyzer='char', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(2, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [25]:
print(char_vectorizer.get_feature_names())

[u' a', u' b', u' c', u' d', u' i', u' o', u' p', u' s', u' t', u' w', u', ', u'a ', u'ab', u'ad', u'an', u'ap', u'as', u'ay', u'bo', u'br', u'ch', u'ck', u'cl', u'co', u'd ', u'da', u'de', u'e ', u'ea', u'ed', u'ee', u'el', u'en', u'er', u'ev', u'f ', u'g ', u'gh', u'ha', u'he', u'hi', u'ht', u'ig', u'ik', u'il', u'in', u'io', u'ir', u'is', u'it', u'ki', u'ks', u'ky', u'l,', u'ld', u'le', u'lo', u'n ', u'n,', u'nd', u'ne', u'ng', u'nn', u'o ', u'oc', u'of', u'ol', u'on', u'or', u'ov', u'po', u'pr', u'r ', u're', u'ri', u'rt', u's ', u'si', u'sk', u'st', u't ', u'te', u'th', u'to', u'tr', u'tu', u'un', u've', u'vi', u'wa', u'we', u'y ']


## Moving on to the problem at hand

In [26]:
import os
with open(os.path.join("data","SMSSpamCollection")) as f:
    lines = [line.strip().split("\t") for line in f.readlines()]
text = [x[1] for x in lines]
y = [x[0] == "ham" for x in lines]

In [28]:
from sklearn.cross_validation import train_test_split

text_train, text_test, y_train, y_test = train_test_split(text, y, random_state=42)

In [29]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(text_train)

X_train = vectorizer.transform(text_train)
X_test = vectorizer.transform(text_test)

In [30]:
print(len(vectorizer.vocabulary_))

7464


In [31]:
print(vectorizer.get_feature_names()[:20])


[u'00', u'000', u'000pes', u'008704050406', u'0089', u'0121', u'01223585236', u'01223585334', u'02', u'0207', u'02072069400', u'02073162414', u'02085076972', u'03', u'04', u'0430', u'05', u'050703', u'0578', u'06']


In [32]:
print(vectorizer.get_feature_names()[3000:3020])

[u'getting', u'getzed', u'gf', u'ghodbandar', u'ghost', u'gibbs', u'gibe', u'gift', u'gifted', u'gifts', u'giggle', u'gimme', u'gimmi', u'gin', u'girl', u'girlfrnd', u'girlie', u'girls', u'gist', u'giv']


In [33]:
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier()
clf

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [34]:
clf.fit(X_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [35]:
clf.score(X_test, y_test)

0.9813486370157819

In [36]:
clf.score(X_train, y_train)

0.99880382775119614