In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn import model_selection
from sklearn import metrics

data set: https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection

In [None]:
labels=[]
documents=[]
with open('SMSSpamCollection.txt') as f:
    for line in f:
        labels.append(line[:4].strip())
        documents.append(line[4:].strip())

len(documents)

Each text is labeled spam or ham (not spam)

In [None]:
for document, label in list(zip(documents, labels))[:5]:
    print(document)
    print(label)
    print()

In [None]:
print('fraction spam:', len([item for item in labels if item == 'spam'])/5574)

# transform texts into vectors

let's use TF-IDF (term frequency, inverse document frequency):

- give more weight to words that occur a lot within a document
- give less weight to words that occur in many documents

In [None]:
vectorizer = TfidfVectorizer()

In [None]:
X = vectorizer.fit_transform(documents)
y = np.array(labels)

print(X.shape, y.shape)

Each text becomes a vector of N numbers, where N is size of vocabulary.

In [None]:
y

# instantiate classifier

naive Bayes:

$$probability(spam | document) = probability(document | spam) \times probability(spam) / probability(document)$$

$$ \approx prob(word_1|spam) \times prob(word_2|spam) \times ... \times prob(word_n|spam) \times prob(spam)$$

In [None]:
clf = BernoulliNB()

# cross validation

In [None]:
cv = model_selection.StratifiedKFold(5)

Randomly split data into training and testing. Iterate 5 times.

In [None]:
precision=[]
recall=[]
for train, test in cv.split(X, y):
#     print(len(train), len(test))
    X_train = X[train]
    X_test = X[test]
    y_train = y[train]
    y_test = y[test]
    clf.fit(X_train, y_train)
    y_hat = clf.predict(X_test)
    p, r, _, _ = metrics.precision_recall_fscore_support(y_test, y_hat)
    precision.append(p[1])
    recall.append(r[1])
    
print(precision)
print(recall)

# average precision / recall across k-folds

- precision: of predicted spam, how many are actual spam?
- recall: of the actual spam, how many are predicted to be spam?

In [None]:
print('precision:',np.average(precision), '+/-', np.std(precision))
print('recall:', np.average(recall), '+/-', np.std(recall))

# try on new spam message

In [None]:
sample = 'URGENT! We are trying to contact U.Todays draw shows that you have won a 2000 prize GUARANTEED. Call 090 5809 4507 from land line. Claim 3030. Valid 12hrs only'
sample = vectorizer.transform([sample])

In [None]:
clf.predict(sample)

# most spammy words

In [None]:
probs=clf.feature_log_prob_[1] - clf.feature_log_prob_[0]
len(probs)

In [None]:
features=vectorizer.get_feature_names()
len(features)

In [None]:
sorted(zip(probs,features), reverse=True)[:25]