# Machine Learning is not that complicated (in Python)

In [1]:
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn import cross_validation
from sklearn import metrics



data set: https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection

In [96]:
labels=[]
documents=[]
with open('./reviews.csv') as f:
    for line in f:
        line_list = line.split('|')
        if line_list[3] == '1' or line_list[3] == '2' or line_list[3] == '3':
            labels.append('neg')
            documents.append(line_list[4])

        elif line_list[3] == '4' or line_list[3] == '5':
            labels.append('pos')
            documents.append(line_list[4])


len(documents)

2431

In [97]:
for document, label in zip(documents, labels)[:10]:
    print document
    print label
    print

Well, I never read this as a teen. The reason being that my middle school didn’t think it was appropriate to put in our reading curriculum, probably due to rape  . So with the   about to come out, I decided to finally read this one. I wanted to go to Barnes and Noble and get their special leather-bound edition of it, but was told that it was no longer available. I bought it on Amazon instead. What we have here is part coming of age school novel, part courtroom drama, and part small town mythos.  While Scout recounts her early years at school (which seem to blend together without regard for timing), it’s a learning process for all. Her earliest teacher makes bureaucratic, though well-meaning, mistakes by shaming Scout for already knowing how to read and doesn’t know the faux pas of the town social nuances. What surprised me most about the court case was how swift it was decided. I don’t know how fast these things went in the old days (to see if the book was an accurate portrayal for the

In [98]:
print 'number neg:', len([item for item in labels if item == 'neg'])

number neg: 416


# transform texts into vectors

let's use TF-IDF (term frequency, inverse document frequency):

- give more weight to words that occur a lot within a document
- give less weight to words that occur in many documents

In [99]:
vectorizer = TfidfVectorizer()

In [100]:
X = vectorizer.fit_transform(documents)
y = np.array(labels)

print X.shape, y.shape

(2431, 38515) (2431,)


In [101]:
y

array(['neg', 'pos', 'pos', ..., 'pos', 'pos', 'pos'],
      dtype='|S3')

# instantiate classifier

naive Bayes:

$$probability(spam | document) = probability(document | spam) \times probability(spam) / probability(document)$$

$$ \approx prob(word_1|spam) \times prob(word_2|spam) \times ... \times prob(word_n|spam) \times prob(spam)$$

"naive" = "wrong"

In [102]:
clf = BernoulliNB()

# cross validation

In [103]:
cv = cross_validation.StratifiedKFold(y,5)

In [104]:
precision=[]
recall=[]
for train, test in cv:
    X_train = X[train]
    X_test = X[test]
    y_train = y[train]
    y_test = y[test]
#     print len(y_train), len(y_test)
#     clf = BernoulliNB()
    clf.fit(X_train, y_train)
    y_hat = clf.predict(X_test)
    p,r,_,_ = metrics.precision_recall_fscore_support(y_test, y_hat)
    precision.append(p[1])
    recall.append(r[1])

In [106]:
recall

[0.86848635235732008,
 0.89330024813895781,
 0.90074441687344908,
 0.85607940446650121,
 0.81637717121588094]

# average precision / recall across k-folds

- precision: of predicted negative reviews, how many are actually negative reviews?
- recall: of the actual negative reviews, how many are predicted to be negative reviews?

In [107]:
# print vectorizer
# print clf
print 'precision:',np.average(precision), '+/-', np.std(precision)
print 'recall:', np.average(recall), '+/-', np.std(recall)

precision: 0.843354920254 +/- 0.0063734628691
recall: 0.86699751861 +/- 0.0300483880957


# try on new negative review

In [118]:
sample = 'absolute recommendation unforgettable grand outstanding captivating marvelous'
print sample
sample = vectorizer.transform([sample])
#print sample

absolute recommendation unforgettable grand outstanding captivating marvelous


In [120]:
clf.predict_proba(sample)

array([[  7.81608867e-62,   1.00000000e+00]])

# most spammy words

In [110]:
probs=clf.feature_log_prob_[1] - clf.feature_log_prob_[0]
len(probs)

38515

In [111]:
features=vectorizer.get_feature_names()
len(features)

38515

In [117]:
sorted(zip(probs,features), reverse=True)[:70]

[(2.0111786214512817, u'absolute'),
 (1.8933955857948983, u'wide'),
 (1.8933955857948983, u'endure'),
 (1.6857562210166535, u'recommendation'),
 (1.6663381351595521, u'lee'),
 (1.6465355078633728, u'prisoners'),
 (1.6465355078633728, u'portrait'),
 (1.6465355078633728, u'abraham'),
 (1.5631538989243214, u'unforgettable'),
 (1.5187021363534878, u'forgiveness'),
 (1.5187021363534878, u'eternal'),
 (1.5187021363534878, u'calvin'),
 (1.5187021363534878, u'atrocities'),
 (1.4721821207185948, u'grand'),
 (1.4721821207185943, u'steinbeck'),
 (1.4721821207185943, u'pen'),
 (1.4721821207185943, u'grinch'),
 (1.4721821207185943, u'grandfather'),
 (1.4721821207185943, u'fantastic'),
 (1.4233919565491622, u'pressure'),
 (1.4233919565491622, u'owner'),
 (1.4233919565491622, u'cabinet'),
 (1.372098662161612, u'searching'),
 (1.372098662161612, u'limits'),
 (1.372098662161612, u'hobbes'),
 (1.372098662161612, u'hadrian'),
 (1.372098662161612, u'guilt'),
 (1.372098662161612, u'drawing'),
 (1.372098662

![](cat.jpg)