# Machine Learning is not that complicated (in Python)

In [121]:
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn import cross_validation
from sklearn import metrics

data set: https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection

In [122]:
labels=[]
documents=[]
with open('./reviews.csv') as f:
    for line in f:
        line_list = line.split('|')
        if line_list[3] == '1' or line_list[3] == '2':
            labels.append('neg')
            documents.append(line_list[4])

        elif line_list[3] == '4' or line_list[3] == '5':
            labels.append('pos')
            documents.append(line_list[4])


len(documents)

6120

In [123]:
for document, label in zip(documents, labels)[:5]:
    print document
    print label
    print

This book was a bizarre experience for me. It reads much like a traditional, classic English novel, except with loads of descriptive sex and vulgar words mixed in for shock value. Instead of being shocked, though, I just found it all a bit tiresome and rather silly.  Maybe it was the fact that Lawrence sometimes used words like "thee" and "thy" and "dost" mixed in with modern day vulgarities that added to the overall unintentional humor of it for me, or perhaps it was that the vulgarities were simply used so darned often. In any event, I found myself laughing out loud often. I also found myself cringing. C and F words aside, did anyone tell Lawrence the word "bowels" is not particularly appealing? Anyway, I can see why some people felt at the time this was quite simply a trashy romance disguised as literature. It kind of is. Well-written and intelligent, for the most part, but still a bit trashy nonetheless. The story is essentially this: Lady Chatterley's young husband is paralyzed fr

In [125]:
print 'number pos:', len([item for item in labels if item == 'pos'])

number pos: 4306


# transform texts into vectors

let's use TF-IDF (term frequency, inverse document frequency):

- give more weight to words that occur a lot within a document
- give less weight to words that occur in many documents

In [208]:
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(3, 1))

In [209]:
X = vectorizer.fit_transform(documents)
y = np.array(labels)

print X.shape, y.shape

(6120, 58125) (6120,)


In [210]:
y

array(['neg', 'pos', 'pos', ..., 'neg', 'pos', 'pos'],
      dtype='|S3')

# instantiate classifier

naive Bayes:

$$probability(spam | document) = probability(document | spam) \times probability(spam) / probability(document)$$

$$ \approx prob(word_1|spam) \times prob(word_2|spam) \times ... \times prob(word_n|spam) \times prob(spam)$$

"naive" = "wrong"

In [211]:
clf = BernoulliNB()

# cross validation

In [212]:
cv = cross_validation.StratifiedKFold(y,5)

In [213]:
precision=[]
recall=[]
for train, test in cv:
    X_train = X[train]
    X_test = X[test]
    y_train = y[train]
    y_test = y[test]
#     print len(y_train), len(y_test)
#     clf = BernoulliNB()
    clf.fit(X_train, y_train)
    y_hat = clf.predict(X_test)
    p,r,_,_ = metrics.precision_recall_fscore_support(y_test, y_hat)
    precision.append(p[1])
    recall.append(r[1])

In [214]:
precision

[0.78357487922705316,
 0.75485661424606842,
 0.79327398615232447,
 0.7530516431924883,
 0.72810218978102192]

# average precision / recall across k-folds

- precision: of predicted negative reviews, how many are actually negative reviews?
- recall: of the actual negative reviews, how many are predicted to be negative reviews?

In [215]:
print vectorizer
print clf
print 'precision:',np.average(precision), '+/-', np.std(precision)
print 'recall:', np.average(recall), '+/-', np.std(recall)

TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(3, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
precision: 0.76257186252 +/- 0.0233326598442
recall: 0.935669956965 +/- 0.00755781622531


# try on new negative review

In [216]:
neg_sample = 'yuck sickening overblown yawn overuse scrooge unoriginal'
neg_sample = vectorizer.transform([neg_sample])
#print sample

In [217]:
clf.predict_proba(neg_sample)

array([[  9.99990107e-01,   9.89285759e-06]])

In [218]:
pos_sample = 'delightful masterfully accessible bittersweet examines terrific playful'
pos_sample = vectorizer.transform([pos_sample])

In [219]:
clf.predict_proba(pos_sample)

array([[  4.49125392e-11,   1.00000000e+00]])

# most spammy words

In [220]:
stop_words = vectorizer.get_stop_words()
len(stop_words)

318

In [221]:
probs=clf.feature_log_prob_[1] - clf.feature_log_prob_[0]
len(probs)

58125

In [222]:
features=vectorizer.get_feature_names()
len(features)

58125

In [223]:
sorted(zip(probs,features), reverse=True)[:50]

[(2.5041099215630735, u'delightful'),
 (2.4690186017518041, u'visited'),
 (2.432650957580929, u'accessible'),
 (2.3556899164448009, u'madame'),
 (2.3148679219245452, u'masterfully'),
 (2.3148679219245452, u'coast'),
 (2.3148679219245452, u'bittersweet'),
 (2.2723083075057495, u'samuel'),
 (2.2723083075057495, u'levin'),
 (2.2278565449349159, u'clare'),
 (2.1813365293000224, u'nazis'),
 (2.1813365293000224, u'examines'),
 (2.1813365293000224, u'diverse'),
 (2.1325463651305903, u'rosaleen'),
 (2.1325463651305903, u'cleopatra'),
 (2.0812530707430401, u'woke'),
 (2.0812530707430401, u'varied'),
 (2.0812530707430401, u'terrific'),
 (2.0812530707430401, u'playful'),
 (2.0812530707430401, u'occupation'),
 (2.0812530707430401, u'nations'),
 (2.0812530707430401, u'lean'),
 (2.0812530707430401, u'histories'),
 (2.0812530707430401, u'ease'),
 (2.0812530707430401, u'bovary'),
 (2.0812530707430401, u'adam'),
 (2.0271858494727644, u'tragedies'),
 (2.0271858494727644, u'september'),
 (2.0271858494727

![](cat.jpg)