# Machine Learning is not that complicated (in Python)

In [1]:
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn import cross_validation
from sklearn import metrics



data set: https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection

In [2]:
labels=[]
documents=[]
with open('./smsspamcollection/SMSSpamCollection') as f:
    for line in f:
        labels.append(line[:4].strip())
        documents.append(line[4:].strip())

len(documents)

5574

In [3]:
for document, label in zip(documents, labels)[:10]:
    print document
    print label
    print

Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
ham

Ok lar... Joking wif u oni...
ham

Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
spam

U dun say so early hor... U c already then say...
ham

Nah I don't think he goes to usf, he lives around here though
ham

FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv
spam

Even my brother is not like to speak with me. They treat me like aids patent.
ham

As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
ham

WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.
spa

In [4]:
print 'fraction spam:', len([item for item in labels if item == 'spam'])/5574.0

fraction spam: 0.134015069968


# transform texts into vectors

let's use TF-IDF (term frequency, inverse document frequency):

- give more weight to words that occur a lot within a document
- give less weight to words that occur in many documents

In [5]:
vectorizer = TfidfVectorizer()

In [6]:
X = vectorizer.fit_transform(documents)
y = np.array(labels)

print X.shape, y.shape

(5574, 8713) (5574,)


In [7]:
y

array(['ham', 'ham', 'spam', ..., 'ham', 'ham', 'ham'],
      dtype='|S4')

# instantiate classifier

naive Bayes:

$$probability(spam | document) = probability(document | spam) \times probability(spam) / probability(document)$$

$$ \approx prob(word_1|spam) \times prob(word_2|spam) \times ... \times prob(word_n|spam) \times prob(spam)$$

"naive" = "wrong"

In [8]:
clf = BernoulliNB()

# cross validation

In [9]:
cv = cross_validation.StratifiedKFold(y,5)

In [10]:
precision=[]
recall=[]
for train, test in cv:
    X_train = X[train]
    X_test = X[test]
    y_train = y[train]
    y_test = y[test]
#     print len(y_train), len(y_test)
    clf.fit(X_train, y_train)
    y_hat = clf.predict(X_test)
    p,r,_,_ = metrics.precision_recall_fscore_support(y_test, y_hat)
    precision.append(p[1])
    recall.append(r[1])

# average precision / recall across k-folds

- precision: of predicted spam, how many are actual spam?
- recall: of the actual spam, how many are predicted to be spam?

In [11]:
# print vectorizer
# print clf
print 'precision:',np.average(precision), '+/-', np.std(precision)
print 'recall:', np.average(recall), '+/-', np.std(recall)

precision: 0.984691806825 +/- 0.00979760340781
recall: 0.870111856823 +/- 0.0254286121657


# try on new spam message

In [12]:
sample = 'URGENT! We are trying to contact U.Todays draw shows that you have won a 2000 prize GUARANTEED. Call 090 5809 4507 from land line. Claim 3030. Valid 12hrs only'
print sample
sample = vectorizer.transform([sample])
#print sample

URGENT! We are trying to contact U.Todays draw shows that you have won a 2000 prize GUARANTEED. Call 090 5809 4507 from land line. Claim 3030. Valid 12hrs only


In [13]:
clf.predict_proba(sample)

array([[  7.94578899e-22,   1.00000000e+00]])

# most spammy words

In [14]:
probs=clf.feature_log_prob_[1] - clf.feature_log_prob_[0]
len(probs)

8713

In [15]:
features=vectorizer.get_feature_names()
len(features)

8713

In [16]:
sorted(zip(probs,features), reverse=True)[:50]

[(6.3284366587708458, u'claim'),
 (6.1110237821656215, u'prize'),
 (5.9734024042895735, u'150p'),
 (5.6237286558098241, u'18'),
 (5.5514079942301979, u'cs'),
 (5.5260901862459084, u'guaranteed'),
 (5.5260901862459084, u'500'),
 (5.5001146998426478, u'www'),
 (5.4178766016056752, u'tone'),
 (5.3590361015827419, u'1000'),
 (5.3590361015827419, u'100'),
 (5.2298243701027367, u'awarded'),
 (5.2298243701027367, u'150ppm'),
 (5.212432627390867, u'uk'),
 (5.1206250781377438, u'ringtone'),
 (5.0405823704642074, u'000'),
 (4.9980227560454118, u'service'),
 (4.9535709934745782, u'weekly'),
 (4.9535709934745782, u'16'),
 (4.9070509778396847, u'mob'),
 (4.8582608136702525, u'tones'),
 (4.8582608136702525, u'national'),
 (4.8582608136702525, u'http'),
 (4.8582608136702525, u'bonus'),
 (4.8582608136702525, u'5000'),
 (4.8069675192827024, u'vouchers'),
 (4.8069675192827024, u'valid'),
 (4.8069675192827024, u'collection'),
 (4.8069675192827024, u'10p'),
 (4.7529002980124266, u'entry'),
 (4.75290029801

![](cat.jpg)