In [1]:
import numpy as np
import pandas
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

import matplotlib.pyplot as plt
%matplotlib inline

### 2

In [2]:
filename = 'SMSSpamCollection.txt'
df = pandas.read_csv(filename, sep='\t', header=None)
df.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### 3

In [3]:
df.shape

(5572, 2)

In [4]:
X = df[1]
y = np.where(df[0] == 'spam', 1, 0)

### 4

In [5]:
vectorizer = CountVectorizer()
X_transformed = vectorizer.fit_transform(X)

In [6]:
X_transformed

<5572x8713 sparse matrix of type '<class 'numpy.int64'>'
	with 74169 stored elements in Compressed Sparse Row format>

### 5

In [7]:
scores = cross_val_score(LogisticRegression(), X_transformed, y, scoring='f1', cv=10).mean()
print('mean score ', scores)

mean score  0.932640298361


###  6

In [8]:
test1 = 'FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use from your phone now! Subscribe6GB'
test2 = 'FreeMsg: Txt: claim your reward of 3 hours talk time'
test3 = 'Have you visited the last lecture on physics?'
test4 = 'Have you visited the last lecture on physics? Just buy this book and you will have all materials! Only 99$'
test5 = 'Only 99$'

In [9]:
clf = LogisticRegression()
clf.fit(X_transformed, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [10]:
X_test = vectorizer.transform([test1, test2, test3, test4, test5])
print(clf.predict(X_test))

[1 1 0 0 0]


### 7

In [11]:
def ngram_range_dependency(clf, ngram_range):
    vectorizer = CountVectorizer(ngram_range=ngram_range)
    score = cross_val_score(clf, vectorizer.fit_transform(X), y, scoring='f1', cv=10).mean()
    print('mean score ', score)

In [12]:
ngram_range_dependency(LogisticRegression(), (2, 2))

mean score  0.822422066419


In [13]:
ngram_range_dependency(LogisticRegression(), (3, 3))

mean score  0.725016155547


In [14]:
ngram_range_dependency(LogisticRegression(), (1, 3))

mean score  0.925138255865


### 8

In [15]:
ngram_range_dependency(MultinomialNB(), (3, 3))

mean score  0.378719485246


In [16]:
ngram_range_dependency(MultinomialNB(), (2, 2))

mean score  0.645501517799


In [17]:
ngram_range_dependency(MultinomialNB(), (1, 3))

mean score  0.888485965606


### 9

In [18]:
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(X)

In [8]:
cross_val_score(LogisticRegression(), X_transformed, y, scoring='f1', cv=10).mean()

0.9326402983610631

In [9]:
cross_val_score(LogisticRegression(), X_tfidf, y, scoring='f1', cv=10).mean()

0.85285995541724557