In [54]:
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', 500)

In [55]:
data = pd.read_csv("./smsspamcollection/SMSSpamCollection", delimiter='\t', names=("Y", "Text"))
data.Y[data.Y == 'ham'] = 0
data.Y[data.Y == 'spam'] = 1
data.head()

Unnamed: 0,Y,Text
0,0,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives around here though"


In [94]:
data.Y.value_counts()[0] * 1.0 /(data.Y.value_counts().sum())

0.86593682699210339

In [128]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn.naive_bayes import MultinomialNB

vectorizer = CountVectorizer()#ngram_range=(1,2))
X = vectorizer.fit_transform(data.Text)


cls = LogisticRegression()
res = cross_val_score(cls, X, data.Y.astype(int), scoring="f1", cv=10)
print "mean_fold_res=", "%.2f"%np.mean(res), "std_fold_res=", "%.2f"%np.std(res)

mean_fold_res= 0.93 std_fold_res= 0.02


In [70]:
target_data = ["FreeMsg: Txt: CALL to No: 86888 & claim your reward of \
3 hours talk time to use from your phone now! Subscribe6GB",
"FreeMsg: Txt: claim your reward of 3 hours talk time",
"Have you visited the last lecture on physics?",
"Have you visited the last lecture on physics? Just buy this book and you will have \
all materials! Only 99$",
"Only 99$"]
X_target = vectorizer.transform(target_data)
cls.fit(X, data.Y.astype(int))
cls.predict(X_target)

array([1, 1, 0, 0, 0])

In [123]:
print "Logistic Regression:"
for ngram_range in [(1,1), (2,2), (3,3), (1,3)]:
    vectorizer = CountVectorizer(ngram_range=ngram_range)#ngram_range=(1,2)
    X = vectorizer.fit_transform(data.Text)
    cls = LogisticRegression()
    res = cross_val_score(cls, X, data.Y.astype(int), scoring="f1", cv=10)
    print "ngram_range = ", ngram_range,  "res=", "%.2f"%np.mean(res)

Logistic Regression:
ngram_range =  (1, 1) res= 0.93
ngram_range =  (2, 2) res= 0.82
ngram_range =  (3, 3) res= 0.73
ngram_range =  (1, 3) res= 0.93


In [124]:
print "Multinomial Naive Bayes"
for ngram_range in [(1,1), (2,2), (3,3), (1,3)]:
    vectorizer = CountVectorizer(ngram_range=ngram_range)#ngram_range=(1,2)
    X = vectorizer.fit_transform(data.Text)
    cls = MultinomialNB()
    res = cross_val_score(cls, X, data.Y.astype(int), scoring="f1", cv=10)
    print "ngram_range = ", ngram_range,  "res=", "%.2f"%np.mean(res)

Multinomial Naive Bayes
ngram_range =  (1, 1) res= 0.93
ngram_range =  (2, 2) res= 0.65
ngram_range =  (3, 3) res= 0.38
ngram_range =  (1, 3) res= 0.89


Как мы видим, качество при переходе от количественных признаков к tf*idf падает:

In [125]:
from sklearn.feature_extraction.text import TfidfVectorizer
print "Logistic Regression with tf*idf features:"
for ngram_range in [(1,1), (2,2), (3,3), (1,3)]:
    vectorizer = TfidfVectorizer(ngram_range=ngram_range)#ngram_range=(1,2)
    X = vectorizer.fit_transform(data.Text)
    cls = LogisticRegression()
    res = cross_val_score(cls, X, data.Y.astype(int), scoring="f1", cv=10)
    print "ngram_range = ", ngram_range,  "res=", "%.2f"%np.mean(res)

Logistic Regression with tf*idf features:
ngram_range =  (1, 1) res= 0.85
ngram_range =  (2, 2) res= 0.34
ngram_range =  (3, 3) res= 0.17
ngram_range =  (1, 3) res= 0.65


Дальше попробуем побороться за качество

In [126]:
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data.Text)
for c in [50, 500, 10000, 50000]:
    cls = SVC(C=c,)
    res = cross_val_score(cls, X, data.Y.astype(int), scoring="f1", cv=10)
    print  "SVC", "c=", c, "\tres=", "%.2f"%np.mean(res)

SVC c= 50 	res= 0.90
SVC c= 500 	res= 0.94
SVC c= 10000 	res= 0.94
SVC c= 50000 	res= 0.94


In [127]:
cls = GaussianNB()
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data.Text)
res = cross_val_score(cls, X.toarray(), data.Y.astype(int), scoring="f1", cv=10)
print  "GaussianNB", "\t res=", "%.2f"%np.mean(res)

GaussianNB 	 res= 0.71


In [118]:
from nltk.stem import SnowballStemmer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
stemmer = PorterStemmer()#SnowballStemmer("english")
class StemmedCountVectorizer(CountVectorizer):
    def __init__(self, stemmer):
        super(StemmedCountVectorizer, self).__init__()
        self.stemmer = stemmer

    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc:(self.stemmer.stem(w) for w in analyzer(doc))



In [132]:
def get_stemmed_score(clf, name):
    stem_vectorizer = StemmedCountVectorizer(stemmer)
    text_clf_tfidf = Pipeline([('vect', stem_vectorizer), ('tfidf', TfidfTransformer()), ('clf', clf) ])
    text_clf = Pipeline([('vect', stem_vectorizer), ('clf', SVC(kernel='linear', C=500)) ])

    print "Stemmed " + name
    score = cross_val_score(text_clf_tfidf, data.Text, data.Y.astype(int), scoring='f1', cv=10)
    print "\t", "Tf Idf:", "res=", "%.2f"%np.mean(score), "std=","%.2f"%np.std(score)
    score = cross_val_score(text_clf, data.Text, data.Y.astype(int), scoring='f1', cv=10)
    print '\t', "Count:" "res=","%.2f"%np.mean(score), "std=","%.2f"%np.std(score)

get_stemmed_score(SVC(kernel='linear', C=500), "SVC")
get_stemmed_score(LogisticRegression(C=500), "LogisticRegression")

Stemmed SVC
	Tf Idf: res= 0.95 std= 0.02
	Count:res= 0.94 std= 0.02
Stemmed LogisticRegression
	Tf Idf: res= 0.95 std= 0.02
	Count:res= 0.94 std= 0.02


Лучшее качество при оптимизации "f1"-метрики у меня получилось 0.95. Конечно, хотелось сразу попробовать SVM, он дал результат чуть лучше, потом, конечно, простемить входные данные, стало заметно получше. Стоп-слова я не стал пробовать, потому что интуитивно кажется, это тут только навредит, хотя, конечно, это не аргумент)