In [1]:
import pandas as pd
import nltk
#nltk.download()

# https://www.kaggle.com/uciml/sms-spam-collection-dataset
data = pd.read_csv("spam.csv", encoding="latin-1")
data = data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
data = data.rename(columns={"v1": "label", "v2": "text"})
print(data.label.value_counts())
data["spam"] = data["label"].map({"ham":0, "spam":1})
data.head()

ham     4825
spam     747
Name: label, dtype: int64


Unnamed: 0,label,text,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [2]:
from nltk.corpus import stopwords
from nltk.corpus import wordnet

porterStemmer = nltk.PorterStemmer()
def fixSentence(sentence):
    tokens = nltk.word_tokenize(sentence.lower())
    #tokens = [token for token in tokens if wordnet.synsets(token)]
    return " ".join(tokens)

data["stemmed_text"] = data.apply(lambda row: fixSentence(row["text"]), axis=1)
data.head()

Unnamed: 0,label,text,spam,stemmed_text
0,ham,"Go until jurong point, crazy.. Available only ...",0,"go until jurong point , crazy.. available only..."
1,ham,Ok lar... Joking wif u oni...,0,ok lar ... joking wif u oni ...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,free entry in 2 a wkly comp to win fa cup fina...
3,ham,U dun say so early hor... U c already then say...,0,u dun say so early hor ... u c already then sa...
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,"nah i do n't think he goes to usf , he lives a..."


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
v = TfidfVectorizer()
X = v.fit_transform(data["stemmed_text"])

In [4]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import cross_val_score

models = [
    GaussianNB(),
    MultinomialNB(),
    BernoulliNB()
]
for model in models:
    scores = cross_val_score(model, X.toarray(), data["spam"], cv=4)
    print(model, "mean score", scores.mean())
    model.fit(X.toarray(), data["spam"])    
    print(
        "test",
        model.predict(v.transform([fixSentence("win call text now please get for free")]).toarray()), 
        model.predict(v.transform([fixSentence("hey how you doing ok")]).toarray()))

GaussianNB(priors=None) mean score 0.900035483106
test [1] [0]
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) mean score 0.956568545972
test [0] [0]
BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True) mean score 0.98007844666
test [0] [0]
