#### Data source :https://archive.ics.uci.edu/ml/machine-learning-databases/00228/

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB

# Count Vectorization

In [2]:
cv = CountVectorizer()

In [3]:
x_traincv = cv.fit_transform(["Hi How are you How are you doing","Hi what's up","Wow that's awesome"])

In [4]:
x_traincv.toarray()

array([[2, 0, 1, 1, 2, 0, 0, 0, 0, 2],
       [0, 0, 0, 1, 0, 0, 1, 1, 0, 0],
       [0, 1, 0, 0, 0, 1, 0, 0, 1, 0]], dtype=int64)

In [5]:
cv.get_feature_names()

['are', 'awesome', 'doing', 'hi', 'how', 'that', 'up', 'what', 'wow', 'you']

## Data loading & Understanding

In [6]:
dataset = pd.read_csv('data/smsspam', sep = '\t', names = ['Status','Message'])

In [7]:
dataset.head()

Unnamed: 0,Status,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
len(dataset)

5572

In [9]:
len(dataset[dataset.Status == 'spam'])

747

In [10]:
len(dataset[dataset.Status == 'ham'])

4825

In [11]:
dataset.loc[dataset["Status"] == 'ham', "Status",] = 1

In [12]:
dataset.loc[dataset["Status"] == 'spam', "Status",] = 0

In [13]:
dataset.head()

Unnamed: 0,Status,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [14]:
X = dataset["Message"].values
y = dataset["Status"].values

In [15]:
X[:2]

array([ 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
       'Ok lar... Joking wif u oni...'], dtype=object)

## Spliting dataset into training and testing

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 4)

In [17]:
X_train[:5]

array([ 'U sleeping now.. Or you going to take? Haha.. I got spys wat.. Me online checking n replying mails lor..',
       'How long has it been since you screamed, princess?',
       'Urgent! call 09066612661 from landline. Your complementary 4* Tenerife Holiday or £10,000 cash await collection SAE T&Cs PO Box 3 WA14 2PX 150ppm 18+ Sender: Hol Offer',
       'Okay. No no, just shining on. That was meant to be signing, but that sounds better.',
       'Wen ur lovable bcums angry wid u, dnt take it seriously.. Coz being angry is d most childish n true way of showing deep affection, care n luv!.. kettoda manda... Have nice day da.'], dtype=object)

## Count Vectorization

In [53]:
cv = CountVectorizer()

In [54]:
X_traincv = cv.fit_transform(X_train)

In [55]:
X_traincv.toarray().shape

(4457, 7762)

In [56]:
X_testcv = cv.transform(X_test)

In [57]:
X_testcv.toarray().shape

(1115, 7762)

In [58]:
y_train = y_train.astype('int')

In [62]:
y_train[: 2]

array([1, 1])

In [60]:
y_test = y_test.astype('int')

# Naive Bayes

In [64]:
naive_bayes = MultinomialNB()

In [65]:
naive_bayes.fit(X_traincv, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [66]:
y_test_pred = naive_bayes.predict(X_testcv)

## Accuracy

In [67]:
# Training accuracy
naive_bayes.score(X_traincv, y_train)

0.99371774736369756

In [68]:
# Testing accuracy
naive_bayes.score(X_testcv, y_test)

0.97937219730941705