In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=['class', 'text'])

In [4]:
data.head(10)

Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


### Now let us split into training and testing

In [8]:
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['class'], test_size=0.3)

### Preparing the data for training and classification

#### using the CountVectorizer and LabelBinarizer

1. CountVectorizer counts the occurance of text elements
2. Turn data elements into binary

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer

In [11]:
count_vect = CountVectorizer()

In [37]:
X_train_counts = count_vect.fit_transform(X_train)
X_test_counts = count_vect.transform(X_test)

In [27]:
X_train_counts

<3900x7284 sparse matrix of type '<class 'numpy.int64'>'
	with 51746 stored elements in Compressed Sparse Row format>

In [18]:
list(count_vect.vocabulary_.items())[0:20]

[('computer', 1820),
 ('gays', 2906),
 ('food', 2751),
 ('comfort', 1792),
 ('stunning', 6134),
 ('dirty', 2198),
 ('even', 2498),
 ('mode', 4302),
 ('loosing', 3972),
 ('randomly', 5235),
 ('christ', 1690),
 ('conversations', 1869),
 ('freaky', 2801),
 ('howda', 3317),
 ('69988', 553),
 ('want2come', 6931),
 ('church', 1696),
 ('videochat', 6842),
 ('shake', 5695),
 ('bags', 1122)]

In [15]:
len(count_vect.vocabulary_)

7284

In [16]:
lab_binary = LabelBinarizer()

In [17]:
y_train_binary = lab_binary.fit_transform(y_train)
y_test_binary = lab_binary.fit_transform(y_test)

### Now let's build the model

1. First use Naive Bayes model to classifer and look at the most import words

In [20]:
from sklearn.naive_bayes import MultinomialNB

In [21]:
model_nb = MultinomialNB()

In [28]:
model_nb.fit(X_train_counts, y_train_binary.ravel())

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

#### Model has leart 7284 coefficents of words in the vocab and the contribution of each word to the classifier

In [29]:
len(model_nb.coef_[0])

7284

In [30]:
import collections
importance_word = collections.Counter()

In [31]:
for word,imp, in zip(count_vect.vocabulary_.keys(),model_nb.coef_[0]):
    importance_word[word]=imp

#### most important 12 words that help to determine if message is spam or not

In [34]:
importance_word.most_common()[0:12]

[('vco', -3.674003570663408),
 ('telphone', -4.436725694903688),
 ('xam', -4.53772890519256),
 ('iouri', -4.671950667192102),
 ('roommate', -4.820591691208391),
 ('beauty', -4.859812404361673),
 ('resolution', -4.873235424693814),
 ('750', -4.965172920019499),
 ('minmobsmorelkpobox177hp51fl', -4.995250375256777),
 ('keypad', -5.066426653724672),
 ('chat', -5.134249250063433),
 ('trial', -5.197618863996023)]

In [39]:
predict = model_nb.predict(X_test_counts)


In [40]:
from sklearn.metrics import average_precision_score

In [42]:
print('Accuracy = {:f}'.format(average_precision_score(y_test_binary,predict)))

Accuracy = 0.956741


#### The model has predicted well with a 94% Accuracy, let's now test it with a unseen custom message

In [44]:
model_nb.predict(count_vect.transform(['You are a winner, please pass on your bank account']))

array([1])

#### I will make a function to allow for testing of any future messages

In [65]:
def new_predict(text):
    prediction = model_nb.predict(count_vect.transform(text))
    if prediction == 1:
        print(prediction)
        print("This is spam")
    else:
        print(prediction)
        print("This is not spam")

In [66]:
test_text = ['Hey what are your plans for tonight']

In [67]:
new_predict(test_text)

[0]
This is not spam


In [72]:
test_text_2 = ['Win big with offer']

In [73]:
new_predict(test_text_2)

[1]
This is spam
