In [2]:
!pip install scikit-learn



In [1]:
import sklearn

In [19]:
#import packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [20]:
#import data
spam_df = pd.read_csv("spam.csv")

In [21]:
#introspect data
spam_df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [22]:
#inspect data
spam_df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [23]:
#turn spam/ham into numerical data, creating a new column for "spam"
spam_df['spam'] = spam_df['Category'].apply(lambda x: 1 if x == 'spam' else 0)

In [24]:
spam_df

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will ü b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


In [25]:
# create train/test split
x_train, x_test, y_train, y_test = train_test_split(spam_df.Message, spam_df.spam, test_size = 0.25)

In [26]:
x_train

4307    Awww dat is sweet! We can think of something t...
5272    Hello.How u doing?What u been up 2?When will u...
1850    You got job in wipro:)you will get every thing...
384             Hey i will be late ah... Meet you at 945+
2621                                            How come?
                              ...                        
3691                             What happen dear tell me
4046                        If You mean the website. Yes.
203     Your account has been refilled successfully by...
2184    Chinatown got porridge, claypot rice, yam cake...
3799    Feb  &lt;#&gt;  is "I LOVE U" day. Send dis to...
Name: Message, Length: 4179, dtype: object

In [27]:
x_train.describe()

count                       4179
unique                      3915
top       Sorry, I'll call later
freq                          29
Name: Message, dtype: object

In [28]:
#find word count and store data as a matrix
cv = CountVectorizer()
x_train_count = cv.fit_transform(x_train.values)

In [29]:
x_train_count

<4179x7445 sparse matrix of type '<class 'numpy.int64'>'
	with 55611 stored elements in Compressed Sparse Row format>

In [30]:
#matrix display
x_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [34]:
# train model
model = MultinomialNB()
model.fit(x_train_count, y_train)

In [37]:
#pre-test ham
email_ham = ["hey wanna meet for a cricket match?"]
email_ham_count = cv.transform(email_ham)
model.predict(email_ham_count)

array([0], dtype=int64)

In [38]:
#pre-test spam
email_spam = ["free sale bonus"]
email_spam_count = cv.transform(email_spam)
model.predict(email_spam_count)

array([1], dtype=int64)

In [41]:
# test model
x_test_count = cv.transform(x_test)
model.score(x_test_count, y_test)

0.9877961234745154

In [45]:
# random test
email_ham = ["Big Earn Money Guaranteed"]
email_ham_count = cv.transform(email_ham)
model.predict(email_ham_count)

array([1], dtype=int64)

In [46]:
# random test
email_spam = ["sorry need time thankyou"]
email_spam_count = cv.transform(email_spam)
model.predict(email_spam_count)

array([0], dtype=int64)

In [48]:
# random test
email_spam = ["Can I take you out for a cup of coffee?"]
email_spam_count = cv.transform(email_spam)
model.predict(email_spam_count)

array([0], dtype=int64)

In [52]:
# random test
email_ham = ["guarantee: skyrocket your income with this incredible opportunity!"]
email_ham_count = cv.transform(email_ham)
model.predict(email_ham_count)

array([1], dtype=int64)