In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [3]:
df = pd.read_csv("spam.csv")
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
df.groupby("Category").describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [8]:
df['spam'] = df['Category'].apply(lambda x: 1 if x == "spam" else 0)
df.drop('Category', axis=1, inplace=True)
df

Unnamed: 0,Message,spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...
5567,This is the 2nd time we have tried 2 contact u...,1
5568,Will ü b going to esplanade fr home?,0
5569,"Pity, * was in mood for that. So...any other s...",0
5570,The guy did some bitching but I acted like i'd...,0


In [9]:
x_train, x_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.25)

We know that our model can't read text values unless it's converted into binary form or any numeric form. We could use label encoder here but it was possible if our text size was so small and specific. Here we have to use another technique called count vextorizer.
It takes all the unique words from your features and make other feature columns by the name of these unique texts. then compare the texts with these unique tests and see how many times these words have appered in each features and then just sum them and put the sum value in those newly created feature columns

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

In [26]:
v = CountVectorizer()
x_train_count = v.fit_transform(x_train)
x_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [27]:
model = MultinomialNB()
model.fit(x_train_count, y_train)

In [30]:
x_test_count = v.transform(x_test)
model.score(x_test_count, y_test)

0.9863603732950467

In [31]:
emails = [
    "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...",
    "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
]

emails_count = v.transform(emails)
emails_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [32]:
model.predict(emails_count)

array([0, 1], dtype=int64)

In [33]:
model.predict(x_test_count)

array([0, 0, 1, ..., 0, 0, 1], dtype=int64)

You can see it's working fine. but there is a tedious task that we have to do each time. like--- everytime we have to convert the test data into count vectorizer. to avoid this we can use sklearn pipeline library

In [34]:
from sklearn.pipeline import Pipeline
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])


It will automatically convert the inputs into count vectorizer form

In [35]:
clf.fit(x_train, y_train)

In [36]:
clf.score(x_test, y_test)

0.9863603732950467