In [50]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [73]:
#importing the dataset
spam_df = pd.read_csv("data.csv", usecols=["Category","Message"])

In [74]:
spam_df.dropna(subset=['Message'], inplace=True)

In [75]:
x = spam_df['Message']
y = spam_df['Category']

In [76]:
#inspect the data set
spam_df

Unnamed: 0,Message,Category
0,naturally irresistible your corporate identity...,spam
1,the stock trading gunslinger fanny is merrill...,spam
2,unbelievable new homes made easy im wanting t...,spam
3,4 color printing special request additional i...,spam
4,"do not have money , get software cds from here...",spam
...,...,...
11295,This is the 2nd time we have tried 2 contact u...,spam
11296,Will �_ b going to esplanade fr home?,ham
11297,"Pity, * was in mood for that. So...any other s...",ham
11298,The guy did some bitching but I acted like i'd...,ham


In [77]:
#Adding a new column named "Spam" which has 1 if the message is spam and 0 is its not
spam_df["Spam"]=spam_df["Category"].apply(lambda x : 1 if x=="spam" else 0)

In [78]:
spam_df.head()

Unnamed: 0,Message,Category,Spam
0,naturally irresistible your corporate identity...,spam,1
1,the stock trading gunslinger fanny is merrill...,spam,1
2,unbelievable new homes made easy im wanting t...,spam,1
3,4 color printing special request additional i...,spam,1
4,"do not have money , get software cds from here...",spam,1


In [79]:
#splitting the data
x_train,x_test,y_train,y_test = train_test_split(spam_df.Message,spam_df.Spam,test_size=0.25, random_state=42)

In [80]:
x_train.describe()

count                       8474
unique                      8205
top       Sorry, I'll call later
freq                          24
Name: Message, dtype: object

In [81]:
#converting the messages to vectors so that our model can understand it
cv=CountVectorizer()
x_train_count=cv.fit_transform(x_train)

In [82]:
#training the model
model = MultinomialNB()
model.fit(x_train_count,y_train)

In [83]:
#pre test ham
email=["Hey lets meet tomorrow"]
email_vector=cv.transform(email)
model.predict(email_vector)

array([0])

In [84]:
#pre test spam
email=["click here to claim your reward"]
email_vector=cv.transform(email)
model.predict(email_vector)

array([1])

In [85]:
x_train.info()

<class 'pandas.core.series.Series'>
Index: 8474 entries, 2110 to 7270
Series name: Message
Non-Null Count  Dtype 
--------------  ----- 
8474 non-null   object
dtypes: object(1)
memory usage: 132.4+ KB


In [86]:
x_test.info()

<class 'pandas.core.series.Series'>
Index: 2825 entries, 6173 to 9729
Series name: Message
Non-Null Count  Dtype 
--------------  ----- 
2825 non-null   object
dtypes: object(1)
memory usage: 44.1+ KB


In [87]:
#testing the data
x_test_count = cv.transform(x_test)
model.score(x_test_count,y_test)

0.9362831858407079