In [11]:
import numpy as np 
import pandas as pd

In [12]:
df = pd.read_csv('spam.csv', encoding='latin-1', usecols = ['v1', 'v2'])
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [14]:
df.describe()

Unnamed: 0,v1,v2
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [16]:
df.rename(columns = {'v1':'Category', 'v2':'Message'}, inplace = True)

In [17]:
spam = df.groupby('Category')['Message'].count()
spam

Category
ham     4825
spam     747
Name: Message, dtype: int64

In [18]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()
df['spam'] = label_encoder.fit_transform(df['Category'])
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [19]:
x = df['Message']
y = df['spam']

In [20]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=40)

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()

In [22]:
x_train_count = v.fit_transform(x_train.values)
x_train_count.toarray()[:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [23]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()

In [24]:
model.fit(x_train_count, y_train)

MultinomialNB()

In [25]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]
emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1])

In [26]:
x_test_count = v.transform(x_test)
model.predict(x_test_count)

array([0, 1, 0, ..., 0, 1, 0])

In [27]:
model.score(x_test_count, y_test)

0.986244019138756

In [28]:
pred = model.predict(x_test_count)

In [29]:
from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(y_test, pred)
cm

array([[1451,    6],
       [  17,  198]], dtype=int64)

In [30]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('Vectorizer' , CountVectorizer()),
    ('nb' , MultinomialNB())
])

In [31]:
clf.fit(x_train, y_train)

Pipeline(steps=[('Vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [32]:
clf.score(x_test, y_test)

0.986244019138756

In [33]:
pred2 = clf.predict(x_test)

In [34]:
cm2 = confusion_matrix(y_test, pred2)
cm2

array([[1451,    6],
       [  17,  198]], dtype=int64)