In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
df = pd.read_csv("./spam.csv", usecols=['v1', 'v2'], encoding="latin-1")

In [3]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.rename(columns={
    'v1': 'type',
    'v2': 'email'
}, inplace=True)

In [5]:
df.head()

Unnamed: 0,type,email
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df.type.value_counts()

ham     4825
spam     747
Name: type, dtype: int64

In [7]:
df['spam'] = df['type'].apply(lambda x: 1 if x=='spam' else 0)

In [8]:
df.head()

Unnamed: 0,type,email,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [9]:
X_train, X_test, y_train, y_test = train_test_split(df.email, df.spam, test_size=0.2, random_state=1) 

In [10]:
vec = CountVectorizer()
X_train_count = vec.fit_transform(X_train.values)
X_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [11]:
model = MultinomialNB()
model.fit(X_train_count, y_train)

In [12]:
emails = [
    'Hey pratham, can we get together to watch football game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]
emails_count = vec.transform(emails)
model.predict(emails_count)

array([0, 1])

In [13]:
X_test_count = vec.transform(X_test)
model.score(X_test_count, y_test)

0.9847533632286996

In [14]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorize', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [15]:
clf.fit(X_train, y_train)

In [16]:
clf.predict(emails)

array([0, 1])