In [1]:
import pandas as pd
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Checking null values

In [2]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

# Replace ham and spam as 0 and 1

In [3]:
df1 = pd.get_dummies(df['Category'])
df1 = df1.replace({True:1,False:0})
df1.head()

Unnamed: 0,ham,spam
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0


In [4]:
df = pd.concat([df,df1],axis='columns')
df.head()

Unnamed: 0,Category,Message,ham,spam
0,ham,"Go until jurong point, crazy.. Available only ...",1,0
1,ham,Ok lar... Joking wif u oni...,1,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,0,1
3,ham,U dun say so early hor... U c already then say...,1,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",1,0


In [5]:
df.drop(['Category','ham'],inplace=True,axis=1)

In [6]:
df.head()

Unnamed: 0,Message,spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


# Splitting data into training and testing data

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['Message'],df['spam'],test_size=0.2)

# Converting text data to sparse matrix

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()

X_train_count = v.fit_transform(X_train)

# Performing model training 

In [9]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_count,y_train)
model.score(X_train_count,y_train)

0.9937177473636976

# Predicting given mail is spam or not

In [10]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

email_counts = v.transform(emails)

model.predict(email_counts)

array([0, 1], dtype=int64)

In [11]:
from sklearn.pipeline import Pipeline
clf = Pipeline([
    ('vect',CountVectorizer()),
    ('nb',MultinomialNB())
])

In [12]:
clf.fit(X_train,y_train)
clf.score(X_train,y_train)

0.9937177473636976