# Problem statement

Spam-- Fake messages
Ham-- Good messages

In [1]:
# Import librraies
import numpy as np
import pandas as pd

In [2]:
# load the data
data=pd.read_csv("spam.tsv",sep='\t',names=['Class','Messages'])

In [3]:
data

Unnamed: 0,Class,Messages
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!!
...,...,...
5562,spam,This is the 2nd time we have tried 2 contact u...
5563,ham,Will ü b going to esplanade fr home?
5564,ham,"Pity, * was in mood for that. So...any other s..."
5565,ham,The guy did some bitching but I acted like i'd...


In [4]:
data['length']=data['Messages'].apply(len)

In [5]:
data

Unnamed: 0,Class,Messages,length
0,ham,I've been searching for the right words to tha...,196
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
2,ham,"Nah I don't think he goes to usf, he lives aro...",61
3,ham,Even my brother is not like to speak with me. ...,77
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!!,36
...,...,...,...
5562,spam,This is the 2nd time we have tried 2 contact u...,160
5563,ham,Will ü b going to esplanade fr home?,36
5564,ham,"Pity, * was in mood for that. So...any other s...",57
5565,ham,The guy did some bitching but I acted like i'd...,125


In [6]:
data['Messages'][data['length']>500]

1080    For me the love should start with attraction.i...
1574    How to Make a girl Happy? It's not at all diff...
1858    The last thing i ever wanted to do was hurt yo...
2153    Sad story of a Man - Last week was my b'day. M...
2429    Indians r poor but India is not a poor country...
2844    Sad story of a Man - Last week was my b'day. M...
Name: Messages, dtype: object

In [7]:
# Find unique classes
data.Class.unique()

array(['ham', 'spam'], dtype=object)

In [8]:
# value count of classes
data.Class.value_counts()

ham     4821
spam     746
Name: Class, dtype: int64

# Text preprocessing

In [9]:
# class: ham/spam
# convert categorical classes into numerical by replacing ham as 1 and spam as 0.
# ham-->1
# spam-->0

In [10]:
data.loc[data['Class']=='ham','Class']=1
data.loc[data['Class']=='spam','Class']=0

In [11]:
data

Unnamed: 0,Class,Messages,length
0,1,I've been searching for the right words to tha...,196
1,0,Free entry in 2 a wkly comp to win FA Cup fina...,155
2,1,"Nah I don't think he goes to usf, he lives aro...",61
3,1,Even my brother is not like to speak with me. ...,77
4,1,I HAVE A DATE ON SUNDAY WITH WILL!!!,36
...,...,...,...
5562,0,This is the 2nd time we have tried 2 contact u...,160
5563,1,Will ü b going to esplanade fr home?,36
5564,1,"Pity, * was in mood for that. So...any other s...",57
5565,1,The guy did some bitching but I acted like i'd...,125


# Text cleaning

In [12]:
# handle punctuations
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [13]:
def remove_punct(text):
    text="".join([char for char in text if char not in string.punctuation])
    return text

In [14]:
data["text_clean"]=data["Messages"].apply(lambda x:remove_punct(x))

In [15]:
data

Unnamed: 0,Class,Messages,length,text_clean
0,1,I've been searching for the right words to tha...,196,Ive been searching for the right words to than...
1,0,Free entry in 2 a wkly comp to win FA Cup fina...,155,Free entry in 2 a wkly comp to win FA Cup fina...
2,1,"Nah I don't think he goes to usf, he lives aro...",61,Nah I dont think he goes to usf he lives aroun...
3,1,Even my brother is not like to speak with me. ...,77,Even my brother is not like to speak with me T...
4,1,I HAVE A DATE ON SUNDAY WITH WILL!!!,36,I HAVE A DATE ON SUNDAY WITH WILL
...,...,...,...,...
5562,0,This is the 2nd time we have tried 2 contact u...,160,This is the 2nd time we have tried 2 contact u...
5563,1,Will ü b going to esplanade fr home?,36,Will ü b going to esplanade fr home
5564,1,"Pity, * was in mood for that. So...any other s...",57,Pity was in mood for that Soany other suggest...
5565,1,The guy did some bitching but I acted like i'd...,125,The guy did some bitching but I acted like id ...


In [16]:
# split data into x and y
x=data["text_clean"]
y=data["Class"]

In [17]:
# convert object data type of Class into integer
y=y.astype('int')

In [18]:
y.dtypes

dtype('int32')

In [19]:
# Split data for tetsing and training
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=10)

In [20]:
# Apply countvectorizer method to convert categorical data into numerical by creating feature vectors 
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(stop_words="english")
x_train_cv=cv.fit_transform(x_train)

In [21]:
x_test_cv=cv.transform(x_test)

In [22]:
x_train_cv

<4453x8159 sparse matrix of type '<class 'numpy.int64'>'
	with 34532 stored elements in Compressed Sparse Row format>

# Build a model

In [23]:
from sklearn.naive_bayes import MultinomialNB
model1=MultinomialNB()
model1.fit(x_train_cv,y_train)

MultinomialNB()

In [24]:
y_pred=model1.predict(x_test_cv)

In [25]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.9829443447037702

In [27]:
# Predictions
msg=input("Enter the msg")
msginput=cv.transform([msg])
predict=model1.predict(msginput)
if (predict[0]==0):
    print("Spam")
else:
    print("Ham")
    

Enter the msgHey! Good morning. You have meeting around 2pm.
Ham


In [None]:
Hey! you have won the prize worth of 50000$

Hey, Good morning. You have meeting at 2pm.

In [None]:
# TFIDF

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf=TfidfVectorizer()
x_tf=tf.fit_transform(x)

In [30]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x_tf,y,test_size=0.20,random_state=10)

In [35]:
# Bernoulli model
from sklearn.naive_bayes import BernoulliNB
model2=BernoulliNB(alpha=0.01)

In [36]:
model2.fit(x_train,y_train)

BernoulliNB(alpha=0.01)

In [37]:
y_pred=model2.predict(x_test)

In [38]:
accuracy_score(y_test,y_pred)

0.9856373429084381

In [None]:
msg=input("Enter the msg")
msginput=tf.transform([msg])
predict=model2.predict(msginput)
if (predict[0]==0):
    print("Spam")
else:
    print("Ham")