In [None]:
# TASK : EMAIL SPAM DETECTION WITH MACHINE LEARNING 

In [1]:
import numpy as np 
import pandas as pd 

In [2]:
d=pd.read_csv("spam.tsv",sep='\t',names=['Class','Messages'])
print(d)

     Class                                           Messages
0      ham  I've been searching for the right words to tha...
1     spam  Free entry in 2 a wkly comp to win FA Cup fina...
2      ham  Nah I don't think he goes to usf, he lives aro...
3      ham  Even my brother is not like to speak with me. ...
4      ham               I HAVE A DATE ON SUNDAY WITH WILL!!!
...    ...                                                ...
5562  spam  This is the 2nd time we have tried 2 contact u...
5563   ham               Will ü b going to esplanade fr home?
5564   ham  Pity, * was in mood for that. So...any other s...
5565   ham  The guy did some bitching but I acted like i'd...
5566   ham                         Rofl. Its true to its name

[5567 rows x 2 columns]


In [3]:
d['length']=d['Messages'].apply(len)
print(d)

     Class                                           Messages  length
0      ham  I've been searching for the right words to tha...     196
1     spam  Free entry in 2 a wkly comp to win FA Cup fina...     155
2      ham  Nah I don't think he goes to usf, he lives aro...      61
3      ham  Even my brother is not like to speak with me. ...      77
4      ham               I HAVE A DATE ON SUNDAY WITH WILL!!!      36
...    ...                                                ...     ...
5562  spam  This is the 2nd time we have tried 2 contact u...     160
5563   ham               Will ü b going to esplanade fr home?      36
5564   ham  Pity, * was in mood for that. So...any other s...      57
5565   ham  The guy did some bitching but I acted like i'd...     125
5566   ham                         Rofl. Its true to its name      26

[5567 rows x 3 columns]


In [4]:
d['Messages'][d['length']>500]


1080    For me the love should start with attraction.i...
1574    How to Make a girl Happy? It's not at all diff...
1858    The last thing i ever wanted to do was hurt yo...
2153    Sad story of a Man - Last week was my b'day. M...
2429    Indians r poor but India is not a poor country...
2844    Sad story of a Man - Last week was my b'day. M...
Name: Messages, dtype: object

In [5]:
# unique class 
d.Class.unique()

array(['ham', 'spam'], dtype=object)

In [6]:
# value count of class 
d.Class.value_counts()

ham     4821
spam     746
Name: Class, dtype: int64

In [7]:
# Text preprocessing 

d.loc[d['Class']=='ham','Class']=1
d.loc[d['Class']=='spam','Class']=0

In [8]:
print(d)

     Class                                           Messages  length
0        1  I've been searching for the right words to tha...     196
1        0  Free entry in 2 a wkly comp to win FA Cup fina...     155
2        1  Nah I don't think he goes to usf, he lives aro...      61
3        1  Even my brother is not like to speak with me. ...      77
4        1               I HAVE A DATE ON SUNDAY WITH WILL!!!      36
...    ...                                                ...     ...
5562     0  This is the 2nd time we have tried 2 contact u...     160
5563     1               Will ü b going to esplanade fr home?      36
5564     1  Pity, * was in mood for that. So...any other s...      57
5565     1  The guy did some bitching but I acted like i'd...     125
5566     1                         Rofl. Its true to its name      26

[5567 rows x 3 columns]


In [9]:
# Text cleaning 
import string
string.punctuation 

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [12]:
def remove_punct(text):
    text="".join([char for char in text if char not in string.punctuation])
    return text


In [14]:
d["text_clean"]=d["Messages"].apply(lambda x:remove_punct(x))

In [15]:
print(d)

     Class                                           Messages  length  \
0        1  I've been searching for the right words to tha...     196   
1        0  Free entry in 2 a wkly comp to win FA Cup fina...     155   
2        1  Nah I don't think he goes to usf, he lives aro...      61   
3        1  Even my brother is not like to speak with me. ...      77   
4        1               I HAVE A DATE ON SUNDAY WITH WILL!!!      36   
...    ...                                                ...     ...   
5562     0  This is the 2nd time we have tried 2 contact u...     160   
5563     1               Will ü b going to esplanade fr home?      36   
5564     1  Pity, * was in mood for that. So...any other s...      57   
5565     1  The guy did some bitching but I acted like i'd...     125   
5566     1                         Rofl. Its true to its name      26   

                                             text_clean  
0     Ive been searching for the right words to than...  
1     F

In [16]:
# split data into x and y 
x=d["text_clean"]
y=d['Class']

In [17]:
# covert object to integer 
y=y.astype('int')

In [18]:
y.dtypes

dtype('int32')

In [19]:
# split data for training and testing 
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=10)

In [20]:
# Apply countvectorizer method to convert categorical data into numerical by creating feature vectors 
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(stop_words="english")
x_train_cv=cv.fit_transform(x_train)

In [21]:
x_test_cv=cv.transform(x_test)

In [23]:
x_train_cv

<4453x8159 sparse matrix of type '<class 'numpy.int64'>'
	with 34532 stored elements in Compressed Sparse Row format>

In [25]:
# Build model 
from sklearn.naive_bayes import MultinomialNB
model1=MultinomialNB()
model1.fit(x_train_cv,y_train)

In [26]:
y_pred=model1.predict(x_test_cv)

In [27]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.9829443447037702

In [29]:
# Predictions 
m=input("enter message :")
minput=cv.transform([m])
predict=model1.predict(minput)
if (predict[0]==0):
    print("spam")
else :
    print("Ham")

enter message :hi! whats up dude ??
Ham


In [None]:
# Using TFIDF 

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf=TfidfVectorizer()
x_tf=tf.fit_transform(x)

In [31]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x_tf,y,test_size=0.20,random_state=10)

In [32]:
# Bernoulli model 

from sklearn.naive_bayes import BernoulliNB
model2=BernoulliNB(alpha=0.01)

In [33]:
model2.fit(x_train,y_train)

In [34]:
y_pred=model2.predict(x_test)

In [35]:
accuracy_score(y_test,y_pred)

0.9856373429084381

In [37]:
msg=input("Enter the msg")
msginput=tf.transform([msg])
predict=model2.predict(msginput)
if (predict[0]==0):
    print("Spam")
else:
    print("Ham")

Enter the msgis it a rainy day today ?
Ham
