In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
import re
import nltk

In [2]:
messages = pd.read_csv('SMSSpamCollection.tsv', sep='\t',
                           names=["label", "message"])
messages

Unnamed: 0,label,message
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!
...,...,...
5563,spam,This is the 2nd time we have tried 2 contact u...
5564,ham,Will ü b going to esplanade fr home?
5565,ham,"Pity, * was in mood for that. So...any other s..."
5566,ham,The guy did some bitching but I acted like i'd...


In [3]:

print(messages.head(8))

  label                                            message
0   ham  I've been searching for the right words to tha...
1  spam  Free entry in 2 a wkly comp to win FA Cup fina...
2   ham  Nah I don't think he goes to usf, he lives aro...
3   ham  Even my brother is not like to speak with me. ...
4   ham                I HAVE A DATE ON SUNDAY WITH WILL!!
5   ham  As per your request 'Melle Melle (Oru Minnamin...
6  spam  WINNER!! As a valued network customer you have...
7  spam  Had your mobile 11 months or more? U R entitle...


In [4]:
print(messages.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5568 entries, 0 to 5567
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5568 non-null   object
 1   message  5568 non-null   object
dtypes: object(2)
memory usage: 87.1+ KB
None


In [5]:
print(messages.describe())

       label                 message
count   5568                    5568
unique     2                    5165
top      ham  Sorry, I'll call later
freq    4822                      30


In [6]:
messages['Length'] = messages['message'].apply(len)
messages['Length']



0       196
1       155
2        61
3        77
4        35
       ... 
5563    160
5564     36
5565     57
5566    125
5567     26
Name: Length, Length: 5568, dtype: int64

In [7]:
print(messages.head(8))


  label                                            message  Length
0   ham  I've been searching for the right words to tha...     196
1  spam  Free entry in 2 a wkly comp to win FA Cup fina...     155
2   ham  Nah I don't think he goes to usf, he lives aro...      61
3   ham  Even my brother is not like to speak with me. ...      77
4   ham                I HAVE A DATE ON SUNDAY WITH WILL!!      35
5   ham  As per your request 'Melle Melle (Oru Minnamin...     160
6  spam  WINNER!! As a valued network customer you have...     157
7  spam  Had your mobile 11 months or more? U R entitle...     154


In [8]:
print(messages.groupby('label').count())


       message  Length
label                 
ham       4822    4822
spam       746     746


In [9]:
print(messages['Length'].describe())

count    5568.000000
mean       80.487428
std        59.950961
min         2.000000
25%        36.000000
50%        62.000000
75%       122.000000
max       910.000000
Name: Length, dtype: float64


In [11]:
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)
    

In [13]:
# Creating the Bag of Words model

cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()
X


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [14]:

y=pd.get_dummies(messages['label'])
y=y.iloc[:,1].values
y


array([0, 1, 0, ..., 0, 0, 0], dtype=uint8)

In [16]:
# Train Test Split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
X_train, X_test, y_train, y_test

(array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64),
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64),
 array([0, 1, 0, ..., 0, 0, 0], dtype=uint8),
 array([1, 0, 0, ..., 0, 0, 0], dtype=uint8))

In [19]:
# Training model using Naive bayes classifier





NB = MultinomialNB()
print(NB)
spam_detect_model = NB.fit(X_train, y_train)



y_pred=spam_detect_model.predict(X_test)
print(y_pred)


accuracyScore = accuracy_score(y_test,y_pred)*100
print("Prediction Accuracy :",accuracyScore)

msg = input("Enter Message: ")
msgInput = cv.transform([msg])
print(msgInput)
predict = NB.predict(msgInput)
if(predict[0]==0):
    print("NotSpam")
else:
    print("spam")

MultinomialNB()
[1 1 0 ... 0 0 0]
Prediction Accuracy : 98.65350089766606
Enter Message: y

NotSpam
