In [27]:
# importing the Dataset

from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
import re
import pandas as pd

In [28]:
messages = pd.read_csv('./SMSSpamCollection', sep='\t',
                       names=["label", "message"])


In [29]:
# Data cleaning and preprocessing
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mbantwal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [30]:
ps = PorterStemmer()
wordnet = WordNetLemmatizer()
corpus = []


In [31]:
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()

    review = [wordnet.lemmatize(word)
              for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)


In [32]:
# Creating the Bag of Words model
cv = TfidfVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()

X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [33]:
y = pd.get_dummies(messages['label'])
print(y)
y = y.iloc[:, 1].values
print(y)


        ham   spam
0      True  False
1      True  False
2     False   True
3      True  False
4      True  False
...     ...    ...
5567  False   True
5568   True  False
5569   True  False
5570   True  False
5571   True  False

[5572 rows x 2 columns]
[False False  True ... False False False]


In [34]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=0)


In [35]:
# Training model using Naive bayes classifier
spam_detect_model = MultinomialNB().fit(X_train, y_train)
y_pred = spam_detect_model.predict(X_test)

In [36]:
c_matrix = confusion_matrix(y_test, y_pred)
c_matrix


array([[954,   1],
       [ 22, 138]], dtype=int64)

In [37]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.979372197309417