In [1]:
# importing the Dataset

import pandas as pd

messages = pd.read_csv('spam_ham_dataset.csv')

In [2]:
messages.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [3]:
messages = messages.drop('Unnamed: 0' , axis=1)

In [4]:
messages.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   label      5171 non-null   object
 1   text       5171 non-null   object
 2   label_num  5171 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 121.3+ KB


In [5]:
#Data cleaning and preprocessing
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\83688\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [7]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['text'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [8]:
corpus[0]

'subject enron methanol meter follow note gave monday preliminari flow data provid daren pleas overrid pop daili volum present zero reflect daili activ obtain ga control chang need asap econom purpos'

In [9]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()

In [10]:
# get into vector form
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [11]:
y = messages.iloc[: , -1]

In [12]:
y

0       0
1       0
2       0
3       1
4       0
       ..
5166    0
5167    0
5168    0
5169    0
5170    1
Name: label_num, Length: 5171, dtype: int64

In [13]:
# Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [14]:
X_train.shape

(4136, 37890)

In [15]:
y_train.shape

(4136,)

In [16]:
X_test.shape

(1035, 37890)

In [17]:
y_test.shape

(1035,)

## Model 1 
using stemming , BOW 

In [18]:
# Training model using Naive bayes classifier

from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [19]:
y_pred=spam_detect_model.predict(X_test)

In [20]:
from sklearn.metrics import accuracy_score , classification_report

In [21]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test , y_pred))

[[721  11]
 [ 20 283]]


In [22]:
print(classification_report(y_test , y_pred))

              precision    recall  f1-score   support

           0       0.97      0.98      0.98       732
           1       0.96      0.93      0.95       303

    accuracy                           0.97      1035
   macro avg       0.97      0.96      0.96      1035
weighted avg       0.97      0.97      0.97      1035



In [23]:
accuracy_score(y_test , y_pred)

0.970048309178744

## Model 2 
using Lemmatization , TF-IDF

In [24]:
from nltk.stem import WordNetLemmatizer
lm = WordNetLemmatizer()

In [25]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['text'][i])
    review = review.lower()
    review = review.split()
    
    review = [lm.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [26]:
corpus[0]

'subject enron methanol meter follow note gave monday preliminary flow data provided daren please override pop daily volume presently zero reflect daily activity obtain gas control change needed asap economics purpose'

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [28]:
X = vectorizer.fit_transform(corpus).toarray()

In [29]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [30]:
# Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [31]:
X_train.shape

(4136, 43162)

In [32]:
y_train.shape

(4136,)

In [33]:
# Training model using Naive bayes classifier

from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [34]:
y_pred=spam_detect_model.predict(X_test)

In [35]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test , y_pred))

[[732   0]
 [ 87 216]]


In [36]:
print(classification_report(y_test , y_pred))

              precision    recall  f1-score   support

           0       0.89      1.00      0.94       732
           1       1.00      0.71      0.83       303

    accuracy                           0.92      1035
   macro avg       0.95      0.86      0.89      1035
weighted avg       0.92      0.92      0.91      1035



In [37]:
accuracy_score(y_test , y_pred)

0.9159420289855073