In [None]:
'''
This notebook explores using different machine learning model to detect spam email and evaluating their results.
Dataset obtained from https://www.kaggle.com/nitishabharathi/email-spam-dataset
data classified using:
1. Naive Bayes model
2. Support Vector Machine (SVM)
3. Random Forest model
(1) for spam (0) for not spam.
'''

In [None]:
# Import Libraries
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [None]:
# Upload data
from google.colab import files
uploaded = files.upload()

Saving completeSpamAssassin.csv to completeSpamAssassin.csv


In [None]:
# Load data
df = pd.read_csv('completeSpamAssassin.csv')
df.head(5)

Unnamed: 0.1,Unnamed: 0,Body,Label
0,0,\nSave up to 70% on Life Insurance.\nWhy Spend...,1
1,1,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
2,2,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
3,3,##############################################...,1
4,4,I thought you might like these:\n1) Slim Down ...,1


In [None]:
# Exploring the data
df.shape

(6045, 3)

In [None]:
# More information on dataset (number of non-null rows)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6046 entries, 0 to 6045
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  6046 non-null   int64 
 1   Body        6045 non-null   object
 2   Label       6046 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 188.9+ KB


In [None]:
# Drop null rows
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6045 entries, 0 to 6045
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  6045 non-null   int64 
 1   Body        6045 non-null   object
 2   Label       6045 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 188.9+ KB


In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
def process_text(text):
  '''
  1) Remove punctuation from message
  2) Remove stopwords (useless words or data)
  3) Return list of clean text words
  '''
  
  # Removing punctuation (1)
  nopunc = [char for char in text if char not in string.punctuation]
  nopunc = ''.join(nopunc)

  # Removing stopwords (2)
  clean_words = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
  
  return clean_words

In [None]:
df['Body'].head().apply(process_text)

0    [Save, 70, Life, Insurance, Spend, ToLife, Quo...
1    [1, Fight, Risk, Cancer, httpwwwadclickwspcfmo...
2    [1, Fight, Risk, Cancer, httpwwwadclickwspcfmo...
3    [Adult, Club, Offers, FREE, Membership, INSTAN...
4    [thought, might, like, 1, Slim, Guaranteed, lo...
Name: Body, dtype: object

In [None]:
msg_bow = CountVectorizer(analyzer=process_text).fit_transform(df['Body'])

In [None]:
# Split data into train (80%) and test (20%) sets
x_train, x_test, y_train, y_test = train_test_split(msg_bow, df['Label'], test_size=0.2, random_state=0)
print(x_train.shape)
print(x_test.shape)

(4836, 104176)
(1209, 104176)


In [None]:
# Create and train using Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB().fit(x_train, y_train)
# Predict test data using trained NB model
y_pred = mnb.predict(x_test)

In [None]:
#Evaluate model
from sklearn.metrics import classification_report, accuracy_score

print('Classification Report:')
print(classification_report(y_test, y_pred))
print('')
print('Accuracy Score:')
print(accuracy_score(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.95      0.97       835
           1       0.89      0.98      0.93       374

    accuracy                           0.96      1209
   macro avg       0.94      0.96      0.95      1209
weighted avg       0.96      0.96      0.96      1209


Accuracy Score:
0.9561621174524401


In [None]:
# Create and train using SVM classifier
from sklearn.svm import SVC

svc = SVC(C=1.0,kernel='rbf',gamma='auto')
svc.fit(x_train, y_train)

# Predict test data using trained SVM model
y_pred2 = svc.predict(x_test)

In [None]:
#Evaluate model
print('Classification Report:')
print(classification_report(y_test, y_pred2))
print('')
print('Accuracy Score:')
print(accuracy_score(y_test, y_pred2))

Classification Report:
              precision    recall  f1-score   support

           0       0.70      1.00      0.82       835
           1       1.00      0.03      0.06       374

    accuracy                           0.70      1209
   macro avg       0.85      0.52      0.44      1209
weighted avg       0.79      0.70      0.59      1209


Accuracy Score:
0.7005789909015715


In [None]:
# Create and train using random forest classifier
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100, criterion='gini')
rfc.fit(x_train, y_train)

# Predict test data using trained RFC model
y_pred3 = rfc.predict(x_test)

In [None]:
#Evaluate model
print('Classification Report:')
print(classification_report(y_test, y_pred3))
print('')
print('Accuracy Score:')
print(accuracy_score(y_test, y_pred3))

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.93      0.96       835
           1       0.86      0.96      0.91       374

    accuracy                           0.94      1209
   macro avg       0.92      0.95      0.93      1209
weighted avg       0.94      0.94      0.94      1209


Accuracy Score:
0.9404466501240695
