# **Email Spam Detection**

In [1]:
# Importing libraries

import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
import string

In [2]:
# Loading the data

from google.colab import files
uploaded = files.upload()

Saving emails.csv to emails.csv


In [23]:
# Reading the file

df = pd.read_csv('emails.csv')

df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5695 entries, 0 to 5727
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5695 non-null   object
 1   spam    5695 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 133.5+ KB


In [4]:
# Shape of dataset

df.shape

(5728, 2)

In [5]:
# Getting columns name in our dataset

df.columns

Index(['text', 'spam'], dtype='object')

In [6]:
# Removing duplicate values

df.drop_duplicates(inplace = True)

In [7]:
# New shape after removing duplicate values

df.shape

(5695, 2)

In [8]:
# Showing number of missing data

df.isnull().sum()

text    0
spam    0
dtype: int64

In [9]:
# Downloading stopwords package

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [10]:
def process_text(text):

  #1 removing punctuation
  #2 remove stopwords
  #3 return list of clean text words

  #1
  nopunc = [char for char in text if char not in string.punctuation]
  nopunc = ''.join(nopunc)

  #2
  clean_words = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

  #3
  return clean_words

In [11]:
# Showing tokenization

df['text'].head().apply(process_text)

0    [Subject, naturally, irresistible, corporate, ...
1    [Subject, stock, trading, gunslinger, fanny, m...
2    [Subject, unbelievable, new, homes, made, easy...
3    [Subject, 4, color, printing, special, request...
4    [Subject, money, get, software, cds, software,...
Name: text, dtype: object

In [12]:
# Converting a collection of text to a matrix of tokens

from sklearn.feature_extraction.text import CountVectorizer

messages_bow = CountVectorizer(analyzer = process_text).fit_transform(df['text'])

In [13]:
# Spliting data

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(messages_bow, df['spam'], test_size = 0.2, random_state = 0)

In [14]:
# Shape of messages_bow

messages_bow.shape

(5695, 37229)

In [15]:
# Creating and training Naive Bayes Classifier

from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB().fit(X_train, y_train)


In [16]:
# Printing predictions

print(classifier.predict(X_train))

# Printing actual values

print(y_train)

[0 0 0 ... 0 0 0]
3337    0
2104    0
3905    0
461     1
314     1
       ..
4950    0
3273    0
1653    0
2611    0
2736    0
Name: spam, Length: 4556, dtype: int64


In [17]:
# Evaluating model on training dataset

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

pred = classifier.predict(X_train)
print(classification_report(y_train, pred))
print()

print('Training Confusion Matrix: \n', confusion_matrix(y_train, pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3457
           1       0.99      1.00      0.99      1099

    accuracy                           1.00      4556
   macro avg       0.99      1.00      1.00      4556
weighted avg       1.00      1.00      1.00      4556


Training Confusion Matrix: 
 [[3445   12]
 [   1 1098]]


In [18]:
print('Training Accuracy: ', accuracy_score(y_train, pred))

Training Accuracy:  0.9971466198419666


In [19]:
# Evaluating model on testing dataset

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

pred = classifier.predict(X_test)
print(classification_report(y_test, pred))
print()

print('Testing Confusion Matrix: \n', confusion_matrix(y_test, pred))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       870
           1       0.97      1.00      0.98       269

    accuracy                           0.99      1139
   macro avg       0.98      0.99      0.99      1139
weighted avg       0.99      0.99      0.99      1139


Testing Confusion Matrix: 
 [[862   8]
 [  1 268]]


In [20]:
print('Testing Accuracy: ', accuracy_score(y_test, pred))

Testing Accuracy:  0.9920983318700615
