# Build a spam filter using NLP and machine learning to
 # identify and filter out spam emails

# wheather the email is spam or not like 0 or 1

# Importing necessary libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import string
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

# Importing dataset using pandas

In [3]:
# Reading the csv file
data = pd.read_csv("emails.csv")
data

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
...,...,...
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0


In [4]:
data.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [5]:
data.tail()

Unnamed: 0,text,spam
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0
5727,Subject: news : aurora 5 . 2 update aurora ve...,0


# Data Visualizations

In [6]:
data.shape

(5728, 2)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5728 entries, 0 to 5727
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5728 non-null   object
 1   spam    5728 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 89.6+ KB


In [8]:
data.describe()

Unnamed: 0,spam
count,5728.0
mean,0.238827
std,0.426404
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [9]:
data.nunique()

text    5695
spam       2
dtype: int64

In [10]:
data.columns

Index(['text', 'spam'], dtype='object')

In [11]:
## checking for the null values in the dataset
data.isnull().sum()

text    0
spam    0
dtype: int64

In [12]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string


In [13]:
# Download NLTK stopwords if not already downloaded
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Keerthana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Text Pre-Processing

In [17]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Download NLTK stopwords if not already downloaded
nltk.download('stopwords')

# Function to preprocess the text
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Convert to lowercase
    tokens = [word.lower() for word in tokens]
    
    # Remove punctuation and stopwords
    tokens = [word for word in tokens if word.isalpha() and word not in stopwords.words('english')]
    
    return ' '.join(tokens)

# Apply preprocessing to the 'text' column
data['text_processed'] = data['text'].apply(preprocess_text)

# Display the updated dataframe
data.head()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Keerthana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,text,spam,text_processed
0,Subject: naturally irresistible your corporate...,1,subject naturally irresistible corporate ident...
1,Subject: the stock trading gunslinger fanny i...,1,subject stock trading gunslinger fanny merrill...
2,Subject: unbelievable new homes made easy im ...,1,subject unbelievable new homes made easy im wa...
3,Subject: 4 color printing special request add...,1,subject color printing special request additio...
4,"Subject: do not have money , get software cds ...",1,subject money get software cds software compat...


# Train Test Split

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data['text_processed'], data['spam'], test_size=0.2, random_state=42)

# Display the shapes of the training and testing sets
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)


Training set shape: (4582,)
Testing set shape: (1146,)


# TF - Term Frequency and IDF - Inverse Document Frequency

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust max_features based on your dataset size

# Fit and transform the training data
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform the testing data
X_test_tfidf = vectorizer.transform(X_test)


# Training a model

In [20]:
from sklearn.naive_bayes import MultinomialNB

# Initialize and train the Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train_tfidf, y_train)


# model evaluation

In [21]:
# Make predictions on the test set
y_pred = classifier.predict(X_test_tfidf)

# Evaluate the model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9790575916230366

Confusion Matrix:
 [[853   3]
 [ 21 269]]

Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       856
           1       0.99      0.93      0.96       290

    accuracy                           0.98      1146
   macro avg       0.98      0.96      0.97      1146
weighted avg       0.98      0.98      0.98      1146



# Train a Support Vector Machine (SVM) Classifier

In [22]:
from sklearn.svm import SVC

# Initialize and train the SVM classifier
svm_classifier = SVC()
svm_classifier.fit(X_train_tfidf, y_train)


# Make Predictions and Evaluate SVM

In [23]:
# Make predictions on the test set using SVM
y_pred_svm = svm_classifier.predict(X_test_tfidf)

# Evaluate the SVM model
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("\nSVM Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))
print("\nSVM Classification Report:\n", classification_report(y_test, y_pred_svm))


SVM Accuracy: 0.9895287958115183

SVM Confusion Matrix:
 [[854   2]
 [ 10 280]]

SVM Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99       856
           1       0.99      0.97      0.98       290

    accuracy                           0.99      1146
   macro avg       0.99      0.98      0.99      1146
weighted avg       0.99      0.99      0.99      1146



In [24]:
pip install scikit-learn tensorflow





[notice] A new release of pip is available: 23.3.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip





# (Random Forest): Train a Random Forest Classifier

In [25]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the Random Forest classifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_tfidf, y_train)


#  (Random Forest): Make Predictions and Evaluate Random Forest

In [26]:
# Make predictions on the test set using Random Forest
y_pred_rf = rf_classifier.predict(X_test_tfidf)

# Evaluate the Random Forest model
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nRandom Forest Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("\nRandom Forest Classification Report:\n", classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.9860383944153578

Random Forest Confusion Matrix:
 [[855   1]
 [ 15 275]]

Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       856
           1       1.00      0.95      0.97       290

    accuracy                           0.99      1146
   macro avg       0.99      0.97      0.98      1146
weighted avg       0.99      0.99      0.99      1146



# (Decision Tree): Train a Decision Tree Classifier

In [27]:
from sklearn.tree import DecisionTreeClassifier

# Initialize and train the Decision Tree classifier
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train_tfidf, y_train)


#  (Decision Tree): Make Predictions and Evaluate Decision Tree

In [28]:
# Make predictions on the test set using Decision Tree
y_pred_dt = dt_classifier.predict(X_test_tfidf)

# Evaluate the Decision Tree model
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print("\nDecision Tree Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))
print("\nDecision Tree Classification Report:\n", classification_report(y_test, y_pred_dt))


Decision Tree Accuracy: 0.956369982547993

Decision Tree Confusion Matrix:
 [[835  21]
 [ 29 261]]

Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.98      0.97       856
           1       0.93      0.90      0.91       290

    accuracy                           0.96      1146
   macro avg       0.95      0.94      0.94      1146
weighted avg       0.96      0.96      0.96      1146



# (Neural Network - MLP): Train a Neural Network (Multilayer Perceptron) Classifier

In [29]:
from sklearn.neural_network import MLPClassifier

# Initialize and train the MLP (Neural Network) classifier
mlp_classifier = MLPClassifier()
mlp_classifier.fit(X_train_tfidf, y_train)


# (Neural Network - MLP): Make Predictions and Evaluate Neural Network

In [30]:
# Make predictions on the test set using Neural Network
y_pred_mlp = mlp_classifier.predict(X_test_tfidf)

# Evaluate the Neural Network model
print("Neural Network Accuracy:", accuracy_score(y_test, y_pred_mlp))
print("\nNeural Network Confusion Matrix:\n", confusion_matrix(y_test, y_pred_mlp))
print("\nNeural Network Classification Report:\n", classification_report(y_test, y_pred_mlp))


Neural Network Accuracy: 0.9930191972076788

Neural Network Confusion Matrix:
 [[854   2]
 [  6 284]]

Neural Network Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00       856
           1       0.99      0.98      0.99       290

    accuracy                           0.99      1146
   macro avg       0.99      0.99      0.99      1146
weighted avg       0.99      0.99      0.99      1146



# Evaluates multiple classifiers, including Naive Bayes, Support Vector Machine, Random Forest, Decision Tree, and Neural Network (MLP)

In [31]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Train and evaluate Naive Bayes
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)
y_pred_nb = nb_classifier.predict(X_test_tfidf)
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print("Naive Bayes Accuracy:", accuracy_nb)

# Train and evaluate Support Vector Machine
svm_classifier = SVC()
svm_classifier.fit(X_train_tfidf, y_train)
y_pred_svm = svm_classifier.predict(X_test_tfidf)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("SVM Accuracy:", accuracy_svm)

# Train and evaluate Random Forest
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_tfidf, y_train)
y_pred_rf = rf_classifier.predict(X_test_tfidf)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", accuracy_rf)

# Train and evaluate Decision Tree
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train_tfidf, y_train)
y_pred_dt = dt_classifier.predict(X_test_tfidf)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print("Decision Tree Accuracy:", accuracy_dt)

# Train and evaluate Neural Network (MLP)
mlp_classifier = MLPClassifier()
mlp_classifier.fit(X_train_tfidf, y_train)
y_pred_mlp = mlp_classifier.predict(X_test_tfidf)
accuracy_mlp = accuracy_score(y_test, y_pred_mlp)
print("Neural Network Accuracy:", accuracy_mlp)


Naive Bayes Accuracy: 0.9790575916230366
SVM Accuracy: 0.9895287958115183
Random Forest Accuracy: 0.9851657940663177
Decision Tree Accuracy: 0.9598603839441536
Neural Network Accuracy: 0.9921465968586387
