In [2]:
import pandas as pd
import os

# Read the email dataset CSV file
file_path = 'email_spam_dataset.csv'

messages = []

if os.path.exists(file_path):
    # Read the CSV file
    messages = pd.read_csv(file_path, sep=",", encoding="utf-8")
    
    # Print number of lines (rows)
    print(f"Number of lines in the dataset: {len(messages)}")
    # Print the first 5 rows of the dataset
    print("First 5 rows of the dataset:")   
    print(messages.head())
    # Print the last 5 rows of the dataset  
    print("Last 5 rows of the dataset:")
    print(messages.tail())
else:
    print(f"File '{file_path}' not found. Please make sure the email dataset has been generated.")

Number of lines in the dataset: 2000
First 5 rows of the dataset:
                                      subject  \
0                        Welcome to DataVault   
1  CLICK HERE to claim your $50000 cash prize   
2       Your order #ORD33665 has been shipped   
3           Invoice #INV9661 from DataSystems   
4      URGENT: Your account will be suspended   

                                                body label  
0  Team Update - July 12, 2025\n\nHi everyone,\n\...   ham  
1  Investment Opportunity - GUARANTEED returns!\n...  spam  
2  Dear Emily,\n\nYour order #ORD33665 has been p...   ham  
3  Team Update - July 07, 2025\n\nHi everyone,\n\...   ham  
4  Dear Lucky Winner,\n            \nCONGRATULATI...  spam  
Last 5 rows of the dataset:
                                          subject  \
1995  WINNER: You've been selected for $50000 USD   
1996            Invoice #INV7900 from DataSystems   
1997                     Appointment confirmation   
1998            Invoice #INV1037 

In [3]:
# Data cleaning and preprocessing
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

corpus = []

for i in range(0, len(messages)):
    #print(messages['body'])
    review = re.sub('[^a-zA-Z]', ' ', messages['body'].iloc[i])
    review = messages['body'].iloc[i]
    review = review.lower()
    review = review.split()
    review = [porter.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    print(review)
    corpus.append(review)

print("Corpus Size: ", len(corpus)) 
print("First 5 entries in the corpus:")
print(corpus[1:])

team updat - juli 12, 2025 hi everyone, here' weekli team update: - project a: track, 75% complet - project b: ahead schedule, launch next week - team meeting: friday 2 pm pleas review attach document let know questions. thanks, lisa
invest opportun - guarante returns! make $209% profit 13 days! trade bot use ai guarante profits. risk free - money 100% safe! join 30953 satisfi investors. start now: http://investment-scam.com
dear emily, order #ord33665 process shipped. track packag use track number: tr360424085. expect deliveri date: june 19, 2025 questions, pleas hesit contact custom service. best regards, digitalsolut team
team updat - juli 07, 2025 hi everyone, here' weekli team update: - project a: track, 75% complet - project b: ahead schedule, launch next week - team meeting: friday 2 pm pleas review attach document let know questions. thanks, mike
dear lucki winner, congratulations! select winner intern lotteri program. $5000 usd! claim prize, pleas click link provid person deta

In [4]:
# Create bag of words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=100, binary=True, stop_words='english')
x = cv.fit_transform(corpus).toarray()
print("Shape of the feature matrix X:", x.shape)
print(cv.vocabulary_)


Shape of the feature matrix X: (2000, 100)
{'team': 93, 'juli': 44, '2025': 2, 'hi': 36, 'project': 71, 'track': 95, 'complet': 16, 'launch': 47, 'meeting': 55, 'pleas': 65, 'review': 79, 'attach': 7, 'let': 48, 'know': 46, 'questions': 73, 'invest': 41, 'guarante': 32, 'make': 53, 'bot': 9, 'use': 97, 'risk': 80, 'free': 31, 'money': 56, '100': 1, 'join': 43, 'satisfi': 81, 'start': 91, 'http': 39, 'scam': 82, 'com': 14, 'dear': 25, 'order': 61, 'process': 68, 'shipped': 88, 'packag': 62, 'number': 58, 'expect': 29, 'deliveri': 26, 'date': 23, 'june': 45, 'hesit': 35, 'contact': 18, 'custom': 21, 'service': 86, 'best': 8, 'regards': 74, 'lucki': 52, 'winner': 99, 'congratulations': 17, 'select': 84, 'intern': 40, 'lotteri': 50, 'program': 70, 'usd': 96, 'claim': 12, 'prize': 67, 'click': 13, 'link': 49, 'provid': 72, 'person': 64, 'details': 27, 'suspicious': 92, 'lottery': 51, 'site': 89, 'offer': 59, 'expir': 30, '24': 3, 'hours': 38, 'act': 6, 'commiss': 15, 'invoic': 42, 'servic':

In [5]:
# N-Gram model
from sklearn.feature_extraction.text import CountVectorizer
ngram_vectorizer = CountVectorizer(ngram_range=(2, 3), max_features=500, binary=True)
x_ngram = ngram_vectorizer.fit_transform(corpus).toarray()
print("Shape of the N-Gram feature matrix X:", x_ngram.shape)
# Print the vocabulary for N-Grams
print(ngram_vectorizer.vocabulary_)


Shape of the N-Gram feature matrix X: (2000, 500)


In [8]:
# TF-IDF model
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=100, stop_words='english', ngram_range=(2, 3))
tfidf_x = tfidf_vectorizer.fit_transform(corpus).toarray()
print("Shape of the TF-IDF feature matrix X:", tfidf_x.shape)
print(tfidf_vectorizer.vocabulary_)
print(tfidf_x)


Shape of the TF-IDF feature matrix X: (2000, 100)
{'let know': 40, 'risk free': 81, 'start http': 89, 'scam com': 82, 'date june': 22, 'best regards': 8, 'dear lucki': 23, 'lucki winner': 48, 'winner congratulations': 95, 'congratulations select': 20, 'select winner': 83, 'winner intern': 97, 'intern lotteri': 35, 'lotteri program': 45, 'usd claim': 92, 'claim prize': 12, 'prize pleas': 70, 'pleas click': 64, 'click link': 16, 'link provid': 42, 'provid person': 72, 'person details': 60, 'details click': 25, 'click http': 14, 'http suspicious': 33, 'suspicious lottery': 90, 'lottery site': 46, 'site com': 87, 'com offer': 18, 'offer expir': 54, 'expir 24': 27, '24 hours': 3, 'hours act': 31, 'act best': 5, 'regards intern': 74, 'lotteri commiss': 44, 'dear lucki winner': 24, 'lucki winner congratulations': 49, 'winner congratulations select': 96, 'congratulations select winner': 21, 'select winner intern': 84, 'winner intern lotteri': 98, 'intern lotteri program': 37, 'usd claim prize'