In [1]:
import pandas as pd
df = pd.read_csv('emails.csv')      # dataset from https://www.kaggle.com/datasets/jackksoncsie/spam-email-dataset
new_df = df.copy()
new_df.head(5)

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5728 entries, 0 to 5727
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5728 non-null   object
 1   spam    5728 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 89.6+ KB


In [3]:
import string
import nltk
import re
nltk.download('stopwords')

class Parser():
    def __init__(self):
        self.stemmer = nltk.PorterStemmer()
        self.stopwords = nltk.corpus.stopwords.words('english')
        self.punctuation = list(string.punctuation)
        
    def tokenize(self, text):
        text = text.lower()
        
        meta_data_patterns = [r'subject\s*:', r'to\s*:', r'cc\s*:', r're\s*:']
        for pattern in meta_data_patterns:
            text = re.sub(pattern, '', text, flags = re.IGNORECASE)
            
        text = re.sub(r'\S+@\S+', '', text)
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r'\d+', '', text)
        for char in self.punctuation:
            text = text.replace(char, '')
        text.replace('\n', ' ')
        text.replace('\t', ' ')
        
        tokens = list(filter(None, text.split(' ')))
        tokens = [self.stemmer.stem(word) for word in tokens if word not in self.stopwords]
        
        cleaned_text = ' '.join(tokens)
        return cleaned_text

[nltk_data] Downloading package stopwords to C:\Users\Thinh
[nltk_data]     Vo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
print(df.iloc[0, 0])
print(Parser().tokenize(df.iloc[0, 0]))

Subject: naturally irresistible your corporate identity  lt is really hard to recollect a company : the  market is full of suqgestions and the information isoverwhelminq ; but a good  catchy logo , stylish statlonery and outstanding website  will make the task much easier .  we do not promise that havinq ordered a iogo your  company will automaticaily become a world ieader : it isguite ciear that  without good products , effective business organization and practicable aim it  will be hotat nowadays market ; but we do promise that your marketing efforts  will become much more effective . here is the list of clear  benefits : creativeness : hand - made , original logos , specially done  to reflect your distinctive company image . convenience : logo and stationery  are provided in all formats ; easy - to - use content management system letsyou  change your website content and even its structure . promptness : you  will see logo drafts within three business days . affordability : your  mar

In [5]:
new_df['text'] = new_df['text'].apply(Parser().tokenize)
new_df.head(5)

Unnamed: 0,text,spam
0,natur irresist corpor ident lt realli hard rec...,1
1,stock trade gunsling fanni merril muzo colza a...,1
2,unbeliev new home made easi im want show homeo...,1
3,color print special request addit inform click...,1
4,money get softwar cd softwar compat great grow...,1


In [6]:
from sklearn.model_selection import train_test_split

def train_valid_test_split(df, rstate = 1, shuffle = True, stratify = None):
    strat = df[stratify] if stratify else None
    train_set, test_set = train_test_split(df, test_size = 0.4, random_state = rstate, shuffle = shuffle, stratify = strat)
    
    strat = df[stratify] if stratify else None
    valid_set, test_set = train_test_split(test_set, test_size = 0.5, random_state = rstate, shuffle = shuffle, stratify = strat)
    
    return (train_set, valid_set, test_set)

train_set, valid_set, test_set = train_valid_test_split(new_df)

In [7]:

from sklearn.feature_extraction.text import CountVectorizer

bagOfwords = CountVectorizer()

# bagOfword.fit_transform(new_df['text'])
# print(bagOfword.vocabulary_)

# bagOfword = CountVectorizer()
# print(bagOfword.fit_transform([df.loc[0, 'text']]))
# bagOfword.fit_transform(df['text'])
# print(bagOfword.get_stop_words())

# corpus = ["Keep coding, keep loving!!!", "I love programming"]
# corpus = [Parser().tokenize(str) for str in corpus]
# bagOfword.fit_transform(corpus)
# print(bagOfword.vocabulary_)

x_train = bagOfwords.fit_transform(train_set['text'])
labels = train_set['spam'].to_list()

x_valid = bagOfwords.transform(valid_set['text'])
x_test = bagOfwords.transform(test_set['text'])

In [8]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score

model = MultinomialNB()
model.fit(x_train, labels)
y_pred_valid = model.predict(x_valid)
print(f"Accuracy score of MultinomialNB with valid_set: {accuracy_score(valid_set['spam'].to_list(), y_pred_valid)*100:.2f} %")

model = BernoulliNB()
model.fit(x_train, labels)
y_pred_valid = model.predict(x_valid)
print(f"Accuracy score of BernoulliNB with valid_set: {accuracy_score(valid_set['spam'].to_list(), y_pred_valid)*100:.2f} %")

Accuracy score of MultinomialNB with valid_set: 98.87 %
Accuracy score of BernoulliNB with valid_set: 96.60 %


In [9]:
model = MultinomialNB()
model.fit(x_train, labels)
y_pred_test = model.predict(x_test)
print(f"Accuracy score of MultinomialNB with test_set: {accuracy_score(test_set['spam'].to_list(), y_pred_test)*100:.2f} %")

model = BernoulliNB()
model.fit(x_train, labels)
y_pred_test = model.predict(x_test)
print(f"Accuracy score of BernoulliNB with test_set: {accuracy_score(test_set['spam'].to_list(), y_pred_test)*100:.2f} %")

Accuracy score of MultinomialNB with test_set: 99.21 %
Accuracy score of BernoulliNB with test_set: 96.42 %
