In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import email
import email.policy
import os
import random
from bs4 import BeautifulSoup
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

In [None]:
base_dir = '/content/drive/MyDrive/project_data/'

In [None]:
label = 'spam'

In [None]:
files = os.listdir(base_dir + label)
len(files)

501

In [None]:
spam_email_dir = os.listdir(base_dir + 'spam')
ham_email_dir = os.listdir(base_dir + 'ham')

In [None]:
def load_email(is_spam, filename):
    directory = base_dir + ('spam' if is_spam else 'ham')
    with open(os.path.join(directory, filename), 'rb') as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [None]:
spam_emails = [load_email(True, filename) for filename in spam_email_dir]
ham_emails = [load_email(False, filename) for filename in ham_email_dir]

In [None]:
for mail in spam_emails:
    payload = mail.get_payload()
    print(mail.keys())

In [None]:
def process_email(emails, label, data_dictionary, default_topic=None):
    for mail in emails:
        payload = mail.get_payload()
        if isinstance(payload, list):
            process_email(payload, label, data_dictionary, default_topic=mail["Subject"])
        else:
            if 'Content-Type' in mail.keys():
                if 'html' in mail['Content-Type'].lower():
                    try: 
                        soup = BeautifulSoup(mail.get_content())
                        topic = mail['Subject']
                        if topic == None:
                            topic = default_topic
                        content = soup.body.text
                        data_dictionary['topic'].append(topic)
                        data_dictionary['content'].append(content)
                        data_dictionary['label'].append(label)
                    except:
                        pass
                elif "plain" in mail['Content-Type'].lower():
                    try: 
                        topic = mail['Subject']
                        if topic == None:
                            topic = default_topic
                        content = mail.get_content()
                        data_dictionary['topic'].append(topic)
                        data_dictionary['content'].append(content)
                        data_dictionary['label'].append(label)
                    except:
                        pass
                else:
                    pass

In [None]:
data_dictionary = {'topic': [], 'content': [], 'label': []}
process_email(spam_emails, 1, data_dictionary)
process_email(ham_emails, 0, data_dictionary)
df = pd.DataFrame(data_dictionary)
df.dropna(inplace=True)
df = df.sample(frac=1)

In [None]:
df.head(10)

Unnamed: 0,topic,content,label
157,Re: Fw: User Name & Password to Membership To ...,##############################################...,1
471,You can gain from lowest interest rates in 30 ...,Opportunity is knocking. Why?\n\nBecause mortg...,1
79,Life Insurance Quotes Without the Hassle... ...,\n\nSave up to\n\n75% on your Term Life\nInsur...,1
1549,RE: A moment of silence for the First Amendmen...,> The problem is that politics have gotten so ...,0
1692,Re: [SAtalk] SA In The News,"On Tue, 20 Aug 2002, Matthew Cline wrote:\n\n>...",0
1075,'Nasty party' warning to Tories,"URL: http://www.newsisfree.com/click/-2,865570...",0
1883,Re: Mplayer,"Matthias Saou wrote:\n\n>Once upon a time, Roi...",0
474,Congratulations on Your 6 New Signups,We guarantee you signups before you ever pay\n...,1
916,"Toddler falls from a first-storey window, save...","URL: http://www.newsisfree.com/click/-1,839867...",0
2256,Re: Java is for kiddies,Reza B'Far (eBuilt) wrote:\n> problems.... Why...,0


In [None]:
import nltk
from nltk.stem import PorterStemmer
porter_stemmer = PorterStemmer()
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
def preprocess_text(content):
    content = content.lower()        # Đổi chữ hoa thành chữ thường 
    content = re.sub(r'^http?:\/\/.*[\r\n]*', '', content, flags=re.MULTILINE) # Xóa các đường dẫn
    cleaner = re.compile('<.*?>')    
    content = re.sub(cleaner, '', content)  # Xóa các ký tự đặc biệt
    content = content.replace('\n',' ')  # Xóa dấu xuống dòng
    content = re.sub(r"[^a-zA-Z0-9]+", ' ', content)  # Xóa các ký tự đặc biệt
    content = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", content)  # Loại bỏ đi các chữ số
    for stopword in stopwords:
        content = content.replace(" " + stopword + " ", " ")  # Loại bỏ các từ stopwords
    content = porter_stemmer.stem(content)  # Lấy từ nguyên mẫu của các từ tiếng anh có trong văn bản
    return content

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:

topic_and_contents = []
for (topic, content) in zip(df["topic"], df["content"]):
    topic_and_contents.append(preprocess_text(topic + " " + content))  
df["topic_content"] = topic_and_contents

In [None]:
df.head(10)

Unnamed: 0,topic,content,label,topic_content
157,Re: Fw: User Name & Password to Membership To ...,##############################################...,1,re fw user name password membership sites zzzz...
471,You can gain from lowest interest rates in 30 ...,Opportunity is knocking. Why?\n\nBecause mortg...,1,you gain lowest interest rates years opportuni...
79,Life Insurance Quotes Without the Hassle... ...,\n\nSave up to\n\n75% on your Term Life\nInsur...,1,life insurance quotes without hassle jhiwns sa...
1549,RE: A moment of silence for the First Amendmen...,> The problem is that politics have gotten so ...,0,re moment silence first amendment fwd problem ...
1692,Re: [SAtalk] SA In The News,"On Tue, 20 Aug 2002, Matthew Cline wrote:\n\n>...",0,re satalk sa news tue aug matthew cline wrote ...
1075,'Nasty party' warning to Tories,"URL: http://www.newsisfree.com/click/-2,865570...",0,nasty party warning tories url http www newsi...
1883,Re: Mplayer,"Matthias Saou wrote:\n\n>Once upon a time, Roi...",0,re mplayer matthias saou wrote upon time roi w...
474,Congratulations on Your 6 New Signups,We guarantee you signups before you ever pay\n...,1,congratulations new signups guarantee signups ...
916,"Toddler falls from a first-storey window, save...","URL: http://www.newsisfree.com/click/-1,839867...",0,toddler falls first storey window saved injury...
2256,Re: Java is for kiddies,Reza B'Far (eBuilt) wrote:\n> problems.... Why...,0,re java kiddies reza b far ebuilt wrote proble...


In [None]:
df['topic'][27]

'[ILUG] IMPORTANT.'

In [None]:
df['content'][27]

"\n>>From the desk of: DR. SAMUEL EBOKA.\nTel No: Your Intl. Access Code + 873762692484\nFax No: Your Intl. Access Code + 873762692485\nemail : samueleboka2@email.com\nLagos, Nigeria.\nDear Sir,\n\nIMPORTANT.\n\nAfter due deliberation with my colleagues, We have decided to forward to\nyou this business proposal. We want a reliable \nperson who could assist us in the transfer the sum of Twenty Million, Five\nHundred Thousand United States Dollars ( \n$20,500,000 ). Via International Bank Draft Cashable in any First World\nCountries.\n\nThis fund resulted from an over-invoiced bill from contracts awarded by us\nunder the budget allocation to our Ministry. This \nbill has been approved for payment by the other concerned Ministries. The\ncontract has since been executed, \ncommissioned and the contractor was paid the actual cost of the contract.\nWe are left with the balance US$20.5M as part \nof the over-invoiced amount which we have deliberated over estimated for\nour own use. But under 

In [None]:
df['topic_content'][27]

' ilug important desk dr samuel eboka tel intl access code fax intl access code email samueleboka2 email com lagos nigeria dear sir important due deliberation colleagues decided forward business proposal want reliable person could assist us transfer sum twenty million five hundred thousand united states dollars 500 via international bank draft cashable first world countries fund resulted invoiced bill contracts awarded us budget allocation ministry bill approved payment concerned ministries contract since executed commissioned contractor paid actual cost contract left balance us 5m part invoiced amount deliberated estimated use protocol division civil servants forbidden operate foreign accounts soliciting assistance manner regard may want know make less curious got address adverts business directory chief accountant internal auditor contract award committee cac nigerian national petroleum corporation nnpc transaction much free sorts risks trouble government n n p c officials involved d

In [None]:
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(df["topic_content"])
x = x.toarray()

X = []            #Vector after encode email
for i in x:
    X.append(i.flatten())
Y = df['label']

In [None]:
Y

157     1
471     1
79      1
1549    0
1692    0
       ..
611     0
797     0
837     0
156     1
2108    0
Name: label, Length: 2658, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print("Number of emails in traning: ",len(y_train))
print("Number of emails in testing: ",len(y_test))

Number of emails in traning:  2126
Number of emails in testing:  532


In [None]:
from sklearn.naive_bayes import MultinomialNB
clf_NB = MultinomialNB()
clf_NB.fit(X_train, y_train)

MultinomialNB()

In [None]:
y_pred = clf_NB.predict(X_test)
print (f"Accuracy in testing dataset:",(100*accuracy_score(y_test, y_pred)))

Accuracy in testing dataset: 99.06015037593986


In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[434,   3],
       [  2,  93]])

In [None]:
import numpy as np
x = input("input content: ")
x = preprocess_text(x)
X = [x]
print(X)
X = vectorizer.transform(X)
y_pred = clf_NB.predict(X.toarray())
print(y_pred[0])

input content: New Promotional Announcement  	 Hi Le Xuan, an announcement has been made from Learn Tech, instructor of Adobe Photoshop 2022 Photo Editing.  Hello everyone,  Make your New Year awesome with these amazing deals on my courses.  Learn a new skill today.  Get Lifetime Access to my Udemy courses from $9.99 and up!  Simply click on the links below of the courses you want to enroll in.  For a limited time! Only 3 Days Left!    TOP RATED COURSES  Web Development Ultimate Course - $9.99  Front-End Web Development Ultimate Course 2022 - $12.99  Python 3 Ultimate Course - $9.99  GitLab - $11.99  Visual Studio Code Ultimate Course - $11.99  Bootstrap 4 Ultimate Course - $9.99  JavaScript Ultimate Course - $9.99  CSS3 Ultimate Course - $9.99  HTML5 Ultimate Course - $9.99  Adobe After Effects 2022 Ultimate Course - $9.99  Adobe Animate 2022 Ultimate Course - $9.99  Adobe XD 2022 Ultimate Course - $9.99  Adobe Audition 2022 Ultimate Course - $9.99  Adobe Creative Cloud 2022 Ultimate 