In [11]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import StratifiedKFold
from statistics import mean
from shutil import move
import json
import os
import joblib
import nltk
nltk.download('punkt')

In [12]:
input_emails = pd.read_csv("./data/spam_ham_dataset.csv")
print(input_emails.head())

     id label                                               text  label_num
0   605   ham  Subject: enron methanol ; meter # : 988291\r\n...          0
1  2349   ham  Subject: hpl nom for january 9 , 2001\r\n( see...          0
2  3624   ham  Subject: neon retreat\r\nho ho ho , we ' re ar...          0
3  4685  spam  Subject: photoshop , windows , office . cheap ...          1
4  2030   ham  Subject: re : indian springs\r\nthis deal is t...          0


In [20]:
def extract_feature(input_emails):
    stopwords_ = stopwords.words("english")
    porterStemmer = PorterStemmer()
    count_words = {}
    for i in range(len(input_emails)):
        email = input_emails[i]
        words = [w for w in word_tokenize(email[2][9:]) if not w in stopwords_]
        words = [porterStemmer.stem(w) for w in words]

        for word in words:
            if word.isalpha() and len(word) > 1:
                if count_words.get(word):
                    count_words[word] += 1
                else:
                    count_words[word] = 1
   
    set_words =  [key for key in count_words.keys() if count_words[key] > 1]
    with open("./data/set_words.txt", "w") as file:
        json.dump(set_words,file)
    print("Number of Feature: ",len(set_words))
extract_feature(input_emails.to_numpy())

Number of Feature:  17393


In [19]:
def create_matrixs(input_emails):
    matrixs = []
    labels = []
    set_words = []
    stopwords_ = stopwords.words("english")
    porterStemmer = PorterStemmer()
    with open("./data/set_words.txt") as file:
        set_words = json.load(file)
    for i in range(len(input_emails)):
        email = input_emails[i]
        words = [w for w in word_tokenize(email[2][9:]) if not w in stopwords_]
        vector = [0 for i in range(len(set_words))]

        for w in words:
            w = porterStemmer.stem(w)
            if w in set_words:
                vector[set_words.index(w)] += 1
        matrixs.append(vector)
        labels.append(email[-1])
    return matrixs, labels

matrixs, labels = create_matrixs(input_emails.to_numpy())



In [15]:
def prepare_email(email):
    matrixs = []
    set_words = []
    stopwords_ = stopwords.words("english")
    porterStemmer = PorterStemmer()
    with open("./data/set_words.txt") as file:
        set_words = json.load(file) 
    words = [w for w in word_tokenize(email[9:]) if not w in stopwords_]
    vector = [0 for i in range(len(set_words))]
    for w in words:
        w = porterStemmer.stem(w)
        if w in set_words:
            vector[set_words.index(w)] += 1
    matrixs.append(vector)
    return matrixs

In [25]:
def train_model_and_save():
    matrixs = []
    labels = []
    if not os.path.exists("./data/matrixs.npy") or not os.path.exists("./data/labels.npy"):
        input_emails = pd.read_csv("./data/spam_ham_dataset.csv").to_numpy()
        matrixs, labels = create_matrixs(input_emails)

        np.save('./data/matrixs.npy', matrixs)
        np.save('./data/labels.npy', labels)
    else:
        matrixs = np.load("./data/matrixs.npy")
        labels = np.load("./data/labels.npy")
    
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
    lst_accu_stratified = []
    multinomialNB = MultinomialNB()
    max = []
    x = 0
    for train_index, test_index in skf.split(matrixs, labels):
        x_train, x_test = matrixs[train_index], matrixs[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        multinomialNB.fit(x_train, y_train)
        sccore = multinomialNB.score(x_test, y_test)
        if x == 0 or x < sccore:
            max = []
            x = sccore
            max.append(x_train)
            max.append(y_train)
            max.append(x_test)
            max.append(y_test)
        lst_accu_stratified.append(sccore)
    print(lst_accu_stratified)
    print("Max score: ",mean(lst_accu_stratified))
    print("Min score: ",mean(lst_accu_stratified))
    print("Mean score: ",mean(lst_accu_stratified))
    multinomialNB = multinomialNB.fit(max[0], max[1])
    joblib.dump(multinomialNB, f"./data/model_detect_spam_email_NB.joblib")

train_model_and_save()

[0.9671814671814671, 0.9748549323017408, 0.9690522243713733, 0.97678916827853, 0.9671179883945842, 0.9729206963249516, 0.9613152804642167, 0.9787234042553191, 0.9806576402321083, 0.9729206963249516]
Max score:  0.9721533498129242
Min score:  0.9721533498129242
Mean score:  0.9721533498129242


In [24]:
def main():
    email_number = -1
    model = joblib.load("./data/model_detect_spam_email_NB.joblib")
    while True:
        list_files = os.listdir("./data/test_email/")
        for i in range(len(list_files)):
            print(f"{i+1}. {list_files[i]}")
        print("0. Exist")
        print("Enter you email number: ")
        email_number = int(input())
        if email_number == 0:
            break
        email = ""
        with  open(f"./data/test_email/{list_files[email_number-1]}") as file:
            email = file.read()
        matrixs = prepare_email(email)
        y_pred = model.predict(matrixs)
        print("===================")
        print(f"Email : {email}")
        print("-------------------")
        if y_pred[0] == 0:
            print("\tHam email")
        else:
            print("\tSpam email")
        print("===================")


main()

1. email_1.txt
2. email_2.txt
3. email_3.txt
4. email_4.txt
5. email_5.txt
6. email_6.txt
0. Exist
Enter you email number: 
Email : Subject: re : indian springs
this deal is to book the teco pvr revenue . it is my understanding that teco
just sends us a check , i haven ' t received an answer as to whether there is a
predermined price associated with this deal or if teco just lets us know what
we are giving . i can continue to chase this deal down if you need .
-------------------
	Ham email
1. email_2.txt
2. email_3.txt
3. email_4.txt
4. email_5.txt
5. email_6.txt
0. Exist
Enter you email number: 
Email : Subject: spring savings certificate - take 30 % off
save 30 % when you use our customer appreciation spring savings
certificate at foot locker , lady foot locker , kids foot locker and at
our online stores !
welcome to our customer appreciation spring savings certificate !
use the special certificate below and receive 30 % off your purchases either in our stores or online . hurry ! th