In [None]:
from sklearn.naive_bayes import MultinomialNB
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import StratifiedKFold
from statistics import mean
from shutil import move
import numpy as np
import pandas as pd
import json
import os
import joblib
import nltk
nltk.download('punkt')

In [None]:
input_emails = pd.read_csv("./data/spam_ham_dataset.csv")
print(input_emails.head())
print(input_emails.describe())
print(input_emails.groupby("label").describe())

In [None]:
def extract_feature(input_emails):
    stopwords_ = stopwords.words("english")
    porterStemmer = PorterStemmer()
    count_words = {}
    for i in range(len(input_emails)):
        email = input_emails[i]
        words = [w for w in word_tokenize(email[2][9:]) if not w in stopwords_]
        words = [porterStemmer.stem(w) for w in words]

        for word in words:
            if word.isalpha() and len(word) > 1:
                if count_words.get(word):
                    count_words[word] += 1
                else:
                    count_words[word] = 1
   
    set_words =  [key for key in count_words.keys() if count_words[key] > 1]
    with open("./data/set_words.txt", "w") as file:
        json.dump(set_words,file)
    print("Number of Feature: ",len(set_words))

In [None]:
def create_matrixs(input_emails):
    matrixs = []
    labels = []
    set_words = []
    stopwords_ = stopwords.words("english")
    porterStemmer = PorterStemmer()
    with open("./data/set_words.txt") as file:
        set_words = json.load(file)
    for i in range(len(input_emails)):
        email = input_emails[i]
        words = [w for w in word_tokenize(email[2][9:]) if not w in stopwords_]
        vector = [0 for i in range(len(set_words))]

        for w in words:
            w = porterStemmer.stem(w)
            if w in set_words:
                vector[set_words.index(w)] += 1
        matrixs.append(vector)
        labels.append(email[-1])
    return matrixs, labels

In [None]:
def prepare_email(email):
    matrixs = []
    set_words = []
    stopwords_ = stopwords.words("english")
    porterStemmer = PorterStemmer()
    with open("./data/set_words.txt") as file:
        set_words = json.load(file) 
    words = [w for w in word_tokenize(email[9:]) if not w in stopwords_]
    vector = [0 for i in range(len(set_words))]
    for w in words:
        w = porterStemmer.stem(w)
        if w in set_words:
            vector[set_words.index(w)] += 1
    matrixs.append(vector)
    return matrixs

In [None]:
def train_model_and_save(input_emails):
    matrixs = []
    labels = []
    if not os.path.exists("./data/matrixs.npy") or not os.path.exists("./data/labels.npy"):
        input_emails = pd.read_csv("./data/spam_ham_dataset.csv").to_numpy()
        matrixs, labels = create_matrixs(input_emails)

        np.save('./data/matrixs.npy', matrixs)
        np.save('./data/labels.npy', labels)
    else:
        matrixs = np.load("./data/matrixs.npy")
        labels = np.load("./data/labels.npy")
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    lst_accu_stratified = []
    multinomialNB = MultinomialNB()
    max = []
    x = 0
    for train_index, test_index in skf.split(matrixs, labels):
        x_train, x_test = matrixs[train_index], matrixs[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        multinomialNB.fit(x_train, y_train)
        sccore = multinomialNB.score(x_test, y_test)
        if x == 0 or x < sccore:
            max = []
            x = sccore
            max.append(x_train)
            max.append(y_train)
            max.append(x_test)
            max.append(y_test)
        lst_accu_stratified.append(sccore)
    print(lst_accu_stratified)
    # print("Max score: ",max(lst_accu_stratified))
    # print("Min score: ",min(lst_accu_stratified))
    print("Mean score: ",mean(lst_accu_stratified))
    multinomialNB = multinomialNB.fit(max[0], max[1])
    joblib.dump(multinomialNB, f"./data/model_detect_spam_email_NB.joblib")


In [None]:
def main():
    if not os.path.exists("./data/model_detect_spam_email_NB.joblib"):
        input_emails = pd.read_csv("./data/spam_ham_dataset.csv")
        print("Start training model detect email....")
        train_model_and_save(input_emails.to_numpy())
        print("Done....")

    email_number = -1
    model = joblib.load("./data/model_detect_spam_email_NB.joblib")
    while True:
        list_files = os.listdir("./data/test_email/")
        for i in range(len(list_files)):
            print(f"{i+1}. {list_files[i]}")
        print("0. Exist")
        print("Enter you email number: ")
        email_number = int(input())
        print(email_number)
        if email_number == 0:
            break
        email = ""
        with  open(f"./data/test_email/{list_files[email_number-1]}") as file:
            email = file.read()
        matrixs = prepare_email(email)
        y_pred = model.predict(matrixs)
        print("===================")
        print(f"Input : {list_files[email_number-1]}")
        if y_pred[0] == 0:
            print("\tResult: Ham email")
        else:
            print("\tResult: Spam email")
        print("===================")


main()