# Preprocessing data
# Create 3 files : hardHam_clean, easyHam_clean, spam_clean

In [1]:
import regex as re
import os
from text_preprocessing import preprocess_text
from text_preprocessing import (
    to_lower,
    remove_email, 
    remove_url, 
    remove_punctuation, 
    lemmatize_word, 
    remove_number, 
    stem_word, 
    remove_stopword,
    remove_whitespace
)
# Thu vien email.parser dung cho tep file co cau truc gioi tin email http
from email.parser import Parser

### Lay duong dan den tat ca cac file trong folder
def read_files_name(folderPath):
    result = []
    for dirPath, dirs, files in os.walk(folderPath):
        for fileName in files:
            fName = os.path.join(dirPath, fileName)
            result.append(fName)
    return result

### Tra ve 3 mang chua danh sach ten file trong thu muc
def read_forders_Path():
    easyHam = read_files_name('dataset\easy_ham')
    hardHam = read_files_name('dataset\hard_ham')
    spam = read_files_name('dataset\spam')
    return easyHam, hardHam, spam


def strip_url(mess):
    return re.sub(r'http[s]?://\S+|www\.\S+', ' ', mess)


def strip_html(mess):
    # clean html_tag
    clean = re.compile('<.*?>')
    mess = re.sub(clean, ' ', mess)
    # clean html_space_white
    return re.sub('&nbsp;', ' ', mess)



def strip_email_header(message):
    message = Parser().parsestr(message)
    subject = message.get('subject', '')
    body = get_email_body(message)

    return '\n'.join((subject, body))

def get_email_body(message):
    payloads = message.get_payload()
    if isinstance(payloads, list):
        return '\n'.join([get_email_body(message) for message in payloads])
    elif isinstance(payloads, str):
        return payloads


def preprocessing_data(mess):
    # processing_function_list = [
    #     str.lower,
    #     strip_url,
    #     remove_url,
    #     strip_html,
    #     remove_email,
    #     remove_number,
    #     remove_stopword,
    #     remove_punctuation,
    #     remove_whitespace

    # ]

    preprocess_functions = [strip_email_header, to_lower, remove_email, remove_url,strip_url,strip_html, 
                            remove_number, remove_punctuation, remove_stopword, lemmatize_word]
    aftermess = preprocess_text(mess, preprocess_functions)

    # return ' '.join(aftermess)
    return aftermess

def handler():
    easyHam,hardHam,spam = read_forders_Path()
    count=0
#     with open(a[0], 'r', encoding='latin') as fi:
#         print(preprocessing_data(fi.read()))
    f=open('dataset\easyHam_clean.txt','a+',encoding='utf-8')
#     print(len(easyHam))
    for i in easyHam:
        try:
            with open(i, 'r', encoding='latin') as fi:
                temp=preprocessing_data(fi.read())
            f.write(temp)
#                print(preprocessing_data(fi.read())) 
            f.write('\n')
        except Exception as e:
            print(i + "Exception :" + str(e))
            count+=1
            continue
    f.close()
    ### hardHam
    f=open('dataset\hardHam_clean.txt','a+',encoding='utf-8')
    for i in hardHam:
        try:
            with open(i, 'r', encoding='latin') as fi:
                temp=preprocessing_data(fi.read())
            f.write(temp)
#                print(preprocessing_data(fi.read())) 
            f.write('\n')
        except Exception as e:
            print(i + "Exception :" + str(e))
            count+=1
            continue
    f.close()
    
    ### spam 
    f=open('dataset\spam_clean.txt','a+',encoding='utf-8')
    for i in spam:
        try:
            with open(i, 'r', encoding='latin') as fi:
                temp=preprocessing_data(fi.read())
            f.write(temp)
#                print(preprocessing_data(fi.read())) 
            f.write('\n')
        except Exception as e:
            print(i + "Exception :" + str(e))
            count+=1
            continue
    f.close()
    print("End Game\n")
#     for i in hardHam:
#         f=open('dataset\hardHam_clean.txt','w+')
#         with open(i, 'r', encoding='latin') as fi:
#             f.write(preprocessing_data(fi.read()))
#         f.write(' ')
#     f.close()
#     for i in spam:
#         f=open('dataset\spam_clean.txt','w+')
#         with open(i, 'r', encoding='latin') as fi:
#             f.write(preprocessing_data(fi.read()))
#         f.write(' ')
#     f.close()
if __name__ == "__main__":
    handler()


End Game



# Thống kê tần suất xuất hiện và tính xác suất trước

In [2]:
# Viết 1 hàm: input 1 file, output 1 mảng kiểu map (từ, fre)
from collections import Counter, defaultdict
from random import shuffle
import pandas as pd
from math import log

In [3]:
def load_data(fileName):
    with open('dataset\\'+fileName, 'r', encoding='utf-8') as fi:
            temp=fi.readlines()
    return temp

In [4]:
def word_freq(data_train):
    result=Counter()
    for email in data_train:
        word=email.split(' ')
        result.update(word)
    return result

In [5]:
def readData():
    easyHam = load_data('/dataset/easyHam_clean.txt')
    hardHam = load_data('/dataset/hardHam_clean.txt')
    spam = load_data('spam_clean.txt')
    return easyHam,hardHam,spam

In [6]:
easyHam,hardHam,spam=readData()

In [7]:
datas=easyHam+hardHam+spam

In [8]:
labels = [0] * len(easyHam)+[0]*len(hardHam)+[1]*len(spam)

In [9]:
#Hàm trộn theo tỷ lệ cho 1 nhãn, output = train_set,test_set theo nhãn
def shuffle_data(mail_by_label,ratio):
    tmp=mail_by_label.copy()
    shuffle(tmp)
    index=int(len(mail_by_label)*ratio)
    print (index,len(tmp))
    return tmp[:index],tmp[index:]

In [10]:
# Hàm chia tỷ lệ, mails = data, lables là mảng nhãn ratio là tỷ lệ , input (mails,labels, tỷ lệ), output -> train_set,test_set
# Hàm defaultdict tạo ra kiểu dữ liệu key - value. Trùng key sẽ nối danh sách liên kết vào value
# Hàm zip () :input 2 mảng mails và lables cùng kích thước -> output 1 mảng, mỗi phần tử của mảng gồm 2 phần tử của mail và lable
# Hàm set (): VD input [0,0,0,0,0,0,0,1,1,1,1,1,1,2,2,3,3] -> output [0,1,2,3]
def split_data(mails,labels,ratio):
    split_data_by_labels=defaultdict(list)
    for mail,label in zip(mails,labels):
        split_data_by_labels[label].append(mail)
    train,test=[],[]
    for label in set(labels):
        data_train,data_test=shuffle_data(split_data_by_labels[label],ratio)
        label_train=[label]*len(data_train)
        label_test=[label]*len(data_test)
        train.extend(zip(label_train,data_train))
        test.extend(zip(label_test,data_test))
    data_train = [item[1] for item in train]
    label_train =[item[0] for item in train]
    data_test =[item[1] for item in test]
    label_test =[item[0] for item in test]
    return data_train,data_test,label_train,label_test

In [12]:
data_train,data_test,label_train,label_test=split_data(datas,labels,0.5)
print(label_train[0])

3475 6951
1896 3793
0


In [13]:
#nhóm email theo nhẫn lớp
def split_by_label(mails, labels):
    split_data_by_labels=defaultdict(list)
    for mail,label in zip(mails,labels):
        split_data_by_labels[label].append(mail)
    return split_data_by_labels

# Tính tần suất xuất hiện của từ trong từng nhãn lớp, và số email của từng nhẫn lớp
def create_Dir():
    split_data_by_labels = split_by_label(data_train,label_train)
    spam_freq=word_freq(split_data_by_labels[1])
    ham_freq=word_freq(split_data_by_labels[0])
    total_spam_emails = len(split_data_by_labels[1])
    total_ham_emails = len(split_data_by_labels[0])
    return spam_freq,ham_freq,total_spam_emails,total_ham_emails 
#tính xác suất
def cal_prob():
    word_in_spam_freq, word_in_ham_freq,total_spam_emails,total_ham_emails = create_Dir()
    #số lần xuất hiện của các từ trong từng nhãn lớp
    total_word_in_spam = sum( word_in_spam_freq.values())
    total_word_in_ham = sum( word_in_ham_freq.values())
    #giá trị tuyệt đối của T
    spam_words = set(word_in_spam_freq.keys())
    ham_words = set(word_in_ham_freq.keys())
    dictionary = spam_words.union(ham_words)
    #xác suất trước của P(ci)
    total_emails= total_spam_emails + total_ham_emails
    spam_prob = total_spam_emails/total_emails
    ham_prob = total_ham_emails/total_emails
    #xác suất của từ khóa trong từ điển đối với hai nhãn
    word_in_spam_probs, word_in_ham_probs = Counter(),Counter()
    for  word in dictionary:
        word_in_spam_probs[word] = (word_in_spam_freq[word]+1)/(total_word_in_spam+len(dictionary))
        word_in_ham_probs[word] = (word_in_ham_freq[word]+1)/(total_word_in_ham +len(dictionary))
    
    data_ham_probs={'words':word_in_ham_probs.keys(), 'values':word_in_ham_probs.values()}
    data_spam_probs={'words':word_in_spam_probs.keys(), 'values':word_in_spam_probs.values()}
    df_ham_probs= pd.DataFrame(data_ham_probs)
    df_spam_probs= pd.DataFrame(data_spam_probs)
    df_ham_probs.to_csv('df_ham_probs.csv', index = False, header=True)
    df_spam_probs.to_csv('df_spam_probs.csv', index = False, header=True)
    
    df_pre_prob= pd.DataFrame( {'spam_prob':[spam_prob],'ham_prob':[ham_prob]})
    df_pre_prob.to_csv('df_pre_probs.csv', index = False, header=True)
#     return spam_prob, ham_prob, word_in_spam_probs, word_in_ham_probs
    
cal_prob()


KeyboardInterrupt: 

In [None]:
def load_model():
    data_ham_probs= pd.read_csv("df_ham_probs.csv") 
    data_spam_probs= pd.read_csv("df_spam_probs.csv")
    data_pre_prob= pd.read_csv("df_pre_probs.csv")
    # lấy xác suất từ files
    word_in_ham_probs= data_ham_probs.set_index('words')['values'].to_dict()
    word_in_spam_probs= data_spam_probs.set_index('words')['values'].to_dict()
    spam_pre_prob = data_pre_prob['spam_prob'][0]
    ham_pre_prob = data_pre_prob['ham_prob'][0]
    return  word_in_ham_probs, word_in_spam_probs, spam_pre_prob, ham_pre_prob
def classify(mail, word_in_ham_probs, word_in_spam_probs, spam_pre_prob, ham_pre_prob):

#     # lấy tập từ
#     spam_words = set(word_in_spam_probs.keys())
#     ham_words = set(word_in_ham_probs.keys())
#     dictionary = spam_words.union(ham_words)
    #tính xác suất
    mail_words = mail.split(' ')
    prob_spam = prob_ham = 0.0
    for mail_word in mail_words:
        if(word_in_ham_probs.get(mail_word) is not None):
            prob_ham += log(word_in_ham_probs.get(mail_word))
        if(word_in_spam_probs.get(mail_word) is not None):
            prob_spam += log(word_in_spam_probs.get(mail_word))
    prob_ham += log( ham_pre_prob)
    prob_spam += log( spam_pre_prob)

    if prob_spam > prob_ham:
        return 1
    return 0
    

In [None]:
correct=0
word_in_ham_probs, word_in_spam_probs, spam_pre_prob, ham_pre_prob = load_model()
for i in range(len(data_test)):
    if (classify(data_test[i], word_in_ham_probs, word_in_spam_probs, spam_pre_prob, ham_pre_prob) == label_test[i]):
        correct+=1
print( correct)
print(len(data_test))
print(len(data_train))