# Preprocessing data
# Create 3 files : hardHam_clean, easyHam_clean, spam_clean

In [1]:
import regex as re
import os
from text_preprocessing import preprocess_text
from text_preprocessing import (
    to_lower,
    remove_email, 
    remove_url, 
    remove_punctuation, 
    lemmatize_word, 
    remove_number, 
    stem_word, 
    remove_stopword,
    remove_whitespace
)
# Thu vien email.parser dung cho tep file co cau truc gioi tin email http
from email.parser import Parser

### Lay duong dan den tat ca cac file trong folder
def read_files_name(folderPath):
    result = []
    for dirPath, dirs, files in os.walk(folderPath):
        for fileName in files:
            fName = os.path.join(dirPath, fileName)
            result.append(fName)
    return result

### Tra ve 3 mang chua danh sach ten file trong thu muc
def read_forders_Path():
    easyHam = read_files_name('dataset\easy_ham')
    hardHam = read_files_name('dataset\hard_ham')
    spam = read_files_name('dataset\spam')
    return easyHam, hardHam, spam


def strip_url(mess):
    return re.sub(r'http[s]?://\S+|www\.\S+', ' ', mess)


def strip_html(mess):
    # clean html_tag
    clean = re.compile('<.*?>')
    mess = re.sub(clean, ' ', mess)
    # clean html_space_white
    return re.sub('&nbsp;', ' ', mess)



def strip_email_header(message):
    message = Parser().parsestr(message)
    subject = message.get('subject', '')
    body = get_email_body(message)

    return '\n'.join((subject, body))

def get_email_body(message):
    payloads = message.get_payload()
    if isinstance(payloads, list):
        return '\n'.join([get_email_body(message) for message in payloads])
    elif isinstance(payloads, str):
        return payloads


def preprocessing_data(mess):
    # processing_function_list = [
    #     str.lower,
    #     strip_url,
    #     remove_url,
    #     strip_html,
    #     remove_email,
    #     remove_number,
    #     remove_stopword,
    #     remove_punctuation,
    #     remove_whitespace

    # ]

    preprocess_functions = [strip_email_header, to_lower, remove_email, remove_url,strip_url,strip_html, 
                            remove_number, remove_punctuation, remove_stopword, lemmatize_word]
    aftermess = preprocess_text(mess, preprocess_functions)

    # return ' '.join(aftermess)
    return aftermess

def handler():
    easyHam,hardHam,spam = read_forders_Path()
    count=0
#     with open(a[0], 'r', encoding='latin') as fi:
#         print(preprocessing_data(fi.read()))
    f=open('dataset\easyHam_clean.txt','a+',encoding='utf-8')
#     print(len(easyHam))
    for i in easyHam:
        try:
            with open(i, 'r', encoding='latin') as fi:
                temp=preprocessing_data(fi.read())
            f.write(temp)
#                print(preprocessing_data(fi.read())) 
            f.write('\n')
        except Exception as e:
            print(i + "Exception :" + str(e))
            count+=1
            continue
    f.close()
    ### hardHam
    f=open('dataset\hardHam_clean.txt','a+',encoding='utf-8')
    for i in hardHam:
        try:
            with open(i, 'r', encoding='latin') as fi:
                temp=preprocessing_data(fi.read())
            f.write(temp)
#                print(preprocessing_data(fi.read())) 
            f.write('\n')
        except Exception as e:
            print(i + "Exception :" + str(e))
            count+=1
            continue
    f.close()
    
    ### spam 
    f=open('dataset\spam_clean.txt','a+',encoding='utf-8')
    for i in spam:
        try:
            with open(i, 'r', encoding='latin') as fi:
                temp=preprocessing_data(fi.read())
            f.write(temp)
#                print(preprocessing_data(fi.read())) 
            f.write('\n')
        except Exception as e:
            print(i + "Exception :" + str(e))
            count+=1
            continue
    f.close()
    print("End Game\n")
#     for i in hardHam:
#         f=open('dataset\hardHam_clean.txt','w+')
#         with open(i, 'r', encoding='latin') as fi:
#             f.write(preprocessing_data(fi.read()))
#         f.write(' ')
#     f.close()
#     for i in spam:
#         f=open('dataset\spam_clean.txt','w+')
#         with open(i, 'r', encoding='latin') as fi:
#             f.write(preprocessing_data(fi.read()))
#         f.write(' ')
#     f.close()
if __name__ == "__main__":
    handler()


End Game



# Thống kê tần suất xuất hiện và tính xác suất trước

In [37]:
# Viết 1 hàm: input 1 file, output 1 mảng kiểu map (từ, fre)
from collections import Counter, defaultdict
from random import shuffle

In [38]:
def load_data(fileName):
    with open('dataset\\'+fileName, 'r', encoding='utf-8') as fi:
            temp=fi.readlines()
    return temp

In [39]:
def word_freq(fileName):
    lines=load_data(fileName)
    result=Counter()
    for i in lines:
        line=i.split(' ')
        result.update(line)
    return result

In [None]:
### Tạo từ điển
def create_Dir():
    

In [40]:
def readData():
    easyHam = load_data('easyHam_clean.txt')
    hardHam = load_data('hardHam_clean.txt')
    spam = load_data('spam_clean.txt')
    return easyHam,hardHam,spam

In [41]:
easyHam,hardHam,spam=readData()

In [42]:
datas=easyHam+spam

In [43]:
labels = [0] * len(easyHam)+[1]*len(spam)

In [116]:
#Hàm trộn theo tỷ lệ cho 1 nhãn, output = train_set,test_set theo nhãn
def shuffle_data(mail_by_label,ratio):
    tmp=mail_by_label.copy()
    shuffle(tmp)
    index=int(len(mail_by_label)*ratio)
    print (index,len(tmp))
    return tmp[:index],tmp[index:]

In [137]:
# Hàm chia tỷ lệ, mails = data, lables là mảng nhãn ratio là tỷ lệ , input (mails,labels, tỷ lệ), output -> train_set,test_set
# Hàm defaultdict tạo ra kiểu dữ liệu key - value. Trùng key sẽ nối danh sách liên kết vào value
# Hàm zip () :input 2 mảng mails và lables cùng kích thước -> output 1 mảng, mỗi phần tử của mảng gồm 2 phần tử của mail và lable
# Hàm set (): VD input [0,0,0,0,0,0,0,1,1,1,1,1,1,2,2,3,3] -> output [0,1,2,3]
def split_data(mails,labels,ratio):
    split_data_by_lables=defaultdict(list)
    for mail,label in zip(mails,labels):
        split_data_by_lables[label].append(mail)
    train,test=[],[]
    for label in set(labels):
        data_train,data_test=shuffle_data(split_data_by_lables[label],ratio)
        label_train=[label]*len(data_train)
        label_test=[label]*len(data_test)
        train.extend(zip(label_train,data_train))
        test.extend(zip(label_test,data_test))
    data_train = [item[1] for item in train]
    label_train =[item[0] for item in train]
    data_test =[item[1] for item in test]
    label_test =[item[0] for item in test]
    return data_train,data_test,label_train,label_test

In [138]:
data_train,data_test,lable_train,lable_test=split_data(datas,labels,0.5)


3225 6451
1896 3793


In [139]:
for i in lable_train:
    if (i==0):
        print("a")

a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
