In [None]:
import os
import pandas as pd 
import string
import re
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn

#%% Functions

# Function to get the email body
def get_email_body(Data_Directory, Label_Directory):
    listOfFiles = list()
    for (dirpath, dirnames, filenames) in tqdm(os.walk(Data_Directory)):
        listOfFiles += [os.path.join(dirpath, file) for file in filenames]
        data=[]
    for f in tqdm(listOfFiles, leave=False):
        with open (f, "r", encoding="cp437") as myfile:
            file = myfile.read()
            r = r'(?P<header>[^\n\n]*)\n\n(?P<body>[\s\S]*)\n'
            for match in re.finditer(r, file):
                data.append(match.group('body'))
    df1 = pd.DataFrame(data)
    labels = pd.read_csv(Label_Directory, header = None)
    HamOrSpam = labels[0].str.split(" ", n = 1, expand = True)
    df1['Status']= HamOrSpam[0]
    df1.columns.values[0] = "Email Content"
    return df1

# Function to get Train and Test Data
def data_set (data, size):
    df = data.sample(frac=1)
    train_size = int(size*len(df))
    train_set = df[:train_size]
    test_set = df[train_size:]
    return train_set, test_set

# Function to get Spam and Ham
def spam_ham (dataset,status):
    if status == True:
        return dataset.loc[dataset['Status'] == 'spam']
    else:
        return dataset.loc[dataset['Status'] == 'ham']

# Function to get Vocabulary Training
def vocabulary (data):
    words=[]
    for i in tqdm(data):
        Word_by_Email = re.sub('[^a-z\s]+',' ',i,flags=re.IGNORECASE)
        Word_by_Email = re.sub('(\s+)',' ',Word_by_Email)
        Word_by_Email = Word_by_Email.lower()
        #Word_by_Email = re.sub('\s+(a|div|an|and|the)(\s+)', '\2', Word_by_Email)
        Word_by_Email = re.sub(r'(?:^| )\w(?:$| )', ' ', Word_by_Email).strip()
        Word_by_Email = Word_by_Email.split()
        words.append(Word_by_Email)
    
    Word_List = pd.Series(words).explode()
    BoW = pd.DataFrame.from_dict(Word_List)
    BoW_Count = BoW.groupby(BoW.columns.tolist(),as_index=False).size().sort_values(by='size',ascending = False)
    return BoW_Count, BoW

# Function to get Vocabulary Testing
def test_vocabulary(testdata):
    words=[]
    for i in tqdm(testdata):
        Word_by_Email = re.sub('[^a-z\s]+',' ',i,flags=re.IGNORECASE)
        Word_by_Email = re.sub('(\s+)',' ',Word_by_Email)
        Word_by_Email = Word_by_Email.lower()
        #Word_by_Email = re.sub('\s+(a|div|font|html|charset|an|and|the|b|e|f|c|)(\s+)', '\2', Word_by_Email)
        Word_by_Email = re.sub(r'(?:^| )\w(?:$| )', ' ', Word_by_Email).strip()
        Word_by_Email = Word_by_Email.split()
        words.append(Word_by_Email)
    return words

# Get Conditional Probabilities
def conditional_probability(BOWTrainSet, Train_Vocabulary, smoothing):
    BoWTrainSet_ = pd.DataFrame([i for i in tqdm(Train_Vocabulary) if i in BoWTrainSet])
    BoW_Count = BoWTrainSet_.groupby(BoWTrainSet_.columns.tolist(),as_index=False).size().sort_values(by='size',ascending = False)
    BoW_Count['Add1'] = BoW_Count['size']+1

    BOW_2 = pd.DataFrame(list(BoWTrainSet - set(Train_Vocabulary)))
    BOW_2Count = BOW_2.groupby(BOW_2.columns.tolist(),as_index=False).size().sort_values(by='size',ascending = False)
    BOW_2Count['Add1'] = smoothing
    BOW = BoW_Count.append(BOW_2Count)

    TotalWords = sum(BoW_Count['size'])

    BOW['CondProb'] = [x/(TotalWords + len(BoW_Count['size'])*smoothing) for x in tqdm(BOW['Add1'])]

    return BOW

#Classify Email
def classify_email(Probability_SpamEmail,Probability_HamEmail):
    Classification = max({'spam':Probability_SpamEmail, 'ham':Probability_HamEmail},key={'spam':Probability_SpamEmail, 'ham':Probability_HamEmail}.get)
    return Classification



In [None]:
#%%
Data_Directory = 'C:/Spam_Filter/trec06p-cs280/trec06p-cs280/data'
Label_Directory = 'C:/Spam_Filter/trec06p-cs280/trec06p-cs280/labels'

#Get dataset and partition to train and test dataset
data = get_email_body(Data_Directory, Label_Directory)
train_set = data_set(data, 0.7)[0]
test_set = data_set(data,0.7)[1] 

#%%
#Train Dataset

#Spam Training Set
train_spam_set = spam_ham(train_set,True)
Train_Spam_Vocabulary = vocabulary(train_spam_set['Email Content'])

#Ham Training Set
train_ham_set = spam_ham(train_set,False)
Train_Ham_Vocabulary = vocabulary(train_ham_set['Email Content'])

#All Training Set
Train_Set_Vocabulary = vocabulary(train_set['Email Content'])

Train_TotalSpam = len(train_spam_set)
Train_TotalHam = len(train_ham_set)
Train_SetTotal = Train_TotalSpam + Train_TotalHam

#%%
#Prior Probability of Spam and Ham in Training Set
PriorProb_Spam = Train_TotalSpam/Train_SetTotal
PriorProb_Ham = Train_TotalHam/Train_SetTotal
print( PriorProb_Spam, PriorProb_Ham)
#%% Training 

BoWTrainSet = set(Train_Set_Vocabulary[0][0])

BOW_Spam = conditional_probability(BoWTrainSet, Train_Spam_Vocabulary[1][0], 1)
BOW_Ham = conditional_probability(BoWTrainSet, Train_Ham_Vocabulary[1][0], 1)

SpamCount = {'words': BOW_Spam[0], 'values':BOW_Spam['CondProb']}
HamCount = {'words': BOW_Ham[0], 'values':BOW_Ham['CondProb']}

#%% Testing
Test_Vocabulary = test_vocabulary(test_set['Email Content'])

classify = []
ham=[]
spam=[]
SpamCountDict = dict(zip(*SpamCount.values()))
HamCountDict = dict(zip(*HamCount.values()))

for email in tqdm(Test_Vocabulary):
    SP = [SpamCountDict[i] for i in email if i in SpamCountDict] + [1 for i in email if i not in SpamCountDict]
    HP = [HamCountDict[i] for i in email if i in HamCountDict] + [1 for i in email if i not in HamCountDict]
    
    LogSpam = np.log(SP)
    LogHam = np.log(HP)
    Probability_SpamEmail = sum(LogSpam) + np.log(PriorProb_Spam)
    Probability_HamEmail = sum(LogHam) + np.log(PriorProb_Ham)
    
    spam.append(Probability_SpamEmail)
    ham.append(Probability_HamEmail)
    classify.append(classify_email(Probability_SpamEmail,Probability_HamEmail))

test_set['Spam Probability'] = spam
test_set['Ham Probability'] = ham
test_set['Classification'] = classify

#%% Precision & Recall

TN = test_set.loc[(test_set['Status'] == 'ham') & (test_set['Classification'] == 'ham')].count()[0]
TP = test_set.loc[(test_set['Status'] == 'spam') & (test_set['Classification'] == 'spam')].count()[0]
FP = test_set.loc[(test_set['Status'] == 'ham') & (test_set['Classification'] == 'spam')].count()[0]
FN = test_set.loc[(test_set['Status'] == 'spam') & (test_set['Classification'] == 'ham')].count()[0]

Precision = TP/(TP+FP)
Recall = TP/(TP+FN)
print(Precision)
print(Recall)

#%% Save dataset to file
test_set.to_csv('normal_test_set.csv')
BOW_Spam.to_csv('normal_spam_words.csv')
BOW_Ham.to_csv('normal_ham_words.csv')

#%% Lambda Smoothing Function

def lambda_smoothing(BoWTrainSet, Train_Spam_Vocabulary,Train_Ham_Vocabulary, Test_Vocabulary, lambda_smoothing):
    
    BOW_Spam = conditional_probability(BoWTrainSet, Train_Spam_Vocabulary[1][0], lambda_smoothing)
    BOW_Ham = conditional_probability(BoWTrainSet, Train_Ham_Vocabulary[1][0], lambda_smoothing)
    
    SpamCount = {'words': BOW_Spam[0], 'values':BOW_Spam['CondProb']}
    HamCount = {'words': BOW_Ham[0], 'values':BOW_Ham['CondProb']}
    
    classify = []
    ham=[]
    spam=[]
    SpamCountDict = dict(zip(*SpamCount.values()))
    HamCountDict = dict(zip(*HamCount.values()))
    for email in tqdm(Test_Vocabulary):
        SP = [SpamCountDict[i] for i in email if i in SpamCountDict] + [1 for i in email if i not in SpamCountDict]
        HP = [HamCountDict[i] for i in email if i in HamCountDict] + [1 for i in email if i not in HamCountDict]
        
        LogSpam = np.log(SP)
        LogHam = np.log(HP)
        Probability_SpamEmail = sum(LogSpam) + np.log(PriorProb_Spam)
        Probability_HamEmail = sum(LogHam) + np.log(PriorProb_Ham)
        
        spam.append(Probability_SpamEmail)
        ham.append(Probability_HamEmail)
        classify.append(classify_email(Probability_SpamEmail,Probability_HamEmail))

    test_set['Spam Probability'] = spam
    test_set['Ham Probability'] = ham
    test_set['Classification'] = classify
    
    TN = test_set.loc[(test_set['Status'] == 'ham') & (test_set['Classification'] == 'ham')].count()[0]
    TP = test_set.loc[(test_set['Status'] == 'spam') & (test_set['Classification'] == 'spam')].count()[0]
    FP = test_set.loc[(test_set['Status'] == 'ham') & (test_set['Classification'] == 'spam')].count()[0]
    FN = test_set.loc[(test_set['Status'] == 'spam') & (test_set['Classification'] == 'ham')].count()[0]
    
    Precision = TP/(TP+FP)
    Recall = TP/(TP+FN)
    
    return Precision,Recall

#%% Precision and Recall using Different Lambda Smoothing

smoothing_value = [2,1,0.5,0.1,0.005]
precision = []
recall = []

for i in smoothing_value:
    result = lambda_smoothing(BoWTrainSet, Train_Spam_Vocabulary,Train_Ham_Vocabulary, Test_Vocabulary,i)
    precision.append(result[0])
    recall.append(result[1])

#%% Plot

plt.plot(smoothing_value, precision, label='Precision',  marker='o')
plt.plot(smoothing_value, recall, label='Recal', marker='x')
plt.legend(loc='center right')
plt.title('Precision and Recall of Different Lambda Smoothing')
plt.ylabel('Precision and Recall')
plt.xlabel('Lambda Smoothing')
plt.show()



In [None]:
#%% Improving Classifier 

# 200 of the Mostly Used Words in the Dataset
Words200Spam = (BOW_Spam[0:200][0]).to_list()
Words200Ham = (BOW_Ham[0:200][0]).to_list()
Train200 = (Train_Set_Vocabulary[0][0:200][0]).to_list()

# Function to get Vocabulary Training
def vocabulary_improve (data, Words200):
    words=[]
    remove = '|'.join(Words200)
    for i in tqdm(data):
        Word_by_Email = re.sub('[^a-z\s]+',' ',i,flags=re.IGNORECASE)
        Word_by_Email = re.sub('(\s+)',' ',Word_by_Email)
        Word_by_Email = Word_by_Email.lower()
        Word_by_Email = re.sub('\s+('+ remove +')(\s+)', '\2', Word_by_Email)
        Word_by_Email = re.sub(r'(?:^| )\w(?:$| )', ' ', Word_by_Email).strip()
        Word_by_Email = Word_by_Email.split()
        words.append(Word_by_Email)
    
    Word_List = pd.Series(words).explode()
    BoW = pd.DataFrame.from_dict(Word_List)
    BoW_Count = BoW.groupby(BoW.columns.tolist(),as_index=False).size().sort_values(by='size',ascending = False)
    return BoW_Count, BoW

# Function to get Vocabulary Testing
def test_vocabulary_improve(testdata, Words200):
    words=[]
    remove = '|'.join(Words200)
    for i in tqdm(testdata):
        Word_by_Email = re.sub('[^a-z\s]+',' ',i,flags=re.IGNORECASE)
        Word_by_Email = re.sub('(\s+)',' ',Word_by_Email)
        Word_by_Email = Word_by_Email.lower()
        Word_by_Email = re.sub('\s+('+ remove +')(\s+)', '\2', Word_by_Email)
        Word_by_Email = re.sub(r'(?:^| )\w(?:$| )', ' ', Word_by_Email).strip()
        Word_by_Email = Word_by_Email.split()
        words.append(Word_by_Email)
    return words

#%%
#Train Dataset

#Spam Training Set
train_spam_set = spam_ham(train_set,True)
Train_Spam_Vocabulary = vocabulary_improve(train_spam_set['Email Content'],Words200Spam)

#Ham Training Set
train_ham_set = spam_ham(train_set,False)
Train_Ham_Vocabulary = vocabulary_improve(train_ham_set['Email Content'],Words200Ham)

#All Training Set
Train_Set_Vocabulary = vocabulary_improve(train_set['Email Content'],Train200)

Train_TotalSpam = len(train_spam_set)
Train_TotalHam = len(train_ham_set)
Train_SetTotal = Train_TotalSpam + Train_TotalHam

#%%
#Prior Probability of Spam and Ham in Training Set
PriorProb_Spam = Train_TotalSpam/Train_SetTotal
PriorProb_Ham = Train_TotalHam/Train_SetTotal
print( PriorProb_Spam, PriorProb_Ham)
#%% Training 

BoWTrainSet = set(Train_Set_Vocabulary[0][0])

BOW_Spam = conditional_probability(BoWTrainSet, Train_Spam_Vocabulary[1][0], 1)
BOW_Ham = conditional_probability(BoWTrainSet, Train_Ham_Vocabulary[1][0], 1)

SpamCount = {'words': BOW_Spam[0], 'values':BOW_Spam['CondProb']}
HamCount = {'words': BOW_Ham[0], 'values':BOW_Ham['CondProb']}

#%% Testing
Test_Vocabulary = test_vocabulary(test_set['Email Content'])

classify = []
ham=[]
spam=[]
SpamCountDict = dict(zip(*SpamCount.values()))
HamCountDict = dict(zip(*HamCount.values()))
for email in tqdm(Test_Vocabulary):
    SP = [SpamCountDict[i] for i in email if i in SpamCountDict] + [1 for i in email if i not in SpamCountDict]
    HP = [HamCountDict[i] for i in email if i in HamCountDict] + [1 for i in email if i not in HamCountDict]
    
    LogSpam = np.log(SP)
    LogHam = np.log(HP)
    Probability_SpamEmail = sum(LogSpam) + np.log(PriorProb_Spam)
    Probability_HamEmail = sum(LogHam) + np.log(PriorProb_Ham)
    
    spam.append(Probability_SpamEmail)
    ham.append(Probability_HamEmail)
    classify.append(classify_email(Probability_SpamEmail,Probability_HamEmail))

test_set['Spam Probability'] = spam
test_set['Ham Probability'] = ham
test_set['Classification'] = classify

#%% Precision & Recall

TN = test_set.loc[(test_set['Status'] == 'ham') & (test_set['Classification'] == 'ham')].count()[0]
TP = test_set.loc[(test_set['Status'] == 'spam') & (test_set['Classification'] == 'spam')].count()[0]
FP = test_set.loc[(test_set['Status'] == 'ham') & (test_set['Classification'] == 'spam')].count()[0]
FN = test_set.loc[(test_set['Status'] == 'spam') & (test_set['Classification'] == 'ham')].count()[0]

Precision = TP/(TP+FP)
Recall = TP/(TP+FN)
print(Precision)
print(Recall)

#%% Precision and Recall using Different Lambda Smoothing

smoothing_value = [2,1,0.5,0.1,0.005]
precision = []
recall = []

for i in smoothing_value:
    result = lambda_smoothing(BoWTrainSet, Train_Spam_Vocabulary,Train_Ham_Vocabulary, Test_Vocabulary,i)
    precision.append(result[0])
    recall.append(result[1])

#%% Plot

plt.plot(smoothing_value, precision, label='Precision',  marker='o')
plt.plot(smoothing_value, recall, label='Recal', marker='x')
plt.legend(loc='center right')
plt.title('Precision and Recall of Improved Classifier')
plt.ylabel('Precision and Recall')
plt.xlabel('Lambda Smoothing')
plt.show()

#%%Save Improved Dataset to File
test_set.to_csv('improved_test_set.csv')
BOW_Spam.to_csv('improved_spam_words.csv')
BOW_Ham.to_csv('improved_ham_words.csv')
train_set.to_csv('trainset.csv')
