# Import Dataset

Data was obtained from https://github.com/mwiechmann/enron_spam_data. It is in the format of a .csv file. The dataset contains a total of 17.171 spam and 16.545 non-spam ("ham") e-mail messages (33.716 e-mails total).

In [51]:
import pandas as pd

df = pd.read_csv('enron_spam_data.csv')
display(df)


Unnamed: 0,Message ID,Subject,Message,Spam/Ham,Date
0,0,christmas tree farm pictures,,ham,1999-12-10
1,1,"vastar resources , inc .","gary , production from the high island larger ...",ham,1999-12-13
2,2,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,ham,1999-12-14
3,3,re : issue,fyi - see note below - already done .\nstella\...,ham,1999-12-14
4,4,meter 7268 nov allocation,fyi .\n- - - - - - - - - - - - - - - - - - - -...,ham,1999-12-14
...,...,...,...,...,...
33711,33711,= ? iso - 8859 - 1 ? q ? good _ news _ c = eda...,"hello , welcome to gigapharm onlinne shop .\np...",spam,2005-07-29
33712,33712,all prescript medicines are on special . to be...,i got it earlier than expected and it was wrap...,spam,2005-07-29
33713,33713,the next generation online pharmacy .,are you ready to rock on ? let the man in you ...,spam,2005-07-30
33714,33714,bloow in 5 - 10 times the time,learn how to last 5 - 10 times longer in\nbed ...,spam,2005-07-30


# Cleaning

In [56]:
import re

def regexClean(message_content):
    # convert everything to string
    # getting a "passing a float" error otherwise
    message_content = str(message_content) 
    
    # remove all URLs
    message_content = re.sub(r'(http|https|ftp)://[a-zA-Z0-9\\./]+', ' ', str(message_content)) 

    # remove all emails
    message_content = re.sub(r'[A-Za-z0-9._\\-]+@[A-Za-z0-9-]*\\.[a-z]{2,3}','',str(message_content))

    # remove all tags
    # https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string
    message_content = re.sub(r'<[^<]+?>', '', str(message_content)) 

    # remove newline
    message_content = message_content.replace('\n', ' ')

    # convert all chars to lowercase
    message_content = message_content.lower()
    return message_content

# make a copy of the df 
df_with_punct_numb_removed = df.copy();

df['Cleaned_Message'] = df['Message'].apply(regexClean)

print(df[['Message', 'Cleaned_Message']].head())
df_with_punct_numb_removed.head()

                                             Message  \
0                                                NaN   
1  gary , production from the high island larger ...   
2             - calpine daily gas nomination 1 . doc   
3  fyi - see note below - already done .\nstella\...   
4  fyi .\n- - - - - - - - - - - - - - - - - - - -...   

                                     Cleaned_Message  
0                                                nan  
1  gary , production from the high island larger ...  
2             - calpine daily gas nomination 1 . doc  
3  fyi - see note below - already done . stella -...  
4  fyi . - - - - - - - - - - - - - - - - - - - - ...  


Unnamed: 0,Message ID,Subject,Message,Spam/Ham,Date,Cleaned_Message
0,0,christmas tree farm pictures,,ham,1999-12-10,
1,1,"vastar resources , inc .","gary , production from the high island larger ...",ham,1999-12-13,"gary , production from the high island larger ..."
2,2,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,ham,1999-12-14,- calpine daily gas nomination 1 . doc
3,3,re : issue,fyi - see note below - already done .\nstella\...,ham,1999-12-14,fyi - see note below - already done . stella -...
4,4,meter 7268 nov allocation,fyi .\n- - - - - - - - - - - - - - - - - - - -...,ham,1999-12-14,fyi . - - - - - - - - - - - - - - - - - - - - ...


In [54]:
from string import punctuation

def punct_removal(message_content):
    message_content = str(message_content)
    
    # https://regexr.com/
    
    # remove digits
    message_content = re.sub(r'\d+', ' ', message_content)

    # remove all non-alphabetic characters
    message_content = message_content.replace(r'[^a-zA-Z]', '') 

    #remove punctuation
    # https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string
    message_content = message_content.translate(str.maketrans('', '', punctuation))
    
    return message_content

df_with_punct_numb_removed['Cleaned_Message'] = df_with_punct_numb_removed['Message'].apply(punct_removal)

display(df_with_punct_numb_removed)

Unnamed: 0,Message ID,Subject,Message,Spam/Ham,Date,Cleaned_Message
0,0,christmas tree farm pictures,,ham,1999-12-10,
1,1,"vastar resources , inc .","gary , production from the high island larger ...",ham,1999-12-13,gary production from the high island larger b...
2,2,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,ham,1999-12-14,calpine daily gas nomination doc
3,3,re : issue,fyi - see note below - already done .\nstella\...,ham,1999-12-14,fyi see note below already done \nstella\n ...
4,4,meter 7268 nov allocation,fyi .\n- - - - - - - - - - - - - - - - - - - -...,ham,1999-12-14,fyi \n forwarded by lauri...
...,...,...,...,...,...,...
33711,33711,= ? iso - 8859 - 1 ? q ? good _ news _ c = eda...,"hello , welcome to gigapharm onlinne shop .\np...",spam,2005-07-29,hello welcome to gigapharm onlinne shop \npre...
33712,33712,all prescript medicines are on special . to be...,i got it earlier than expected and it was wrap...,spam,2005-07-29,i got it earlier than expected and it was wrap...
33713,33713,the next generation online pharmacy .,are you ready to rock on ? let the man in you ...,spam,2005-07-30,are you ready to rock on let the man in you r...
33714,33714,bloow in 5 - 10 times the time,learn how to last 5 - 10 times longer in\nbed ...,spam,2005-07-30,learn how to last times longer in\nbed ...
