#### Importing relevant packages

In [5]:
import os
import mailbox
import pandas as pd
from pyspark.sql import SparkSession

#### Parsing the nazario phishing corpus dataset
We'll be using pySpark to speed up the process.


In [6]:
# We'll be using this function to parse an email 
# into a dict (mail header fields + mail body).
def parse_mbox_email(email):
    email_dict = {}
    email_dict['body'] = email.get_payload()
    
    for field in email.keys():
        value = email.get(field)
        email_dict[field] = value
    return email_dict

In [106]:
# Parsing the files files.
directory_path = 'datasets/nazario-phishing-corpus/'
mbox_files = [directory_path + f for f in  os.listdir(directory_path) if not f.startswith('.')]
spark = SparkSession.builder.appName("MboxParser").getOrCreate()
rdd = spark.sparkContext.parallelize(mbox_files).flatMap(lambda file: mailbox.mbox(file))
emails = rdd.map(parse_mbox_email).collect()
df = pd.DataFrame(emails)


                                                                                

#### Detecting redundant columns

In [68]:
import Levenshtein # You'll need to install this package using pip or conda

# Define a list of words to cluster
words = df.columns

# Define a threshold edit distance for clustering
threshold = 2

# Create an empty dictionary to hold clusters
clusters = {}

# Iterate over each word in the list
for word in words:

    # Create an empty list to hold matching clusters
    matches = []

    # Iterate over each cluster in the dictionary
    for cluster in clusters:
        # Check if the word is within the threshold edit distance of any word in the cluster
        for member in clusters[cluster]:
            distance = Levenshtein.distance(word, member)
            if distance <= threshold:
                matches.append(cluster)
                break
            
    # If the word matches an existing cluster, add it to that cluster
    if len(matches) > 0:
        clusters[matches[0]].append(word)

    # If the word doesn't match any existing clusters, create a new cluster
    else:
        clusters[word] = [word]
        
clusters_list = []
# Print the resulting clusters
for cluster in clusters:
    if len(clusters[cluster]) > 1:
        print(cluster, clusters[cluster])
        clusters_list.append(clusters[cluster])


From ['From', 'from']
Subject ['Subject', 'subject']
Message-ID ['Message-ID', 'Message-Id', 'XMessage-Id', 'XMessage-ID', 'X-Message-Id', 'X-Message-ID', 'Message-id']
X-IMAP ['X-IMAP', 'X-IP', 'X-UID', 'X-CID', 'X-UMS', 'X-ID', 'X-CSC', 'X-CHA', 'X-TLS', 'X-TMN', 'X-SID', 'X-Uri', 'X-dc', 'X-LCID', 'X-JID', 'X-Org']
Status ['Status', 'X-Status']
Return-Path ['Return-Path', 'X-Return-Path']
X-Original-To ['X-Original-To', 'X-Original-IP']
Delivered-To ['Delivered-To', 'X-Delivered-To']
Received ['Received', 'X-Received', 'Reveived']
Reply-To ['Reply-To', 'Reply-to']
To ['To', 'Cc', 'Bcc', 'CC']
MIME-Version ['MIME-Version', 'MIME-version']
Content-Type ['Content-Type', 'Content-type', 'content-type']
X-Priority ['X-Priority', 'Priority']
X-Mailer ['X-Mailer', 'X-Mailster', 'X-mailer']
X-MSMail-Priority ['X-MSMail-Priority', 'X-MSmail-Priority', 'X-Msmail-Priority']
X-Accept-Language ['X-Accept-Language', 'Accept-Language']
X-Sender ['X-Sender', 'Sender', 'X-SenderId', 'X-Sendera', 'X-

In [11]:
# import seaborn as sns
# import matplotlib.pyplot as plt

# for cluster in clusters_list:
#     sns.heatmap(df[cluster].isnull(), cmap='viridis')
#     plt.show()


In [107]:
df2 = df[['from', 'From']]
print(df2.isnull().sum())

# Mergin From & from 
df2['from'] = df2.apply(lambda row: row['From'] if pd.isnull(row['from']) else row['from'], axis=1)
# Drop the "From" column
df2 = df2.drop('From', axis=1)
print(df2.isnull().sum())


from    7172
From       5
dtype: int64
from    4
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['from'] = df2.apply(lambda row: row['From'] if pd.isnull(row['from']) else row['from'], axis=1)


In [15]:

# mbox = mailbox.mbox("datasets/nazario-phishing-corpus/20051114.mbox")
# for m in mbox:
#     body = m.get_payload()
#     print(body)

In [58]:
empty_cols = [col for col in df.columns if df[col].isnull().sum() >7172]
len(empty_cols)


899

In [92]:
df3 = df[['Message-ID', 'Message-Id', 'Message-id']]
df3['Message-ID'] = df3.apply(lambda row: row['Message-Id'] if pd.isnull(row['Message-ID']) else row['Message-ID'], axis=1)
df3['Message-ID'] = df3.apply(lambda row: row['Message-id'] if pd.isnull(row['Message-ID']) else row['Message-ID'], axis=1)
df3 = df3.drop('Message-Id', axis=1)
df3 = df3.drop('Message-id', axis=1)

df3.isnull().sum()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['Message-ID'] = df3.apply(lambda row: row['Message-Id'] if pd.isnull(row['Message-ID']) else row['Message-ID'], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['Message-ID'] = df3.apply(lambda row: row['Message-id'] if pd.isnull(row['Message-ID']) else row['Message-ID'], axis=1)


Message-ID    327
dtype: int64

In [95]:
cols = ['Subject', 'subject']
df4 = df[cols]

df[cols].isnull().sum()

Subject      51
subject    7172
dtype: int64

In [98]:
df4 = df[['Subject', 'subject']]
df4['subject'] = df4.apply(lambda row: row['Subject'] if pd.isnull(row['subject']) else row['subject'], axis=1)
df4 = df4.drop('Subject', axis=1)
df4.isnull().sum()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['subject'] = df4.apply(lambda row: row['Subject'] if pd.isnull(row['subject']) else row['subject'], axis=1)


subject    50
dtype: int64