In [1]:
# Setting up (Imports, creating combined dataframe)

import math
import pandas as pd
import numpy as np

CEAS_08 = pd.read_csv("datasets/CEAS_08.csv", encoding="utf-8", on_bad_lines="skip")
ENRON = pd.read_csv("datasets/Enron.csv", encoding="utf-8", on_bad_lines="skip")
LING = pd.read_csv("datasets/Ling.csv", encoding="utf-8", on_bad_lines="skip")
NAZARIO = pd.read_csv("datasets/Nazario.csv", encoding="utf-8", on_bad_lines="skip")
NAZARIO_5 = pd.read_csv("datasets/Nazario_5.csv", encoding="utf-8", on_bad_lines="skip")
NIGERIAN_FRAUD = pd.read_csv("datasets/Nigerian_Fraud.csv", encoding="utf-8", on_bad_lines="skip")
NIGERIAN_5 = pd.read_csv("datasets/Nigerian_5.csv", encoding="utf-8", on_bad_lines="skip")
SPAMASSASSIN = pd.read_csv("datasets/SpamAssasin.csv", encoding="utf-8", on_bad_lines="skip")

dfs = [CEAS_08, ENRON, LING, NAZARIO, NAZARIO_5, NIGERIAN_FRAUD, NIGERIAN_5, SPAMASSASSIN]
df = pd.concat(dfs, axis=0, ignore_index=True)

print("Cleaning Data")

print(f"Filling {df['sender'].isnull().sum()} empty senders")
df['sender'] = df['sender'].fillna("[NO_SENDER]")

print("Removing receiver (not useful for classification)")
df = df.drop('receiver', axis=1)

df['date'] = pd.to_datetime(df['date'], errors="coerce", utc=True)
date_missing = df['date'].isnull().sum()
print(f"Filling {date_missing} dates")
if date_missing > 0:
    median_date = df['date'].median()
    print(median_date)
    df['date'] = df['date'].fillna(median_date)

both_missing = df['subject'].isnull() & df['body'].isnull()
print(f"Dropping {both_missing.sum()} rows with both subject and body missing")
if both_missing.sum() > 0:
    print("Dropping rows with no text content at all...")
    df = df[~both_missing]

print(f"Filling {df['subject'].isnull().sum()} empty subjects")
df['subject'] = df['subject'].fillna('[NO_SUBJECT]')

print(f"Filling {df['body'].isnull().sum()} empty bodies")
df['body'] = df['body'].fillna('[NO_BODY]')

print(f"Filling {df['urls'].isnull().sum()} empty URLs")
df['urls'] = df['urls'].apply(
    lambda x: 0 if x == '[]' or x == 0 else 1
)


Cleaning Data
Filling 33297 empty senders
Removing receiver (not useful for classification)
Filling 36283 dates
2008-08-06 13:28:32+00:00
Dropping 0 rows with both subject and body missing
Filling 503 empty subjects
Filling 1 empty bodies
Filling 32626 empty URLs
