In [5]:
import io
import os
import re
import tarfile
from pathlib import Path

import chardet
import pandas as pd
import requests
from bs4 import BeautifulSoup
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
spark.conf.set("spark.sql.parquet.enableVectorizedReader", "false")

23/06/14 21:57:14 WARN Utils: Your hostname, mr.local resolves to a loopback address: 127.0.0.1; using 192.168.15.9 instead (on interface en0)
23/06/14 21:57:14 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/14 21:57:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
APACHE_SPAM_ASSASSIN = "https://spamassassin.apache.org/old/publiccorpus"
DATA_FOLDER = '../data'

In [9]:
def pull_data():
    response = requests.get(APACHE_SPAM_ASSASSIN)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, 'html.parser')
    file_paths = [link.get('href') for link in soup.find_all('a')]
    file_paths = [path for path in file_paths if path.split('.')[-1] == 'bz2']

    available_data = set(os.listdir(DATA_FOLDER))
    file_paths = [path for path in file_paths if path not in available_data and path != 'corpus.parquet']

    if len(file_paths) == 0:
        print('No data to pull')

    for file_path in file_paths:
        print(f'Pulling {file_path}')
        response = requests.get(f"{APACHE_SPAM_ASSASSIN}/{file_path}")
        response.raise_for_status()

        file_object = io.BytesIO(response.content)
        tar = tarfile.open(fileobj=file_object, mode="r:bz2")

        extract_dir = Path(DATA_FOLDER)
        extract_path = extract_dir.joinpath(Path(file_path))

        extract_dir.mkdir(exist_ok=True)
        tar.extractall(extract_path)

        tar.close()

    size = get_directory_size("../data")
    print(f"Data directory size: {size} bytes")


def get_directory_size(directory):
    total = 0
    for dirpath, dirnames, filenames in os.walk(directory):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            if not os.path.islink(fp):
                total += os.path.getsize(fp)

    return total

In [10]:
pull_data()

No data to pull
Data directory size: 121604826 bytes


In [11]:
def parse_data():
    successful_files = []
    failed_files = []
    corpus_df = pd.DataFrame(columns=['date', 'difficulty', 'class', 'collection', 'body'])
    for dirpath, dirnames, filenames in os.walk(DATA_FOLDER):
        print(dirpath)
        for filename in filenames:
            corpus_path = os.path.join(dirpath, filename)
            corpus_df, successful_files, failed_files = incorporate(df, corpus_path, dirpath, successful_files,
                                                                    failed_files)
    return corpus_df, successful_files, failed_files


def incorporate(df, corpus_path, dirpath, successful_files, failed_files):
    rawdata = open(corpus_path, 'rb').read()
    result = chardet.detect(rawdata)
    encoding = result['encoding']

    with open(corpus_path, 'r', encoding=encoding) as f:
        [(date, *ids)] = re.findall(r'([\d]{8})_([a-z]*)_?([a-z]*)_?([\d]*)\.tar\.bz2.*',
                                    dirpath)
        [difficulty, cls, collection] = parse_ids(ids)
        try:
            body = f.read()
            row = pd.DataFrame({'date': date,
                                'difficulty': difficulty,
                                'class': cls,
                                'collection': collection,
                                'body': body}, index=[0, 1, 2, 3, 4])
            df = pd.concat([df, row], ignore_index=True)
            successful_files.append(corpus_path)
        except Exception as E:
            print(E)
            failed_files.append(corpus_path)
    return df, successful_files, failed_files


def parse_ids(ids):
    cls, difficulty, collection = None, None, None
    match ids:
        case [cls, '', '']:
            difficulty = None
            collection = None
        case [difficulty, cls, '']:
            collection = None
        case [difficulty, cls, collection]:
            pass
        case _:
            pass

    return [difficulty, cls, collection]


def save_as_parquet(df):
    spark_df = spark.createDataFrame(df)
    spark_df.write.parquet(f'{DATA_FOLDER}/corpus.parquet')

In [12]:
if 'corpus.parquet' in set(os.listdir(DATA_FOLDER)):
    df = spark.read.parquet(f'{DATA_FOLDER}/corpus.parquet')
    corpus_df = df.toPandas()
else:
    corpus_df, successful_files, failed_files = parse_data()
    save_as_parquet(corpus_df)

                                                                                

In [17]:
corpus_df

Unnamed: 0,date,difficulty,class,collection,body
0,20030228,hard,ham,,Return-Path: <bounce-lghtml-2534368@sprocket.l...
1,20030228,hard,ham,,Return-Path: <bounce-lghtml-2534368@sprocket.l...
2,20030228,hard,ham,,Return-Path: <bounce-lghtml-2534368@sprocket.l...
3,20030228,hard,ham,,Return-Path: <bounce-lghtml-2534368@sprocket.l...
4,20030228,hard,ham,,Return-Path: <bounce-lghtml-2534368@sprocket.l...
...,...,...,...,...,...
53650,20030228,easy,ham,,From exmh-workers-admin@redhat.com Tue Aug 27...
53651,20030228,easy,ham,,From exmh-workers-admin@redhat.com Tue Aug 27...
53652,20030228,easy,ham,,From exmh-workers-admin@redhat.com Tue Aug 27...
53653,20030228,easy,ham,,From exmh-workers-admin@redhat.com Tue Aug 27...
