# Prepare Dataset
## Task 1: Index entire dataset (only consider eml files)

Prepare two files: train.csv (70%), test.csv (30%) - ensure proportionate representation
- Save files to repo: data/full-dataset
- Structure of each file:
- path (/dataset/...); target_1; target_2; target_3

In [None]:
import pandas as pd
import pytest
from pathlib import Path
from sklearn.model_selection import train_test_split

In [None]:
# Mapping of subcategory folder to category and binary label
label_map = {
    # Malicious
    "CEO_Fraud_-_Gift_Cards": ("gift_cards", "ceo_fraud", "malicious"),
    "CEO_Fraud_-_Payroll_Update": ("payroll_update", "ceo_fraud", "malicious"),
    "CEO_Fraud_-_Wire_Transfers": ("wire_transfers", "ceo_fraud", "malicious"),
    "Phishing_-_3rd_Party": ("third_party", "phishing", "malicious"),
    "Phishing_-_Outbound": ("outbound", "phishing", "malicious"),
    "Phishing_–_UBC": ("ubc", "phishing", "malicious"),
    "Phishing_UBC_-_Outbound": ("ubc_outbound", "phishing", "malicious"),
    "Self-Phishing": ("self_phishing", "phishing", "malicious"),
    "Spearphishing": ("spearphishing", "phishing", "malicious"),
    "Reply_Chain_Attack": ("reply-chain-attack", "reply-chain-attack", "malicious"),

    # Benign
    "Legitimate_Email_Confirmed": ("legitimate_email_confirmed", "legitimate", "benign"),
    "Spam_-_False_Positives": ("spam_false_positive", "legitimate", "benign"),
    "Spam_–_Inbound": ("inbound", "spam", "benign"),
    "Spam_–_Outbound": ("outbound", "spam", "benign"),
}

dataset_root = Path("/data/dataset")

# Collect all .eml file entries
rows = []
for subfolder, (subcategory, category, binary_label) in label_map.items():
    eml_files = (dataset_root / subfolder).rglob("*.eml")
    for eml in eml_files:
        rel_path = eml.relative_to("/") 
        rows.append({
            "path": f"/{rel_path.as_posix()}",
            "target_1": binary_label,
            "target_2": category,
            "target_3": subcategory
        })

# Build full DataFrame
df = pd.DataFrame(rows)

In [None]:
df.head()

Split the data into 70% train and 30% test

In [None]:
# Stratified split on target_3 to preserve subcategory distribution
train_df, test_df = train_test_split(
    df,
    test_size=0.3,
    stratify=df["target_3"],
    random_state=42,
)

# Create output folder
output_dir = Path("./data/full-dataset")
output_dir.mkdir(parents=True, exist_ok=True)

# Save to CSV
train_df.to_csv(output_dir / "train.csv", index=False)
test_df.to_csv(output_dir / "test.csv", index=False)

## Task 2: Downsample train.csv

Prepare two files: sample-small.csv (100-200 samples), sample-large.csv (2000-3000 samples)
- Save files to repo: data/sampled-dataset

In [None]:
# Sample small (200 rows)
sample_small, _ = train_test_split(
    train_df,
    train_size=200,
    stratify=train_df["target_3"],
    random_state=42
)

# Sample large (3000 rows)
sample_large, _ = train_test_split(
    train_df,
    train_size=3000,
    stratify=train_df["target_3"],
    random_state=42
)

In [None]:
len(sample_small)

In [None]:
len(sample_large)

In [None]:
# Output directory
output_dir = Path("./data/sampled-dataset")
output_dir.mkdir(parents=True, exist_ok=True)

sample_small.to_csv(output_dir / "sample-small.csv", index=False)
sample_large.to_csv(output_dir / "sample-large.csv", index=False)

## Tests Suite

- Ensure the class proportion of train test split follow the original dataset class proportion
- Ensure the class proportion of downsample split follow the original dataset class proportion

In [None]:
def test_train_test_split_proportions():
    targets = ['target_1', 'target_2', 'target_3']

    for target in targets:
        # Compute proportions
        original = df[target].value_counts(normalize=True).sort_index()
        train = train_df[target].value_counts(normalize=True).sort_index()
        test = test_df[target].value_counts(normalize=True).sort_index()

        # Align all categories across splits
        all_labels = original.index.union(train.index).union(test.index)
        original = original.reindex(all_labels, fill_value=0)
        train = train.reindex(all_labels, fill_value=0)
        test = test.reindex(all_labels, fill_value=0)

        # Use pytest.approx for array-wise comparison
        assert train.values == pytest.approx(original.values, abs=0.01), f"Train {target} proportions do not match original {target} proportions"
        assert test.values == pytest.approx(original.values, abs=0.01), f"Test {target} proportions do not match original {target} proportions"

In [None]:
def test_downsample_proportions():
    targets = ['target_1', 'target_2', 'target_3']

    for target in targets:
        # Compute proportions
        original = df[target].value_counts(normalize=True).sort_index()
        small = sample_small[target].value_counts(normalize=True).sort_index()
        large = sample_large[target].value_counts(normalize=True).sort_index()

        # Align all categories across downsampled sets
        all_labels = original.index.union(small.index).union(large.index)
        original = original.reindex(all_labels, fill_value=0)
        small = small.reindex(all_labels, fill_value=0)
        large = large.reindex(all_labels, fill_value=0)

        # Use pytest.approx for array-wise comparison
        assert small.values == pytest.approx(original.values, abs=0.01), f"Small sample {target} proportions do not match original {target} proportions"
        assert large.values == pytest.approx(original.values, abs=0.01), f"Large sample {target} proportions do not match original {target} proportions"

In [None]:
# Run the tests
test_train_test_split_proportions()
test_downsample_proportions()
