In [None]:
import pandas as pd

import sys
import os
sys.path.append(os.path.abspath("../../"))

Run the following code in Terminal from the project root dir:

```bash
python scripts/build_original_df.py --dataset sample-small
```

In [None]:
original_df = pd.read_parquet('/data/workspace/danishki/git_repo/data/sampled-dataset/raw/sample-small.parquet')

original_df = original_df.iloc[:30]

original_df

In [None]:
input_df_classical = pd.read_parquet('/data/workspace/danishki/git_repo/data/sampled-dataset/processed/sample-small.parquet')

input_df_classical

In [None]:
from src.extract_header_features import (
    has_dmarc_authentication, get_dkim_result,
    get_spf_result, get_dmarc_result,
    dkim_domain_matches_sender, has_attachment,
    number_of_received, to_from_match,
    spf_email_matches_sender
)

from src.extract_text_features import (
    non_ascii_present, hidden_text_present, html_parsing_error,
    word_count, readable_proportion, whitespace_ratio,
    alphabet_proportion, check_grammar, english_french_proportion
)

features_df = pd.DataFrame({
    "dmarc_authentication_present": has_dmarc_authentication(original_df['Authentication-Results']),
    "dkim_result": get_dkim_result(original_df['Authentication-Results']),
    "spf_result": get_spf_result(original_df['received-spf']),
    "dmarc_result": get_dmarc_result(original_df['Authentication-Results']),
    "dkim_sender_domains_match": dkim_domain_matches_sender(
        original_df['DKIM-Signature'],
        original_df['From_email_domain']
    ),
    "attachments_present": has_attachment(original_df['attachment_types']),
    "routing_length": number_of_received(original_df['Received']),
    "to_from_addresses_match": to_from_match(original_df['From_email'], original_df['To_email']),
    "sender_email_spf_match": spf_email_matches_sender(
        original_df['received-spf'], original_df['From_email']
    ),

    "non_ascii_present": non_ascii_present(original_df['text_clean']),
    "hidden_text_present": hidden_text_present(original_df['text_html']),
    "html_parsing_error": html_parsing_error(original_df['text_html']),
    "word_count": word_count(original_df['text_clean']),
    "readable_proportion": readable_proportion(original_df['text_clean'], original_df['text_html']),
    "whitespace_ratio": whitespace_ratio(original_df['text_plain']),
    "alphabet_proportion": alphabet_proportion(original_df['text_clean']),
    "grammar_error_rate": check_grammar(original_df['text_plain'], original_df['Content-Language']),
    "english_french_proportion": english_french_proportion(original_df['text_plain']),

})

features_df

In [None]:
import pandera as pa

schema_features_df = pa.DataFrameSchema(
    {
        "dmarc_authentication_present": pa.Column(bool),
        "dkim_result": pa.Column(str),
        "spf_result": pa.Column(str),
        "dmarc_result": pa.Column(str),
        "dkim_sender_domains_match": pa.Column(bool),
        "attachments_present": pa.Column(bool),
        "routing_length": pa.Column(int),
        "to_from_addresses_match": pa.Column(bool),
        "sender_email_spf_match": pa.Column(bool),

        "non_ascii_present": pa.Column(bool),
        "hidden_text_present": pa.Column(bool),
        "html_parsing_error": pa.Column(int, pa.Check.isin([-1, 0, 1])),
        "word_count": pa.Column(int, pa.Check(lambda x: x >= 0)),
        "readable_proportion": pa.Column(float, pa.Check.in_range(0, 1)),
        "whitespace_ratio": pa.Column(float, pa.Check.in_range(0, 1)),
        "alphabet_proportion": pa.Column(float, pa.Check.in_range(0, 1)),
        "grammar_error_rate": pa.Column(float, pa.Check.in_range(0, 1)),
        "english_french_proportion": pa.Column(float, pa.Check.in_range(0, 1)),

    }
)

schema_features_df.validate(features_df)

In [None]:
from src.extract_url_features import (
    get_url_count, has_accessible_url, has_redirected_url,
    has_ip_url, has_http_only, has_at_symbol,
    has_port_number, has_long_url, has_multiple_subdomains
)


features_df_2 = pd.DataFrame({
    "url_count": get_url_count(original_df['urls']),
    "all_urls_accessible": has_accessible_url(original_df['urls']),
    "urls_redirected": has_redirected_url(original_df['urls']),
    "ip_addr_urls": has_ip_url(original_df['urls']),
    "http_urls_present": has_http_only(original_df['urls']),
    "url_at_symbol": has_at_symbol(original_df['urls']),
    "url_port_number": has_port_number(original_df['urls']),
    "any_long_urls": has_long_url(original_df['urls']),
    "url_multiple_subdomains": has_multiple_subdomains(original_df['urls'])
})

features_df_2