In [None]:
import pandas as pd
from email.parser import BytesParser
from bs4 import BeautifulSoup

import sys
import os
username = os.environ.get('USER')
sys.path.append(f'/data/workspace/{username}')

import git_repo.scripts.extract_text_features as etf

In [None]:
def remove_caution_label(text):
    """
    Remove '[CAUTION: Non-UBC Email]' from a string if it exists anywhere in the text.
    
    Parameters
    ----------
    text : str
        The input text that may contain the caution prefix.
        
    Returns
    -------
    str
        The input text with the caution prefix removed if it was present.
        
    Example
    -------
    >>> text = '[CAUTION: Non-UBC Email] This is a phishing email.'
    >>> remove_caution_prefix(text)
    ' This is a phishing email.'
    >>> text = 'Normal email without prefix'
    >>> remove_caution_prefix(text)
    'Normal email without prefix'
    >>> text = 'This is an email with [CAUTION: Non-UBC Email] in the middle.'
    >>> remove_caution_prefix(text)
    'This is an email with  in the middle.'
    """
    label = '[CAUTION: Non-UBC Email]'

    if label in text:
        return text.replace(label, '').lstrip()
    else:
        return text


In [None]:
email_list = pd.read_csv('/data/workspace/danishki/git_repo/data/sampled-dataset/sample-small.csv')
paths = email_list.path

emails = []
payloads = []
text_html = []
text_plain = []
text_clean = []

for i, path in enumerate(paths):
    with open(path, 'rb') as fp:
        msg = BytesParser().parse(fp)
        emails.append(msg)

    content_type = list()
    payload = {}

    for part in msg.walk():
        payload[part.get_content_type()] = part.get_payload(decode=True)

    payloads.append(payload)

    text_html.append(payload['text/html'] if 'text/html' in payload.keys() else None)

    try:
        text_plain.append(payload['text/plain'].decode() if 'text/plain' in payload.keys() else BeautifulSoup(payload['text/html']).get_text())
    except:
        text_plain.append('')

    try:
        text_clean.append(remove_caution_label(' '.join(text_plain[i].split())))
    except:
        text_clean.append('')

In [None]:
data_df = pd.DataFrame({
    'path': paths,
    'email': emails,
    'payload': payloads,
    'text_html': text_html,
    'text_plain': text_plain,
    'text_clean': text_clean,
}).set_index('path')

data_df

#### Check for presence of non-ASCII characters

In [None]:
non_ascii_results = etf.non_ascii_present(data_df.text_clean)

non_ascii_results

#### Detect presence of hidden text

In [None]:
hidden_text_results = etf.is_hidden_text_present(data_df.text_html)

hidden_text_results

#### Check if there are errors when parsing HTML caused by invalid HTML tags

In [None]:
parsing_error_results = etf.html_parsing_error(data_df.text_html)

parsing_error_results

#### Get word count

In [None]:
word_counts = etf.word_count(data_df.text_clean)

word_counts

#### Get proportion of readable text

$$
\text{Proportion of readable text} = \frac{\text{No. of chars in }\texttt{text\_clean}}{\text{No. of chars in }\texttt{text\_html}}
$$

In [None]:
readable_proportion_results = etf.readable_proportion(data_df.text_clean, data_df.text_html)

readable_proportion_results

#### Get count of whitespace character occurrences in `text_plain`

The whitespace ratio is a measurement of how much of the text consists of whitespace characters, which can be an indicator of formatting or structural characteristics of the email content. This metric is calculated using the following formula:

$$
\text{Whitespace ratio} = \frac{\text{Number of whitespace characters in text}}{\text{Total number of characters in text}}
$$

Where whitespace characters include spaces, tabs, newlines, and other non-visible formatting characters.

In [None]:
whitespace_ratio_results = etf.whitespace_ratio(data_df.text_plain)

whitespace_ratio_results

#### Get proportion of alphabetical characters in `text_clean`

In [None]:
prop_alphabets_results = etf.alphabet_proportion(data_df.text_clean)

prop_alphabets_results

#### Get number of grammatical errors

NOTE: This requires the local LanguageTool server to be running (currently only installed at `/data/workspace/danishki`).

```bash
cd /data/workspace/danishki/LanguageTool-6.7-SNAPSHOT
java -cp languagetool-server.jar org.languagetool.server.HTTPServer --config server.properties --port 8081 --allow-origin
```

In [None]:
check_grammar_results = etf.check_grammar(data_df.text_plain, data_df['Content-Language'])

check_grammar_results

#### Named entity recognition

In [None]:
get_named_entities_results = etf.named_entities(data_df.text_clean)

get_named_entities_results

#### Get proportion of English/French text

In [None]:
text_language_results = etf.english_french_proportion(data_df.text_clean)

text_language_results