# Parsing scans example

This notebook demonstrates the parsing tools provided by this package and is not intended to prescribe a specific parsing workflow.


### Load scans dataframe with RAW column

In [1]:
import pandas as pd 

SCANS_EXAMPLES_PATH = "scans_example.csv"

scans_raws = pd.read_csv(SCANS_EXAMPLES_PATH, index_col=0)["RAW"]
scans_raws

0    {\n  "children": [],\n  "children_finished_at"...
1    {\n  "children": [],\n  "children_finished_at"...
2    {\n  "children": [],\n  "children_finished_at"...
3    {\n  "children": [],\n  "children_finished_at"...
4    {\n  "children": [],\n  "children_finished_at"...
5    {\n  "children": [],\n  "children_finished_at"...
6    {\n  "children": [],\n  "children_finished_at"...
7    {\n  "children": [],\n  "children_finished_at"...
8    {\n  "children": [],\n  "children_finished_at"...
9    {\n  "attachments_names": [\n    "audio.mp3"\n...
Name: RAW, dtype: object

### Load Attributes, Evidences, and Decisions from Scans
We will also add the evidences and decisions as columns to the features DataFrame.

Note:
     There are no decisions in the given scans; therefore, we will ignore this column in the subsequent steps.

In [2]:
import DetectionUtilities.feature_extractor.scans_raw_parser as raw_parser

features = raw_parser.extract_attributes_df(scans_raws, valid_attributes=["subject", "cc_addresses", "delivery_time", "main_address_domain_spf_verdict"])
evidences = raw_parser.extract_evidences_names(scans_raws)
features["evidences"] = evidences

decisions = raw_parser.extract_decisions_names(scans_raws)
features["decisions"] = decisions
features

Unnamed: 0,cc_addresses,delivery_time,main_address_domain_spf_verdict,subject,evidences,decisions
0,"Jon DiVaio <Jon.DiVaio@redbull.com>,\r\n Joe D...",2024-05-16T17:44:28.834063,PASS,EXT: Re: Red Bull Cliff Diving Project Check in,"[skip_scan_duplicated_emails, skip_scan_duplic...",
1,,2024-05-20T12:15:08.044878,PASS,EXT: Approval required for Sanam Chawla's basket,[],
2,,2024-05-20T13:55:31.693020,PASS,Completed: TPD__242668-RED BULL ENERGY DRINK 2...,[],
3,,2024-05-20T13:57:28.002516,PASS,Completed: TPD__242668-RED BULL ENERGY DRINK 2...,[],
4,"Cedric De Sousa <Cedric.DESOUSA@redbull.com>,\...",2024-05-14T05:47:40.247231,PASS,EXT: RE: Pickings & Commandes Lagny - Point 13...,[],
5,,2024-05-20T09:05:05.352768,PASS,EXT: Current mileage needed for vehicle 34FKN481,[skip_scan_sender_clean],
6,,2024-05-18T16:05:40.119104,PASS,EXT: Accepterad: Ella Naturhistoriska,[],
7,,2024-05-17T01:15:47.364627,PASS,Executive Flight Book: SWE_MPU_CRE,[],
8,,2024-05-17T01:15:47.364627,PASS,Executive Flight Book: SWE_MPU_CRE,[],
9,,2024-05-17T10:44:10.828816,"('NONE', 'NONE')",Voice Mail (12 seconds),"[new_sender_vector, organization_sender_counte...",


## Encoding Demonstration:

### Encoding using a metadata:
Encode the features dataframe using a metadata dataframe.

#### The metadata:

In [3]:
encoding_metadata = pd.DataFrame(data={"column_name": ["cc_addresses", "delivery_time", "subject", "main_address_domain_spf_verdict", "evidences"], "encoders": ["LengthOfList", "DayOfWeek, Hour", "IsEmpty, HasLabels,HasEmoji,LengthOfString", "CategoryOneHot(categories=[“ERROR”,”FAIL”,”PASS”])", "CategoryOneHot"]})
encoding_metadata

Unnamed: 0,column_name,encoders
0,cc_addresses,LengthOfList
1,delivery_time,"DayOfWeek, Hour"
2,subject,"IsEmpty, HasLabels,HasEmoji,LengthOfString"
3,main_address_domain_spf_verdict,"CategoryOneHot(categories=[“ERROR”,”FAIL”,”PAS..."
4,evidences,CategoryOneHot


#### Creates encoders from metadata


In [4]:
from DetectionUtilities.feature_extractor.encoders.encoder_factory import build_encoders_dict_from_metadata, run_encoders_from_metadata

# Generate encoders objects for each column
col2encoders = build_encoders_dict_from_metadata(encoding_metadata, "column_name", "encoders")
col2encoders

{'cc_addresses': [LengthOfList(mean=0, std=1)],
 'delivery_time': [DayOfWeek, Hour],
 'subject': [IsEmpty(empty_value_regex=^[\s]*$),
  HasLabels(additional_regex=[]),
  HasEmoji,
  LengthOfString(mean=0, std=1)],
 'main_address_domain_spf_verdict': [CategoryOneHot(categories=['ERROR', 'FAIL', 'PASS'],invalid_category_repre=other,valid_category_regex=None)],
 'evidences': [CategoryOneHot(categories=None,invalid_category_repre=other,valid_category_regex=None)]}

#### Encode features

In [5]:
encoded_data = run_encoders_from_metadata(features, encoding_metadata, "column_name", "encoders")

encoded_data



Unnamed: 0,"cc_addresses|LengthOfList(mean=0, std=1)",delivery_time|DayOfWeek|sin,delivery_time|DayOfWeek|cos,delivery_time|Hour|sin,delivery_time|Hour|cos,"main_address_domain_spf_verdict|CategoryOneHot(categories=['ERROR', 'FAIL', 'PASS'],invalid_category_repre=other,valid_category_regex=None)|PASS","main_address_domain_spf_verdict|CategoryOneHot(categories=['ERROR', 'FAIL', 'PASS'],invalid_category_repre=other,valid_category_regex=None)|other",subject|IsEmpty(empty_value_regex=^[\s]*$),subject|HasLabels(additional_regex=[]),subject|HasEmoji,"subject|LengthOfString(mean=0, std=1)","evidences|CategoryOneHot(categories=[np.str_('new_sender_vector'), np.str_('organization_sender_counter'), np.str_('skip_scan_duplicated_emails'), np.str_('skip_scan_sender_clean'), np.str_('spf_return_path_address_internal_fail')],invalid_category_repre=other,valid_category_regex=None)|new_sender_vector","evidences|CategoryOneHot(categories=[np.str_('new_sender_vector'), np.str_('organization_sender_counter'), np.str_('skip_scan_duplicated_emails'), np.str_('skip_scan_sender_clean'), np.str_('spf_return_path_address_internal_fail')],invalid_category_repre=other,valid_category_regex=None)|organization_sender_counter","evidences|CategoryOneHot(categories=[np.str_('new_sender_vector'), np.str_('organization_sender_counter'), np.str_('skip_scan_duplicated_emails'), np.str_('skip_scan_sender_clean'), np.str_('spf_return_path_address_internal_fail')],invalid_category_repre=other,valid_category_regex=None)|other","evidences|CategoryOneHot(categories=[np.str_('new_sender_vector'), np.str_('organization_sender_counter'), np.str_('skip_scan_duplicated_emails'), np.str_('skip_scan_sender_clean'), np.str_('spf_return_path_address_internal_fail')],invalid_category_repre=other,valid_category_regex=None)|skip_scan_duplicated_emails","evidences|CategoryOneHot(categories=[np.str_('new_sender_vector'), np.str_('organization_sender_counter'), np.str_('skip_scan_duplicated_emails'), np.str_('skip_scan_sender_clean'), np.str_('spf_return_path_address_internal_fail')],invalid_category_repre=other,valid_category_regex=None)|skip_scan_sender_clean","evidences|CategoryOneHot(categories=[np.str_('new_sender_vector'), np.str_('organization_sender_counter'), np.str_('skip_scan_duplicated_emails'), np.str_('skip_scan_sender_clean'), np.str_('spf_return_path_address_internal_fail')],invalid_category_repre=other,valid_category_regex=None)|spf_return_path_address_internal_fail",decisions
0,22,0.433884,-0.900969,-0.9659258,-0.258819,1,0,False,True,False,47,0,0,0,1,0,0,
1,1,0.0,1.0,1.224647e-16,-1.0,1,0,False,True,False,48,0,0,1,0,0,0,
2,1,0.0,1.0,-0.258819,-0.965926,1,0,False,True,False,52,0,0,1,0,0,0,
3,1,0.0,1.0,-0.258819,-0.965926,1,0,False,True,False,52,0,0,1,0,0,0,
4,13,0.781831,0.62349,0.9659258,0.258819,1,0,False,True,False,55,0,0,1,0,0,0,
5,1,0.0,1.0,0.7071068,-0.707107,1,0,False,True,False,48,0,0,0,0,1,0,
6,1,-0.974928,-0.222521,-0.8660254,-0.5,1,0,False,True,False,38,0,0,1,0,0,0,
7,1,-0.433884,-0.900969,0.258819,0.965926,1,0,False,False,False,34,0,0,1,0,0,0,
8,1,-0.433884,-0.900969,0.258819,0.965926,1,0,False,False,False,34,0,0,1,0,0,0,
9,1,-0.433884,-0.900969,0.5,-0.866025,0,1,False,False,False,23,1,1,0,0,0,1,


#### Encode and Normalize the data 
By specifying `fit_and_normalize=True`, each column will also be normalized according to its mean and standard deviation values (if the corresponding encoder allows it).

Note that the normalization parameters in the column names have been updated and should be used for normalization during inference.

In [6]:
encoded_data = run_encoders_from_metadata(features, encoding_metadata, "column_name", "encoders", fit_and_normalize=True)

encoded_data



Unnamed: 0,"cc_addresses|LengthOfList(mean=4.3, std=7.273238618387272)",delivery_time|DayOfWeek|sin,delivery_time|DayOfWeek|cos,delivery_time|Hour|sin,delivery_time|Hour|cos,"main_address_domain_spf_verdict|CategoryOneHot(categories=['ERROR', 'FAIL', 'PASS'],invalid_category_repre=other,valid_category_regex=None)|PASS","main_address_domain_spf_verdict|CategoryOneHot(categories=['ERROR', 'FAIL', 'PASS'],invalid_category_repre=other,valid_category_regex=None)|other",subject|IsEmpty(empty_value_regex=^[\s]*$),subject|HasLabels(additional_regex=[]),subject|HasEmoji,"subject|LengthOfString(mean=43.1, std=10.32203683603408)","evidences|CategoryOneHot(categories=[np.str_('new_sender_vector'), np.str_('organization_sender_counter'), np.str_('skip_scan_duplicated_emails'), np.str_('skip_scan_sender_clean'), np.str_('spf_return_path_address_internal_fail')],invalid_category_repre=other,valid_category_regex=None)|new_sender_vector","evidences|CategoryOneHot(categories=[np.str_('new_sender_vector'), np.str_('organization_sender_counter'), np.str_('skip_scan_duplicated_emails'), np.str_('skip_scan_sender_clean'), np.str_('spf_return_path_address_internal_fail')],invalid_category_repre=other,valid_category_regex=None)|organization_sender_counter","evidences|CategoryOneHot(categories=[np.str_('new_sender_vector'), np.str_('organization_sender_counter'), np.str_('skip_scan_duplicated_emails'), np.str_('skip_scan_sender_clean'), np.str_('spf_return_path_address_internal_fail')],invalid_category_repre=other,valid_category_regex=None)|other","evidences|CategoryOneHot(categories=[np.str_('new_sender_vector'), np.str_('organization_sender_counter'), np.str_('skip_scan_duplicated_emails'), np.str_('skip_scan_sender_clean'), np.str_('spf_return_path_address_internal_fail')],invalid_category_repre=other,valid_category_regex=None)|skip_scan_duplicated_emails","evidences|CategoryOneHot(categories=[np.str_('new_sender_vector'), np.str_('organization_sender_counter'), np.str_('skip_scan_duplicated_emails'), np.str_('skip_scan_sender_clean'), np.str_('spf_return_path_address_internal_fail')],invalid_category_repre=other,valid_category_regex=None)|skip_scan_sender_clean","evidences|CategoryOneHot(categories=[np.str_('new_sender_vector'), np.str_('organization_sender_counter'), np.str_('skip_scan_duplicated_emails'), np.str_('skip_scan_sender_clean'), np.str_('spf_return_path_address_internal_fail')],invalid_category_repre=other,valid_category_regex=None)|spf_return_path_address_internal_fail",decisions
0,2.433579,0.433884,-0.900969,-0.9659258,-0.258819,1.0,0.0,0.0,1.0,0.0,0.377832,0.0,0.0,0.0,1.0,0.0,0.0,
1,-0.453718,0.0,1.0,1.224647e-16,-1.0,1.0,0.0,0.0,1.0,0.0,0.474713,0.0,0.0,1.0,0.0,0.0,0.0,
2,-0.453718,0.0,1.0,-0.258819,-0.965926,1.0,0.0,0.0,1.0,0.0,0.862233,0.0,0.0,1.0,0.0,0.0,0.0,
3,-0.453718,0.0,1.0,-0.258819,-0.965926,1.0,0.0,0.0,1.0,0.0,0.862233,0.0,0.0,1.0,0.0,0.0,0.0,
4,1.196166,0.781831,0.62349,0.9659258,0.258819,1.0,0.0,0.0,1.0,0.0,1.152873,0.0,0.0,1.0,0.0,0.0,0.0,
5,-0.453718,0.0,1.0,0.7071068,-0.707107,1.0,0.0,0.0,1.0,0.0,0.474713,0.0,0.0,0.0,0.0,1.0,0.0,
6,-0.453718,-0.974928,-0.222521,-0.8660254,-0.5,1.0,0.0,0.0,1.0,0.0,-0.494089,0.0,0.0,1.0,0.0,0.0,0.0,
7,-0.453718,-0.433884,-0.900969,0.258819,0.965926,1.0,0.0,0.0,0.0,0.0,-0.881609,0.0,0.0,1.0,0.0,0.0,0.0,
8,-0.453718,-0.433884,-0.900969,0.258819,0.965926,1.0,0.0,0.0,0.0,0.0,-0.881609,0.0,0.0,1.0,0.0,0.0,0.0,
9,-0.453718,-0.433884,-0.900969,0.5,-0.866025,0.0,1.0,0.0,0.0,0.0,-1.94729,1.0,1.0,0.0,0.0,0.0,1.0,


### Encoding using encoders:
Now we generate encoders from string representation

In [13]:
from DetectionUtilities.feature_extractor.encoders.encoder_factory import get_encoder_from_str
encoders_str = "IsEmpty, HasLabels,HasEmoji,LengthOfString"

encoders = get_encoder_from_str(encoders_str)
subject_col = features[["subject"]]


result = [features[["subject"]]]
for encoder in encoders:
    encoder.fit(features["subject"])
    encoded_df = encoder(features["subject"])
    remaned_cols = {encoder_name: "subject" + "|" + encoder_name for encoder_name in encoded_df.columns}
    encoded_df.rename(columns=remaned_cols, inplace=True)
    encoded_df = encoder.fit_and_normalize(encoded_df)
    result.append(encoded_df)

pd.concat(result, axis=1)


Unnamed: 0,subject,subject|IsEmpty(empty_value_regex=^[\s]*$),subject|HasLabels(additional_regex=[]),subject|HasEmoji,"subject|LengthOfString(mean=43.1, std=10.32203683603408)"
0,EXT: Re: Red Bull Cliff Diving Project Check in,0.0,1.0,0.0,0.377832
1,EXT: Approval required for Sanam Chawla's basket,0.0,1.0,0.0,0.474713
2,Completed: TPD__242668-RED BULL ENERGY DRINK 2...,0.0,1.0,0.0,0.862233
3,Completed: TPD__242668-RED BULL ENERGY DRINK 2...,0.0,1.0,0.0,0.862233
4,EXT: RE: Pickings & Commandes Lagny - Point 13...,0.0,1.0,0.0,1.152873
5,EXT: Current mileage needed for vehicle 34FKN481,0.0,1.0,0.0,0.474713
6,EXT: Accepterad: Ella Naturhistoriska,0.0,1.0,0.0,-0.494089
7,Executive Flight Book: SWE_MPU_CRE,0.0,0.0,0.0,-0.881609
8,Executive Flight Book: SWE_MPU_CRE,0.0,0.0,0.0,-0.881609
9,Voice Mail (12 seconds),0.0,0.0,0.0,-1.94729
