In [None]:
import sys
import os 
sys.path.append(os.path.join(os.path.abspath("../../"), "src"))

from hashlib import sha1
import numpy as np
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_validate,
    cross_val_score,
    train_test_split,
)
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
)
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC


import altair as alt
alt.data_transformers.enable('vegafusion')

from extract_text_keywords import preprocess_text

In [None]:
original_df = pd.read_parquet('/data/workspace/dataset/full-dataset/raw/train.parquet')
input_df = pd.read_parquet('/data/workspace/dataset/full-dataset/processed/train.parquet')

# input_df = input_df.join(original_df[['Subject', 'text_preprocessed']])
# input_df['subject_preprocessed'] = preprocess_text(input_df['Subject'].fillna(""))

# input_df['text_preprocessed'] = input_df['text_preprocessed'].fillna("")
# input_df['subject_preprocessed'] = input_df['subject_preprocessed'].fillna("")

input_df = input_df.join(original_df[['target_1', 'target_3']])

### drop self-phishing

In [None]:
input_df = input_df[input_df['target_3'] != 'self_phishing']

In [None]:
train_df, test_df = train_test_split(input_df, test_size=0.3, random_state=42)

X_train = train_df.iloc[:, :-1]
y_train = train_df['target_1']
X_test = test_df.iloc[:, :-1]
y_test = test_df['target_1']

In [None]:
train_df.select_dtypes(include=['number']).columns

## EDA

## Code below was ran with self-phishing

In [None]:
quantitative_cols = train_df.select_dtypes(include='number').columns.tolist()

charts = []
for col in quantitative_cols:
  chart = alt.Chart(train_df).transform_density(
      col,
      groupby=['target_1'],
      as_=[col, 'density']
  ).mark_area(opacity=0.4).encode(
      x=col,
      y=alt.Y('density:Q').stack(None),
      color=alt.Color('target_1:N',
                      scale=alt.Scale(range=['#1f77b4', '#ff7f0e']))
  ).properties(
      height = 300,
      width = 300   
  ).interactive()

  charts.append(chart)

final_chart_quant = alt.hconcat(*charts)  
final_chart_quant


## Code below was ran with self-phishing excluded

In [None]:
quantitative_cols = train_df.select_dtypes(include='number').columns.tolist()

charts = []
for col in quantitative_cols:
  chart = alt.Chart(train_df).transform_density(
      col,
      groupby=['target_1'],
      as_=[col, 'density']
  ).mark_area(opacity=0.4).encode(
      x=col,
      y=alt.Y('density:Q').stack(None),
      color=alt.Color('target_1:N',
                      scale=alt.Scale(range=['#1f77b4', '#ff7f0e']))
  ).properties(
      height = 300,
      width = 300   
  ).interactive()

  charts.append(chart)

final_chart_quant = alt.hconcat(*charts)  
final_chart_quant

## Code below was ran with self-phishing

In [None]:
categorical_cols = ['dmarc_authentication_present', 'dkim_result', 'spf_result',
       'dmarc_result', 'dkim_sender_domains_match', 'attachments_present',
       'routing_length', 'to_from_addresses_match', 'sender_email_spf_match',
       'non_ascii_present', 'hidden_text_present', 'html_parsing_error',
       'all_urls_accessible', 'urls_redirected', 'ip_addr_urls', 'http_urls_present', 'url_at_symbol',
       'url_port_number', 'any_long_urls', 'url_multiple_subdomains']

charts = []

for col in categorical_cols:
    chart = alt.Chart(train_df).transform_aggregate(
        count='count()',
        groupby=[col, 'target_1']
    ).transform_joinaggregate(
        total='sum(count)',
        groupby=['target_1']  # normalize across categories for each class
    ).transform_calculate(
        proportion='datum.count / datum.total'
    ).mark_bar().encode(
        x=alt.X(f'{col}:N', title=col),
        y=alt.Y('proportion:Q', axis=alt.Axis(format='%')),
        color=alt.Color('target_1:N', scale=alt.Scale(range=['#1f77b4', '#ff7f0e'])),
        column=alt.Column('target_1:N', title='Class')
    ).properties(
        width=150,
        height=300
    )

    charts.append(chart)

final_chart_categorical = alt.hconcat(*charts)
final_chart_categorical

## Code below was ran with self-phishing excluded

In [None]:
charts = []

for col in categorical_cols:
    chart = alt.Chart(train_df).transform_aggregate(
        count='count()',
        groupby=[col, 'target_1']
    ).transform_joinaggregate(
        total='sum(count)',
        groupby=['target_1']  # normalize across categories for each class
    ).transform_calculate(
        proportion='datum.count / datum.total'
    ).mark_bar().encode(
        x=alt.X(f'{col}:N', title=col),
        y=alt.Y('proportion:Q', axis=alt.Axis(format='%')),
        color=alt.Color('target_1:N', scale=alt.Scale(range=['#1f77b4', '#ff7f0e'])),
        column=alt.Column('target_1:N', title='Class')
    ).properties(
        width=150,
        height=300
    )

    charts.append(chart)

final_chart_categorical = alt.hconcat(*charts)
final_chart_categorical