# PREPROCESS

In [1]:
import pandas as pd
from pathlib import Path

In [2]:
DATA_DIR = Path('../data')
COLLECTED_DIR = DATA_DIR / 'collected' / 'csv_scraped'
PROCESSED_DIR = DATA_DIR / 'processed'
PHISH_DATA = COLLECTED_DIR / 'phished.csv'
LEGIT_DATA = COLLECTED_DIR / 'valid.csv'

In [3]:
df_phish = pd.read_csv(PHISH_DATA)
df_valid = pd.read_csv(LEGIT_DATA)

# Label

In [4]:
df_phish['result'] = -1
df_valid['result'] = 1

# Stats

In [5]:
from tabulate import tabulate

def create_table(data, headers):
    print(tabulate(data, headers=headers, tablefmt="fancy_grid"))

In [6]:
def statistics(df):
    metrics = []
    data = []
    for col in df.columns:
        if col in {'id', 'timestamp', 'open_rank_domain', 'page_rank_integer', 'page_rank_decimal', 'website_url', 'sim_lev', 'sim_fuzz', 'num_of_subdomains', 'len_of_subdomains', 'result'}:
            continue
        
        value_counts = df[col].value_counts().reindex([-1, 0, 1], fill_value=0)
        phishing = value_counts.get(-1, 0)
        suspicious = value_counts.get(0, 0)
        legitimate = value_counts.get(1, 0)
        
        df[col] = pd.to_numeric(df[col], errors='coerce')
        df['result'] = pd.to_numeric(df['result'], errors='coerce')
        
        TP = ((df[col] == -1) & (df['result'] == -1)).sum()
        FP = (((df[col] == -1) | (df[col] == 0)) & (df['result'] == 1)).sum()
        TN = ((df[col] == 1) & (df['result'] == 1)).sum()
        FN = (((df[col] == 1) | (df[col] == 0)) & (df['result'] == -1)).sum()
        
        precision = TP / (TP + FP) if (TP + FP) > 0 else 0
        recall = TP / (TP + FN) if (TP + FN) > 0 else 0
        accuracy = (TP + TN) / (TP + FP + TN + FN) if (TP + FP + TN + FN) > 0 else 0
        metrics.append({
            'Feature': col,
            'Precision': precision,
            'Recall': recall,
            'Accuracy': accuracy
        })
        data.append([col, phishing, suspicious, legitimate, precision, recall, accuracy])


    headers = ["Feature","Phishing (-1)", "Suspicious (0)","Legitimate (1)", "Precision (%)", "Recall (%)", "Accuracy (%)"]
    create_table(data, headers)

## Phished Stats

In [7]:
statistics(df_phish)

╒════════════════════════════╤═════════════════╤══════════════════╤══════════════════╤═════════════════╤══════════════╤════════════════╕
│ Feature                    │   Phishing (-1) │   Suspicious (0) │   Legitimate (1) │   Precision (%) │   Recall (%) │   Accuracy (%) │
╞════════════════════════════╪═════════════════╪══════════════════╪══════════════════╪═════════════════╪══════════════╪════════════════╡
│ having_ip_address          │               0 │                0 │             2110 │               0 │   0          │     0          │
├────────────────────────────┼─────────────────┼──────────────────┼──────────────────┼─────────────────┼──────────────┼────────────────┤
│ url_length                 │             170 │              183 │             1757 │               1 │   0.0805687  │     0.0805687  │
├────────────────────────────┼─────────────────┼──────────────────┼──────────────────┼─────────────────┼──────────────┼────────────────┤
│ shortining_service         │           

# Valid Stats

In [8]:
statistics(df_valid)

╒════════════════════════════╤═════════════════╤══════════════════╤══════════════════╤═════════════════╤══════════════╤════════════════╕
│ Feature                    │   Phishing (-1) │   Suspicious (0) │   Legitimate (1) │   Precision (%) │   Recall (%) │   Accuracy (%) │
╞════════════════════════════╪═════════════════╪══════════════════╪══════════════════╪═════════════════╪══════════════╪════════════════╡
│ having_ip_address          │               0 │                0 │             2038 │               0 │            0 │     1          │
├────────────────────────────┼─────────────────┼──────────────────┼──────────────────┼─────────────────┼──────────────┼────────────────┤
│ url_length                 │             788 │              578 │              672 │               0 │            0 │     0.329735   │
├────────────────────────────┼─────────────────┼──────────────────┼──────────────────┼─────────────────┼──────────────┼────────────────┤
│ shortining_service         │           

# Combine and drop duplicates

In [9]:
df_combined = pd.concat([df_phish, df_valid], ignore_index=True)
df_combined.drop_duplicates(subset='website_url', inplace=True)

In [10]:
num_rows = df_combined.shape[0]
print(f"Number of rows in df_combined: {num_rows}")

Number of rows in df_combined: 4072


## Combined Stats

In [11]:
statistics(df_combined)

╒════════════════════════════╤═════════════════╤══════════════════╤══════════════════╤═════════════════╤══════════════╤════════════════╕
│ Feature                    │   Phishing (-1) │   Suspicious (0) │   Legitimate (1) │   Precision (%) │   Recall (%) │   Accuracy (%) │
╞════════════════════════════╪═════════════════╪══════════════════╪══════════════════╪═════════════════╪══════════════╪════════════════╡
│ having_ip_address          │               0 │                0 │             4072 │       0         │   0          │      0.481827  │
├────────────────────────────┼─────────────────┼──────────────────┼──────────────────┼─────────────────┼──────────────┼────────────────┤
│ url_length                 │             927 │              742 │             2403 │       0.114401  │   0.0805687  │      0.200393  │
├────────────────────────────┼─────────────────┼──────────────────┼──────────────────┼─────────────────┼──────────────┼────────────────┤
│ shortining_service         │           

# Feature Selection

## Removing REDUNDANT and USELESS FEATURES

In [12]:
# id,timestamp,open_rank_domain,page_rank_integer,page_rank_decimal,website_url,having_ip_address,url_length,shortining_service,having_at_symbol,double_slash_redirecting,prefix_suffix,having_sub_domain,sslfinal_state,domain_registration_length,favicon,port,https_token,request_url,url_of_anchor,links_in_tags,sfh,submitting_to_email,abnormal_url,redirect,on_mouseover,rightclick,popupwindow,iframe,age_of_domain,dnsrecord,web_traffic,page_rank,google_index,links_pointing_to_page,statistical_report,has_numbers,special_characters,lev,sim_lev,fuzzy,sim_fuzz,num_sub,num_of_subdomains,len_sub,len_of_subdomains,result
drop_useless_features = {
    'id',                   # NOT NEEDED
    'timestamp',            # NOT NEEDED
    'open_rank_domain',     # NOT NEEDED
    'page_rank_integer',    # NOT NEEDED
    'website_url',          # NOT NEEDED
    'sslfinal_state',       # BAD
    'having_ip_address',    # USELESS
    'port',                 # USELESS
    'https_token',          # Not useful
    'special_characters',   # Not useful
}

df_combined.drop(columns=drop_useless_features, inplace=True)

In [13]:
print(df_combined.dtypes)

page_rank_decimal             float64
url_length                      int64
shortining_service              int64
having_at_symbol                int64
double_slash_redirecting        int64
prefix_suffix                   int64
having_sub_domain               int64
domain_registration_length      int64
favicon                         int64
request_url                     int64
url_of_anchor                   int64
links_in_tags                   int64
sfh                             int64
submitting_to_email             int64
abnormal_url                    int64
redirect                        int64
on_mouseover                    int64
rightclick                      int64
popupwindow                     int64
iframe                          int64
age_of_domain                   int64
dnsrecord                       int64
web_traffic                     int64
page_rank                       int64
google_index                    int64
links_pointing_to_page          int64
statistical_

In [14]:
X = df_combined.drop(columns=['result'])
y = df_combined['result']

In [15]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

selector = SelectKBest(score_func=mutual_info_classif, k='all')  # Use 'all' to compute scores for all features
X_new = selector.fit_transform(X, y)

# Get feature scores
feature_scores = selector.scores_

In [16]:
print("Feature scores:")
for i, score in enumerate(feature_scores):
    print(f"{X.columns[i]}: {score}")

Feature scores:
page_rank_decimal: 0.3890339329261614
url_length: 0.13992117390304148
shortining_service: 0.0069628365184704055
having_at_symbol: 0.002977554554012052
double_slash_redirecting: 0.02125197024432035
prefix_suffix: 0.0488609696138802
having_sub_domain: 0.0053850984870496
domain_registration_length: 0.0002869823451139286
favicon: 0.07576661495825987
request_url: 0.025671322968740418
url_of_anchor: 0.03133517740203162
links_in_tags: 0.038651661114835756
sfh: 0.024796916713026196
submitting_to_email: 0.07065011087124984
abnormal_url: 0.018877278172037437
redirect: 0.017594905777776226
on_mouseover: 0.0007056779955174619
rightclick: 0.0002057288284837977
popupwindow: 0.01049510591398084
iframe: 0.1068036688333942
age_of_domain: 0.0
dnsrecord: 0.005050528577135616
web_traffic: 0.30813817517877373
page_rank: 0.17345946235401155
google_index: 0.19988085389693788
links_pointing_to_page: 0.20782006811147036
statistical_report: 0.0029672366688133778
has_numbers: 0.03147900762600542


In [19]:
import json

threshold = 0.01

# Filter out good features (those with scores above the threshold)
good_features = [feature for feature, score in zip(X.columns, feature_scores) if score > threshold]
bad_features = [feature for feature, score in zip(X.columns, feature_scores) if score <= threshold]

df_combined.drop(columns=bad_features, inplace=True)
df_combined = df_combined.sample(frac=1).reset_index(drop=True)
df_combined.to_csv(PROCESSED_DIR / 'final.csv', index=False)

# Save the good features to a JSON file
good_features_filename = '../configs/features/final_features.json'

with open(good_features_filename, 'w') as f:
    json.dump(good_features, f, indent=4)

# Output the list of good features
print("List of Good Features:", good_features)

List of Good Features: ['page_rank_decimal', 'url_length', 'double_slash_redirecting', 'prefix_suffix', 'favicon', 'request_url', 'url_of_anchor', 'links_in_tags', 'sfh', 'submitting_to_email', 'abnormal_url', 'redirect', 'popupwindow', 'iframe', 'web_traffic', 'page_rank', 'google_index', 'links_pointing_to_page', 'has_numbers', 'lev', 'sim_lev', 'fuzzy', 'sim_fuzz', 'num_of_subdomains', 'len_sub', 'len_of_subdomains']
