# PREPROCESS

In [1]:
import pandas as pd
from pathlib import Path
import dns.resolver
import requests

In [2]:
DATA_DIR = Path('../data')
PHISH_DIR = DATA_DIR / 'phishtank'
LEGIT_DIR = DATA_DIR / 'common_crawl'
PROCESSED_DIR = DATA_DIR / 'processed'
PHISH_DATA = PHISH_DIR / 'collected_data.csv'
LEGIT_DATA = LEGIT_DIR / 'collected_data.csv'

In [5]:
df_phish = pd.read_csv(PHISH_DATA)
df_valid = pd.read_csv(LEGIT_DATA)

In [3]:
df_processed = pd.read_csv(PROCESSED_DIR / 'phish_data.csv')

# Reapplying Rules
Some address based indicators were updated after they were collected

In [4]:
from urllib.parse import urlparse, urljoin
import whois
import tldextract

# Stats

In [7]:
from tabulate import tabulate

def create_table(data, headers):
    print(tabulate(data, headers=headers, tablefmt="fancy_grid"))

In [8]:
def statistics(df):
    metrics = []
    data = []
    for col in df.columns:
        if col == 'result':
            continue
        
        value_counts = df[col].value_counts().reindex([-1, 0, 1], fill_value=0)
        phishing = value_counts.get(-1, 0)
        suspicious = value_counts.get(0, 0)
        legitimate = value_counts.get(1, 0)
        
        df[col] = pd.to_numeric(df[col], errors='coerce')
        df['result'] = pd.to_numeric(df['result'], errors='coerce')
        
        TP = ((df[col] == -1) & (df['result'] == -1)).sum()
        FP = (((df[col] == -1) | (df[col] == 0)) & (df['result'] == 1)).sum()
        TN = ((df[col] == 1) & (df['result'] == 1)).sum()
        FN = (((df[col] == 1) | (df[col] == 0)) & (df['result'] == -1)).sum()
        
        precision = TP / (TP + FP) if (TP + FP) > 0 else 0
        recall = TP / (TP + FN) if (TP + FN) > 0 else 0
        accuracy = (TP + TN) / (TP + FP + TN + FN) if (TP + FP + TN + FN) > 0 else 0
        metrics.append({
            'Feature': col,
            'Precision': precision,
            'Recall': recall,
            'Accuracy': accuracy
        })
        data.append([col, phishing, suspicious, legitimate, precision, recall, accuracy])


    headers = ["Feature","Phishing (-1)", "Suspicious (0)","Legitimate (1)", "Precision (%)", "Recall (%)", "Accuracy (%)"]
    create_table(data, headers)

# Combine

In [None]:
df_combined = pd.concat([df_phish, df_valid], ignore_index=True)
statistics(df_combined)
df_combined.drop(columns=['website_url'], inplace=True)

# Feature Selection

In [None]:
drop_features = {
    'website_url',          # NOT NEEDED
    'sslfinal_state',       # BAD
    'having_ip_address',    # USELESS
    'port'                  # USELESS
}