# PREPROCESS

In [None]:
import pandas as pd
from pathlib import Path

In [None]:
DATA_DIR = Path('../data')
PHISH_DIR = DATA_DIR / 'phishtank'
LEGIT_DIR = DATA_DIR / 'common_crawl'
PROCESSED_DIR = DATA_DIR / 'processed'
PHISH_DATA = PHISH_DIR / 'collected_data.csv'
LEGIT_DATA = LEGIT_DIR / 'collected_data.csv'

In [None]:
df_phish = pd.read_csv(PHISH_DATA)
df_valid = pd.read_csv(LEGIT_DATA)

# Reapplying Rules
Some address based indicators were updated after they were collected

In [None]:
from urllib.parse import urlparse, urljoin
import whois

In [None]:
# Collected from Cloudflare's Phishing Database
top_phishing_domains = [
    # Cheap and Open TLDs
    ".xyz",".xyz70", ".top", ".club", ".online", ".shop", ".site", ".vip", ".buzz",

    # Freenom TLDs (Free Domains)
    ".tk", ".ml", ".ga", ".cf", ".gq",

    # Geographic and Niche TLDs less commonly used for legitimate purposes
    ".ly", ".to", ".ru", ".cn", ".su"
]

top_phishing_ips = [
    "156.146.62.218", "212.102.57.68", "138.199.18.156", "199.167.138.22", 
    "178.159.37.4", "178.159.37.17", "185.190.42.200", "178.159.37.34", 
    "89.234.157.254", "190.2.131.167", "185.236.200.42", "62.122.184.194", 
    "196.196.53.142", "178.159.37.11", "195.176.3.23", "94.230.208.147", 
    "35.0.127.52", "93.157.254.39", "178.159.37.55", "31.173.87.149",
    "118.107.16.194", "45.43.63.15", "122.230.47.69", "185.228.234.120", 
    "185.247.118.151", "107.172.143.65", "194.37.82.149", "103.240.252.87", 
    "77.222.46.175", "131.108.17.87", "93.190.10.18", "103.18.103.50", 
    "103.18.103.5", "165.154.184.8", "193.233.237.13", "212.230.134.27", 
    "192.92.97.185", "190.247.243.99", "216.117.133.168", "123.190.180.241", 
    "103.102.177.230", "57.128.225.168", "181.229.154.222", "209.85.214.193", 
    "103.25.90.29"
]

def get_whois(url):
    """Gets the WHOIS information of a URL."""
    try:
        domain = whois.whois(url)
        return domain
    except Exception:
        return False


def is_statistical_report(url, domain_info):
    """Determines if the URL has a suspicious statistical report based on phishing domains or IPs."""
    domain = urlparse(url).netloc

    for phishing_domain in top_phishing_domains:
        if phishing_domain.lower() in domain.lower():
            return -1

    ip_address = domain_info.get('ips', [])
    if ip_address and ip_address[0] in top_phishing_ips:
        return -1

    return 1

url_shortening_services = [
    "tinyurl.com", "bit.ly", "t.co", "goo.gl", "is.gd", "buff.ly",
    "adf.ly", "ow.ly", "bit.do", "cutt.ly", "shorte.st", "clck.ru",
    "tiny.cc", "tr.im", "x.co", "soo.gd", "s2r.co", "bl.ink", "mcaf.ee",
    "urlz.fr", "shorturl.at"
]
def is_shortening_service(url):
    """Determines if the URL uses a URL shortening service."""
    parsed_url = urlparse(url)
    domain = parsed_url.netloc.lower()

    for service in url_shortening_services:
        if service in domain:
            return -1
    return 1

In [None]:
df_phish.drop_duplicates(subset=['website_url'], inplace=True)
df_valid.drop_duplicates(subset=['website_url'], inplace=True)

df_phish['statistical_report'] = df_phish['website_url'].apply(lambda x: is_statistical_report(x, get_whois(x)))
df_valid['statistical_report'] = df_valid['website_url'].apply(lambda x: is_statistical_report(x, get_whois(x)))
df_phish['shortining_service'] = df_phish['website_url'].apply(is_shortening_service)
df_valid['shortining_service'] = df_valid['website_url'].apply(is_shortening_service)
df_phish['result'] = -1
df_valid['result'] = 1

In [None]:
df_phish.to_csv(PROCESSED_DIR / 'phish_data.csv', index=False)
df_valid.to_csv(PROCESSED_DIR / 'valid_data.csv', index=False)

# Stats

In [None]:
from tabulate import tabulate

def create_table(data, headers):
    print(tabulate(data, headers=headers, tablefmt="fancy_grid"))

In [None]:
def statistics(df):
    metrics = []
    data = []
    for col in df.columns:
        if col == 'result':
            continue
        
        value_counts = df[col].value_counts().reindex([-1, 0, 1], fill_value=0)
        phishing = value_counts.get(-1, 0)
        suspicious = value_counts.get(0, 0)
        legitimate = value_counts.get(1, 0)
        
        df[col] = pd.to_numeric(df[col], errors='coerce')
        df['result'] = pd.to_numeric(df['result'], errors='coerce')
        
        TP = ((df[col] == -1) & (df['result'] == -1)).sum()
        FP = (((df[col] == -1) | (df[col] == 0)) & (df['result'] == 1)).sum()
        TN = ((df[col] == 1) & (df['result'] == 1)).sum()
        FN = (((df[col] == 1) | (df[col] == 0)) & (df['result'] == -1)).sum()
        
        precision = TP / (TP + FP) if (TP + FP) > 0 else 0
        recall = TP / (TP + FN) if (TP + FN) > 0 else 0
        accuracy = (TP + TN) / (TP + FP + TN + FN) if (TP + FP + TN + FN) > 0 else 0
        metrics.append({
            'Feature': col,
            'Precision': precision,
            'Recall': recall,
            'Accuracy': accuracy
        })
        data.append([col, phishing, suspicious, legitimate, precision, recall, accuracy])


    headers = ["Feature","Phishing (-1)", "Suspicious (0)","Legitimate (1)", "Precision (%)", "Recall (%)", "Accuracy (%)"]
    create_table(data, headers)

# Combine

# Feature Selection

In [None]:
drop_features = {
    'website_url',          # NOT NEEDED
    'sslfinal_state',       # BAD
    'having_ip_address',    # USELESS
    'port'                  # USELESS
}