# INDICATORS
- a list of all there inidcators and there implementations

## Required Libraries

In [None]:
from urllib.parse import urlparse
import re
import whois
import ssl
import socket
from datetime import datetime
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import pandas as pd
import os
import json
import tldextract
import logging
import time
import dns.resolver
from dotenv import load_dotenv

import coloredlogs

load_dotenv(dotenv_path='./.env')

# Logging

In [None]:
def setup_logging():
    log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    coloredlogs.install(level='INFO', fmt=log_format)

setup_logging()

# Sample DATA for Testing

In [None]:
csv_file_path = 'path_to_your_csv_file.csv'
df_csv = pd.read_csv(csv_file_path)

json_file_path = 'path_to_your_json_file.json'
with open(json_file_path, 'r') as f:
    json_data = json.load(f)

df_json = pd.DataFrame(json_data)

merged_df = pd.merge(df_csv, df_json, on='id', how='inner', validate='one_to_one')

html_features = merged_df['html'].tolist()

print(merged_df)
print(html_features)

# Description
## 1: Legitimate
## 0: Suspicious
## -1: Phishing

## HELPERS

In [5]:
def core_domain(url):
    """Normalize the URL by extracting only the core domain using tldextract."""
    extracted = tldextract.extract(url)
    core_domain = f"{extracted.domain}.{extracted.suffix}"
    return core_domain

def domain_name(url):
    """Normalize the URL by extracting only the domain name using tldextract."""
    extracted = tldextract.extract(url)
    return extracted.domain

def lower_case(url):
    """Normalize the URL by converting it to lowercase."""
    return url.lower()

## HAS IP ADDRESS
#### Value: Essentially useless
Almost no instances of phishing urls utilizing IP addresses anymore

In [None]:
## RULE: Using the IP Address
## STATUS: FINISHED
def is_having_ip(url):
    """Determines if the URL has an IP address."""
    try:
        hostname = urlparse(url).hostname
        if hostname is None:
            return False
        
        ipv4_pattern = re.compile(r'^(?:\d{1,3}\.){3}\d{1,3}$')
        hex_pattern = re.compile(r'^(?:0x[0-9A-Fa-f]{1,2}\.){3}0x[0-9A-Fa-f]{1,2}$')
        
        if ipv4_pattern.match(hostname) or hex_pattern.match(hostname):
            return -1
        return 1
    except Exception:
        return 1

## URL LONG
#### Value: OK
While many phishing urls do have much longer urls, the same can be said for legitamate websites (google searches)

In [None]:
## RULE: Long URL to Hide the Suspicious Part
## STATUS: FINISHED
def is_url_long(url):
    """Determines if the URL length is suspicious or phishing based on length."""
    url_length = len(url)
    
    if url_length < 54:
        return 1  # Legitimate
    elif 54 <= url_length <= 75:
        return 0  # Suspicious
    else:
        return -1  # Phishing

## SHORTENING SERVICES
#### Value: Good
Many phishing websites utilize shortening services to hide.
However, this is not a clear indicator. But it does indicate the need to look at other features

In [None]:
## RULE: Using URL Shortening Services "TinyURL"
## STATUS: FINISHED
## List of URL shortening services
url_shortening_services = [
    "tinyurl.com", "bit.ly", "t.co", "goo.gl", "is.gd", "buff.ly",
    "adf.ly", "ow.ly", "bit.do", "cutt.ly", "shorte.st", "clck.ru",
    "tiny.cc", "tr.im", "x.co", "soo.gd", "s2r.co", "bl.ink", "mcaf.ee",
    "urlz.fr", "shorturl.at"
]
def is_shortening_service(url):
    """Determines if the URL uses a URL shortening service."""
    core = core_domain(url)
    if core in url_shortening_services:
        return -1
    return 1

## HAVING @ SYMBOLS

In [None]:
## RULE: URL's having "@" Symbol
## STATUS: FINISHED
def is_having_at_symbol(url):
    """Determines if the URL contains an '@' symbol."""
    if '@' in url:
        return -1
    else:
        return 1

## DOUBLE SLASH
- account for both http and https
#### Value: OK
Simply not very common anymore, sometimes are blocked

In [None]:
## RULE: Redirecting using "//"
## STATUS: FINISHED
def is_double(url):
    """Determines if the URL redirects using '//'."""
    parsed_url = urlparse(url)
    if parsed_url.scheme == "http":
        limit_position = 6
    elif parsed_url.scheme == "https":
        limit_position = 7
    else:
        return 1

    last_occurrence_index = url.rfind("//")

    if last_occurrence_index > limit_position:
        return -1
    else:
        return 1

##  Prefix Suffix
#### Value: Good
Many phishing URLs use - to obfuscate domains

In [None]:
## RULE: Adding Prefix or Suffix Separated by (-) to the Domain
## STATUS: FINISHED
def is_prefix_suffix(url):
    """Determines if the URL has a prefix or suffix separated by a hyphen."""
    parsed_url = urlparse(url)
    domain_name = parsed_url.netloc

    if '-' in domain_name:
        return -1
    else:
        return 1

## Multiple SUBDOMAINS
#### Value: Good

In [None]:
## RULE: Sub Domain and Multi Sub Domains
## STATUS: FINISHED
def is_having_sub_domain(url):
    """Classifies a URL based on the number of subdomains."""
    ext = tldextract.extract(url)
    subdomain = ext.subdomain
    num_subdomains = len(subdomain.split('.')) if subdomain else 0
    
    if num_subdomains == 0:
        return 1  # Legitimate
    elif num_subdomains == 1:
        return 1  # Legitimate
    elif num_subdomains == 2:
        return 0  # Suspicious
    else:
        return -1

## SSL Trusted
#### Value: BAD - OUTDATED

In [None]:
## RULE: HTTPS (Hyper Text Transfer Protocol with Secure Sockets Layer)
## STATUS: FINISHED
## List of trusted Certificate Authorities
TRUSTED_ISSUER_KEYWORDS = {
    "GeoTrust", "GoDaddy", "Network Solutions", "Thawte",
    "Comodo", "Doster", "VeriSign", "DigiCert", "WR2",
    "GlobalSign", "Entrust", "Symantec", "Let's Encrypt",
    "Amazon", "Trustwave", "QuoVadis",
    "SwissSign", "Sectigo", "WoSign", "CNNIC",
    "StartCom", "GeoTrust", "Verisign"
}


def is_trusted_issuer(issuer_common_name):
    return any(keyword in issuer_common_name for keyword in TRUSTED_ISSUER_KEYWORDS)
def is_https(url):
    """Determines if the URL uses HTTPS."""
    try:
        hostname = url.replace("https://", "").replace("http://", "").split('/')[0]

        context = ssl.create_default_context()
        with socket.create_connection((hostname, 443)) as sock:
            with context.wrap_socket(sock, server_hostname=hostname) as ssock:
                cert = ssock.getpeercert()

        issuer = dict(x[0] for x in cert['issuer'])
        issuer_common_name = issuer.get('commonName', '')


        if not cert:
            return 1
        

        if not is_trusted_issuer(issuer_common_name):
            return 0


        valid_from = datetime.strptime(cert['notBefore'], '%b %d %H:%M:%S %Y %Z')


        age_in_years = (datetime.now() - valid_from).days / 365.25
        if age_in_years >= 1:
            return 1
        else:
            return 0

    except Exception:
        return -1

## Domain Registration Length
#### Value: Good

In [None]:
## RULE: Domain Registration Length
## STATUS: FINISHED
def is_domain_registration_length(url):
    """Determines if the URL's domain registration length is suspicious."""
    try:
        domain = whois.whois(url)
        creation_date = domain.creation_date
        expiration_date = domain.expiration_date

        if isinstance(creation_date, list):
            creation_date = creation_date[0]
        if isinstance(expiration_date, list):
            expiration_date = expiration_date[0]

        if creation_date and expiration_date:
            registration_length = (expiration_date - creation_date).days / 365
            if registration_length < 1:
                return -1
            else:
                return 1
        else:
            return -1
    except Exception:
        logging.error("Error in domain registration length")
        return 0

## Domain Registration LENGTH

In [None]:
## RULE: Domain Registration Length
## STATUS: FINISHED
def is_domain_registration_length(url):
    """Determines if the URL's domain registration length is suspicious."""
    try:
        domain = whois.whois(url)
        creation_date = domain.creation_date
        expiration_date = domain.expiration_date

        if isinstance(creation_date, list):
            creation_date = creation_date[0]
        if isinstance(expiration_date, list):
            expiration_date = expiration_date[0]

        if creation_date and expiration_date:
            registration_length = (expiration_date - creation_date).days / 365
            if registration_length < 1:
                return -1
            else:
                return 1
        else:
            return -1
    except Exception:
        logging.error("Error in domain registration length")
        return 0

## Favicon
#### Value: OK

In [None]:
## RULE
## STATUS: FINISHED
def is_favicon(url, soup):
    """Determines if the URL has a favicon."""
    
    main_domain = urlparse(url).netloc
    
    favicon_link = soup.find("link", rel=lambda value: value and 'icon' in value.lower())
    
    if not favicon_link or not favicon_link.get("href"):
        return 1
    
    favicon_url = urljoin(url, favicon_link.get("href"))
    favicon_domain = urlparse(favicon_url).netloc
    
    if favicon_domain == main_domain:
        return 1
    else:
        return -1

## PORT
#### Value: Essentially useless
No occurance of altenrate port numbers used

In [None]:
## RULE: Using Non-Standard Port
## STATUS: FINISHED
## List of preferred ports
preferred_ports = [80, 443]
## List of non-preferred ports
non_preferred_ports = [21, 22, 23, 445, 1433, 1521, 3306, 3389]
def is_port(url):
    """Determines if the URL uses a non-standard port."""
    parsed_url = urlparse(url)
    port = parsed_url.port
    
    if port is None:
        if parsed_url.scheme == 'http':
            port = 80
        elif parsed_url.scheme == 'https':
            port = 443
        else:
            return 1
    
    if port in preferred_ports:
        return 1
    
    elif port in non_preferred_ports:
        return -1
    else:
        return -1

## HTTPS Token
#### Value: OK

In [None]:
## RULE: HTTP and HTTPS Tokens
## STATUS: FINISHED
def is_https_token(url):
    """Determines if the URL has 'https' tokens."""
    parsed_url = urlparse(url)
    domain = parsed_url.netloc
    
    # Check if the "https" token appears in the domain part
    if "https" in domain:
        return -1
    else:
        return 1

## NEW FEATURES
- Detection of numbers in domain (0,1)
- Special Characters (0,1)
- Homoglyhs (1,0,-1)

## Brand Impersonation Metrics


In [None]:
import idna

# 1. Presence of Numbers in the Domain
def has_numbers_in_domain(url: str) -> bool:
    domain = urlparse(url).netloc
    if bool(re.search(r'\d', domain)):
        return 0
    return 1

# 2. Presence of Special Characters in the Domain
def has_special_characters_in_domain(url: str) -> bool:
    domain = urlparse(url).netloc
    special_chars = set("!#$%&'()*+,/:;<=>?@[\\]^`{|}~")
    if any(char in special_chars for char in domain):
        return 0
    return 1

# 3. Presence of IDN Spoofing in the Domain
HOMOGLYPHS = {
    'a': ['α', 'а'],  # Latin 'a' vs Greek 'alpha' and Cyrillic 'a'
    'o': ['ο', 'օ'],  # Latin 'o' vs Greek 'omicron' and Armenian 'o'
    'e': ['е'],       # Latin 'e' vs Cyrillic 'e'
    'i': ['і', '١'],   # Latin 'i' vs Cyrillic 'і' and Arabic digit '1'
    'l': ['ӏ', '١'],   # Latin 'l' vs Cyrillic 'ӏ' and Arabic digit '1'
    'u': ['υ'],       # Latin 'u' vs Greek 'upsilon'
    'c': ['с'],       # Latin 'c' vs Cyrillic 'с'
    'n': ['п'],       # Latin 'n' vs Cyrillic 'п'
}

## NOT USED NOT ENOUGH DATA TO BE AFFECTIVE
def homoglyph(url):
    try:
        parsed_url = urlparse(url)
        domain = parsed_url.netloc
        
        is_idn = not domain.isascii()
        suspicious_chars = []

        if is_idn:
            for char in domain:
                for key, glyphs in HOMOGLYPHS.items():
                    if char in glyphs:
                        suspicious_chars.append((char, key))
            
            if suspicious_chars:
                return -1
            else:
                return 0
        else:
            return 1
    except Exception as e:
        print(f"Error parsing URL: {e}")
        return 1

In [14]:
KNOWN_BRANDS_DOMAINS = [
    "microsoft",
    "apple",
    "google",
    "facebook",
    "whatsapp",
    "amazon",
    "alibaba",
    "adobe",
    "twitter",
    "adidas",
    "netflix",
    "paypal",
    "bankofamerica",
    "chase",
    "wellsfargo",
    "linkedin",
    "ebay",
    "instagram",
    "zoom",
    "dropbox",
    "youtube",
    "airbnb",
    "spotify",
    "appleid"
]

import Levenshtein

def extract_domain_and_subdomains(url):
    """
    Extract the domain, subdomains, and TLD from a given URL using tldextract.
    """
    extracted = tldextract.extract(url)
    subdomains = extracted.subdomain.split('.') if extracted.subdomain else []
    domain = extracted.domain
    return domain, subdomains

def overlapping_substrings(string, n):
    """
    Breaks the string into overlapping substrings of length n with a stride of 1.
    If there are fewer than n characters remaining at the end, it takes the remaining characters.
    """
    substrings = []
    for i in range(len(string) - n + 1):
        substrings.append(string[i:i + n])
    
    if len(string) - (len(string) - n + 1) > 0:
        substrings.append(string[-n:])
    
    return substrings

def check_brands(url, brand_name="microsoft"):
    """
    Analyzes the URL for phishing indicators based on domain and subdomain similarity to brand name.
    Uses Levenshtein distance normalized by the length of the target brand name.
    """
    domain, subdomains = extract_domain_and_subdomains(url)
    domain_and_subdomains = ''.join(sub.replace('.', '') for sub in subdomains) + domain.replace('.', '')
    brand_len = len(brand_name)
    domain_substrings = overlapping_substrings(domain_and_subdomains, brand_len)
    domain_levenshtein_distances = [Levenshtein.distance(sub, brand_name) for sub in domain_substrings]
    
    min_distance = min(domain_levenshtein_distances)
    normalized_distance = min_distance / brand_len
    return normalized_distance

def is_brand_impersonation_lev(url):
    """
    Determines if the URL is impersonating a known brand.
    """
    current = 1
    for brand in KNOWN_BRANDS_DOMAINS:
        distance = check_brands(url, brand)
        if distance < 0.2:
            return -1
        elif distance < 0.5:
            current = 0
    return current

url = "http://demo-apple.serveirc.com/"
url = url.replace("-", "")
result = is_brand_impersonation_lev(url)
print(result)

-1


In [None]:
from fuzzywuzzy import fuzz
import tldextract

KNOWN_BRANDS_DOMAINS = [
    "microsoft",
    "apple",
    "google",
    "facebook",
    "whatsapp",
    "amazon",
    "alibaba",
    "adobe",
    "twitter",
    "adidas",
    "netflix",
    "paypal",
    "bankofamerica",
    "chase",
    "wellsfargo",
    "linkedin",
    "ebay",
    "instagram",
    "zoom",
    "dropbox",
    "youtube",
    "airbnb",
    "spotify",
    "appleid"
]
        
def is_brand_impersonation_fuzzy(url):
    """
    Analyzes the URL for phishing indicators based on domain and subdomain similarity to brand name.
    Uses fuzzy matching (Levenshtein distance) to compare strings.
    """
    domain, subdomains = extract_domain_and_subdomains(url)
    current = 1
    brand_similarities = {}
    for brand in KNOWN_BRANDS_DOMAINS:
        similarity = fuzz.ratio(domain, brand)
        if similarity > 70:
            return -1
        elif similarity > 50:
            current = 0
    
    for brand in KNOWN_BRANDS_DOMAINS:
        for subdomain in subdomains:
            similarity = fuzz.ratio(subdomain, brand)
            if similarity > 70:
                return -1
            elif similarity > 50:
                current = 0
    return current

# Example usage
url = "http://demo-apple.serveirc.com/"  # Example of a phishing domain
url = url.replace('-', '')
result = is_brand_impersonation_fuzzy(url)
print(result)


-1


In [16]:
import time

def measure_execution_time(func, *args, **kwargs):
    start_time = time.perf_counter()
    result = func(*args, **kwargs)
    end_time = time.perf_counter()
    execution_time = end_time - start_time
    return result, execution_time

url = "http://demo-apple.serveirc.com/"
url = url.replace("-", "")
result, exec_time = measure_execution_time(is_brand_impersonation_fuzzy, url)
print(f"Fuzzy Result: {result}, Execution Time: {exec_time} seconds")
result, exec_time = measure_execution_time(is_brand_impersonation_lev, url)
print(f"Lev Result: {result}, Execution Time: {exec_time} seconds")

Fuzzy Result: -1, Execution Time: 0.00030440000409726053 seconds
Lev Result: -1, Execution Time: 0.00013220000255387276 seconds
