In [5]:
import pandas as pd
from pathlib import Path
from urllib.parse import urlparse
import re
import whois
import ssl
import socket
from datetime import datetime
import tldextract
from urllib.parse import urlparse, urljoin
import dns.resolver

In [6]:
DATA_DIR = Path('../data')
PHISH_DIR = DATA_DIR / 'phishtank'
LEGIT_DIR = DATA_DIR / 'common_crawl'
PROCESSED_DIR = DATA_DIR / 'processed'
PHISH_DATA = PHISH_DIR / 'collected_data.csv'
LEGIT_DATA = LEGIT_DIR / 'collected_data.csv'

# ADDRESS BAR FEATURES

In [16]:
top_phishing_tlds = [
    # Cheap and Open TLDs
    ".xyz", ".top", ".club", ".online", ".shop", ".site", ".vip", ".buzz",

    # Freenom TLDs (Free Domains)
    ".tk", ".ml", ".ga", ".cf", ".gq",

    # Geographic and Niche TLDs less commonly used for legitimate purposes
    ".ly", ".to", ".ru", ".cn", ".su"
]

def is_statistical_report(url):
    """Determines if the URL has a suspicious statistical report based on phishing domains or IPs."""
    ext = tldextract.extract(url)
    
    if ext.suffix in top_phishing_tlds:
        return -1  # Phishing
    
    return 1 

def core_domain(url):
    """Normalize the URL by extracting only the core domain using tldextract."""
    extracted = tldextract.extract(url)
    core_domain = f"{extracted.domain}.{extracted.suffix}"
    return core_domain

def domain_name(url):
    """Normalize the URL by extracting only the domain name using tldextract."""
    extracted = tldextract.extract(url)
    return extracted.domain

def lower_case(url):
    """Normalize the URL by converting it to lowercase."""
    return url.lower()

def is_having_ip(url):
    """Determines if the URL has an IP address."""
    parsed_url = urlparse(url)
    domain = parsed_url.netloc.lower().split(':')[0]  # Remove port if present
    
    ipv4_pattern = r"^(\d{1,3}\.){3}\d{1,3}$"
    hex_ip_pattern = r"^0x[0-9a-fA-F]{1,2}(\.0x[0-9a-fA-F]{1,2}){3}$"

    if re.match(ipv4_pattern, domain):
        return -1
    
    elif re.match(hex_ip_pattern, domain):
        hex_parts = domain.split('.')
        decimal_parts = []
        for part in hex_parts:
            decimal_parts.append(str(int(part, 16)))
        return -1
    
    return 1

def is_url_long(url):
    """Determines if the URL length is suspicious or phishing based on length."""
    url_length = len(url)
    
    if url_length < 54:
        return 1  # Legitimate
    elif 54 <= url_length <= 75:
        return 0  # Suspicious
    else:
        return -1  # Phishing
    
url_shortening_services = [
    "tinyurl.com", "bit.ly", "t.co", "goo.gl", "is.gd", "buff.ly",
    "adf.ly", "ow.ly", "bit.do", "cutt.ly", "shorte.st", "clck.ru",
    "tiny.cc", "tr.im", "x.co", "soo.gd", "s2r.co", "bl.ink", "mcaf.ee",
    "urlz.fr", "shorturl.at"
]
def is_shortening_service(url):
    """Determines if the URL uses a URL shortening service."""
    core = core_domain(url)
    if core in url_shortening_services:
        return -1
    return 1

def is_double(url):
    """Determines if the URL redirects using '//'."""
    parsed_url = urlparse(url)
    if parsed_url.scheme == "http":
        limit_position = 6
    elif parsed_url.scheme == "https":
        limit_position = 7
    else:
        return 1

    last_occurrence_index = url.rfind("//")

    if last_occurrence_index > limit_position:
        return -1
    else:
        return 1
    
    
    
def is_having_sub_domain(url):
    """Classifies a URL based on the number of subdomains."""
    ext = tldextract.extract(url)
    subdomain = ext.subdomain
    num_subdomains = len(subdomain.split('.')) if subdomain else 0
    
    if num_subdomains == 0:
        return 1  # Legitimate
    elif num_subdomains == 1:
        return 1  # Legitimate
    elif num_subdomains == 2:
        return 0  # Suspicious
    else:
        return -1

def is_dns_record(url, timeout=5):
    """Check if the domain or subdomain has DNS records."""
    ext = tldextract.extract(url)
    domain = f"{ext.domain}.{ext.suffix}"
    
    resolver = dns.resolver.Resolver()
    resolver.timeout = timeout
    resolver.lifetime = timeout  # Set a timeout for the entire resolution process
    
    try:
        a_records = resolver.resolve(domain, 'A')
        if a_records:
            print(f"A records: {a_records}")
            return 1
    except dns.resolver.NoAnswer:
        print("No A records")
    except dns.resolver.NXDOMAIN:
        print("NXDOMAIN")
        return -1
    except dns.exception.Timeout:
        print("DNS query timed out for A records")
        return -1
    except dns.resolver.NoNameservers:
        print("No nameservers available to resolve the domain")
        return -1
    
    try:
        aaaa_records = resolver.resolve(domain, 'AAAA')
        if aaaa_records:
            print(f"AAAA records: {aaaa_records}")
            return 1
    except dns.resolver.NoAnswer:
        print("No AAAA records")
    except dns.resolver.NXDOMAIN:
        print("NXDOMAIN")
        return -1
    except dns.exception.Timeout:
        print("DNS query timed out for AAAA records")
        return -1
    except dns.resolver.NoNameservers:
        print("No nameservers available to resolve the domain")
        return -1
    
    return -1

def is_abnormal_url(url):
    """Determines if the URL is abnormal."""
    ext = tldextract.extract(url)
    host_name = ext.domain + '.' + ext.suffix
    w = whois.whois(url)
    if w and 'domain_name' in w:
        domain_names = w['domain_name']
        print(f"Domain names: {domain_names}")
        if isinstance(domain_names, list):
            for domain in domain_names:
                if host_name.lower() == domain.lower():
                    return 1
        elif isinstance(domain_names, str):
            if host_name.lower() == domain_names.lower():
                return 1
    return -1

In [8]:
df_phish = pd.read_csv(PHISH_DATA)

for index, row in df_phish.iterrows():
    if df_phish.loc[index, 'shortining_service'] != is_shortening_service(df_phish.loc[index, 'website_url']):
        print(f"Shortening service mismatch at index {index}.")
        print(f"{df_phish.loc[index,'website_url']}, {df_phish.loc[index, 'shortining_service']}, {is_shortening_service(df_phish.loc[index, 'website_url'])}")

Shortening service mismatch at index 118.
https://urlz.fr/t2Je, 1, -1
Shortening service mismatch at index 119.
https://urlz.fr/t2HL, 1, -1
Shortening service mismatch at index 125.
https://urlz.fr/t31X, 1, -1
Shortening service mismatch at index 126.
https://urlz.fr/t2Je, 1, -1
Shortening service mismatch at index 127.
https://urlz.fr/t2HL, 1, -1
Shortening service mismatch at index 130.
https://urlz.fr/t30u, 1, -1
Shortening service mismatch at index 132.
https://urlz.fr/t2Z6, 1, -1
Shortening service mismatch at index 133.
https://urlz.fr/t2U4, 1, -1
Shortening service mismatch at index 135.
https://urlz.fr/t2DP, 1, -1
Shortening service mismatch at index 157.
https://urlz.fr/t2Je, 1, -1
Shortening service mismatch at index 158.
https://urlz.fr/t2HL, 1, -1
Shortening service mismatch at index 446.
https://urlz.fr/t1Vh, 1, -1
Shortening service mismatch at index 448.
https://urlz.fr/t2Nq, 1, -1
Shortening service mismatch at index 507.
https://shorturl.at/AINF5?JHQ=rbtFZaq, 1, -1
Sho

In [10]:
for index, row in df_phish.iterrows():
    if df_phish.loc[index, 'url_length'] != is_url_long(df_phish.loc[index, 'website_url']):
        print(f"Double slash mismatch at index {index}.")
        print(f"{df_phish.loc[index,'website_url']}, {df_phish.loc[index, 'url_length']}, {is_url_long(df_phish.loc[index, 'website_url'])}")

Double slash mismatch at index 3.
https://web13319.cweb06.gamingcontrol.de/ch/chpost/N/swiss_de/, -1, 0
Double slash mismatch at index 30.
https://compliance-page-issu.d13z0rbu5zhsxy.amplifyapp.com/, -1, 0
Double slash mismatch at index 31.
https://compliance-page-issu.d13z0rbu5zhsxy.amplifyapp.com/, -1, 0
Double slash mismatch at index 38.
https://microsoftoffice365onedrivedocument.weebly.com/, -1, 0
Double slash mismatch at index 67.
https://intermediate-peppermint-blackbird.glitch.me/hostlink.html, -1, 0
Double slash mismatch at index 87.
https://4215465tygfgarcfdens655665df56568767.weeblysite.com/, -1, 0
Double slash mismatch at index 93.
https://05456et3454svdf04b8fd5ab27b1be5e7b51d9d8146566.weeblysite.com/, -1, 0
Double slash mismatch at index 148.
https://swisspass-ch-remov.codeanyapp.com/aa/d/sign.php, -1, 0
Double slash mismatch at index 149.
https://pub-c75b5893124e4c2cb9233f0097f1d214.r2.dev/freshdocu%25%5E.html, -1, 0
Double slash mismatch at index 150.
https://pub-b4b4dc12

In [11]:
for index, row in df_phish.iterrows():
    if df_phish.loc[index, 'having_ip_address'] != is_having_ip(df_phish.loc[index, 'website_url']):
        print(f"Having IP mismatch at index {index}.")
        print(f"{df_phish.loc[index,'website_url']}, {df_phish.loc[index, 'having_ip_address']}, {is_having_ip(df_phish.loc[index, 'website_url'])}")

In [12]:
for index, row in df_phish.iterrows():
    if df_phish.loc[index, 'double_slash_redirecting'] != is_double(df_phish.loc[index, 'website_url']):
        print(f"Double slash mismatch at index {index}.")
        print(f"{df_phish.loc[index,'website_url']}, {df_phish.loc[index, 'double_slash_redirecting']}, {is_double(df_phish.loc[index, 'website_url'])}")

In [None]:
for index, row in df_phish.iterrows():
    if df_phish.loc[index, 'having_sub_domain'] != is_having_sub_domain(df_phish.loc[index, 'website_url']):
        print(f"Subdomain mismatch at index {index}.")
        print(f"{df_phish.loc[index,'website_url']}, {df_phish.loc[index, 'having_sub_domain']}, {is_having_sub_domain(df_phish.loc[index, 'website_url'])}")

In [15]:
df_valid = pd.read_csv(LEGIT_DATA)
# for index, row in df_phish.iterrows():
#     if df_phish.loc[index, 'abnormal_url'] != is_abnormal_url(df_phish.loc[index, 'website_url']):
#         print(f"Abnormal URL mismatch at index {index}.")
#         print(f"{df_phish.loc[index,'website_url']}, {df_phish.loc[index, 'abnormal_url']}, {is_abnormal_url(df_phish.loc[index, 'website_url'])}")

for index, row in df_valid.iterrows():
    if df_valid.loc[index, 'abnormal_url'] != is_abnormal_url(df_valid.loc[index, 'website_url']):
        print(f"Abnormal URL mismatch at index {index}.")
        print(f"{df_valid.loc[index,'website_url']}, {df_valid.loc[index, 'abnormal_url']}, {is_abnormal_url(df_valid.loc[index, 'website_url'])}")

Domain names: HOLLOWAYSOFLUDLOW.COM
Domain names: airccse.org
Domain names: ['LACARTOONS.COM', 'lacartoons.com']
Domain names: ['PUERTOVALLARTAWALKINGTOURS.COM', 'puertovallartawalkingtours.com']
Domain names: HOLMI.RU
Domain names: partytwinkle.com.au
Domain names: ['northernstar.nyc', 'NORTHERNSTAR.NYC']
Domain names: ILLINOIS.EDU
Domain names: myjobvacancies.co.ke
Domain names: ['JAMARANEMA.COM', 'jamaranema.com']
Domain names: ['ARMANI.COM', 'armani.com']


KeyboardInterrupt: 

In [20]:
df_valid = pd.read_csv(LEGIT_DATA)

In [22]:
# for index, row in df_phish.iterrows():
#     if df_phish.loc[index, 'having_sub_domain'] != is_having_sub_domain(df_phish.loc[index, 'website_url']):
#         print(f"Subdomain mismatch at index {index}.")
#         print(f"{df_phish.loc[index,'website_url']}, {df_phish.loc[index, 'having_sub_domain']}, {is_having_sub_domain(df_phish.loc[index, 'website_url'])}")

df_phish['website_url'].apply(is_dns_record)

No A records
No AAAA records
A records: <dns.resolver.Answer object at 0x000001D2D0162410>
A records: <dns.resolver.Answer object at 0x000001D2D04FDE10>
No A records
No AAAA records
No A records
No AAAA records
A records: <dns.resolver.Answer object at 0x000001D2D04FFF10>
A records: <dns.resolver.Answer object at 0x000001D2D04FE210>
A records: <dns.resolver.Answer object at 0x000001D2D048EF10>
A records: <dns.resolver.Answer object at 0x000001D2D04FCE10>
A records: <dns.resolver.Answer object at 0x000001D2D04FE750>
A records: <dns.resolver.Answer object at 0x000001D2D0529990>
A records: <dns.resolver.Answer object at 0x000001D2D052B9D0>
A records: <dns.resolver.Answer object at 0x000001D2D0301150>
A records: <dns.resolver.Answer object at 0x000001D2D048EF50>
A records: <dns.resolver.Answer object at 0x000001D2D052B190>
A records: <dns.resolver.Answer object at 0x000001D2D0529BD0>
A records: <dns.resolver.Answer object at 0x000001D2D052B410>
A records: <dns.resolver.Answer object at 0x0

0      -1
1       1
2       1
3      -1
4      -1
       ..
2473    1
2474    1
2475    1
2476    1
2477    1
Name: website_url, Length: 2478, dtype: int64

In [14]:
df_phish['url_length'] = df_phish['website_url'].apply(is_url_long)
df_phish['shortining_service'] = df_phish['website_url'].apply(is_shortening_service)