In [1]:
from urllib.parse import urlparse
import re
import whois
import datetime



In [16]:
def extract_features(url):
    # Parse the URL
    parsed_url = urlparse(url)

    # Extract domain
    domain = parsed_url.netloc
    
    # Extract path
    path = parsed_url.path
    
    # Extract length of URL
    url_length = len(url)
    
    # Count special characters
    special_chars_count = len(re.findall(r'[-_&]', url))
    
    # Check if IP address is present
    has_ip_address = bool(re.match(r'\b(?:\d{1,3}\.){3}\d{1,3}\b', domain))
    
    # Extract top-level domain (TLD)
    tld = domain.split('.')[-1]
    
    # Extract subdomains
    subdomains_count = len(domain.split('.')) - 2
    
    # Check if URL uses HTTPS
    uses_https = parsed_url.scheme == 'https'
    
    # Extract keywords
    keywords = ['login', 'bank', 'paypal']  # Add more keywords as needed
    keyword_presence = [1 if keyword in url.lower() else 0 for keyword in keywords]
    
    # Check if URL redirects
    redirects = 1 if parsed_url.query else 0
        
    # Check for suspicious characters
    suspicious_chars_count = len(re.findall(r'[!@#$%^&*()_+={}\[\]:;"\'<>,.?/|\\]', url))
    
    # Check URL syntax
    url_syntax_validity = 1 if re.match(r'^https?://', url) else 0
    
    # Fetch WHOIS information
    try:
        domain_whois = whois.whois(domain)
        print(f"domain_whois: {domain_whois}")
        registration_length = (domain_whois.expiration_date - domain_whois.creation_date).days
    except:
        registration_length = -1  # Indicate failure to retrieve WHOIS information
    
    # Presence of JavaScript in URL
    javascript_presence = 1 if '.js' in url else 0
    
    # Additional features can be added based on specific requirements
    
    # Put all extracted features into a dictionary
    features = {
        'domain': domain,
        'path': path,
        'url_length': url_length,
        'special_chars_count': special_chars_count,
        'has_ip_address': has_ip_address,
        'tld': tld,
        'subdomains_count': subdomains_count,
        'uses_https': uses_https,
        'keyword_presence': keyword_presence,
        'redirects': redirects,
        'suspicious_chars_count': suspicious_chars_count,
        'url_syntax_validity': url_syntax_validity,
        'registration_length': registration_length,
        'javascript_presence': javascript_presence
    }
    
    return features



In [27]:
# Example usage
from pprint import pprint
url = "https://pypi.org/project/python-whois/"
# url = "http://58.65.202.98:11500/api/generate"
url_features = extract_features(url)
pprint(url_features)

pypi.org
domain_whois: {
  "domain_name": "pypi.org",
  "registrar": "Gandi SAS",
  "whois_server": "http://whois.gandi.net",
  "referral_url": null,
  "updated_date": "2023-06-25 20:20:44",
  "creation_date": "2015-07-24 15:13:23",
  "expiration_date": "2032-07-24 15:13:23",
  "name_servers": [
    "ns-1264.awsdns-30.org",
    "ns-1702.awsdns-20.co.uk",
    "ns-897.awsdns-48.net",
    "ns-96.awsdns-12.com"
  ],
  "status": "clientTransferProhibited https://icann.org/epp#clientTransferProhibited",
  "emails": "abuse@support.gandi.net",
  "dnssec": "signedDelegation",
  "name": "REDACTED FOR PRIVACY",
  "org": "Python Software Foundation",
  "address": "REDACTED FOR PRIVACY",
  "city": "REDACTED FOR PRIVACY",
  "state": "OR",
  "registrant_postal_code": "REDACTED FOR PRIVACY",
  "country": "US"
}
{'domain': 'pypi.org',
 'has_ip_address': False,
 'javascript_presence': 0,
 'keyword_presence': [0, 0, 0],
 'path': '/project/python-whois/',
 'redirects': 0,
 'registration_length': 6210,
 's