In [41]:
# Import required libraries
import joblib
from urllib.parse import urlparse, parse_qs

In [42]:
# Load the trained model
model_path = 'url_model.pkl'
model = joblib.load(model_path)

In [43]:
# Feature extraction function with high-impact features only
def extract_url_features(url):
    parsed_url = urlparse(url)
    domain_parts = parsed_url.netloc.split('.')
    query_params = parse_qs(parsed_url.query)

    # Define features dictionary
    features = {
        # Basic URL features
        'url_length': len(url),
        'num_dots': url.count('.'),

        # Keyword-based features
        'contains_free': int('free' in url.lower()),
        'contains_click': int('click' in url.lower()),
        'contains_offer': int('offer' in url.lower()),
        'contains_account': int('account' in url.lower()),
        'contains_auth': int('auth' in url.lower()),
        'contains_login': int('login' in url.lower()),
        'contains_brand': int(any(brand in url.lower() for brand in ['paypal', 'google', 'amazon', 'facebook'])),

        # Domain and subdomain features
        'domain_length': len(domain_parts[-2]) if len(domain_parts) > 1 else 0,
        'subdomain_length': len(domain_parts[0]) if len(domain_parts) > 2 else 0,
        'suspicious_tld': int(domain_parts[-1] in ['top', 'xyz', 'click', 'club', 'biz', 'info', 'work', 'zip', 'mobi']),

        # Redirect and suspicious path features
        'has_redirect': int('?q=' in url or '?url=' in url or '?redirect=' in url),
        'suspicious_subdomain': int(any(keyword in parsed_url.netloc for keyword in ['auth', 'login', 'secure'])),
        'num_redirects': url.count('http') - 1,  # Count number of redirects

        # Path and query features
        'path_length': len(parsed_url.path),
        'query_length': len(parsed_url.query),
        'num_query_params': len(query_params),
    }
    
    # Return the feature values as a list
    return list(features.values())

In [44]:
# Function to predict if a URL is spam or not
def predict_spam(url):
    # Extract features from the URL
    features = extract_url_features(url)
    
    # Predict the probability of the URL being spam
    proba = model.predict_proba([features])[0]
    
    # Get the probabilities for spam and not spam
    spam_proba = proba[1] * 100
    not_spam_proba = proba[0] * 100
    
    # Return the result based on higher probability
    if spam_proba > not_spam_proba:
        return f'This URL is {spam_proba:.2f}% likely to be Spam.'
    else:
        return f'This URL is {not_spam_proba:.2f}% likely to be Not Spam.'

In [52]:
# Main block for running the script
if __name__ == "__main__":
    # Example URL to check
    url = 'https://www.google.com'
    
    # Get the prediction result and print
    print(predict_spam(url))

This URL is 99.94% likely to be Not Spam.
