In [49]:
# Import necessary libraries
from urllib.parse import urlparse, parse_qs
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report
import joblib
from xgboost import XGBClassifier
from tqdm import tqdm

In [50]:
# Updated feature extraction function with high-impact features only
def extract_url_features(url):
    parsed_url = urlparse(url)
    domain_parts = parsed_url.netloc.split('.')
    query_params = parse_qs(parsed_url.query)
    
    features = {
        # Basic URL features
        'url_length': len(url),
        'num_dots': url.count('.'),
        
        # Keyword-based features
        'contains_free': int('free' in url.lower()),
        'contains_click': int('click' in url.lower()),
        'contains_offer': int('offer' in url.lower()),
        'contains_account': int('account' in url.lower()),
        'contains_auth': int('auth' in url.lower()),
        'contains_login': int('login' in url.lower()),
        'contains_brand': int(any(brand in url.lower() for brand in ['paypal', 'google', 'amazon', 'facebook'])),
        
        # Domain and subdomain features
        'domain_length': len(domain_parts[-2]) if len(domain_parts) > 1 else 0,
        'subdomain_length': len(domain_parts[0]) if len(domain_parts) > 2 else 0,
        'suspicious_tld': int(domain_parts[-1] in ['top', 'xyz', 'click', 'club', 'biz', 'info', 'work', 'zip', 'mobi']),
        
        # Redirect and suspicious path features
        'has_redirect': int('?q=' in url or '?url=' in url or '?redirect=' in url),
        'suspicious_subdomain': int(any(keyword in parsed_url.netloc for keyword in ['auth', 'login', 'secure'])),
        'num_redirects': url.count('http') - 1,
        
        # Path and query features
        'path_length': len(parsed_url.path),
        'query_length': len(parsed_url.query),
        'num_query_params': len(query_params),
    }
    return list(features.values())


In [51]:
# Load dataset
df = pd.read_csv('data/spam_data.csv')

# Extract features from each URL
url_features = [extract_url_features(url) for url in tqdm(df['text'], desc="Extracting URL Features")]

# Create DataFrame with extracted features
feature_columns = [
    'url_length', 'num_dots', 
    'contains_free', 'contains_click', 'contains_offer',
    'contains_account', 'contains_auth', 'contains_login', 'contains_brand',
    'domain_length', 'subdomain_length', 'suspicious_tld',
    'has_redirect', 'suspicious_subdomain', 'num_redirects',
    'path_length', 'query_length', 'num_query_params'
]
url_features_df = pd.DataFrame(url_features, columns=feature_columns)

# Add target variable
url_features_df['is_spam'] = df['is_spam']

# Split data into training and testing sets (stratified)
X = url_features_df.drop('is_spam', axis=1)
y = url_features_df['is_spam']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


Extracting URL Features: 100%|██████████| 450176/450176 [00:11<00:00, 38288.51it/s]


In [52]:
# Define hyperparameter grid for RandomizedSearchCV
param_dist = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [3, 5, 7, 10, 15],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [0, 0.1, 0.5],
}

In [None]:
# Initialize and run RandomizedSearchCV
randomized_search = RandomizedSearchCV(
    estimator=XGBClassifier(random_state=42, n_jobs=-1),
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42
)

# Fit model to training data
randomized_search.fit(X_train, y_train)

# Retrieve the best model
best_model = randomized_search.best_estimator_

Fitting RandomizedSearchCV:   0%|          | 0/1 [00:14<?, ?it/s]


KeyboardInterrupt: 

In [None]:
# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Evaluate model performance
print(f'Accuracy: {accuracy_score(y_test, y_pred):.4f}')
print(classification_report(y_test, y_pred))

Accuracy: 0.9798
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     69148
           1       0.97      0.94      0.96     20888

    accuracy                           0.98     90036
   macro avg       0.98      0.96      0.97     90036
weighted avg       0.98      0.98      0.98     90036



In [None]:
# Save the trained model to disk
joblib.dump(best_model, 'url_model.pkl')

# Print the best hyperparameters
print("Best Hyperparameters:", randomized_search.best_params_)

Best Hyperparameters: {'subsample': 0.8, 'reg_lambda': 0.1, 'reg_alpha': 0.5, 'n_estimators': 200, 'max_depth': 15, 'learning_rate': 0.2, 'gamma': 0.2, 'colsample_bytree': 0.8}


In [None]:
# Get feature importance from the trained model
importances = best_model.feature_importances_
feature_importance_df = pd.DataFrame({'feature': feature_columns, 'importance': importances})

# Print sorted feature importance
print(feature_importance_df.sort_values(by='importance', ascending=False))

                 feature  importance
10      subdomain_length    0.564239
7         contains_login    0.257179
14         num_redirects    0.058406
5       contains_account    0.036978
8         contains_brand    0.013070
11        suspicious_tld    0.012429
1               num_dots    0.008158
17      num_query_params    0.005926
3         contains_click    0.005753
6          contains_auth    0.005353
9          domain_length    0.004758
15           path_length    0.004712
12          has_redirect    0.004676
2          contains_free    0.004250
0             url_length    0.004008
16          query_length    0.003956
13  suspicious_subdomain    0.003307
4         contains_offer    0.002843
