In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/new_data_urls.csv')
print(f"Original dataset shape: {df.shape}")
print(f"\nColumn names: {df.columns.tolist()}")
print(f"\nFirst few rows:")
print(df.head())

Original dataset shape: (822010, 2)

Column names: ['url', 'status']

First few rows:
                                       url  status
0  0000111servicehelpdesk.godaddysites.com       0
1     000011accesswebform.godaddysites.com       0
2                             00003.online       0
3      0009servicedeskowa.godaddysites.com       0
4                     000n38p.wcomhost.com       0


In [3]:
df.isnull().sum()

url       0
status    0
dtype: int64

In [4]:
df['status'].value_counts()

status
1    427028
0    394982
Name: count, dtype: int64

In [5]:
df['status'].value_counts(normalize=True) * 100

status
1    51.949246
0    48.050754
Name: proportion, dtype: float64

In [6]:
df['url'].duplicated().sum()

np.int64(13968)

In [7]:
df.drop_duplicates(subset=['url'], inplace=True)
df.shape

(808042, 2)

In [8]:
df_sample = df.sample(n=100000, random_state=42)
print(f"\nSampled dataset shape: {df_sample.shape}")
print(f"Sampled class distribution:\n{df_sample['status'].value_counts()}")
print(f"Sampled class distribution:\n{df['status'].value_counts(normalize=True) * 100}")


Sampled dataset shape: (100000, 2)
Sampled class distribution:
status
1    52690
0    47310
Name: count, dtype: int64
Sampled class distribution:
status
1    52.847253
0    47.152747
Name: proportion, dtype: float64


In [9]:

import numpy as np

def extract_url_features(url):
    features = {}
    
    # Basic length features
    features['url_length'] = len(url)
    
    # Extract domain
    url_parts = url.split('/')
    if url.startswith(('http://', 'https://')):
        domain = url_parts[2] if len(url_parts) > 2 else ''
    else:
        domain = url_parts[0] if len(url_parts) > 0 else ''
    
    features['domain_length'] = len(domain)
    
    # Character composition
    features['num_dots'] = url.count('.')
    features['num_hyphens'] = url.count('-')
    features['num_underscores'] = url.count('_')
    features['num_slashes'] = url.count('/')
    features['num_question_marks'] = url.count('?')
    features['num_equals'] = url.count('=')
    features['num_at_symbols'] = url.count('@')
    features['num_ampersands'] = url.count('&')
    features['num_digits'] = sum(c.isdigit() for c in url)
    features['num_special_chars'] = sum(not c.isalnum() for c in url)
    features['num_percent'] = url.count('%')
    sensitive_keywords = ['login', 'secure', 'account', 'bank', 'paypal', 'update', 'verify', 'password', 'sign', 'free']
    features['num_sensitive'] = sum(1 for kw in sensitive_keywords if kw in url.lower())
    
    # Protocol and structure
    features['has_https'] = int(url.startswith('https://'))
    features['has_http'] = int(url.startswith('http://'))
    features['has_www'] = int('www.' in url)
    features['has_ip_address'] = int(any(char.isdigit() for char in domain))
    
    # Suspicious patterns
    features['has_double_slash'] = int('//' in url[8:]) if url.startswith(('http://', 'https://')) else 0
    features['num_subdomains'] = domain.count('.') - 1 if domain else 0
    letters = sum(c.isalpha() for c in url)
    features['letter_ratio'] = letters / max(len(url), 1)
    
    # Entropy (measure of randomness)
    from collections import Counter
    if len(url) > 0:
        prob = [float(url.count(c)) / len(url) for c in dict(Counter(url))]
        features['entropy'] = -sum([p * np.log2(p) for p in prob])
    else:
        features['entropy'] = 0
    
    # Path features
    if url.startswith(('http://', 'https://')):
        path = '/'.join(url_parts[3:]) if len(url_parts) > 3 else ''
    else:
        path = '/'.join(url_parts[1:]) if len(url_parts) > 1 else ''
    
    features['path_length'] = len(path)
    features['num_path_tokens'] = len(path.split('/')) if path else 0
    
    # Ratio features
    features['digit_ratio'] = features['num_digits'] / len(url) if len(url) > 0 else 0
    features['special_char_ratio'] = features['num_special_chars'] / len(url) if len(url) > 0 else 0
    
    return features

print("Extracting features from URLs...")
features_list = []
for idx, url in enumerate(df_sample['url']):
    if idx % 10000 == 0:
        print(f"  Processed {idx}/{len(df_sample)} URLs...")
    features_list.append(extract_url_features(url))

#feature DataFrame
X = pd.DataFrame(features_list)
y = df_sample['status'].values

print(f"\nFeature shape: {X.shape}")
print(f"\nExtracted features:")
print(X.columns.tolist())
print(f"\nFeature statistics:")
print(X.describe())

# Check NaN/infinite values
print(f"\nNaN values in features: {X.isnull().sum().sum()}")
print(f"Infinite values in features: {np.isinf(X).sum().sum()}")

Extracting features from URLs...
  Processed 0/100000 URLs...
  Processed 10000/100000 URLs...
  Processed 20000/100000 URLs...
  Processed 30000/100000 URLs...
  Processed 40000/100000 URLs...
  Processed 50000/100000 URLs...
  Processed 60000/100000 URLs...
  Processed 70000/100000 URLs...
  Processed 80000/100000 URLs...
  Processed 90000/100000 URLs...

Feature shape: (100000, 26)

Extracted features:
['url_length', 'domain_length', 'num_dots', 'num_hyphens', 'num_underscores', 'num_slashes', 'num_question_marks', 'num_equals', 'num_at_symbols', 'num_ampersands', 'num_digits', 'num_special_chars', 'num_percent', 'num_sensitive', 'has_https', 'has_http', 'has_www', 'has_ip_address', 'has_double_slash', 'num_subdomains', 'letter_ratio', 'entropy', 'path_length', 'num_path_tokens', 'digit_ratio', 'special_char_ratio']

Feature statistics:
          url_length  domain_length      num_dots    num_hyphens  \
count  100000.000000   100000.00000  100000.00000  100000.000000   
mean       4

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [11]:
print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"\nTraining set class distribution:")
print(pd.Series(y_train).value_counts())
print(f"\nTest set class distribution:")
print(pd.Series(y_test).value_counts())

Training set shape: (80000, 26)
Test set shape: (20000, 26)

Training set class distribution:
1    42152
0    37848
Name: count, dtype: int64

Test set class distribution:
1    10538
0     9462
Name: count, dtype: int64


In [12]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Mean of scaled training features: {X_train_scaled.mean():.6f}")
print(f"Std of scaled training features: {X_train_scaled.std():.6f}")

Mean of scaled training features: -0.000000
Std of scaled training features: 1.000000


In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.model_selection import GridSearchCV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Hyperparameter grid
param_grid = {
    'n_estimators': [100, 300],
    'max_depth': [10, 20],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [1, 5],
    'max_features': ['sqrt', 'log2']
}

rf_base = RandomForestClassifier(random_state=42, n_jobs=-1, class_weight='balanced')
rf_grid = GridSearchCV(rf_base, param_grid, cv=cv, scoring='f1', n_jobs=-1)
rf_grid.fit(X_train_scaled, y_train)
rf_model = rf_grid.best_estimator_
print("Best RF Params:", rf_grid.best_params_)

print("Standard Random Forest training completed")
print("\nStandard Random Forest CV Scores:")
rf_cv_scores = cross_val_score(rf_model, X_train_scaled, y_train, cv=cv, scoring='f1', n_jobs=-1)
print(f"  F1 Scores: {rf_cv_scores}")
print(f"  Mean F1: {rf_cv_scores.mean():.4f}")

Best RF Params: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
Standard Random Forest training completed

Standard Random Forest CV Scores:
  F1 Scores: [0.93093826 0.93131092 0.92640092 0.9290136  0.92942666]
  Mean F1: 0.9294


In [14]:
from sklearn.metrics import (classification_report, confusion_matrix, roc_curve, 
                             auc, precision_recall_curve, average_precision_score,
                             roc_auc_score, f1_score, precision_score, recall_score)

In [15]:
def calculate_metrics(y_true, y_pred, y_pred_proba, model_name):
    metrics = {
        'Model': model_name,
        'Accuracy': (y_pred == y_true).mean(),
        'Precision': precision_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'F1-Score': f1_score(y_true, y_pred),
        'ROC-AUC': roc_auc_score(y_true, y_pred_proba),
        'AP-Score': average_precision_score(y_true, y_pred_proba)
    }
    return metrics

In [16]:
rf_pred = rf_model.predict(X_test_scaled)
rf_pred_proba = rf_model.predict_proba(X_test_scaled)[:, 1]
rf_metrics = calculate_metrics(y_test, rf_pred, rf_pred_proba, 'Random Forest')
rf_metrics

{'Model': 'Random Forest',
 'Accuracy': np.float64(0.92285),
 'Precision': 0.905143680749482,
 'Recall': 0.9535016132093377,
 'F1-Score': 0.9286935625491012,
 'ROC-AUC': 0.9760281800053345,
 'AP-Score': 0.9752226242836735}

In [17]:
from imblearn.ensemble import BalancedRandomForestClassifier
print("\nTuning Balanced Random Forest...")
brf_base = BalancedRandomForestClassifier(random_state=42, n_jobs=-1, sampling_strategy='auto', replacement=True)
brf_grid = GridSearchCV(brf_base, param_grid, cv=cv, scoring='f1', n_jobs=-1)
brf_grid.fit(X_train_scaled, y_train)
brf_model = brf_grid.best_estimator_
print("Best BRF Params:", brf_grid.best_params_)

print("\nBalanced Random Forest CV Scores:")
brf_cv_scores = cross_val_score(brf_model, X_train_scaled, y_train, cv=cv, scoring='f1', n_jobs=-1)
print(f"  F1 Scores: {brf_cv_scores}")
print(f"  Mean F1: {brf_cv_scores.mean():.4f}")


Tuning Balanced Random Forest...
Best BRF Params: {'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 300}

Balanced Random Forest CV Scores:
  F1 Scores: [0.92805544 0.92887176 0.92641653 0.92829128 0.92759364]
  Mean F1: 0.9278


In [18]:
brf_pred = brf_model.predict(X_test_scaled)
brf_pred_proba = brf_model.predict_proba(X_test_scaled)[:, 1]
brf_metrics = calculate_metrics(y_test, brf_pred, brf_pred_proba, 'Balanced RF')
brf_metrics


{'Model': 'Balanced RF',
 'Accuracy': np.float64(0.9218),
 'Precision': 0.9130915116921378,
 'Recall': 0.9411653065097741,
 'F1-Score': 0.9269158878504673,
 'ROC-AUC': 0.9753031213666085,
 'AP-Score': 0.974285801312025}

In [19]:
metrics_df = pd.DataFrame([rf_metrics, brf_metrics])
print("\nModel Comparison (Test Set):")
print(metrics_df.to_string(index=False))

# Detailed classification reports
print("\n" + "=" * 80)
print("STANDARD RANDOM FOREST - Classification Report")
print("=" * 80)
print(classification_report(y_test, rf_pred, 
                          target_names=['Phishing (0)', 'Legitimate (1)']))

print("\n" + "=" * 80)
print("BALANCED RANDOM FOREST - Classification Report")
print("=" * 80)
print(classification_report(y_test, brf_pred, 
                          target_names=['Phishing (0)', 'Legitimate (1)']))


Model Comparison (Test Set):
        Model  Accuracy  Precision   Recall  F1-Score  ROC-AUC  AP-Score
Random Forest   0.92285   0.905144 0.953502  0.928694 0.976028  0.975223
  Balanced RF   0.92180   0.913092 0.941165  0.926916 0.975303  0.974286

STANDARD RANDOM FOREST - Classification Report
                precision    recall  f1-score   support

  Phishing (0)       0.94      0.89      0.92      9462
Legitimate (1)       0.91      0.95      0.93     10538

      accuracy                           0.92     20000
     macro avg       0.93      0.92      0.92     20000
  weighted avg       0.92      0.92      0.92     20000


BALANCED RANDOM FOREST - Classification Report
                precision    recall  f1-score   support

  Phishing (0)       0.93      0.90      0.92      9462
Legitimate (1)       0.91      0.94      0.93     10538

      accuracy                           0.92     20000
     macro avg       0.92      0.92      0.92     20000
  weighted avg       0.92      0.9

In [20]:
import xgboost as xgb
from xgboost import XGBClassifier


scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print(f"\nClass imbalance ratio: {scale_pos_weight:.2f}")
print(f"This will be used for scale_pos_weight parameter\n")

print("Tuning XGBoost with GridSearchCV...")

# XGBoost parameter grid
xgb_param_grid = {
    'n_estimators': [200, 300, 400],
    'max_depth': [6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 0.9],
    'colsample_bytree': [0.8, 0.9],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2]
}

# Reduced grid for faster computation
xgb_param_grid_reduced = {
    'n_estimators': [200, 300],
    'max_depth': [6, 8],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 0.9],
    'colsample_bytree': [0.8],
    'min_child_weight': [1, 3],
    'gamma': [0, 0.1]
}

# XGBoost base model
xgb_base = XGBClassifier(
    random_state=42,
    n_jobs=-1,
    scale_pos_weight=scale_pos_weight,  # Handle imbalance
    eval_metric='logloss',
    use_label_encoder=False
)

# GridSearch
xgb_grid = GridSearchCV(
    estimator=xgb_base,
    param_grid=xgb_param_grid_reduced,
    cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
    scoring='f1',
    n_jobs=-1,
    verbose=2
)

print("Training XGBoost (this may take 10-20 minutes)...")
xgb_grid.fit(X_train_scaled, y_train)

xgb_model = xgb_grid.best_estimator_

print("\n XGBoost training completed!")
print(f"Best Parameters: {xgb_grid.best_params_}")
print(f"Best CV F1-Score: {xgb_grid.best_score_:.4f}")


Class imbalance ratio: 0.90
This will be used for scale_pos_weight parameter

Tuning XGBoost with GridSearchCV...
Training XGBoost (this may take 10-20 minutes)...
Fitting 3 folds for each of 64 candidates, totalling 192 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



 XGBoost training completed!
Best Parameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 8, 'min_child_weight': 1, 'n_estimators': 300, 'subsample': 0.9}
Best CV F1-Score: 0.9321


In [21]:
xgb_model_final = xgb_grid


print("\n" + "=" * 80)
print("[EVALUATION] XGBOOST PERFORMANCE")
print("=" * 80)

# Predictions
xgb_pred = xgb_model_final.predict(X_test_scaled)
xgb_pred_proba = xgb_model_final.predict_proba(X_test_scaled)[:, 1]

# Calculate metrics
xgb_metrics = calculate_metrics(y_test, xgb_pred, xgb_pred_proba, 'XGBoost')

print("\nXGBoost Metrics:")
for key, value in xgb_metrics.items():
    if key != 'Model':
        print(f"  {key}: {value:.4f}")

# Cross-validation scores
print("\n" + "-" * 80)
print("XGBoost Cross-Validation Scores:")
print("-" * 80)
xgb_cv_scores = cross_val_score(
    xgb_model_final, X_train_scaled, y_train, 
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='f1', n_jobs=-1
)
print(f"F1 Scores: {xgb_cv_scores}")
print(f"Mean F1: {xgb_cv_scores.mean():.4f} ")

# Detailed classification report
print("\n" + "=" * 80)
print("XGBOOST - Classification Report")
print("=" * 80)
print(classification_report(y_test, xgb_pred, 
                          target_names=['Phishing (0)', 'Legitimate (1)']))

# Confusion Matrix
print("\nConfusion Matrix:")
cm_xgb = confusion_matrix(y_test, xgb_pred)
print(cm_xgb)
print(f"\nTrue Negatives (Phishing correctly identified): {cm_xgb[0,0]}")
print(f"False Positives (Legitimate marked as Phishing): {cm_xgb[0,1]}")
print(f"False Negatives (Phishing marked as Legitimate): {cm_xgb[1,0]}")
print(f"True Positives (Legitimate correctly identified): {cm_xgb[1,1]}")



[EVALUATION] XGBOOST PERFORMANCE

XGBoost Metrics:
  Accuracy: 0.9283
  Precision: 0.9195
  Recall: 0.9468
  F1-Score: 0.9329
  ROC-AUC: 0.9788
  AP-Score: 0.9786

--------------------------------------------------------------------------------
XGBoost Cross-Validation Scores:
--------------------------------------------------------------------------------
F1 Scores: [0.93305219 0.93338773 0.93110866 0.93158048 0.93270572]
Mean F1: 0.9324 

XGBOOST - Classification Report
                precision    recall  f1-score   support

  Phishing (0)       0.94      0.91      0.92      9462
Legitimate (1)       0.92      0.95      0.93     10538

      accuracy                           0.93     20000
     macro avg       0.93      0.93      0.93     20000
  weighted avg       0.93      0.93      0.93     20000


Confusion Matrix:
[[8588  874]
 [ 561 9977]]

True Negatives (Phishing correctly identified): 8588
False Positives (Legitimate marked as Phishing): 874
False Negatives (Phishing mark

In [22]:
# ========== COMPREHENSIVE MODEL COMPARISON ==========
print("\n" + "=" * 80)
print(" FINAL MODEL COMPARISON - ALL CLASSIFIERS")
print("=" * 80)

# Update comparison dataframe
all_models_comparison = pd.DataFrame([
    rf_metrics, 
    brf_metrics,
    xgb_metrics
])

print("\nComplete Performance Comparison:")
print(all_models_comparison.to_string(index=False))

# Find best model
best_idx = all_models_comparison['F1-Score'].idxmax()
best_model_name = all_models_comparison.loc[best_idx, 'Model']
best_accuracy = all_models_comparison.loc[best_idx, 'Accuracy']
best_f1 = all_models_comparison.loc[best_idx, 'F1-Score']

print("\n" + "=" * 80)
print(f"BEST PERFORMING MODEL: {best_model_name}")
print("=" * 80)
print(f"  • Accuracy:  {best_accuracy:.4f} ({best_accuracy*100:.2f}%)")
print(f"  • Precision: {all_models_comparison.loc[best_idx, 'Precision']:.4f}")
print(f"  • Recall:    {all_models_comparison.loc[best_idx, 'Recall']:.4f}")
print(f"  • F1-Score:  {best_f1:.4f}")
print(f"  • ROC-AUC:   {all_models_comparison.loc[best_idx, 'ROC-AUC']:.4f}")
print(f"  • AP-Score:  {all_models_comparison.loc[best_idx, 'AP-Score']:.4f}")


 FINAL MODEL COMPARISON - ALL CLASSIFIERS

Complete Performance Comparison:
        Model  Accuracy  Precision   Recall  F1-Score  ROC-AUC  AP-Score
Random Forest   0.92285   0.905144 0.953502  0.928694 0.976028  0.975223
  Balanced RF   0.92180   0.913092 0.941165  0.926916 0.975303  0.974286
      XGBoost   0.92825   0.919454 0.946764  0.932909 0.978774  0.978607

BEST PERFORMING MODEL: XGBoost
  • Accuracy:  0.9283 (92.83%)
  • Precision: 0.9195
  • Recall:    0.9468
  • F1-Score:  0.9329
  • ROC-AUC:   0.9788
  • AP-Score:  0.9786
