# URLTitleMatchScore Regression Notebook

This notebook shows how to predict `URLTitleMatchScore` using other features, with proper training/testing split and leak prevention.

# Imports

In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt


## Load and Prepare Data

In [2]:

# URL to the dataset
url = "https://archive.ics.uci.edu/static/public/967/phiusiil+phishing+url+dataset.zip"

# Read the CSV file from the URL, ignoring the first column (index 0)
df = pd.read_csv(url, encoding="utf-8")
df = df.drop("FILENAME", axis=1)  # Column "FILENAME" can be ignored.
print("\nShape of data:", df.shape)
df.head()







Shape of data: (235795, 55)


Unnamed: 0,URL,URLLength,Domain,DomainLength,IsDomainIP,TLD,URLSimilarityIndex,CharContinuationRate,TLDLegitimateProb,URLCharProb,...,Pay,Crypto,HasCopyrightInfo,NoOfImage,NoOfCSS,NoOfJS,NoOfSelfRef,NoOfEmptyRef,NoOfExternalRef,label
0,https://www.southbankmosaics.com,31,www.southbankmosaics.com,24,0,com,100.0,1.0,0.522907,0.061933,...,0,0,1,34,20,28,119,0,124,1
1,https://www.uni-mainz.de,23,www.uni-mainz.de,16,0,de,100.0,0.666667,0.03265,0.050207,...,0,0,1,50,9,8,39,0,217,1
2,https://www.voicefmradio.co.uk,29,www.voicefmradio.co.uk,22,0,uk,100.0,0.866667,0.028555,0.064129,...,0,0,1,10,2,7,42,2,5,1
3,https://www.sfnmjournal.com,26,www.sfnmjournal.com,19,0,com,100.0,1.0,0.522907,0.057606,...,1,1,1,3,27,15,22,1,31,1
4,https://www.rewildingargentina.org,33,www.rewildingargentina.org,26,0,org,100.0,1.0,0.079963,0.059441,...,1,0,1,244,15,34,72,1,85,1


## Show correlation

In [3]:
# Keep numeric columns only
numeric_df = df.select_dtypes(include=[np.number])


print("Correlation with URLTitleMatchScore:")
print(numeric_df.corr()['URLTitleMatchScore'].sort_values(ascending=False))



Correlation with URLTitleMatchScore:
URLTitleMatchScore            1.000000
DomainTitleMatchScore         0.961008
CharContinuationRate          0.568560
URLSimilarityIndex            0.543969
label                         0.539419
HasSocialNet                  0.425394
HasTitle                      0.417060
HasCopyrightInfo              0.413144
HasDescription                0.398488
URLCharProb                   0.331182
HasSubmitButton               0.302893
IsResponsive                  0.297164
TLDLegitimateProb             0.296253
HasHiddenFields               0.265098
HasFavicon                    0.245452
Robots                        0.222834
IsHTTPS                       0.220749
Pay                           0.203851
NoOfJS                        0.199003
TLDLength                     0.169064
NoOfSelfRef                   0.161625
NoOfImage                     0.150345
LineOfCode                    0.146272
NoOfiFrame                    0.142916
NoOfExternalRef            

## Removing Leakage, High Correleation, and Irrelevant Features


In [4]:
leak_columns = [
    # 'Title',                  # UrlTitleMatchScore is derived from Title.
    'DomainTitleMatchScore',  # Extremely high correlation; direct overlap
    'label'                   # Phishing would cause leakage as it was the original target.
]

# Drop from feature matrix
numeric_df = numeric_df.drop(columns=leak_columns)

print("Remaining columns after dropping potential leaks:")
print(numeric_df.columns.tolist())

Remaining columns after dropping potential leaks:
['URLLength', 'DomainLength', 'IsDomainIP', 'URLSimilarityIndex', 'CharContinuationRate', 'TLDLegitimateProb', 'URLCharProb', 'TLDLength', 'NoOfSubDomain', 'HasObfuscation', 'NoOfObfuscatedChar', 'ObfuscationRatio', 'NoOfLettersInURL', 'LetterRatioInURL', 'NoOfDegitsInURL', 'DegitRatioInURL', 'NoOfEqualsInURL', 'NoOfQMarkInURL', 'NoOfAmpersandInURL', 'NoOfOtherSpecialCharsInURL', 'SpacialCharRatioInURL', 'IsHTTPS', 'LineOfCode', 'LargestLineLength', 'HasTitle', 'URLTitleMatchScore', 'HasFavicon', 'Robots', 'IsResponsive', 'NoOfURLRedirect', 'NoOfSelfRedirect', 'HasDescription', 'NoOfPopup', 'NoOfiFrame', 'HasExternalFormSubmit', 'HasSocialNet', 'HasSubmitButton', 'HasHiddenFields', 'HasPasswordField', 'Bank', 'Pay', 'Crypto', 'HasCopyrightInfo', 'NoOfImage', 'NoOfCSS', 'NoOfJS', 'NoOfSelfRef', 'NoOfEmptyRef', 'NoOfExternalRef']


## Prepare DF and Split into Test/Train

In [5]:
# Define X and y
X = numeric_df.drop('URLTitleMatchScore', axis=1)
y = numeric_df['URLTitleMatchScore']

# Drop known leaks
leak_columns = ['DomainTitleMatchScore', 'label']
X = X.drop(columns=[col for col in leak_columns if col in X.columns])

print("Remaining features:", X.columns.tolist())

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])

Remaining features: ['URLLength', 'DomainLength', 'IsDomainIP', 'URLSimilarityIndex', 'CharContinuationRate', 'TLDLegitimateProb', 'URLCharProb', 'TLDLength', 'NoOfSubDomain', 'HasObfuscation', 'NoOfObfuscatedChar', 'ObfuscationRatio', 'NoOfLettersInURL', 'LetterRatioInURL', 'NoOfDegitsInURL', 'DegitRatioInURL', 'NoOfEqualsInURL', 'NoOfQMarkInURL', 'NoOfAmpersandInURL', 'NoOfOtherSpecialCharsInURL', 'SpacialCharRatioInURL', 'IsHTTPS', 'LineOfCode', 'LargestLineLength', 'HasTitle', 'HasFavicon', 'Robots', 'IsResponsive', 'NoOfURLRedirect', 'NoOfSelfRedirect', 'HasDescription', 'NoOfPopup', 'NoOfiFrame', 'HasExternalFormSubmit', 'HasSocialNet', 'HasSubmitButton', 'HasHiddenFields', 'HasPasswordField', 'Bank', 'Pay', 'Crypto', 'HasCopyrightInfo', 'NoOfImage', 'NoOfCSS', 'NoOfJS', 'NoOfSelfRef', 'NoOfEmptyRef', 'NoOfExternalRef']
Training samples: 188636
Testing samples: 47159


## Scale Features

In [6]:

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# Cross-Validation and Evaluation Metrics

We use **10-Fold Cross-Validation** (`KFold`) to get a reliable estimate of generalization performance.
Each fold trains on 90% of the training data and tests on 10%.

**Why 10 folds?**
- Good balance between bias and variance.
- Works well for medium-sized datasets.
- Uses as much data as possible for training each round.

**Evaluation metrics:**
- **MAE (Mean Absolute Error):** Average absolute difference between predicted and actual values.
- **RMSE (Root Mean Squared Error):** Penalizes larger errors more strongly.
- **R² (R-Squared):** Proportion of variance explained by the model.

Combining these metrics gives a better sense of how well the model performs.


In [7]:
# Define K-Fold CV
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Define scoring
scoring = {
    'MAE': 'neg_mean_absolute_error',
    'RMSE': 'neg_root_mean_squared_error',
    'R2': 'r2'
}

# Random Forest Regression

In [None]:

rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)

rf_scores = cross_validate(rf, X_train_scaled, y_train, cv=kf, scoring=scoring)

rf_results = {
    'MAE': -rf_scores['test_MAE'].mean(),
    'RMSE': -rf_scores['test_RMSE'].mean(),
    'R2': rf_scores['test_R2'].mean()
}

print(rf_results)


# Random Forest: Actual vs Predicted

In [None]:

rf.fit(X_train_scaled, y_train)
y_pred = rf.predict(X_test_scaled)

plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual URLTitleMatchScore')
plt.ylabel('Predicted URLTitleMatchScore')
plt.title('Random Forest: Actual vs Predicted')
plt.show()

feature_names = X_train.columns

# Get importances
importances = rf.feature_importances_

# Make a DataFrame for easy sorting
feat_imp = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print(feat_imp)

# K-Nearest Neighbors Regression

In [None]:

knn = KNeighborsRegressor(n_neighbors=5)

knn_scores = cross_validate(knn, X_train_scaled, y_train, cv=kf, scoring=scoring)

knn_results = {
    'MAE': -knn_scores['test_MAE'].mean(),
    'RMSE': -knn_scores['test_RMSE'].mean(),
    'R2': knn_scores['test_R2'].mean()
}

print(knn_results)


# Support Vector Regression

In [None]:

svr = SVR(kernel='rbf', C=1.0, epsilon=0.1)

svr_scores = cross_validate(svr, X_train_scaled, y_train, cv=kf, scoring=scoring)

svr_results = {
    'MAE': -svr_scores['test_MAE'].mean(),
    'RMSE': -svr_scores['test_RMSE'].mean(),
    'R2': svr_scores['test_R2'].mean()
}

print(svr_results)


# Model Comparison

In [None]:

comparison = pd.DataFrame({
    'Random Forest': rf_results,
    'KNN': knn_results,
    'SVR': svr_results
})

print(comparison)

comparison.T.plot(kind='bar', figsize=(10,6), title='Regression Model Comparison')


# Model Analysis and Real-World Use

## Which Features Matter Most?
Random Forest can show which features are most useful for predicting the score. We drop `DomainTitleMatchScore`, `Title`, `Phishing(lable)` to prevent injecting false results.

## Why Do These Features Matter?
Features that measure similarity are helpful because phishing URLs often try to look like real ones.

## Usefulness for Interested Parties
This kind of model could help email security providers, browsers, or companies detect phishing attempts early and block suspicious links.

## Measuring and Improving the Model
- Test how many phishing URLs are blocked vs missed.
- Retrain with updated data regularly as attackers change strategies.
- Add more data like page content or SSL info for better predictions.

## Deployment
Run the model as an API or integrate it into a security pipeline. Monitor its performance and update as needed.
