In [2]:

# Importing required libraries
import re
import pandas as pd
import numpy as np
from urllib.parse import urlparse
def has_obfuscation(url):
    patterns = [
        r"%[0-9A-Fa-f]{2}",  # URL encoding
        r"@",  # '@' symbol for phishing
        r"[0-9]{1,3}(\.[0-9]{1,3}){3}",  # IP address instead of domain
    ]
    return 1 if any(re.search(pattern, url) for pattern in patterns) else 0


# Function to extract components from a URL to better training of model

# Here the returned data is of int orr float, as RandomForest Trains on this type of values
def extract_features(url):
    parsed_url = urlparse(url)
    path = parsed_url.path
    query = parsed_url.query

    return {
        'url_length': len(url),
        'digits_ratio': (sum(c.isdigit() for c in url))/len(url),
        'special_chars_ratio': (sum(c in "/-_.?=&" for c in url))/len(url),
        'contains_ip': 1 if re.match(r'http[s]?://\d+\.\d+\.\d+\.\d+', url) else 0,
        'num_subdomains': parsed_url.netloc.count('.'),
        'path_length': len(path),
        'num_params': query.count('&') + query.count('?'),
        'has_executable': 1 if path.endswith(('.exe', '.sh', '.php')) else 0,
        'has_obfuscation': has_obfuscation(url),
        'is_https': 1 if url.startswith('https') else 0
    }

# Data Preprocessing
# Getting training data from the file containing all malicious urls
with open("/content/urlhaus.abuse.ch.txt", "r") as f:
    urls = [line.strip() for line in f if not line.startswith("#")]

data = pd.DataFrame([extract_features(url) for url in urls])
data['label'] = [1] * len(urls)

df_new = pd.read_csv("/content/PhiUSIIL_Phishing_URL_Dataset.csv")

if "URL" in df_new.columns and "label" in df_new.columns:
    df_new = df_new[["URL", "label"]].rename(columns={"URL": "url"})
else:
    raise ValueError("CSV must contain 'URL' and 'label' columns.")
df_new_features = df_new["url"].apply(lambda x: extract_features(x)).apply(pd.Series)

df_new_final = pd.concat([df_new, df_new_features], axis=1)

df_final = pd.concat([data, df_new_final]).drop_duplicates(subset=["url"]).reset_index(drop=True)

df_final =  df_final.drop('url',axis=1)
print(df_final.head())

   url_length  digits_ratio  special_chars_ratio  contains_ip  num_subdomains  \
0        27.0      0.518519             0.222222          1.0             3.0   
1        32.0      0.000000             0.125000          0.0             2.0   
2        24.0      0.000000             0.208333          0.0             2.0   
3        30.0      0.000000             0.166667          0.0             3.0   
4        27.0      0.000000             0.148148          0.0             2.0   

   path_length  num_params  has_executable  has_obfuscation  is_https  label  
0          2.0         0.0             0.0              1.0       0.0      1  
1          0.0         0.0             0.0              0.0       1.0      1  
2          0.0         0.0             0.0              0.0       1.0      1  
3          0.0         0.0             0.0              0.0       1.0      1  
4          0.0         0.0             0.0              0.0       1.0      1  


In [3]:
df_final.tail()

Unnamed: 0,url_length,digits_ratio,special_chars_ratio,contains_ip,num_subdomains,path_length,num_params,has_executable,has_obfuscation,is_https,label
235366,30.0,0.0,0.133333,0.0,2.0,0.0,0.0,0.0,0.0,1.0,1
235367,29.0,0.0,0.172414,0.0,3.0,0.0,0.0,0.0,0.0,1.0,1
235368,31.0,0.0,0.129032,0.0,2.0,0.0,0.0,0.0,0.0,1.0,1
235369,56.0,0.053571,0.142857,0.0,3.0,1.0,0.0,0.0,0.0,1.0,0
235370,34.0,0.0,0.117647,0.0,2.0,0.0,0.0,0.0,0.0,1.0,1


In [4]:
df_final = df_final.assign(label=df_final['label'].fillna(0))


In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# Splitting dataset
X = df_final.drop(columns=['label'])
y = df_final['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training RandomForest Model
rf = RandomForestClassifier(n_estimators=100, random_state=112)
rf.fit(X_train, y_train)

# Predict and evaluate
y_pred = rf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")


Accuracy: 0.9958364312267658


In [6]:
# More datasets to train on
phis = pd.read_csv('/content/phishing-urls.csv')
phis['label'] = [1] * len(phis['Domain'])
phis['url'] = phis['Domain'] + phis['Path']
phis_df = phis[['url','label']]
phis_df.head()


Unnamed: 0,url,label
0,asesoresvelfit.com/media/datacredito.co/,1
1,caixa.com.br.fgtsagendesaqueconta.com/consulta...,1
2,hissoulreason.com/js/homepage/home/,1
3,unauthorizd.newebpage.com/webapps/66fbf/,1
4,133.130.103.10/23/,1


In [7]:
leg = pd.read_csv('/content/legitimate-urls.csv')
leg['label'] = [0] * len(leg['Domain'])
leg['url'] = leg['Domain'] + leg['Path']
leg_df = leg[['url','label']]
leg_df.head()

Unnamed: 0,url,label
0,www.liquidgeneration.com/,0
1,www.onlineanime.org/,0
2,www.ceres.dti.ne.jp/~nekoi/senno/senfirst.html,0
3,www.galeon.com/kmh/,0
4,www.fanworkrecs.com/,0


In [9]:
df_combined = pd.concat([phis_df, leg_df], ignore_index=True)
df_shuffled = df_combined
# Checking value of label column
df_shuffled['label']


Unnamed: 0,label
0,1
1,1
2,1
3,1
4,1
...,...
2010,0
2011,0
2012,0
2013,0


In [10]:
import pandas as pd
import re
from urllib.parse import urlparse

def has_obfuscation(url):
    return 1 if '%' in url or '@' in url else 0

def extract_features(url):
    if not isinstance(url, str) or len(url) == 0:
        return {
            'url_length': 0,
            'digits_ratio': 0,
            'special_chars_ratio': 0,
            'contains_ip': 0,
            'num_subdomains': 0,
            'path_length': 0,
            'num_params': 0,
            'has_executable': 0,
            'has_obfuscation': 0,
            'is_https': 0
        }

    parsed_url = urlparse(url)
    path = parsed_url.path
    query = parsed_url.query

    url_length = len(url)

    return {
        'url_length': url_length,
        'digits_ratio': sum(c.isdigit() for c in url) / url_length if url_length > 0 else 0,
        'special_chars_ratio': sum(c in "/-_.?=&" for c in url) / url_length if url_length > 0 else 0,
        'contains_ip': 1 if re.match(r'http[s]?://\d+\.\d+\.\d+\.\d+', url) else 0,
        'num_subdomains': parsed_url.netloc.count('.'),
        'path_length': len(path),
        'num_params': query.count('&') + query.count('?'),
        'has_executable': 1 if path.endswith(('.exe', '.sh', '.php')) else 0,
        'has_obfuscation': has_obfuscation(url),
        'is_https': 1 if url.startswith('https') else 0
    }

# Ensure 'url' column has no NaN values
df_shuffled["url"] = df_shuffled["url"].fillna("")

df_last = df_shuffled["url"].apply(lambda x: extract_features(x)).apply(pd.Series)
df_last['label'] = df_shuffled['label']


In [11]:
df_last.tail()

Unnamed: 0,url_length,digits_ratio,special_chars_ratio,contains_ip,num_subdomains,path_length,num_params,has_executable,has_obfuscation,is_https,label
2010,16.0,0.0,0.1875,0.0,0.0,16.0,0.0,0.0,0.0,0.0,0
2011,18.0,0.0,0.166667,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0
2012,24.0,0.0,0.125,0.0,0.0,24.0,0.0,0.0,0.0,0.0,0
2013,41.0,0.0,0.121951,0.0,0.0,41.0,0.0,0.0,0.0,0.0,0
2014,32.0,0.0,0.1875,0.0,0.0,32.0,0.0,0.0,0.0,0.0,0


In [12]:
from sklearn.metrics import accuracy_score

# Making dataset to train from dataframe
X_val = df_last.drop(columns=["label"], errors="ignore")
y_val_true = df_last["label"] if "label" in df_last.columns else None

# Predict using the trained model
y_val_pred = rf.predict(X_val)

# If labels exist, evaluate accuracy
if y_val_true is not None:
    accuracy = accuracy_score(y_val_true, y_val_pred) * 100
    print(f"Validation Accuracy: {accuracy} %")
else:
    print("Predictions on new dataset:")
    print(y_val_pred)


Validation Accuracy: 50.47146401985112%


In [13]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import (accuracy_score, precision_score,
                            recall_score, f1_score, confusion_matrix,
                            classification_report)
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# 1. Data Preprocessing
X = df_final.drop(columns=['label'])
y = df_final['label']

# Split dataset with stratification to ensure model doesnt remember the order
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


rf = RandomForestClassifier(random_state=112, n_jobs=-1)

# Paramters , enough to train with less computational power
param_dist = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'class_weight': [None, 'balanced']
}


random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=10,
    cv=3,
    n_jobs=-1,
    scoring='f1',
    random_state=112
)
random_search.fit(X_train, y_train)

best_rf = random_search.best_estimator_
print(f"Best parameters: {random_search.best_params_}")

# Test set evaluation
y_pred = best_rf.predict(X_test)

# Calculating metrics
print("\nTest Set Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")

print("\n")
print("Essential Classification Report: ")
print(classification_report(y_test, y_pred, digits=4))


# Showing the most important feature of an url to classify
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': best_rf.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nTop 5 Most Important Features:")
print(feature_importances.head(5))



Best parameters: {'n_estimators': 100, 'min_samples_split': 5, 'max_depth': 10, 'class_weight': 'balanced'}

Test Set Performance:
Accuracy: 0.9956
Precision: 0.9930
Recall: 0.9994
F1 Score: 0.9962

Essential Classification Report:
              precision    recall  f1-score   support

           0     0.9992    0.9905    0.9949     20104
           1     0.9930    0.9994    0.9962     26971

    accuracy                         0.9956     47075
   macro avg     0.9961    0.9950    0.9955     47075
weighted avg     0.9956    0.9956    0.9956     47075


Top 5 Most Important Features:
          Feature  Importance
9        is_https    0.442863
5     path_length    0.329646
1    digits_ratio    0.111995
0      url_length    0.071065
4  num_subdomains    0.026806
