In [1]:

# === STEP 1: Import Libraries ===

import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from scipy.sparse import hstack
import joblib

print("Libraries Imported Successfully!")


Libraries Imported Successfully!


In [2]:

# === STEP 2: Load Dataset (Limit to 100k Rows) ===

df = pd.read_csv("/content/malicious_phish.csv").sample(n=100000, random_state=42)

print("Dataset Loaded Successfully!")
print("Dataset Shape:", df.shape)
print("\nType Distribution:\n", df['type'].value_counts())
df.head()


Dataset Loaded Successfully!
Dataset Shape: (100000, 2)

Type Distribution:
 type
benign        65966
defacement    14690
phishing      14317
malware        5027
Name: count, dtype: int64


Unnamed: 0,url,type
536448,http://37.49.226.178/deusbins/deus.sh4,malware
40630,medical-dictionary.thefreedictionary.com/Galt+...,benign
630496,www.jscape.com/sshfactory/,phishing
426724,http://www.wsnc.org.au/component/jcalpro/view/983,defacement
184034,virtualtourist.com/travel/North_America/Canada...,benign


In [3]:

# === STEP 3: Convert to Binary Classification ===

df['label'] = df['type'].apply(lambda x: 0 if x == 'benign' else 1)
print("\nLabel Distribution:\n", df['label'].value_counts())



Label Distribution:
 label
0    65966
1    34034
Name: count, dtype: int64


In [4]:

# === STEP 4: Clean and Preprocess URLs ===

def clean_url(url):
    url = str(url).lower()
    url = re.sub(r'https?://', '', url)  # remove http/https
    url = re.sub(r'www\\.', '', url)     # remove www
    url = url.strip().strip('/')
    return url

df['clean_url'] = df['url'].apply(clean_url)

print("URLs Cleaned Successfully!")
df[['url', 'clean_url']].head(5)


URLs Cleaned Successfully!


Unnamed: 0,url,clean_url
536448,http://37.49.226.178/deusbins/deus.sh4,37.49.226.178/deusbins/deus.sh4
40630,medical-dictionary.thefreedictionary.com/Galt+...,medical-dictionary.thefreedictionary.com/galt+...
630496,www.jscape.com/sshfactory/,www.jscape.com/sshfactory
426724,http://www.wsnc.org.au/component/jcalpro/view/983,www.wsnc.org.au/component/jcalpro/view/983
184034,virtualtourist.com/travel/North_America/Canada...,virtualtourist.com/travel/north_america/canada...


In [5]:

# === STEP 5: Extract Additional URL Features ===

def extract_features(url):
    return {
        "url_length": len(url),
        "count_digits": sum(c.isdigit() for c in url),
        "count_dots": url.count('.'),
        "count_hyphens": url.count('-'),
        "count_at": url.count('@'),
        "count_question": url.count('?'),
        "count_equals": url.count('='),
        "has_ip": 1 if re.search(r'\\b\\d{1,3}(?:\\.\\d{1,3}){3}\\b', url) else 0,
        "has_suspicious_word": 1 if any(w in url for w in
                                        ['login','verify','update','free','click','secure',
                                         'account','bank','signin','confirm','password']) else 0
    }

feature_df = df['clean_url'].apply(extract_features).apply(pd.Series)
print("Additional Features Extracted Successfully!")
feature_df.head()


Additional Features Extracted Successfully!


Unnamed: 0,url_length,count_digits,count_dots,count_hyphens,count_at,count_question,count_equals,has_ip,has_suspicious_word
536448,31,11,4,0,0,0,0,0,0
40630,54,0,2,1,0,0,0,0,1
630496,25,0,2,0,0,0,0,0,0
426724,42,3,3,0,0,0,0,0,0
184034,121,7,2,5,0,0,0,0,0


In [6]:

# === STEP 6: Combine TF-IDF with Additional Features ===

tfidf = TfidfVectorizer(max_features=3000, ngram_range=(1, 2))
X_tfidf = tfidf.fit_transform(df['clean_url'])

# Combine numerical features with TF-IDF
X_combined = hstack([X_tfidf, feature_df.values])
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)
print("Features Combined and Data Split Successfully!")


Features Combined and Data Split Successfully!


In [7]:

# === STEP 7: Train & Evaluate Model ===

model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("\nModel Trained Successfully!")
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Save Model and TF-IDF
joblib.dump(model, "spam_url_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")

print("\nModel and Vectorizer Saved Successfully!")



Model Trained Successfully!

Accuracy: 0.92395

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.96      0.94     13268
           1       0.92      0.85      0.88      6732

    accuracy                           0.92     20000
   macro avg       0.92      0.90      0.91     20000
weighted avg       0.92      0.92      0.92     20000


Confusion Matrix:
 [[12777   491]
 [ 1030  5702]]

Model and Vectorizer Saved Successfully!


In [8]:

# === STEP 8: Test on Custom URLs ===

test_urls = [
    "https://google.com",
    "http://free-gift.ru/login",
    "https://secure-paypal-login.xyz",
    "http://update-account-info.net",
    "https://pes.edu",
    "http:/google.com"
]

# Clean and extract new features
test_clean = [clean_url(u) for u in test_urls]
test_features = [extract_features(u) for u in test_clean]
test_feature_df = pd.DataFrame(test_features)

# TF-IDF transform
tfidf_loaded = joblib.load("tfidf_vectorizer.pkl")
test_tfidf = tfidf_loaded.transform(test_clean)

# Combine TF-IDF + numeric features
test_combined = hstack([test_tfidf, test_feature_df.values])

# Predict
loaded_model = joblib.load("spam_url_model.pkl")
preds = loaded_model.predict(test_combined)

for url, pred in zip(test_urls, preds):
    print(f"{url} → {'SPAM' if pred == 1 else 'SAFE'}")


https://google.com → SAFE
http://free-gift.ru/login → SPAM
https://secure-paypal-login.xyz → SPAM
http://update-account-info.net → SPAM
https://pes.edu → SAFE
http:/google.com → SAFE
