In [1]:
# === STEP 1: Import Libraries ===
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

In [2]:
# ============================================================
# === STEP 2: Load Dataset ===
# ============================================================
# Make sure you upload 'malicious_phish.csv' to your Colab files before running this cell.
df = pd.read_csv("/content/malicious_phish.csv")

print("✅ Dataset Loaded Successfully!")
print("Dataset Shape:", df.shape)
print("\nType distribution:\n", df['type'].value_counts())

✅ Dataset Loaded Successfully!
Dataset Shape: (651191, 2)

Type distribution:
 type
benign        428103
defacement     96457
phishing       94111
malware        32520
Name: count, dtype: int64


In [3]:
# ============================================================
# === STEP 3: Convert to Binary Classification (Spam vs Non-Spam) ===
# ============================================================

# Mark benign as 0 (non-spam), others as 1 (spam)
df['label'] = df['type'].apply(lambda x: 0 if x == 'benign' else 1)
print("\nLabel Distribution:\n", df['label'].value_counts())



Label Distribution:
 label
0    428103
1    223088
Name: count, dtype: int64


In [4]:
# ============================================================
# === STEP 4: Balance Dataset ===
# ============================================================

# Separate classes
spam_df = df[df['label'] == 1]
benign_df = df[df['label'] == 0]

# Balance both classes to equal count
min_size = min(len(spam_df), len(benign_df))
spam_df = spam_df.sample(min_size, random_state=42)
benign_df = benign_df.sample(min_size, random_state=42)

# Combine and shuffle
balanced_df = pd.concat([spam_df, benign_df]).sample(frac=1, random_state=42)

print("\nBalanced Label Distribution:\n", balanced_df['label'].value_counts())
print("✅ Dataset Balanced Successfully!")


Balanced Label Distribution:
 label
1    223088
0    223088
Name: count, dtype: int64
✅ Dataset Balanced Successfully!


In [5]:
# ============================================================
# === STEP 5: Prepare Features and Labels ===
# ============================================================
X = balanced_df['url']
y = balanced_df['label']

# Split into train/test
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\nTraining samples:", len(X_train_raw))
print("Testing samples:", len(X_test_raw))


Training samples: 356940
Testing samples: 89236


In [6]:
# ============================================================
# === STEP 6: Text Vectorization using TF-IDF ===
# ============================================================
vectorizer = TfidfVectorizer(max_features=3000)
X_train = vectorizer.fit_transform(X_train_raw)
X_test = vectorizer.transform(X_test_raw)

print("\n✅ TF-IDF Vectorization Complete")
print("Feature Matrix Shape:", X_train.shape)


✅ TF-IDF Vectorization Complete
Feature Matrix Shape: (356940, 3000)


In [11]:
# ============================================================
# === STEP 7: Train Models (Optimized for 100k URLs) ===
# ============================================================

# --- Reduce dataset size to 100k (50k spam + 50k benign) ---
spam_df_small = spam_df.sample(50000, random_state=42)
benign_df_small = benign_df.sample(50000, random_state=42)
balanced_small_df = pd.concat([spam_df_small, benign_df_small]).sample(frac=1, random_state=42)

# Prepare features and labels
X_small = balanced_small_df['url']
y_small = balanced_small_df['label']

# Split into train/test
X_train_raw_small, X_test_raw_small, y_train_small, y_test_small = train_test_split(
    X_small, y_small, test_size=0.2, random_state=42, stratify=y_small
)

# TF-IDF Vectorization (reduce max features to 1500 for speed)
vectorizer_small = TfidfVectorizer(max_features=1500)
X_train_small = vectorizer_small.fit_transform(X_train_raw_small)
X_test_small = vectorizer_small.transform(X_test_raw_small)

print("\n✅ TF-IDF Vectorization Complete (Optimized)")

# --- Support Vector Machine ---
print("\n🎯 Training Support Vector Machine (SVM)...")
svm_model = SVC(kernel='linear', C=1, random_state=42)
svm_model.fit(X_train_small, y_train_small)
print("✅ SVM Model Trained Successfully!")

# --- Random Forest ---
print("\n🎯 Training Random Forest on 100k URLs...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train_small, y_train_small)
print("✅ Random Forest Model Trained Successfully!")

# --- Evaluate Models ---
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("\n--- Model Evaluation ---")

# SVM
y_pred_svm = svm_model.predict(X_test_small)
acc_svm = accuracy_score(y_test_small, y_pred_svm)
print(f"\nSVM Accuracy: {acc_svm:.4f}")
print("\n--- SVM Classification Report ---")
print(classification_report(y_test_small, y_pred_svm))
cm_svm = confusion_matrix(y_test_small, y_pred_svm)
print("\nSVM Confusion Matrix:\n", cm_svm)

# Random Forest
y_pred_rf = rf_model.predict(X_test_small)
acc_rf = accuracy_score(y_test_small, y_pred_rf)
print(f"\nRandom Forest Accuracy: {acc_rf:.4f}")
print("\n--- Random Forest Classification Report ---")
print(classification_report(y_test_small, y_pred_rf))
cm_rf = confusion_matrix(y_test_small, y_pred_rf)
print("\nRF Confusion Matrix:\n", cm_rf)

# --- Save Models & Vectorizer ---
import joblib
joblib.dump(svm_model, "/content/svm_model.pkl")
joblib.dump(rf_model, "/content/rf_model.pkl")
joblib.dump(vectorizer_small, "/content/vectorizer.pkl")
print("\n✅ Models and vectorizer saved successfully!")



✅ TF-IDF Vectorization Complete (Optimized)

🎯 Training Support Vector Machine (SVM)...
✅ SVM Model Trained Successfully!

🎯 Training Random Forest on 100k URLs...
✅ Random Forest Model Trained Successfully!

--- Model Evaluation ---

SVM Accuracy: 0.9195

--- SVM Classification Report ---
              precision    recall  f1-score   support

           0       0.89      0.95      0.92     10000
           1       0.95      0.88      0.92     10000

    accuracy                           0.92     20000
   macro avg       0.92      0.92      0.92     20000
weighted avg       0.92      0.92      0.92     20000


SVM Confusion Matrix:
 [[9549  451]
 [1159 8841]]

Random Forest Accuracy: 0.9375

--- Random Forest Classification Report ---
              precision    recall  f1-score   support

           0       0.91      0.97      0.94     10000
           1       0.97      0.91      0.94     10000

    accuracy                           0.94     20000
   macro avg       0.94      0.94  

In [12]:
# ============================================================
# === STEP 8: Model Evaluation ===
# ============================================================

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("\n--- Model Evaluation ---")

# --- SVM Evaluation ---
y_pred_svm = svm_model.predict(X_test_small)
acc_svm = accuracy_score(y_test_small, y_pred_svm)
print(f"\nSVM Accuracy: {acc_svm:.4f}")
print("\n--- SVM Classification Report ---")
print(classification_report(y_test_small, y_pred_svm))
cm_svm = confusion_matrix(y_test_small, y_pred_svm)
print("\nSVM Confusion Matrix:\n", cm_svm)

# --- Random Forest Evaluation ---
y_pred_rf = rf_model.predict(X_test_small)
acc_rf = accuracy_score(y_test_small, y_pred_rf)
print(f"\nRandom Forest Accuracy: {acc_rf:.4f}")
print("\n--- Random Forest Classification Report ---")
print(classification_report(y_test_small, y_pred_rf))
cm_rf = confusion_matrix(y_test_small, y_pred_rf)
print("\nRF Confusion Matrix:\n", cm_rf)


--- Model Evaluation ---

SVM Accuracy: 0.9195

--- SVM Classification Report ---
              precision    recall  f1-score   support

           0       0.89      0.95      0.92     10000
           1       0.95      0.88      0.92     10000

    accuracy                           0.92     20000
   macro avg       0.92      0.92      0.92     20000
weighted avg       0.92      0.92      0.92     20000


SVM Confusion Matrix:
 [[9549  451]
 [1159 8841]]

Random Forest Accuracy: 0.9375

--- Random Forest Classification Report ---
              precision    recall  f1-score   support

           0       0.91      0.97      0.94     10000
           1       0.97      0.91      0.94     10000

    accuracy                           0.94     20000
   macro avg       0.94      0.94      0.94     20000
weighted avg       0.94      0.94      0.94     20000


RF Confusion Matrix:
 [[9676  324]
 [ 926 9074]]


In [13]:
import joblib

joblib.dump(svm_model, "/content/svm_model.pkl")
joblib.dump(rf_model, "/content/rf_model.pkl")
joblib.dump(vectorizer_small, "/content/vectorizer.pkl")

print("\n✅ Models and vectorizer saved successfully!")
print("Files saved as:")
print(" - svm_model.pkl")
print(" - rf_model.pkl")
print(" - vectorizer.pkl")


✅ Models and vectorizer saved successfully!
Files saved as:
 - svm_model.pkl
 - rf_model.pkl
 - vectorizer.pkl


In [14]:
sample_url = "http://login-verify-secure-paypal.com"
sample_vec = vectorizer_small.transform([sample_url])

pred_svm = svm_model.predict(sample_vec)[0]
pred_rf = rf_model.predict(sample_vec)[0]

print("\n--- Sample URL Test ---")
print("URL:", sample_url)
print(f"SVM Prediction: {'SPAM' if pred_svm else 'SAFE'}")
print(f"RF Prediction: {'SPAM' if pred_rf else 'SAFE'}")


--- Sample URL Test ---
URL: http://login-verify-secure-paypal.com
SVM Prediction: SPAM
RF Prediction: SPAM
