In [None]:
# === STEP 1: Import Libraries ===
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

In [None]:
# ============================================================
# === STEP 2: Load Dataset ===
# ============================================================
# Make sure you upload 'malicious_phish.csv' to your Colab files before running this cell.
df = pd.read_csv("/content/malicious_phish.csv")

print("✅ Dataset Loaded Successfully!")
print("Dataset Shape:", df.shape)
print("\nType distribution:\n", df['type'].value_counts())

✅ Dataset Loaded Successfully!
Dataset Shape: (651191, 2)

Type distribution:
 type
benign        428103
defacement     96457
phishing       94111
malware        32520
Name: count, dtype: int64


In [None]:
# ============================================================
# === STEP 3: Convert to Binary Classification (Spam vs Non-Spam) ===
# ============================================================

# Mark benign as 0 (non-spam), others as 1 (spam)
df['label'] = df['type'].apply(lambda x: 0 if x == 'benign' else 1)
print("\nLabel Distribution:\n", df['label'].value_counts())



Label Distribution:
 label
0    428103
1    223088
Name: count, dtype: int64


In [None]:
# ============================================================
# === STEP 4: Balance Dataset ===
# ============================================================

# Separate classes
spam_df = df[df['label'] == 1]
benign_df = df[df['label'] == 0]

# Balance both classes to equal count
min_size = min(len(spam_df), len(benign_df))
spam_df = spam_df.sample(min_size, random_state=42)
benign_df = benign_df.sample(min_size, random_state=42)

# Combine and shuffle
balanced_df = pd.concat([spam_df, benign_df]).sample(frac=1, random_state=42)

print("\nBalanced Label Distribution:\n", balanced_df['label'].value_counts())
print("✅ Dataset Balanced Successfully!")


Balanced Label Distribution:
 label
1    223088
0    223088
Name: count, dtype: int64
✅ Dataset Balanced Successfully!


In [None]:
# ============================================================
# === STEP 5: Prepare Features and Labels ===
# ============================================================
X = balanced_df['url']
y = balanced_df['label']

# Split into train/test
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\nTraining samples:", len(X_train_raw))
print("Testing samples:", len(X_test_raw))


Training samples: 356940
Testing samples: 89236


In [None]:
# ============================================================
# === STEP 6: Text Vectorization using TF-IDF ===
# ============================================================
vectorizer = TfidfVectorizer(max_features=3000)
X_train = vectorizer.fit_transform(X_train_raw)
X_test = vectorizer.transform(X_test_raw)

print("\n✅ TF-IDF Vectorization Complete")
print("Feature Matrix Shape:", X_train.shape)


✅ TF-IDF Vectorization Complete
Feature Matrix Shape: (356940, 3000)


In [None]:
# ============================================================
# === STEP 7: Train Models ===
# ============================================================

# --- Support Vector Machine ---
print("\n🎯 Training Support Vector Machine (SVM)...")
svm_model = SVC(kernel='linear', C=1, random_state=42)
svm_model.fit(X_train, y_train)
print("✅ SVM Model Trained Successfully!")

# --- Random Forest ---
print("\n🎯 Training Random Forest...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
print("✅ Random Forest Model Trained Successfully!")


🎯 Training Support Vector Machine (SVM)...
✅ SVM Model Trained Successfully!

🎯 Training Random Forest...
✅ Random Forest Model Trained Successfully!


In [None]:
# ============================================================
# === STEP 8: Model Evaluation ===
# ============================================================
print("\n--- Model Evaluation ---")

# Predictions
y_pred_svm = svm_model.predict(X_test)
y_pred_rf = rf_model.predict(X_test)

# Accuracy
acc_svm = accuracy_score(y_test, y_pred_svm)
acc_rf = accuracy_score(y_test, y_pred_rf)

print(f"\nSVM Accuracy: {acc_svm:.4f}")
print(f"Random Forest Accuracy: {acc_rf:.4f}")

# Detailed Report
print("\n--- SVM Classification Report ---")
print(classification_report(y_test, y_pred_svm))

print("\n--- Random Forest Classification Report ---")
print(classification_report(y_test, y_pred_rf))

# Confusion Matrix (optional visualization)
cm_svm = confusion_matrix(y_test, y_pred_svm)
cm_rf = confusion_matrix(y_test, y_pred_rf)

print("\nSVM Confusion Matrix:\n", cm_svm)
print("\nRF Confusion Matrix:\n", cm_rf)


--- Model Evaluation ---

SVM Accuracy: 0.9301
Random Forest Accuracy: 0.9472

--- SVM Classification Report ---
              precision    recall  f1-score   support

           0       0.90      0.96      0.93     44618
           1       0.96      0.90      0.93     44618

    accuracy                           0.93     89236
   macro avg       0.93      0.93      0.93     89236
weighted avg       0.93      0.93      0.93     89236


--- Random Forest Classification Report ---
              precision    recall  f1-score   support

           0       0.92      0.98      0.95     44618
           1       0.97      0.92      0.95     44618

    accuracy                           0.95     89236
   macro avg       0.95      0.95      0.95     89236
weighted avg       0.95      0.95      0.95     89236


SVM Confusion Matrix:
 [[42948  1670]
 [ 4571 40047]]

RF Confusion Matrix:
 [[43526  1092]
 [ 3618 41000]]


In [None]:
# ============================================================
# === STEP 9: Save Trained Models ===
# ============================================================
joblib.dump(svm_model, "/content/svm_model.pkl")
joblib.dump(rf_model, "/content/rf_model.pkl")
joblib.dump(vectorizer, "/content/vectorizer.pkl")

print("\n✅ Models Saved Successfully!")
print("Files saved as:")
print(" - svm_model.pkl")
print(" - rf_model.pkl")
print(" - vectorizer.pkl")


✅ Models Saved Successfully!
Files saved as:
 - svm_model.pkl
 - rf_model.pkl
 - vectorizer.pkl


In [None]:
# ============================================================
# === STEP 10: Test with a Sample URL ===
# ============================================================
sample_url = "http://login-verify-secure-paypal.com"
sample_vec = vectorizer.transform([sample_url])

pred_svm = svm_model.predict(sample_vec)[0]
pred_rf = rf_model.predict(sample_vec)[0]

print("\n--- Sample URL Test ---")
print("URL:", sample_url)
print(f"SVM Prediction: {'SPAM' if pred_svm else 'SAFE'}")
print(f"RF Prediction: {'SPAM' if pred_rf else 'SAFE'}")

print("\n🎉 All Steps Completed Successfully!")



--- Sample URL Test ---
URL: http://login-verify-secure-paypal.com
SVM Prediction: SPAM
RF Prediction: SPAM

🎉 All Steps Completed Successfully!
