In [1]:
# ============================================================
# === STEP 1: Import Libraries ===
# ============================================================

import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from scipy.sparse import hstack
import joblib

print("✅ Libraries Imported Successfully!")


✅ Libraries Imported Successfully!


In [2]:
# ============================================================
# === STEP 2: Load Dataset (Limit to 100k Rows) ===
# ============================================================

df = pd.read_csv("/content/malicious_phish.csv").sample(n=100000, random_state=42)

print("✅ Dataset Loaded Successfully!")
print("Dataset Shape:", df.shape)
print("\nType Distribution:\n", df['type'].value_counts())
df.head()


✅ Dataset Loaded Successfully!
Dataset Shape: (100000, 2)

Type Distribution:
 type
benign        65966
defacement    14690
phishing      14317
malware        5027
Name: count, dtype: int64


Unnamed: 0,url,type
536448,http://37.49.226.178/deusbins/deus.sh4,malware
40630,medical-dictionary.thefreedictionary.com/Galt+...,benign
630496,www.jscape.com/sshfactory/,phishing
426724,http://www.wsnc.org.au/component/jcalpro/view/983,defacement
184034,virtualtourist.com/travel/North_America/Canada...,benign


In [3]:
# ============================================================
# === STEP 3: Convert to Binary Classification ===
# ============================================================

df['label'] = df['type'].apply(lambda x: 0 if x == 'benign' else 1)
print("\nLabel Distribution:\n", df['label'].value_counts())



Label Distribution:
 label
0    65966
1    34034
Name: count, dtype: int64


In [4]:
# ============================================================
# === STEP 4: Clean and Preprocess URLs ===
# ============================================================

def clean_url(url):
    url = str(url).lower()
    url = re.sub(r'https?://', '', url)  # remove http/https
    url = re.sub(r'www\\.', '', url)     # remove www
    url = url.strip().strip('/')
    return url

df['clean_url'] = df['url'].apply(clean_url)

print("✅ URLs Cleaned Successfully!")
df[['url', 'clean_url']].head(5)


✅ URLs Cleaned Successfully!


Unnamed: 0,url,clean_url
536448,http://37.49.226.178/deusbins/deus.sh4,37.49.226.178/deusbins/deus.sh4
40630,medical-dictionary.thefreedictionary.com/Galt+...,medical-dictionary.thefreedictionary.com/galt+...
630496,www.jscape.com/sshfactory/,www.jscape.com/sshfactory
426724,http://www.wsnc.org.au/component/jcalpro/view/983,www.wsnc.org.au/component/jcalpro/view/983
184034,virtualtourist.com/travel/North_America/Canada...,virtualtourist.com/travel/north_america/canada...


In [5]:
# ============================================================
# === STEP 5: Extract Additional URL Features ===
# ============================================================

def extract_features(url):
    return {
        "url_length": len(url),
        "count_digits": sum(c.isdigit() for c in url),
        "count_dots": url.count('.'),
        "count_hyphens": url.count('-'),
        "count_at": url.count('@'),
        "count_question": url.count('?'),
        "count_equals": url.count('='),
        "has_ip": 1 if re.search(r'\\b\\d{1,3}(?:\\.\\d{1,3}){3}\\b', url) else 0,
        "has_suspicious_word": 1 if any(w in url for w in
                                        ['login','verify','update','free','click','secure',
                                         'account','bank','signin','confirm','password']) else 0
    }

feature_df = df['clean_url'].apply(extract_features).apply(pd.Series)
print("✅ Additional Features Extracted Successfully!")
feature_df.head()


✅ Additional Features Extracted Successfully!


Unnamed: 0,url_length,count_digits,count_dots,count_hyphens,count_at,count_question,count_equals,has_ip,has_suspicious_word
536448,31,11,4,0,0,0,0,0,0
40630,54,0,2,1,0,0,0,0,1
630496,25,0,2,0,0,0,0,0,0
426724,42,3,3,0,0,0,0,0,0
184034,121,7,2,5,0,0,0,0,0


In [6]:
# ============================================================
# === STEP 6: Combine TF-IDF with Additional Features ===
# ============================================================

tfidf = TfidfVectorizer(max_features=3000, ngram_range=(1, 2))
X_tfidf = tfidf.fit_transform(df['clean_url'])

# Combine numerical features with TF-IDF
X_combined = hstack([X_tfidf, feature_df.values])
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)
print("✅ Features Combined and Data Split Successfully!")


✅ Features Combined and Data Split Successfully!


In [7]:
# ============================================================
# === STEP 7: Train & Evaluate Model ===
# ============================================================

model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("\n✅ Model Trained Successfully!")
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Save Model and TF-IDF
joblib.dump(model, "spam_url_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")

print("\n💾 Model and Vectorizer Saved Successfully!")



✅ Model Trained Successfully!

Accuracy: 0.92395

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.96      0.94     13268
           1       0.92      0.85      0.88      6732

    accuracy                           0.92     20000
   macro avg       0.92      0.90      0.91     20000
weighted avg       0.92      0.92      0.92     20000


Confusion Matrix:
 [[12777   491]
 [ 1030  5702]]

💾 Model and Vectorizer Saved Successfully!


In [None]:
# ============================================================
# 🚀 Spam URL Detection - Streamlit Web App (ENHANCED)
# ============================================================

import streamlit as st
import pandas as pd
import joblib
import re
from datetime import datetime
# CRITICAL: Re-added hstack for combining TF-IDF and numeric features, as used in enhanced_model.ipynb
from scipy.sparse import hstack 

# ============================================================
# Load Models and Vectorizer
# ============================================================
# IMPORTANT: Ensure these files are in the same directory (tfidf_vectorizer.pkl, spam_url_model.pkl)
try:
    # Loading the Vectorizer and the final combined-feature Random Forest model 
    # as saved in the enhanced_model.ipynb (Steps 8 & 9).
    model = joblib.load("spam_url_model.pkl") 
    vectorizer = joblib.load("tfidf_vectorizer.pkl")
except FileNotFoundError:
    st.error("Model files (spam_url_model.pkl, tfidf_vectorizer.pkl) not found. Please ensure they are in the correct directory.")
    st.stop()


# ============================================================
# Feature Preprocessing (Enhanced)
# ============================================================
def clean_url(url):
    """
    Cleans the URL. Implemented based on STEP 4 of enhanced_model.ipynb:
    removes http/https and www., converts to lowercase, and strips trailing '/'.
    """
    url = str(url).lower()
    url = re.sub(r'https?://', '', url)  # remove http/https
    url = re.sub(r'www\.?', '', url)     # remove www (fixed regex for consistency)
    url = url.strip().strip('/')
    return url

# CRITICAL: Re-introducing the feature extraction logic based on STEP 5 of enhanced_model.ipynb
def extract_features(url):
    """
    Extracts additional numeric features from the URL string.
    """
    return {
        "url_length": len(url),
        "count_digits": sum(c.isdigit() for c in url),
        "count_dots": url.count('.'),
        "count_hyphens": url.count('-'),
        "count_at": url.count('@'),
        "count_question": url.count('?'),
        "count_equals": url.count('='),
        "has_ip": 1 if re.search(r'\b\d{1,3}(?:\.\d{1,3}){3}\b', url) else 0,
        "has_suspicious_word": 1 if any(w in url for w in 
            ['login','verify','update','free','click','secure',
            'account','bank','signin','confirm','password']) else 0
    }

# ============================================================
# Prediction Function
# ============================================================
def predict_url(url):
    """
    Predicts if a URL is SPAM (1) or SAFE (0) using the combined-feature model.
    """
    
    # 1. Clean the URL (Text Feature)
    clean = clean_url(url)
    
    # 2. Extract Numeric Features
    features_dict = extract_features(clean)
    # Convert features to a DataFrame and then to a numpy array for hstack
    feature_df = pd.DataFrame([features_dict])
    numeric_features = feature_df.values
    
    # 3. Apply TF-IDF Vectorization (Sparse Matrix)
    tfidf_vector = vectorizer.transform([clean]) 
    
    # 4. Combine TF-IDF and Numeric Features (CRITICAL FIX based on .ipynb Step 7)
    combined_features = hstack([tfidf_vector, numeric_features])

    # 5. Predict
    pred = model.predict(combined_features)[0]
    
    # Label mapping: 1 = malicious (SPAM), 0 = benign (SAFE)
    return "🚨 SPAM" if pred == 1 else "✅ SAFE"

# ============================================================
# Streamlit UI Setup
# ============================================================

st.set_page_config(page_title="Spam URL Detector", page_icon="🔒", layout="centered")

st.markdown("<h1 style='text-align:center; color:#2C3E50;'>🔍 Spam URL Detection System (Enhanced Model)</h1>", unsafe_allow_html=True)
st.write("")
st.markdown("<p style='text-align:center; font-size:16px;'>Check if a URL is safe or potentially malicious using a combined text and feature-engineered model.</p>", unsafe_allow_html=True)

# URL Input
url_input = st.text_input("Enter a URL to analyze:")

# CRITICAL CHANGE: Model selection removed as only one combined model was trained/saved in the notebook.

# Predict Button
if st.button("🔍 Analyze"):
    if not url_input.strip():
        st.warning("Please enter a valid URL.")
    else:
        url_to_analyze = url_input

        with st.spinner("Analyzing... Please wait..."):
            # Call prediction function without model_choice argument
            result = predict_url(url_to_analyze)
        
        # Displaying the result
        if "SPAM" in result:
            st.error(result)
        else:
            st.success(result)

        # Save logs
        log_entry = pd.DataFrame({
            "Timestamp": [datetime.now().strftime("%Y-%m-%d %H:%M:%S")],
            "URL": [url_to_analyze],
            "Model": ["Combined Random Forest"], # Fixed model name
            "Prediction": [result]
        })
        
        # Robust logging implementation
        try:
            # Read existing logs and append without header
            logs_df = pd.read_csv("logs.csv")
            log_entry.to_csv("logs.csv", mode='a', header=False, index=False)
        except (FileNotFoundError, pd.errors.EmptyDataError):
            # If file doesn't exist or is empty, write with header
            log_entry.to_csv("logs.csv", index=False)


# ============================================================
# Display Logs
# ============================================================
if st.checkbox("📜 Show Prediction History"):
    try:
        logs = pd.read_csv("logs.csv")
        st.dataframe(logs, use_container_width=True)
    except (FileNotFoundError, pd.errors.EmptyDataError):
        st.info("No logs yet. Start predicting!")

# ============================================================
# Footer
# ============================================================
st.markdown("---")
st.markdown("<p style='text-align:center; color:gray;'>Spam URL Detection using Combined RF Model</p>", unsafe_allow_html=True)

https://google.com → ✅ SAFE
http://free-gift.ru/login → 🚨 SPAM
https://secure-paypal-login.xyz → 🚨 SPAM
http://update-account-info.net → 🚨 SPAM
https://pes.edu → ✅ SAFE
http:/google.com → ✅ SAFE
