In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np
import re
import math
from urllib.parse import urlparse


In [2]:
data = pd.read_csv('malicious_urls.csv')

# Map labels to integers
label_mapping = {'benign': 0, 'phishing': 1, 'defacement': 2}
data['label'] = data['type'].map(label_mapping)

In [3]:
def extract_features(url):
    features = {}
    
    # URL length
    features['url_length'] = len(url)
    
    # Check for IP address
    features['has_ip'] = 1 if re.search(r'\d+\.\d+\.\d+\.\d+', url) else 0
    
    # Number of subdomains
    features['num_subdomains'] = url.count('.')
    
    # Presence of suspicious words
    suspicious_words = ['login', 'paypal', 'secure', 'bank', 'phishing']
    features['has_suspicious_words'] = 1 if any(word in url for word in suspicious_words) else 0
    
    # Special characters in URL
    features['num_special_chars'] = len(re.findall(r'[^a-zA-Z0-9./:_-]', url))
    
    # URL Entropy
    entropy = 0
    for char in set(url):
        prob = url.count(char) / len(url)
        entropy -= prob * math.log(prob, 2)
    features['url_entropy'] = entropy

    return features


In [4]:
features_df = pd.DataFrame([extract_features(url) for url in data['url']])

# Add the 'label' column
features_df['label'] = data['label']

# Drop rows with NaN values in 'label' column
features_df = features_df.dropna(subset=['label'])

# Split the dataset into features (X) and target (y)
X = features_df.drop('label', axis=1)
y = features_df['label']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)


In [6]:
class_weights = {0: 1.0, 1: 3.0, 2: 3.0}  # Adjust weights based on the class distribution
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, class_weight=class_weights)
model.fit(X_train, y_train)


In [7]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, target_names=label_mapping.keys()))


Accuracy: 0.7653222980526957

Classification Report:

              precision    recall  f1-score   support

      benign       0.86      0.84      0.85    128432
    phishing       0.41      0.48      0.44     22418
  defacement       0.68      0.68      0.68     28937

    accuracy                           0.77    179787
   macro avg       0.65      0.66      0.65    179787
weighted avg       0.78      0.77      0.77    179787



In [None]:
while True:
    user_input = input("Enter URLs separated by commas (or type 'exit' to quit): ").strip()
    if user_input.lower() == 'exit':
        print("Exiting...")
        break

    # Split the input into a list of URLs
    user_urls = [url.strip() for url in user_input.split(',')]
    
    # Extract features for user-input URLs
    user_features = pd.DataFrame([extract_features(url) for url in user_urls])
    
    # Predict using the trained model
    user_predictions = model.predict(user_features)
    
    # Display results
    for url, pred in zip(user_urls, user_predictions):
        predicted_label = list(label_mapping.keys())[list(label_mapping.values()).index(pred)]
        print(f"URL: {url}, Prediction: {predicted_label}")

Enter URLs separated by commas (or type 'exit' to quit):  GET /articles?include=author&fields[articles]=title,body,author&fields[people]=name HTTP/1.1


URL: GET /articles?include=author&fields[articles]=title, Prediction: benign
URL: body, Prediction: phishing
URL: author&fields[people]=name HTTP/1.1, Prediction: benign


Enter URLs separated by commas (or type 'exit' to quit):  push /bank/ /login /articles?include=author&fields[articles]=title,body,author&fields[people]=name HTTP/1.1


URL: push /bank/ /login /articles?include=author&fields[articles]=title, Prediction: benign
URL: body, Prediction: phishing
URL: author&fields[people]=name HTTP/1.1, Prediction: benign


Enter URLs separated by commas (or type 'exit' to quit):  push /bank/ /login /articles?include=author&fields[articles]=title,body,author&fields[people]=name HTTP/1.1


URL: push /bank/ /login /articles?include=author&fields[articles]=title, Prediction: benign
URL: body, Prediction: phishing
URL: author&fields[people]=name HTTP/1.1, Prediction: benign
