In [2]:
# ========================================
# Quantum Dataset Generator - Feature Extraction
# ========================================

# Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import re
from urllib.parse import urlparse
import math
import os
from tqdm import tqdm

print("📦 All libraries imported successfully!")

# Set paths to your dataset files
train_data_path = "train_data.csv"  # Update with your actual path
test_data_path = "test_data.csv"    # Update with your actual path


📦 All libraries imported successfully!


In [3]:
# CELL 2: Load the datasets
print("Loading datasets...")
train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

print(f"Training set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")

# Check if target column exists
if 'Label' not in train_df.columns:
    # Try to find the target column
    target_col = None
    for col in train_df.columns:
        if col.lower() in ['label', 'target', 'class', 'is_malicious']:
            target_col = col
            break
    
    if target_col:
        print(f"Using '{target_col}' as target column")
        train_df = train_df.rename(columns={target_col: 'target'})
        test_df = test_df.rename(columns={target_col: 'target'})
    else:
        # If no target column found, assume the last column is the target
        print("No explicit target column found, using last column as target")
        train_df = train_df.rename(columns={train_df.columns[-1]: 'target'})
        test_df = test_df.rename(columns={test_df.columns[-1]: 'target'})
else:
    train_df = train_df.rename(columns={'Label': 'target'})
    test_df = test_df.rename(columns={'Label': 'target'})

print(f"Training target distribution:\n{train_df['target'].value_counts()}")
print(f"Test target distribution:\n{test_df['target'].value_counts()}")

Loading datasets...
Training set shape: (1200000, 12)
Test set shape: (361934, 12)
Using 'label' as target column
Training target distribution:
target
good    1172747
bad       27253
Name: count, dtype: int64
Test target distribution:
target
good    353872
bad       8062
Name: count, dtype: int64


In [4]:
# CELL 3: Basic preprocessing
# Check for missing values
print("Missing values in training set:")
print(train_df.isnull().sum())
print("Missing values in test set:")
print(test_df.isnull().sum())

# Drop rows with missing values if any
train_df = train_df.dropna()
test_df = test_df.dropna()

print(f"Training set shape after cleaning: {train_df.shape}")
print(f"Test set shape after cleaning: {test_df.shape}")

Missing values in training set:
Unnamed: 0    0
url           0
url_len       0
ip_add        0
geo_loc       0
tld           0
who_is        0
https         0
js_len        0
js_obf_len    0
content       0
target        0
dtype: int64
Missing values in test set:
Unnamed: 0    0
url           0
url_len       0
ip_add        0
geo_loc       0
tld           0
who_is        0
https         0
js_len        0
js_obf_len    0
content       0
target        0
dtype: int64
Training set shape after cleaning: (1200000, 12)
Test set shape after cleaning: (361934, 12)


In [5]:
# CELL 4: Enhanced URL-based feature engineering
def extract_url_features(url):
    """Extract additional features from URL"""
    try:
        if not isinstance(url, str):
            url = str(url)
            
        parsed = urlparse(url)
        
        features = {}
        
        # Additional length features beyond the existing url_length
        features['path_length'] = len(parsed.path) if parsed.path else 0
        features['query_length'] = len(parsed.query) if parsed.query else 0
        features['fragment_length'] = len(parsed.fragment) if parsed.fragment else 0
        
        # Structural features
        features['num_dots'] = url.count('.')
        features['num_hyphens'] = url.count('-')
        features['num_underscores'] = url.count('_')
        features['num_slashes'] = url.count('/')
        features['num_questionmarks'] = url.count('?')
        features['num_equals'] = url.count('=')
        features['num_ampersands'] = url.count('&')
        features['num_at_signs'] = url.count('@')
        features['num_percent_signs'] = url.count('%')
        
        # Character composition
        features['num_digits'] = sum(1 for c in url if c.isdigit())
        features['num_letters'] = sum(1 for c in url if c.isalpha())
        features['num_special_chars'] = len(url) - features['num_digits'] - features['num_letters']
        
        # Entropy features (quantum-inspired)
        features['url_entropy'] = calculate_entropy(url)
        
        return features
    except:
        # Return default values if URL parsing fails
        return {feature: 0 for feature in [
            'path_length', 'query_length', 'fragment_length',
            'num_dots', 'num_hyphens', 'num_underscores', 'num_slashes',
            'num_questionmarks', 'num_equals', 'num_ampersands', 
            'num_at_signs', 'num_percent_signs', 'num_digits',
            'num_letters', 'num_special_chars', 'url_entropy'
        ]}

def calculate_entropy(text):
    """Calculate Shannon entropy of a string efficiently"""
    if not text or not isinstance(text, str):
        return 0
        
    text_length = len(text)
    if text_length == 0:
        return 0
        
    # Use numpy for faster entropy calculation
    chars, counts = np.unique(list(text), return_counts=True)
    probabilities = counts / text_length
    entropy = -np.sum(probabilities * np.log2(probabilities + 1e-10))  # Avoid log(0)
    
    return entropy

# Apply URL feature extraction with progress bar
print("Extracting additional URL features from training set...")
url_features_train = []
for url in tqdm(train_df['url'], desc="Processing URLs"):
    url_features_train.append(extract_url_features(url))

url_features_train = pd.DataFrame(url_features_train)
train_df = pd.concat([train_df.reset_index(drop=True), url_features_train], axis=1)

print("Extracting additional URL features from test set...")
url_features_test = []
for url in tqdm(test_df['url'], desc="Processing URLs"):
    url_features_test.append(extract_url_features(url))

url_features_test = pd.DataFrame(url_features_test)
test_df = pd.concat([test_df.reset_index(drop=True), url_features_test], axis=1)

print(f"Added {len(url_features_train.columns)} additional URL-based features")

Extracting additional URL features from training set...


Processing URLs: 100%|██████████| 1200000/1200000 [00:34<00:00, 35229.42it/s]


Extracting additional URL features from test set...


Processing URLs: 100%|██████████| 361934/361934 [00:09<00:00, 36485.56it/s]


Added 16 additional URL-based features


In [6]:
# CELL 5: Enhanced HTML content features
# Let's use the exact column names from your dataset
url_col = 'url'
html_col = 'content'
target_col = 'label'

def extract_html_features(html_content):
    """Extract quantum-inspired features from HTML content"""
    if not isinstance(html_content, str):
        html_content = str(html_content)
    
    features = {}
    
    # Basic HTML structure features
    html_length = len(html_content)
    features['html_content_length'] = html_length
    
    # Only process HTML if it's not too long
    if html_length < 100000:  # Reasonable limit
        features['num_script_tags'] = html_content.count('<script')
        features['num_iframe_tags'] = html_content.count('<iframe')
        features['num_link_tags'] = html_content.count('<a')
        features['num_form_tags'] = html_content.count('<form')
        features['num_image_tags'] = html_content.count('<img')
        features['num_input_tags'] = html_content.count('<input')
        features['num_style_tags'] = html_content.count('<style')
        
        # Suspicious patterns
        features['num_external_links'] = html_content.count('http://') + html_content.count('https://')
        features['num_suspicious_keywords'] = sum(1 for keyword in ['eval', 'exec', 'document.write', 'innerHTML', 'fromCharCode'] 
                                               if keyword in html_content)
        
        # Quantum-inspired features
        features['html_entropy'] = calculate_entropy(html_content) if html_length < 50000 else 0
        
        # Tag ratios
        total_tags = html_content.count('<')
        if total_tags > 0:
            features['script_ratio'] = features['num_script_tags'] / total_tags
            features['iframe_ratio'] = features['num_iframe_tags'] / total_tags
            features['external_link_ratio'] = features['num_external_links'] / total_tags
        else:
            features['script_ratio'] = 0
            features['iframe_ratio'] = 0
            features['external_link_ratio'] = 0
    else:
        # Default values for very long HTML
        features.update({
            'num_script_tags': 0, 'num_iframe_tags': 0, 'num_link_tags': 0,
            'num_form_tags': 0, 'num_image_tags': 0, 'num_input_tags': 0,
            'num_style_tags': 0, 'num_external_links': 0, 'num_suspicious_keywords': 0,
            'html_entropy': 0, 'script_ratio': 0, 'iframe_ratio': 0, 'external_link_ratio': 0
        })
    
    return features

# Apply HTML feature extraction with progress bar
print("Extracting HTML features from training set...")
html_features_train = []
for html in tqdm(train_df[html_col], desc="Processing HTML"):
    html_features_train.append(extract_html_features(html))

html_features_train = pd.DataFrame(html_features_train)
train_df = pd.concat([train_df, html_features_train], axis=1)

print("Extracting HTML features from test set...")
html_features_test = []
for html in tqdm(test_df[html_col], desc="Processing HTML"):
    html_features_test.append(extract_html_features(html))

html_features_test = pd.DataFrame(html_features_test)
test_df = pd.concat([test_df, html_features_test], axis=1)

print(f"Added {len(html_features_train.columns)} HTML-based features")

Extracting HTML features from training set...


Processing HTML: 100%|██████████| 1200000/1200000 [05:18<00:00, 3769.49it/s]


Extracting HTML features from test set...


Processing HTML: 100%|██████████| 361934/361934 [01:36<00:00, 3742.84it/s]


Added 14 HTML-based features


In [7]:
# CELL 6: Create quantum-inspired features
def create_quantum_features(row):
    """Create features inspired by quantum concepts"""
    features = {}
    
    # Feature entanglement (correlation-inspired)
    features['entanglement_url_html'] = row.get('url_entropy', 0) * row.get('html_entropy', 0)
    
    # Superposition-inspired features
    html_len = row.get('html_content_length', 1)
    features['superposition_structure'] = (row.get('num_script_tags', 0) + row.get('num_iframe_tags', 0)) / max(html_len, 1) * 100
    
    # Quantum probability-inspired features
    url_len = max(row.get('url_len', 1), 1)
    features['prob_malicious_url'] = min(1.0, row.get('num_special_chars', 0) / url_len * 5)
    features['prob_malicious_html'] = min(1.0, (row.get('num_script_tags', 0) + row.get('num_iframe_tags', 0)) / max(html_len, 1) * 100)
    
    # Quantum state-inspired binary features
    features['has_suspicious_elements'] = 1 if (row.get('num_script_tags', 0) > 3 or 
                                              row.get('num_iframe_tags', 0) > 2 or
                                              row.get('num_suspicious_keywords', 0) > 5) else 0
    
    # JS-related quantum features
    js_len = max(row.get('js_len', 1), 1)
    js_obf_len = row.get('js_obf_len', 0)
    features['js_quantum_ratio'] = js_obf_len / js_len
    features['js_entropy_ratio'] = calculate_entropy(str(js_len)) / max(calculate_entropy(str(js_obf_len)), 1)
    
    return features

# Apply quantum feature creation with progress bar
print("Creating quantum-inspired features for training set...")
quantum_features_train = []
for _, row in tqdm(train_df.iterrows(), total=len(train_df), desc="Quantum features"):
    quantum_features_train.append(create_quantum_features(row))

quantum_features_train = pd.DataFrame(quantum_features_train)
train_df = pd.concat([train_df, quantum_features_train], axis=1)

print("Creating quantum-inspired features for test set...")
quantum_features_test = []
for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Quantum features"):
    quantum_features_test.append(create_quantum_features(row))

quantum_features_test = pd.DataFrame(quantum_features_test)
test_df = pd.concat([test_df, quantum_features_test], axis=1)

print(f"Added {len(quantum_features_train.columns)} quantum-inspired features")


Creating quantum-inspired features for training set...


Quantum features: 100%|██████████| 1200000/1200000 [01:31<00:00, 13143.80it/s]


Creating quantum-inspired features for test set...


Quantum features: 100%|██████████| 361934/361934 [00:27<00:00, 13023.56it/s]


Added 7 quantum-inspired features


In [9]:
# CELL: Check what quantum features actually exist in your ORIGINAL dataframe
print("Checking what quantum features actually exist in the dataset...")

# List all possible quantum features we expect
quantum_features_expected = [
    'entanglement_url_html', 'superposition_structure',
    'prob_malicious_url', 'prob_malicious_html', 'has_suspicious_elements',
    'js_quantum_ratio', 'js_entropy_ratio'
]

# Check which ones actually exist
quantum_features_available = [f for f in quantum_features_expected if f in train_df.columns]

print(f"Expected quantum features: {quantum_features_expected}")
print(f"Available quantum features: {quantum_features_available}")

# Check if any are missing
missing_quantum = set(quantum_features_expected) - set(quantum_features_available)
if missing_quantum:
    print(f"Missing quantum features: {missing_quantum}")
    
    # Let's check why they might be missing by looking at the feature creation code
    print("\nChecking if missing features can be recreated...")
    
    # Recreate missing quantum features if possible
    if 'superposition_structure' in missing_quantum:
        print("Recreating superposition_structure...")
        train_df['superposition_structure'] = (train_df.get('num_script_tags', 0) + train_df.get('num_iframe_tags', 0)) / np.maximum(train_df.get('html_content_length', 1), 1) * 100
        test_df['superposition_structure'] = (test_df.get('num_script_tags', 0) + test_df.get('num_iframe_tags', 0)) / np.maximum(test_df.get('html_content_length', 1), 1) * 100
    
    if 'prob_malicious_html' in missing_quantum:
        print("Recreating prob_malicious_html...")
        train_df['prob_malicious_html'] = np.minimum(1.0, (train_df.get('num_script_tags', 0) + train_df.get('num_iframe_tags', 0)) / np.maximum(train_df.get('html_content_length', 1), 1) * 100)
        test_df['prob_malicious_html'] = np.minimum(1.0, (test_df.get('num_script_tags', 0) + test_df.get('num_iframe_tags', 0)) / np.maximum(test_df.get('html_content_length', 1), 1) * 100)
    
    if 'has_suspicious_elements' in missing_quantum:
        print("Recreating has_suspicious_elements...")
        train_df['has_suspicious_elements'] = ((train_df.get('num_script_tags', 0) > 3) | 
                                              (train_df.get('num_iframe_tags', 0) > 2) |
                                              (train_df.get('num_suspicious_keywords', 0) > 5)).astype(int)
        test_df['has_suspicious_elements'] = ((test_df.get('num_script_tags', 0) > 3) | 
                                             (test_df.get('num_iframe_tags', 0) > 2) |
                                             (test_df.get('num_suspicious_keywords', 0) > 5)).astype(int)

# Update the available features list
quantum_features_available = [f for f in quantum_features_expected if f in train_df.columns]
print(f"Quantum features after recreation: {quantum_features_available}")

# Extract target variables BEFORE creating quantum-only datasets
y_train = train_df['target']
y_test = test_df['target']

print(f"Target variable extracted: y_train shape {y_train.shape}, y_test shape {y_test.shape}")

# Now create a dataset with ALL quantum features + target
quantum_only_features = quantum_features_available
print(f"\nCreating quantum-only dataset with features: {quantum_only_features}")

# Check if we have the required HTML features for quantum features
required_html_features = ['num_script_tags', 'num_iframe_tags', 'html_content_length', 'num_suspicious_keywords']
missing_html_for_quantum = [f for f in required_html_features if f not in train_df.columns]

if missing_html_for_quantum:
    print(f"Warning: Missing HTML features needed for quantum features: {missing_html_for_quantum}")
    print("Some quantum features might not work properly")

# Create quantum-only datasets
X_train_quantum = train_df[quantum_only_features]
X_test_quantum = test_df[quantum_only_features]

print(f"Quantum training set shape: {X_train_quantum.shape}")
print(f"Quantum test set shape: {X_test_quantum.shape}")

# Check for missing values in quantum features
print("\nMissing values in quantum features:")
print(X_train_quantum.isnull().sum())

# Handle any missing values
if X_train_quantum.isnull().any().any():
    print("Filling missing values with 0...")
    X_train_quantum = X_train_quantum.fillna(0)
    X_test_quantum = X_test_quantum.fillna(0)

# Scale the quantum features
scaler_quantum = StandardScaler()
X_train_quantum_scaled = scaler_quantum.fit_transform(X_train_quantum)
X_test_quantum_scaled = scaler_quantum.transform(X_test_quantum)

# Create final quantum dataset
X_train_quantum_df = pd.DataFrame(X_train_quantum_scaled, columns=quantum_only_features)
X_test_quantum_df = pd.DataFrame(X_test_quantum_scaled, columns=quantum_only_features)

# Add target
X_train_quantum_df['target'] = y_train.values
X_test_quantum_df['target'] = y_test.values

# Save quantum-only dataset
train_quantum_path = "quantum_only_features_train.csv"
test_quantum_path = "quantum_only_features_test.csv"

X_train_quantum_df.to_csv(train_quantum_path, index=False)
X_test_quantum_df.to_csv(test_quantum_path, index=False)

print(f"💾 Quantum-only training dataset saved as '{train_quantum_path}'")
print(f"💾 Quantum-only test dataset saved as '{test_quantum_path}'")

print("✅ Quantum-only dataset created successfully!")

Checking what quantum features actually exist in the dataset...
Expected quantum features: ['entanglement_url_html', 'superposition_structure', 'prob_malicious_url', 'prob_malicious_html', 'has_suspicious_elements', 'js_quantum_ratio', 'js_entropy_ratio']
Available quantum features: ['entanglement_url_html', 'superposition_structure', 'prob_malicious_url', 'prob_malicious_html', 'has_suspicious_elements', 'js_quantum_ratio', 'js_entropy_ratio']
Quantum features after recreation: ['entanglement_url_html', 'superposition_structure', 'prob_malicious_url', 'prob_malicious_html', 'has_suspicious_elements', 'js_quantum_ratio', 'js_entropy_ratio']
Target variable extracted: y_train shape (1200000,), y_test shape (361934,)

Creating quantum-only dataset with features: ['entanglement_url_html', 'superposition_structure', 'prob_malicious_url', 'prob_malicious_html', 'has_suspicious_elements', 'js_quantum_ratio', 'js_entropy_ratio']
Quantum training set shape: (1200000, 7)
Quantum test set shape:

In [10]:
# CELL: Analyze Quantum-Only CSV Files
print("🔬 ANALYZING QUANTUM-ONLY DATASETS")
print("=" * 50)

# Load the quantum-only datasets
train_quantum_path = "quantum_only_features_train.csv"
test_quantum_path = "quantum_only_features_test.csv"

try:
    # Load the datasets
    quantum_train = pd.read_csv(train_quantum_path)
    quantum_test = pd.read_csv(test_quantum_path)
    
    print(f"📊 Quantum Training Dataset Shape: {quantum_train.shape}")
    print(f"📊 Quantum Test Dataset Shape: {quantum_test.shape}")
    print()
    
    # Display all columns (quantum features)
    print("🎯 QUANTUM FEATURES AVAILABLE:")
    print("-" * 30)
    for i, col in enumerate(quantum_train.columns, 1):
        if col != 'target':
            print(f"{i:2d}. {col}")
    
    print()
    
    # Feature statistics
    print("📈 QUANTUM FEATURES STATISTICS:")
    print("-" * 35)
    
    # Get only quantum features (exclude target)
    quantum_features = [col for col in quantum_train.columns if col != 'target']
    
    for feature in quantum_features:
        print(f"\n🔹 {feature}:")
        print(f"   Training - Min: {quantum_train[feature].min():.4f}, "
              f"Max: {quantum_train[feature].max():.4f}, "
              f"Mean: {quantum_train[feature].mean():.4f}")
        print(f"   Test     - Min: {quantum_test[feature].min():.4f}, "
              f"Max: {quantum_test[feature].max():.4f}, "
              f"Mean: {quantum_test[feature].mean():.4f}")
    
    print()
    
    # Target distribution
    print("🎯 TARGET DISTRIBUTION:")
    print("-" * 25)
    print("Training set:")
    print(quantum_train['target'].value_counts().sort_index())
    print("\nTest set:")
    print(quantum_test['target'].value_counts().sort_index())
    
    print()
    
    # Missing values check
    print("🔍 MISSING VALUES CHECK:")
    print("-" * 25)
    print("Training set missing values:")
    missing_train = quantum_train.isnull().sum()
    print(missing_train[missing_train > 0])
    
    print("\nTest set missing values:")
    missing_test = quantum_test.isnull().sum()
    print(missing_test[missing_test > 0])
    
    if missing_train.sum() == 0 and missing_test.sum() == 0:
        print("✅ No missing values found!")
    
    print()
    
    # Data types information
    print("📋 DATA TYPES:")
    print("-" * 15)
    print(quantum_train.dtypes)
    
    print()
    
    # Correlation with target
    print("📊 CORRELATION WITH TARGET:")
    print("-" * 30)
    correlations = quantum_train.corr()['target'].sort_values(ascending=False)
    
    # Display correlations (excluding target itself)
    correlations = correlations[correlations.index != 'target']
    
    for feature, corr in correlations.items():
        correlation_strength = "STRONG" if abs(corr) > 0.5 else "MODERATE" if abs(corr) > 0.3 else "WEAK"
        direction = "positive" if corr > 0 else "negative"
        print(f"{feature:25} : {corr:7.4f} ({correlation_strength} {direction} correlation)")
    
    print()
    
    # Summary
    print("✅ SUMMARY:")
    print("-" * 10)
    print(f"• Total quantum features: {len(quantum_features)}")
    print(f"• Training samples: {len(quantum_train):,}")
    print(f"• Test samples: {len(quantum_test):,}")
    print(f"• Features with strongest positive correlation: {correlations.head(3).index.tolist()}")
    print(f"• Features with strongest negative correlation: {correlations.tail(3).index.tolist()}")
    
except FileNotFoundError as e:
    print(f"❌ Error: {e}")
    print("Please make sure the quantum-only CSV files exist in the current directory.")
    print("Expected files:")
    print(f"  - {train_quantum_path}")
    print(f"  - {test_quantum_path}")
    
except Exception as e:
    print(f"❌ An error occurred: {e}")

# Additional: Display first few rows for visual inspection
print("\n" + "=" * 60)
print("👀 FIRST 5 ROWS OF QUANTUM TRAINING DATASET:")
print("=" * 60)
try:
    display(quantum_train.head())
except:
    print(quantum_train.head())

print("\n" + "=" * 60)
print("📋 QUANTUM FEATURES DESCRIPTION:")
print("=" * 60)

# Create a description of what each quantum feature represents
quantum_feature_descriptions = {
    'entanglement_url_html': 'Quantum entanglement-inspired feature combining URL and HTML entropy',
    'superposition_structure': 'Superposition-inspired feature measuring script/iframe density in HTML',
    'prob_malicious_url': 'Quantum probability-inspired feature for URL maliciousness likelihood',
    'prob_malicious_html': 'Quantum probability-inspired feature for HTML maliciousness likelihood',
    'has_suspicious_elements': 'Quantum state binary feature indicating suspicious elements presence',
    'js_quantum_ratio': 'JavaScript obfuscation ratio inspired by quantum measurement principles',
    'js_entropy_ratio': 'JavaScript entropy ratio inspired by quantum information theory'
}

for feature in quantum_features:
    description = quantum_feature_descriptions.get(feature, "No description available")
    print(f"• {feature:25} : {description}")

🔬 ANALYZING QUANTUM-ONLY DATASETS
📊 Quantum Training Dataset Shape: (1200000, 8)
📊 Quantum Test Dataset Shape: (361934, 8)

🎯 QUANTUM FEATURES AVAILABLE:
------------------------------
 1. entanglement_url_html
 2. superposition_structure
 3. prob_malicious_url
 4. prob_malicious_html
 5. has_suspicious_elements
 6. js_quantum_ratio
 7. js_entropy_ratio

📈 QUANTUM FEATURES STATISTICS:
-----------------------------------

🔹 entanglement_url_html:
   Training - Min: -5.0351, Max: 5.2672, Mean: -0.0000
   Test     - Min: -4.5443, Max: 4.7738, Mean: -0.0023

🔹 superposition_structure:
   Training - Min: -1.2905, Max: 10.2113, Mean: -0.0000
   Test     - Min: -1.2905, Max: 9.6679, Mean: -0.0003

🔹 prob_malicious_url:
   Training - Min: -9.0761, Max: 0.6371, Mean: -0.0000
   Test     - Min: -8.8884, Max: 0.6371, Mean: -0.0009

🔹 prob_malicious_html:
   Training - Min: -1.2909, Max: 7.1771, Mean: -0.0000
   Test     - Min: -1.2909, Max: 7.1771, Mean: -0.0002

🔹 has_suspicious_elements:
   Tra

Unnamed: 0,entanglement_url_html,superposition_structure,prob_malicious_url,prob_malicious_html,has_suspicious_elements,js_quantum_ratio,js_entropy_ratio,target
0,-0.168778,-1.290511,-0.869515,-1.290865,-0.552329,-0.139775,0.195356,good
1,0.236668,0.094981,0.637054,0.095047,-0.552329,-0.139775,-0.746396,good
2,-0.084247,-0.649197,0.637054,-0.649356,-0.552329,-0.139775,0.801708,good
3,-1.251932,-0.060357,0.637054,-0.060337,1.810516,7.500982,-2.012619,bad
4,0.352408,-0.118024,0.637054,-0.118022,-0.552329,-0.139775,0.195356,good



📋 QUANTUM FEATURES DESCRIPTION:
• entanglement_url_html     : Quantum entanglement-inspired feature combining URL and HTML entropy
• superposition_structure   : Superposition-inspired feature measuring script/iframe density in HTML
• prob_malicious_url        : Quantum probability-inspired feature for URL maliciousness likelihood
• prob_malicious_html       : Quantum probability-inspired feature for HTML maliciousness likelihood
• has_suspicious_elements   : Quantum state binary feature indicating suspicious elements presence
• js_quantum_ratio          : JavaScript obfuscation ratio inspired by quantum measurement principles
• js_entropy_ratio          : JavaScript entropy ratio inspired by quantum information theory


In [13]:
%pip install -q --upgrade qiskit>=1.2.0 qiskit-algorithms>=0.3.1 qiskit-machine-learning>=0.8.3 qiskit-aer scikit-learn>=1.3.0 pandas>=2.0.0 matplotlib
import sys 
print(f"✅ Installation complete!") 
print(f"🐍 Python version: {sys.version_info.major}.{sys.version_info.minor}") 
# Verify installations 
try: 
    import qiskit 
    import qiskit_algorithms 
    import qiskit_machine_learning 
    print(f"✅ Qiskit version: {qiskit.version}") 
    print(f"✅ All packages installed successfully!") 
except ImportError as e: 
    print(f"❌ Import error: {e}") 
    print("🔄 Please restart runtime and try again")

  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sklearn-compat 0.1.3 requires scikit-learn<1.7,>=1.2, but you have scikit-learn 1.7.2 which is incompatible.


Note: you may need to restart the kernel to use updated packages.
✅ Installation complete!
🐍 Python version: 3.13
✅ Qiskit version: <module 'qiskit.version' from 'd:\\thesisb1\\Lib\\site-packages\\qiskit\\version.py'>
✅ All packages installed successfully!


In [15]:
# CELL: Quantum SVM Training with Corrected Qiskit API
print("🔮 QUANTUM SVM TRAINING WITH CORRECTED QISKIT API")
print("=" * 60)

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
import matplotlib.pyplot as plt

# Qiskit imports
from qiskit import QuantumCircuit
from qiskit.circuit.library import ZZFeatureMap, RealAmplitudes
from qiskit_algorithms.utils import algorithm_globals

# CORRECTED: Updated FidelityQuantumKernel import and usage
from qiskit_machine_learning.kernels import FidelityQuantumKernel
from qiskit_machine_learning.algorithms import QSVC
from qiskit.primitives import Sampler
from qiskit_aer import AerSimulator

print("📦 All libraries imported successfully!")
print("✅ Using corrected FidelityQuantumKernel API")

# Additional imports for evaluation
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
import time
import joblib

# Load the quantum-only datasets
print("📁 Loading quantum-only datasets...")
quantum_train = pd.read_csv("quantum_only_features_train.csv")
quantum_test = pd.read_csv("quantum_only_features_test.csv")

# Prepare features and targets
X_train = quantum_train.drop('target', axis=1).values
y_train = quantum_train['target'].values
X_test = quantum_test.drop('target', axis=1).values
y_test = quantum_test['target'].values

print(f"📊 Dataset shapes:")
print(f"   Training: {X_train.shape}")
print(f"   Test: {X_test.shape}")
print(f"   Features: {quantum_train.drop('target', axis=1).columns.tolist()}")

# Reduce dataset size for quantum computation
MAX_SAMPLES = 500  # Reduced for faster execution
MAX_TEST_SAMPLES = 200

if len(X_train) > MAX_SAMPLES:
    print(f"⚠️  Dataset too large for quantum computation. Sampling {MAX_SAMPLES} samples...")
    indices = np.random.choice(len(X_train), MAX_SAMPLES, replace=False)
    X_train_small = X_train[indices]
    y_train_small = y_train[indices]
else:
    X_train_small = X_train
    y_train_small = y_train

if len(X_test) > MAX_TEST_SAMPLES:
    indices_test = np.random.choice(len(X_test), MAX_TEST_SAMPLES, replace=False)
    X_test_small = X_test[indices_test]
    y_test_small = y_test[indices_test]
else:
    X_test_small = X_test
    y_test_small = y_test

print(f"🔬 Using {len(X_train_small)} training samples and {len(X_test_small)} test samples")

# Normalize features for quantum computation
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(-1, 1))
X_train_scaled = scaler.fit_transform(X_train_small)
X_test_scaled = scaler.transform(X_test_small)

print("✅ Data preprocessing completed!")

# Set random seed for reproducibility
algorithm_globals.random_seed = 42

# METHOD 1: Basic QSVM with ZZFeatureMap (CORRECTED)
def train_basic_qsvm(X_train, X_test, y_train, y_test):
    print("\n🚀 TRAINING BASIC QSVM WITH ZZFeatureMap")
    print("-" * 50)
    
    start_time = time.time()
    
    # Create feature map
    feature_dim = X_train.shape[1]
    feature_map = ZZFeatureMap(feature_dimension=feature_dim, reps=2, entanglement='linear')
    
    print(f"🔧 Feature Map: ZZFeatureMap with {feature_dim} dimensions, 2 repetitions")
    
    # CORRECTED: Create fidelity quantum kernel with proper parameters
    try:
        # Try the new API first
        quantum_kernel = FidelityQuantumKernel(
            feature_map=feature_map,
            fidelity=None,  # Let it use default
            enforce_psd=True
        )
    except TypeError:
        # Fallback to simpler initialization
        quantum_kernel = FidelityQuantumKernel(feature_map=feature_map)
    
    # Create QSVC (Quantum Support Vector Classifier)
    qsvc = QSVC(quantum_kernel=quantum_kernel)
    
    try:
        # Train the model
        print("⏳ Training QSVM (this may take a while...)")
        qsvc.fit(X_train, y_train)
        
        # Make predictions
        print("⏳ Making predictions...")
        y_pred = qsvc.predict(X_test)
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
        
        training_time = time.time() - start_time
        
        print(f"✅ Basic QSVM Training Completed!")
        print(f"📊 Accuracy: {accuracy:.4f}")
        print(f"🎯 Precision: {precision:.4f}")
        print(f"🔍 Recall: {recall:.4f}")
        print(f"⚖️  F1-Score: {f1:.4f}")
        print(f"⏱️  Training Time: {training_time:.2f} seconds")
        
        return qsvc, y_pred, accuracy, training_time, feature_map
        
    except Exception as e:
        print(f"❌ Basic QSVM failed: {e}")
        import traceback
        traceback.print_exc()
        return None, None, 0, 0, None

# METHOD 2: Advanced QSVM with Custom Parameters (CORRECTED)
def train_advanced_qsvm(X_train, X_test, y_train, y_test):
    print("\n🚀 TRAINING ADVANCED QSVM WITH ENHANCED FEATURE MAP")
    print("-" * 50)
    
    start_time = time.time()
    
    # Create enhanced feature map
    feature_dim = X_train.shape[1]
    
    # Use different parameters for better performance
    feature_map = ZZFeatureMap(
        feature_dimension=feature_dim, 
        reps=2,  # Reduced for stability
        entanglement='linear'  # Changed to linear for stability
    )
    
    print(f"🔧 Feature Map: Enhanced ZZFeatureMap with {feature_dim} dimensions, 2 repetitions, linear entanglement")
    
    # CORRECTED: Create quantum kernel
    try:
        quantum_kernel = FidelityQuantumKernel(
            feature_map=feature_map,
            fidelity=None,
            enforce_psd=True
        )
    except TypeError:
        quantum_kernel = FidelityQuantumKernel(feature_map=feature_map)
    
    # Create QSVC with potential parameter tuning
    qsvc_advanced = QSVC(quantum_kernel=quantum_kernel)
    
    try:
        print("⏳ Training Advanced QSVM...")
        qsvc_advanced.fit(X_train, y_train)
        y_pred_advanced = qsvc_advanced.predict(X_test)
        
        accuracy_advanced = accuracy_score(y_test, y_pred_advanced)
        precision_advanced = precision_score(y_test, y_pred_advanced, average='weighted', zero_division=0)
        recall_advanced = recall_score(y_test, y_pred_advanced, average='weighted', zero_division=0)
        f1_advanced = f1_score(y_test, y_pred_advanced, average='weighted', zero_division=0)
        
        training_time = time.time() - start_time
        
        print(f"✅ Advanced QSVM Training Completed!")
        print(f"📊 Accuracy: {accuracy_advanced:.4f}")
        print(f"🎯 Precision: {precision_advanced:.4f}")
        print(f"🔍 Recall: {recall_advanced:.4f}")
        print(f"⚖️  F1-Score: {f1_advanced:.4f}")
        print(f"⏱️  Training Time: {training_time:.2f} seconds")
        
        return qsvc_advanced, y_pred_advanced, accuracy_advanced, training_time, feature_map
        
    except Exception as e:
        print(f"❌ Advanced QSVM failed: {e}")
        return None, None, 0, 0, None

# METHOD 3: Classical SVM for comparison
def train_classical_svm(X_train, X_test, y_train, y_test):
    print("\n🚀 TRAINING CLASSICAL SVM FOR COMPARISON")
    print("-" * 50)
    
    start_time = time.time()
    
    # Use RBF kernel which is similar to quantum kernels
    svm_classical = SVC(kernel='rbf', gamma='scale', random_state=42)
    svm_classical.fit(X_train, y_train)
    
    y_pred_classical = svm_classical.predict(X_test)
    accuracy_classical = accuracy_score(y_test, y_pred_classical)
    precision_classical = precision_score(y_test, y_pred_classical, average='weighted', zero_division=0)
    recall_classical = recall_score(y_test, y_pred_classical, average='weighted', zero_division=0)
    f1_classical = f1_score(y_test, y_pred_classical, average='weighted', zero_division=0)
    
    training_time = time.time() - start_time
    
    print(f"✅ Classical SVM Training Completed!")
    print(f"📊 Accuracy: {accuracy_classical:.4f}")
    print(f"🎯 Precision: {precision_classical:.4f}")
    print(f"🔍 Recall: {recall_classical:.4f}")
    print(f"⚖️  F1-Score: {f1_classical:.4f}")
    print(f"⏱️  Training Time: {training_time:.2f} seconds")
    
    return svm_classical, y_pred_classical, accuracy_classical, training_time

# Train all models
print("\n" + "="*60)
print("🏋️‍♂️ STARTING MODEL TRAINING")
print("="*60)

# Train Basic QSVM
qsvc_basic, y_pred_basic, acc_basic, time_basic, feature_map_basic = train_basic_qsvm(
    X_train_scaled, X_test_scaled, y_train_small, y_test_small
)

# Train Advanced QSVM
qsvc_advanced, y_pred_advanced, acc_advanced, time_advanced, feature_map_advanced = train_advanced_qsvm(
    X_train_scaled, X_test_scaled, y_train_small, y_test_small
)

# Train Classical SVM
svm_classical, y_pred_classical, acc_classical, time_classical = train_classical_svm(
    X_train_scaled, X_test_scaled, y_train_small, y_test_small
)

# Results Comparison
print("\n" + "="*60)
print("📊 COMPREHENSIVE RESULTS COMPARISON")
print("="*60)

results = {
    'Basic QSVM': (acc_basic, time_basic, y_pred_basic if qsvc_basic is not None else None),
    'Advanced QSVM': (acc_advanced, time_advanced, y_pred_advanced if qsvc_advanced is not None else None),
    'Classical SVM': (acc_classical, time_classical, y_pred_classical)
}

print(f"{'Model':<15} {'Accuracy':<10} {'Training Time (s)':<15} {'Status':<10}")
print("-" * 55)
for model, (acc, t_time, preds) in results.items():
    status = "✅ Success" if preds is not None else "❌ Failed"
    if preds is not None:
        print(f"{model:<15} {acc:<10.4f} {t_time:<15.2f} {status:<10}")
    else:
        print(f"{model:<15} {'N/A':<10} {t_time:<15.2f} {status:<10}")

# Find best model
successful_models = {k: v for k, v in results.items() if v[2] is not None}
if successful_models:
    best_model_name = max(successful_models.items(), key=lambda x: x[1][0])[0]
    best_accuracy = successful_models[best_model_name][0]
    print(f"\n🏆 BEST MODEL: {best_model_name} with accuracy {best_accuracy:.4f}")
else:
    print(f"\n⚠️  No quantum models succeeded. Using classical SVM as fallback.")
    best_model_name = "Classical SVM"
    best_accuracy = acc_classical

# Detailed evaluation for the best successful model
print("\n" + "="*60)
print("🔍 DETAILED EVALUATION OF BEST MODEL")
print("="*60)

if best_model_name == "Basic QSVM" and qsvc_basic is not None:
    best_model = qsvc_basic
    best_y_pred = y_pred_basic
    feature_map = feature_map_basic
elif best_model_name == "Advanced QSVM" and qsvc_advanced is not None:
    best_model = qsvc_advanced
    best_y_pred = y_pred_advanced
    feature_map = feature_map_advanced
else:
    best_model = svm_classical
    best_y_pred = y_pred_classical
    feature_map = None

print(f"📈 Detailed results for {best_model_name}:")
print(classification_report(y_test_small, best_y_pred, zero_division=0))

# Confusion Matrix
cm = confusion_matrix(y_test_small, best_y_pred)
print("📋 Confusion Matrix:")
print(cm)

# Feature importance analysis
print("\n🎯 Quantum Feature Analysis:")
feature_names = quantum_train.drop('target', axis=1).columns.tolist()
print("Features used:", feature_names)

# Quantum Circuit Information
if best_model_name != "Classical SVM" and feature_map is not None:
    print("\n🔬 QUANTUM CIRCUIT INFORMATION")
    print("-" * 35)
    try:
        print(f"Quantum circuit depth: {feature_map.decompose().depth()}")
        print(f"Number of quantum gates: {len(feature_map.decompose().data)}")
        print(f"Number of qubits: {feature_map.num_qubits}")
        
        # Display the feature map circuit
        print("\n📊 Feature Map Circuit:")
        print(feature_map.decompose().draw(output='text'))
    except Exception as e:
        print(f"Circuit analysis error: {e}")

# Save the trained models
print("\n💾 SAVING TRAINED MODELS")
print("-" * 25)

if qsvc_basic is not None:
    joblib.dump(qsvc_basic, 'qsvc_basic_model.pkl')
    print("✅ Basic QSVC model saved as 'qsvc_basic_model.pkl'")

if qsvc_advanced is not None:
    joblib.dump(qsvc_advanced, 'qsvc_advanced_model.pkl')
    print("✅ Advanced QSVC model saved as 'qsvc_advanced_model.pkl'")

joblib.dump(svm_classical, 'classical_svm_model.pkl')
print("✅ Classical SVM model saved as 'classical_svm_model.pkl'")

# Save the scaler
joblib.dump(scaler, 'quantum_feature_scaler.pkl')
print("✅ Feature scaler saved as 'quantum_feature_scaler.pkl'")

# Save feature names
feature_info = {
    'feature_names': feature_names,
    'feature_count': len(feature_names),
    'best_model': best_model_name,
    'best_accuracy': best_accuracy
}
joblib.dump(feature_info, 'model_metadata.pkl')
print("✅ Model metadata saved as 'model_metadata.pkl'")

print("\n✅ QUANTUM SVM TRAINING COMPLETED SUCCESSFULLY!")
print("=" * 60)

# Final recommendations
print("\n💡 RECOMMENDATIONS FOR QUANTUM ML:")
print("-" * 40)
print("• Start with small datasets for quantum experiments")
print("• Use proper feature scaling for quantum circuits")
print("• Experiment with different feature map repetitions")
print("• Consider hybrid quantum-classical approaches for larger datasets")
print("• Monitor training time vs. accuracy trade-offs")

print(f"\n🎯 Next steps: Use the saved '{best_model_name}' model for predictions")

🔮 QUANTUM SVM TRAINING WITH CORRECTED QISKIT API
📦 All libraries imported successfully!
✅ Using corrected FidelityQuantumKernel API
📁 Loading quantum-only datasets...
📊 Dataset shapes:
   Training: (1200000, 7)
   Test: (361934, 7)
   Features: ['entanglement_url_html', 'superposition_structure', 'prob_malicious_url', 'prob_malicious_html', 'has_suspicious_elements', 'js_quantum_ratio', 'js_entropy_ratio']
⚠️  Dataset too large for quantum computation. Sampling 500 samples...
🔬 Using 500 training samples and 200 test samples
✅ Data preprocessing completed!

🏋️‍♂️ STARTING MODEL TRAINING

🚀 TRAINING BASIC QSVM WITH ZZFeatureMap
--------------------------------------------------
🔧 Feature Map: ZZFeatureMap with 7 dimensions, 2 repetitions
⏳ Training QSVM (this may take a while...)
⏳ Making predictions...
✅ Basic QSVM Training Completed!
📊 Accuracy: 0.9800
🎯 Precision: 0.9604
🔍 Recall: 0.9800
⚖️  F1-Score: 0.9701
⏱️  Training Time: 879.35 seconds

🚀 TRAINING ADVANCED QSVM WITH ENHANCED FE

In [16]:
# CELL: Display Complete Results
print("\n" + "="*80)
print("📊 COMPLETE RESULTS SUMMARY")
print("="*80)

# Re-display results with full details
results = {
    'Basic QSVM': (acc_basic, time_basic, y_pred_basic if qsvc_basic is not None else None),
    'Advanced QSVM': (acc_advanced, time_advanced, y_pred_advanced if qsvc_advanced is not None else None),
    'Classical SVM': (acc_classical, time_classical, y_pred_classical)
}

print(f"{'Model':<20} {'Accuracy':<10} {'Precision':<10} {'Recall':<10} {'F1-Score':<10} {'Time (s)':<12} {'Status':<10}")
print("-" * 85)

for model, (acc, t_time, preds) in results.items():
    status = "✅ Success" if preds is not None else "❌ Failed"
    if preds is not None:
        # Calculate metrics for this model
        precision = precision_score(y_test_small, preds, average='weighted', zero_division=0)
        recall = recall_score(y_test_small, preds, average='weighted', zero_division=0)
        f1 = f1_score(y_test_small, preds, average='weighted', zero_division=0)
        
        print(f"{model:<20} {acc:<10.4f} {precision:<10.4f} {recall:<10.4f} {f1:<10.4f} {t_time:<12.2f} {status:<10}")
    else:
        print(f"{model:<20} {'N/A':<10} {'N/A':<10} {'N/A':<10} {'N/A':<10} {t_time:<12.2f} {status:<10}")

# Show which model actually performed best
successful_models = {k: v for k, v in results.items() if v[2] is not None}
if successful_models:
    best_model_name = max(successful_models.items(), key=lambda x: x[1][0])[0]
    best_accuracy = successful_models[best_model_name][0]
    print(f"\n🏆 ACTUAL BEST MODEL: {best_model_name} with accuracy {best_accuracy:.4f}")
    
    # Show why Classical SVM was chosen if it wasn't the best
    if best_model_name != "Classical SVM":
        classical_acc = results['Classical SVM'][0]
        print(f"💡 Note: Classical SVM accuracy was {classical_acc:.4f}")
else:
    print(f"\n⚠️  No quantum models succeeded. Classical SVM accuracy: {acc_classical:.4f}")


📊 COMPLETE RESULTS SUMMARY
Model                Accuracy   Precision  Recall     F1-Score   Time (s)     Status    
-------------------------------------------------------------------------------------
Basic QSVM           0.9800     0.9604     0.9800     0.9701     879.35       ✅ Success 
Advanced QSVM        0.9800     0.9604     0.9800     0.9701     950.59       ✅ Success 
Classical SVM        0.9950     0.9950     0.9950     0.9946     0.01         ✅ Success 

🏆 ACTUAL BEST MODEL: Classical SVM with accuracy 0.9950


In [17]:
# CELL: Debug Individual Model Results
print("\n" + "="*60)
print("🔍 INDIVIDUAL MODEL STATUS CHECK")
print("="*60)

print(f"Basic QSVM Status: {'✅ Trained' if qsvc_basic is not None else '❌ Failed'}")
if qsvc_basic is not None:
    print(f"  - Accuracy: {acc_basic:.4f}")
    print(f"  - Training Time: {time_basic:.2f}s")

print(f"Advanced QSVM Status: {'✅ Trained' if qsvc_advanced is not None else '❌ Failed'}")
if qsvc_advanced is not None:
    print(f"  - Accuracy: {acc_advanced:.4f}")
    print(f"  - Training Time: {time_advanced:.2f}s")

print(f"Classical SVM Status: ✅ Trained")
print(f"  - Accuracy: {acc_classical:.4f}")
print(f"  - Training Time: {time_classical:.2f}s")

# Check if Advanced QSVM failed
if qsvc_advanced is None:
    print("\n❌ Advanced QSVM failed to train. Possible reasons:")
    print("   - Too complex feature map for the dataset size")
    print("   - Memory issues with full entanglement")
    print("   - Numerical instability")


🔍 INDIVIDUAL MODEL STATUS CHECK
Basic QSVM Status: ✅ Trained
  - Accuracy: 0.9800
  - Training Time: 879.35s
Advanced QSVM Status: ✅ Trained
  - Accuracy: 0.9800
  - Training Time: 950.59s
Classical SVM Status: ✅ Trained
  - Accuracy: 0.9950
  - Training Time: 0.01s


In [20]:
# CELL 7: Prepare the final dataset (FIXED)
# Select features for the final dataset
target_col = 'target'
feature_columns = [
    # Original features from your dataset
    'url_len', 'ip_add', 'geo_loc', 'tld', 'who_is', 'https', 
    'js_len', 'js_obf_len',
    
    # Enhanced URL features
    'path_length', 'query_length', 'fragment_length',
    'num_dots', 'num_hyphens', 'num_underscores', 'num_slashes',
    'num_questionmarks', 'num_equals', 'num_ampersands', 
    'num_at_signs', 'num_percent_signs', 'num_digits',
    'num_letters', 'num_special_chars', 'url_entropy',
    
    # Enhanced HTML features
    'html_content_length', 'num_script_tags', 'num_iframe_tags', 
    'num_link_tags', 'num_form_tags', 'num_image_tags',
    'num_input_tags', 'num_style_tags', 'num_external_links',
    'num_suspicious_keywords', 'html_entropy', 'script_ratio',
    'iframe_ratio', 'external_link_ratio',
    
    # Quantum features
    'entanglement_url_html', 'superposition_structure',
    'prob_malicious_url', 'prob_malicious_html', 'has_suspicious_elements',
    'js_quantum_ratio', 'js_entropy_ratio'
]

# Keep only features that actually exist in the dataframe
available_features = [f for f in feature_columns if f in train_df.columns]
print(f"Using {len(available_features)} available features")

# Create the feature datasets
X_train = train_df[available_features]
y_train = train_df[target_col]  # Use the correct target column
X_test = test_df[available_features]
y_test = test_df[target_col]    # Use the correct target column

print(f"Training feature matrix shape: {X_train.shape}")
print(f"Test feature matrix shape: {X_test.shape}")

# Convert target to binary (good=0, bad=1)
if y_train.dtype == 'object':
    y_train = y_train.map({'good': 0, 'bad': 1})
    y_test = y_test.map({'good': 0, 'bad': 1})
    print("Converted 'good'->0, 'bad'->1")

print(f"Target values after conversion: {np.unique(y_train)}")

Using 15 available features
Training feature matrix shape: (1200000, 15)
Test feature matrix shape: (361934, 15)
Converted 'good'->0, 'bad'->1
Target values after conversion: [0 1]


In [22]:
# CELL 8: Data normalization (FIXED)
print("Checking data types before scaling...")
print(X_train.dtypes)

# Identify non-numeric columns that need special handling
non_numeric_cols = X_train.select_dtypes(include=['object']).columns.tolist()
numeric_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()

print(f"Non-numeric columns: {non_numeric_cols}")
print(f"Numeric columns: {numeric_cols}")

# Handle non-numeric columns (like IP addresses, categorical data)
if non_numeric_cols:
    print("Processing non-numeric columns...")
    
    # For IP addresses: convert to numeric representation
    if 'ip_add' in non_numeric_cols:
        print("Converting IP addresses to numeric format...")
        
        def ip_to_numeric(ip):
            try:
                if isinstance(ip, str) and '.' in ip:
                    # Convert IP to numeric representation
                    parts = ip.split('.')
                    if len(parts) == 4:
                        return (int(parts[0]) * 256**3 + int(parts[1]) * 256**2 + 
                                int(parts[2]) * 256 + int(parts[3]))
                return 0
            except:
                return 0
        
        X_train['ip_add_numeric'] = X_train['ip_add'].apply(ip_to_numeric)
        X_test['ip_add_numeric'] = X_test['ip_add'].apply(ip_to_numeric)
        numeric_cols.append('ip_add_numeric')
    
    # For other categorical columns, use one-hot encoding or label encoding
    categorical_cols = [col for col in non_numeric_cols if col != 'ip_add']
    
    if categorical_cols:
        print(f"Encoding categorical columns: {categorical_cols}")
        
        # Use label encoding for simplicity (or one-hot encoding if few categories)
        from sklearn.preprocessing import LabelEncoder
        
        for col in categorical_cols:
            # Check if it's worth encoding (not too many unique values)
            unique_vals = X_train[col].nunique()
            if unique_vals < 50:  # Only encode if reasonable number of categories
                le = LabelEncoder()
                # Fit on training data and transform both train and test
                X_train[col + '_encoded'] = le.fit_transform(X_train[col].astype(str))
                X_test[col + '_encoded'] = le.transform(X_test[col].astype(str))
                numeric_cols.append(col + '_encoded')
                print(f"  Encoded {col} with {unique_vals} categories")
            else:
                print(f"  Skipping {col} (too many categories: {unique_vals})")

# Now use only numeric columns for scaling
print(f"Final numeric columns for scaling: {numeric_cols}")

# Scale only the numeric features
scaler = StandardScaler()
X_train_scaled_numeric = scaler.fit_transform(X_train[numeric_cols])
X_test_scaled_numeric = scaler.transform(X_test[numeric_cols])

# Convert back to DataFrames
X_train_scaled_df = pd.DataFrame(X_train_scaled_numeric, columns=numeric_cols)
X_test_scaled_df = pd.DataFrame(X_test_scaled_numeric, columns=numeric_cols)

# Add target column
if y_train.dtype == 'object':
    y_train_binary = y_train.map({'good': 0, 'bad': 1}).fillna(0)
    y_test_binary = y_test.map({'good': 0, 'bad': 1}).fillna(0)
else:
    y_train_binary = y_train
    y_test_binary = y_test

X_train_scaled_df['target'] = y_train_binary.values
X_test_scaled_df['target'] = y_test_binary.values

print(f"Final training set shape: {X_train_scaled_df.shape}")
print(f"Final test set shape: {X_test_scaled_df.shape}")



Checking data types before scaling...
url_len                    int64
ip_add                    object
geo_loc                   object
tld                       object
https                     object
js_len                   float64
js_obf_len               float64
url_entropy              float64
html_content_length        int64
num_script_tags            int64
script_ratio             float64
entanglement_url_html    float64
prob_malicious_url       float64
js_quantum_ratio         float64
js_entropy_ratio         float64
dtype: object
Non-numeric columns: ['ip_add', 'geo_loc', 'tld', 'https']
Numeric columns: ['url_len', 'js_len', 'js_obf_len', 'url_entropy', 'html_content_length', 'num_script_tags', 'script_ratio', 'entanglement_url_html', 'prob_malicious_url', 'js_quantum_ratio', 'js_entropy_ratio']
Processing non-numeric columns...
Converting IP addresses to numeric format...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['ip_add_numeric'] = X_train['ip_add'].apply(ip_to_numeric)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['ip_add_numeric'] = X_test['ip_add'].apply(ip_to_numeric)


Encoding categorical columns: ['geo_loc', 'tld', 'https']
  Skipping geo_loc (too many categories: 234)
  Skipping tld (too many categories: 1246)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col + '_encoded'] = le.fit_transform(X_train[col].astype(str))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[col + '_encoded'] = le.transform(X_test[col].astype(str))


  Encoded https with 2 categories
Final numeric columns for scaling: ['url_len', 'js_len', 'js_obf_len', 'url_entropy', 'html_content_length', 'num_script_tags', 'script_ratio', 'entanglement_url_html', 'prob_malicious_url', 'js_quantum_ratio', 'js_entropy_ratio', 'ip_add_numeric', 'https_encoded']
Final training set shape: (1200000, 14)
Final test set shape: (361934, 14)


In [23]:
# CELL 9: Save the pre-encoded datasets as CSV
train_output_path = "quantum_phishing_features_train.csv"
test_output_path = "quantum_phishing_features_test.csv"

X_train_scaled_df.to_csv(train_output_path, index=False)
X_test_scaled_df.to_csv(test_output_path, index=False)

print(f"💾 Pre-encoded training dataset saved as '{train_output_path}'")
print(f"💾 Pre-encoded test dataset saved as '{test_output_path}'")


💾 Pre-encoded training dataset saved as 'quantum_phishing_features_train.csv'
💾 Pre-encoded test dataset saved as 'quantum_phishing_features_test.csv'


In [None]:
# CELL 10: Save feature list for reference (FIXED)
print(f"Number of available features: {len(available_features)}")

# Create a mapping of feature types based on the feature names
feature_types = []
for feature in available_features:
    if feature in ['url_len', 'ip_add', 'geo_loc', 'tld', 'who_is', 'https', 'js_len', 'js_obf_len']:
        feature_types.append('original')
    elif any(keyword in feature for keyword in ['path', 'query', 'fragment', 'num_', 'url_entropy']):
        feature_types.append('enhanced_url')
    elif any(keyword in feature for keyword in ['html', 'script', 'iframe', 'link', 'form', 'image', 'input', 'style', 'external', 'suspicious']):
        feature_types.append('enhanced_html')
    elif any(keyword in feature for keyword in ['entanglement', 'superposition', 'prob_malicious', 'has_suspicious', 'quantum', 'entropy_ratio']):
        feature_types.append('quantum')
    else:
        feature_types.append('other')

feature_info_df = pd.DataFrame({
    'feature_name': available_features,
    'feature_type': feature_types
})

feature_info_df.to_csv("feature_info.csv", index=False)
print("Feature info saved to 'feature_info.csv'")

print("✅ Feature extraction completed successfully!")

Number of available features: 15
Feature info saved to 'feature_info.csv'
✅ Feature extraction completed successfully!


In [19]:
# Check what columns you actually have
print("Current columns in your dataset:")
for i, col in enumerate(train_df.columns):
    print(f"{i}: {col}")

print(f"\nNumber of columns: {len(train_df.columns)}")

# Check if the first column is an unnamed index
if train_df.columns[0].startswith('Unnamed'):
    print(f"\nFirst column '{train_df.columns[0]}' appears to be an index column")
    
    # Check if we should remove it
    if train_df.columns[0] == 'Unnamed: 0':
        # Check if this is just sequential numbers (like an index)
        if train_df.iloc[:, 0].dtype in [np.int64, np.float64]:
            print("Removing unnamed index column...")
            train_df = train_df.iloc[:, 1:]  # Keep all columns except first
            test_df = test_df.iloc[:, 1:]
            print(f"Columns after removing index: {len(train_df.columns)}")

# Now check what columns remain
print("\nColumns after processing:")
for i, col in enumerate(train_df.columns):
    print(f"{i}: {col}")

# Let's see if we can identify the important columns
important_columns = ['url', 'url_len', 'ip_add', 'geo_loc', 'tld', 'who_is', 'https', 'js_len', 'js_obf_len', 'content', 'label']

# Find which of these important columns exist in your dataset
available_important_cols = []
for col in important_columns:
    if col in train_df.columns:
        available_important_cols.append(col)
    else:
        # Try to find similar columns
        for actual_col in train_df.columns:
            if col in actual_col.lower() or actual_col.lower() in col:
                print(f"Found similar: '{actual_col}' -> '{col}'")
                available_important_cols.append(actual_col)
                break

print(f"\nAvailable important columns: {available_important_cols}")

# If we found the key columns, use them
if 'url' in available_important_cols and 'label' in available_important_cols:
    url_col = 'url'
    html_col = 'content' if 'content' in available_important_cols else None
    target_col = 'label'
    
    print(f"\nUsing: URL='{url_col}', HTML='{html_col}', Target='{target_col}'")
    
    # Verify the target column
    print(f"Target values: {train_df[target_col].unique()}")
    print(f"Target counts:\n{train_df[target_col].value_counts()}")
else:
    print("\nCould not find key columns. Let's manually identify them:")
    
    # Look for URL column
    url_candidates = [col for col in train_df.columns if 'url' in col.lower()]
    if url_candidates:
        url_col = url_candidates[0]
        print(f"URL column: {url_col}")
    
    # Look for target/label column
    target_candidates = [col for col in train_df.columns if any(x in col.lower() for x in ['label', 'target', 'class', 'type'])]
    if target_candidates:
        target_col = target_candidates[0]
        print(f"Target column: {target_col}")
        print(f"Target values: {train_df[target_col].unique()}")
    
    # Look for content/HTML column  
    html_candidates = [col for col in train_df.columns if any(x in col.lower() for x in ['content', 'html', 'text', 'body'])]
    if html_candidates:
        html_col = html_candidates[0]
        print(f"HTML column: {html_col}")

# If you have many extra columns, let's select only the important ones
if len(train_df.columns) > 20:  # If you have too many columns
    print(f"\nToo many columns ({len(train_df.columns)}). Selecting only important features...")
    
    # Keep only columns that are in our important list or contain key words
    columns_to_keep = []
    for col in train_df.columns:
        if any(keyword in col.lower() for keyword in ['url', 'ip', 'geo', 'tld', 'whois', 'https', 'js', 'content', 'label', 'target', 'class']):
            columns_to_keep.append(col)
    
    if columns_to_keep:
        train_df = train_df[columns_to_keep]
        test_df = test_df[columns_to_keep]
        print(f"Reduced to {len(columns_to_keep)} columns: {columns_to_keep}")

Current columns in your dataset:
0: url
1: url_len
2: ip_add
3: geo_loc
4: tld
5: who_is
6: https
7: js_len
8: js_obf_len
9: content
10: target
11: path_length
12: query_length
13: fragment_length
14: num_dots
15: num_hyphens
16: num_underscores
17: num_slashes
18: num_questionmarks
19: num_equals
20: num_ampersands
21: num_at_signs
22: num_percent_signs
23: num_digits
24: num_letters
25: num_special_chars
26: url_entropy
27: html_content_length
28: num_script_tags
29: num_iframe_tags
30: num_link_tags
31: num_form_tags
32: num_image_tags
33: num_input_tags
34: num_style_tags
35: num_external_links
36: num_suspicious_keywords
37: html_entropy
38: script_ratio
39: iframe_ratio
40: external_link_ratio
41: entanglement_url_html
42: superposition_structure
43: prob_malicious_url
44: prob_malicious_html
45: has_suspicious_elements
46: js_quantum_ratio
47: js_entropy_ratio

Number of columns: 48

Columns after processing:
0: url
1: url_len
2: ip_add
3: geo_loc
4: tld
5: who_is
6: https
7: js

In [None]:
# ========================================
# Quantum Dataset Generator - Feature Extraction
# ========================================

# Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import re
from urllib.parse import urlparse
import math
from tqdm import tqdm

print("📦 All libraries imported successfully!")

# Set paths to your dataset files
train_data_path = "train_data.csv"
test_data_path = "test_data.csv"

# CELL 1: Load and explore the datasets
print("Loading datasets...")
train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

print(f"Training set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")

print("\nTraining set columns:")
print(train_df.columns.tolist())
print("\nFirst few rows of training data:")
print(train_df.head(3))

# CELL 2: Check what columns we actually have
print("\nAvailable columns in training data:")
for i, col in enumerate(train_df.columns):
    print(f"{i}: {col}")

# Based on your data sample, the columns are:
# ,url,url_len,ip_add,geo_loc,tld,who_is,https,js_len,js_obf_len,content,label

# Let's use the exact column names from your dataset
url_col = 'url'
html_col = 'content'
target_col = 'label'

print(f"\nUsing columns: URL='{url_col}', HTML='{html_col}', Target='{target_col}'")

# CELL 3: Basic preprocessing
print("Checking for missing values...")
print("Missing values in training set:")
print(train_df.isnull().sum())
print("Missing values in test set:")
print(test_df.isnull().sum())

# Drop rows with missing values if any
train_df = train_df.dropna()
test_df = test_df.dropna()

print(f"Training set shape after cleaning: {train_df.shape}")
print(f"Test set shape after cleaning: {test_df.shape}")

# CELL 4: Enhanced URL-based feature engineering
def extract_url_features(url):
    """Extract additional features from URL"""
    try:
        if not isinstance(url, str):
            url = str(url)
            
        parsed = urlparse(url)
        
        features = {}
        
        # Additional length features beyond the existing url_len
        features['path_length'] = len(parsed.path) if parsed.path else 0
        features['query_length'] = len(parsed.query) if parsed.query else 0
        features['fragment_length'] = len(parsed.fragment) if parsed.fragment else 0
        
        # Structural features
        features['num_dots'] = url.count('.')
        features['num_hyphens'] = url.count('-')
        features['num_underscores'] = url.count('_')
        features['num_slashes'] = url.count('/')
        features['num_questionmarks'] = url.count('?')
        features['num_equals'] = url.count('=')
        features['num_ampersands'] = url.count('&')
        features['num_at_signs'] = url.count('@')
        features['num_percent_signs'] = url.count('%')
        
        # Character composition
        features['num_digits'] = sum(1 for c in url if c.isdigit())
        features['num_letters'] = sum(1 for c in url if c.isalpha())
        features['num_special_chars'] = len(url) - features['num_digits'] - features['num_letters']
        
        # Entropy features (quantum-inspired)
        features['url_entropy'] = calculate_entropy(url)
        
        return features
    except:
        # Return default values if URL parsing fails
        return {feature: 0 for feature in [
            'path_length', 'query_length', 'fragment_length',
            'num_dots', 'num_hyphens', 'num_underscores', 'num_slashes',
            'num_questionmarks', 'num_equals', 'num_ampersands', 
            'num_at_signs', 'num_percent_signs', 'num_digits',
            'num_letters', 'num_special_chars', 'url_entropy'
        ]}

def calculate_entropy(text):
    """Calculate Shannon entropy of a string efficiently"""
    if not text or not isinstance(text, str):
        return 0
        
    text_length = len(text)
    if text_length == 0:
        return 0
        
    # Use numpy for faster entropy calculation
    chars, counts = np.unique(list(text), return_counts=True)
    probabilities = counts / text_length
    entropy = -np.sum(probabilities * np.log2(probabilities + 1e-10))  # Avoid log(0)
    
    return entropy

# Apply URL feature extraction with progress bar
print("Extracting additional URL features from training set...")
url_features_train = []
for url in tqdm(train_df[url_col], desc="Processing URLs"):
    url_features_train.append(extract_url_features(url))

url_features_train = pd.DataFrame(url_features_train)
train_df = pd.concat([train_df.reset_index(drop=True), url_features_train], axis=1)

print("Extracting additional URL features from test set...")
url_features_test = []
for url in tqdm(test_df[url_col], desc="Processing URLs"):
    url_features_test.append(extract_url_features(url))

url_features_test = pd.DataFrame(url_features_test)
test_df = pd.concat([test_df.reset_index(drop=True), url_features_test], axis=1)

print(f"Added {len(url_features_train.columns)} additional URL-based features")

# CELL 5: Enhanced HTML content features
def extract_html_features(html_content):
    """Extract quantum-inspired features from HTML content"""
    if not isinstance(html_content, str):
        html_content = str(html_content)
    
    features = {}
    
    # Basic HTML structure features
    html_length = len(html_content)
    features['html_content_length'] = html_length
    
    # Only process HTML if it's not too long
    if html_length < 100000:  # Reasonable limit
        features['num_script_tags'] = html_content.count('<script')
        features['num_iframe_tags'] = html_content.count('<iframe')
        features['num_link_tags'] = html_content.count('<a')
        features['num_form_tags'] = html_content.count('<form')
        features['num_image_tags'] = html_content.count('<img')
        features['num_input_tags'] = html_content.count('<input')
        features['num_style_tags'] = html_content.count('<style')
        
        # Suspicious patterns
        features['num_external_links'] = html_content.count('http://') + html_content.count('https://')
        features['num_suspicious_keywords'] = sum(1 for keyword in ['eval', 'exec', 'document.write', 'innerHTML', 'fromCharCode'] 
                                               if keyword in html_content)
        
        # Quantum-inspired features
        features['html_entropy'] = calculate_entropy(html_content) if html_length < 50000 else 0
        
        # Tag ratios
        total_tags = html_content.count('<')
        if total_tags > 0:
            features['script_ratio'] = features['num_script_tags'] / total_tags
            features['iframe_ratio'] = features['num_iframe_tags'] / total_tags
            features['external_link_ratio'] = features['num_external_links'] / total_tags
        else:
            features['script_ratio'] = 0
            features['iframe_ratio'] = 0
            features['external_link_ratio'] = 0
    else:
        # Default values for very long HTML
        features.update({
            'num_script_tags': 0, 'num_iframe_tags': 0, 'num_link_tags': 0,
            'num_form_tags': 0, 'num_image_tags': 0, 'num_input_tags': 0,
            'num_style_tags': 0, 'num_external_links': 0, 'num_suspicious_keywords': 0,
            'html_entropy': 0, 'script_ratio': 0, 'iframe_ratio': 0, 'external_link_ratio': 0
        })
    
    return features

# Apply HTML feature extraction with progress bar
print("Extracting HTML features from training set...")
html_features_train = []
for html in tqdm(train_df[html_col], desc="Processing HTML"):
    html_features_train.append(extract_html_features(html))

html_features_train = pd.DataFrame(html_features_train)
train_df = pd.concat([train_df, html_features_train], axis=1)

print("Extracting HTML features from test set...")
html_features_test = []
for html in tqdm(test_df[html_col], desc="Processing HTML"):
    html_features_test.append(extract_html_features(html))

html_features_test = pd.DataFrame(html_features_test)
test_df = pd.concat([test_df, html_features_test], axis=1)

print(f"Added {len(html_features_train.columns)} HTML-based features")

# CELL 6: Create quantum-inspired features
def create_quantum_features(row):
    """Create features inspired by quantum concepts"""
    features = {}
    
    # Feature entanglement (correlation-inspired)
    features['entanglement_url_html'] = row.get('url_entropy', 0) * row.get('html_entropy', 0)
    
    # Superposition-inspired features
    html_len = row.get('html_content_length', 1)
    features['superposition_structure'] = (row.get('num_script_tags', 0) + row.get('num_iframe_tags', 0)) / max(html_len, 1) * 100
    
    # Quantum probability-inspired features
    url_len = max(row.get('url_len', 1), 1)
    features['prob_malicious_url'] = min(1.0, row.get('num_special_chars', 0) / url_len * 5)
    features['prob_malicious_html'] = min(1.0, (row.get('num_script_tags', 0) + row.get('num_iframe_tags', 0)) / max(html_len, 1) * 100)
    
    # Quantum state-inspired binary features
    features['has_suspicious_elements'] = 1 if (row.get('num_script_tags', 0) > 3 or 
                                              row.get('num_iframe_tags', 0) > 2 or
                                              row.get('num_suspicious_keywords', 0) > 5) else 0
    
    # JS-related quantum features
    js_len = max(row.get('js_len', 1), 1)
    js_obf_len = row.get('js_obf_len', 0)
    features['js_quantum_ratio'] = js_obf_len / js_len
    features['js_entropy_ratio'] = calculate_entropy(str(js_len)) / max(calculate_entropy(str(js_obf_len)), 1)
    
    return features

# Apply quantum feature creation with progress bar
print("Creating quantum-inspired features for training set...")
quantum_features_train = []
for _, row in tqdm(train_df.iterrows(), total=len(train_df), desc="Quantum features"):
    quantum_features_train.append(create_quantum_features(row))

quantum_features_train = pd.DataFrame(quantum_features_train)
train_df = pd.concat([train_df, quantum_features_train], axis=1)

print("Creating quantum-inspired features for test set...")
quantum_features_test = []
for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Quantum features"):
    quantum_features_test.append(create_quantum_features(row))

quantum_features_test = pd.DataFrame(quantum_features_test)
test_df = pd.concat([test_df, quantum_features_test], axis=1)

print(f"Added {len(quantum_features_train.columns)} quantum-inspired features")

# CELL 7: Prepare the final dataset
# Select features for the final dataset (using original column names)
feature_columns = [
    # Original features from your dataset
    'url_len', 'ip_add', 'geo_loc', 'tld', 'who_is', 'https', 
    'js_len', 'js_obf_len',
    
    # Enhanced URL features
    'path_length', 'query_length', 'fragment_length',
    'num_dots', 'num_hyphens', 'num_underscores', 'num_slashes',
    'num_questionmarks', 'num_equals', 'num_ampersands', 
    'num_at_signs', 'num_percent_signs', 'num_digits',
    'num_letters', 'num_special_chars', 'url_entropy',
    
    # Enhanced HTML features
    'html_content_length', 'num_script_tags', 'num_iframe_tags', 
    'num_link_tags', 'num_form_tags', 'num_image_tags',
    'num_input_tags', 'num_style_tags', 'num_external_links',
    'num_suspicious_keywords', 'html_entropy', 'script_ratio',
    'iframe_ratio', 'external_link_ratio',
    
    # Quantum features
    'entanglement_url_html', 'superposition_structure',
    'prob_malicious_url', 'prob_malicious_html', 'has_suspicious_elements',
    'js_quantum_ratio', 'js_entropy_ratio'
]

# Keep only features that actually exist in the dataframe
available_features = [f for f in feature_columns if f in train_df.columns]
print(f"Using {len(available_features)} available features")

# Create the feature datasets
X_train = train_df[available_features]
y_train = train_df[target_col]
X_test = test_df[available_features]
y_test = test_df[target_col]

print(f"Training feature matrix shape: {X_train.shape}")
print(f"Test feature matrix shape: {X_test.shape}")

# CELL 8: Data normalization
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrames
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=available_features)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=available_features)

# Add target column (convert to binary if needed)
# Assuming 'good' = 0, 'bad' = 1 or similar
if y_train.dtype == 'object':
    y_train_binary = y_train.map({'good': 0, 'bad': 1}).fillna(0)
    y_test_binary = y_test.map({'good': 0, 'bad': 1}).fillna(0)
else:
    y_train_binary = y_train
    y_test_binary = y_test

X_train_scaled_df['target'] = y_train_binary.values
X_test_scaled_df['target'] = y_test_binary.values

# CELL 9: Save the pre-encoded datasets as CSV
train_output_path = "quantum_phishing_features_train.csv"
test_output_path = "quantum_phishing_features_test.csv"

X_train_scaled_df.to_csv(train_output_path, index=False)
X_test_scaled_df.to_csv(test_output_path, index=False)

print(f"💾 Pre-encoded training dataset saved as '{train_output_path}'")
print(f"💾 Pre-encoded test dataset saved as '{test_output_path}'")

# CELL 10: Save feature list for reference
feature_info_df = pd.DataFrame({
    'feature_name': available_features,
    'feature_type': ['original']*8 + ['enhanced_url']*16 + ['enhanced_html']*14 + ['quantum']*7
})
feature_info_df.to_csv("feature_info.csv", index=False)

print("✅ Feature extraction completed successfully!")