In [1]:
import pandas as pd
import numpy as np

print("‚úÖ Libraries imported")


‚úÖ Libraries imported


In [2]:
# Load dataset 
df = pd.read_csv("email_phishing_dataset_FINAL.csv")

print("‚úÖ Dataset loaded successfully")

‚úÖ Dataset loaded successfully


In [3]:
# View first few rows
df.head()

Unnamed: 0,email_subject_len,email_has_urgent_keyword,email_from_domain,web_url,web_url_len,web_ip_add,web_geo_loc,web_tld,web_who_is,web_https,...,content_entropy,domain_trust_score,email_domain_matches_url,email_url_domain_similarity,content_num_forms,content_num_inputs,content_num_scripts,content_suspicious_keywords,semantic_coherence_score,brand_consistency_score
0,32,0,spamassassin.zones.apache.org,http://tools.ietf.org/html/rfc1583,34,30.180.42.35,United States,org,complete,yes,...,4.620961,0.8,0,0.411765,0,0,5,0,0.043153,0.5
1,46,0,gmail.com>,http://www.quickfixgolf.com,27,150.66.16.42,Japan,com,complete,yes,...,4.742243,0.8,0,0.5,0,0,2,0,0.081125,0.5
2,21,0,telefonica.net>,http://www.lvnazarene.org,25,180.123.185.229,China,org,complete,yes,...,4.663432,0.8,0,0.4,0,0,2,0,0.0,0.5
3,99,1,gmail.com>,http://hatchersmartialarts.homestead.com/front...,51,46.97.122.170,Romania,com,complete,yes,...,4.971977,0.8,0,0.466667,0,1,2,2,0.07172,0.5
4,72,1,luebeck.de>,http://www.gabile.com/,22,94.145.85.24,Denmark,com,incomplete,no,...,4.338266,0.4,0,0.357143,0,1,24,3,0.0,0.5


In [4]:
# Dataset shape
print("Dataset shape:", df.shape)


Dataset shape: (8000, 39)


In [5]:
# Check data types
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 39 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   email_subject_len            8000 non-null   int64  
 1   email_has_urgent_keyword     8000 non-null   int64  
 2   email_from_domain            7822 non-null   object 
 3   web_url                      8000 non-null   object 
 4   web_url_len                  8000 non-null   int64  
 5   web_ip_add                   8000 non-null   object 
 6   web_geo_loc                  8000 non-null   object 
 7   web_tld                      8000 non-null   object 
 8   web_who_is                   8000 non-null   object 
 9   web_https                    8000 non-null   object 
 10  web_js_len                   8000 non-null   float64
 11  web_js_obf_len               8000 non-null   float64
 12  web_content                  8000 non-null   object 
 13  domain_age        

In [3]:
# email_from_domain missing values
df = df.dropna(subset=['email_from_domain'])

print("Shape after dropping email_from_domain missing:", df.shape)


Shape after dropping email_from_domain missing: (7822, 39)


In [4]:
# Create missingness indicator
df['domain_age_missing'] = df['domain_age'].isnull().astype(int)

# Impute missing domain_age with 0 (unknown/new domain)
df['domain_age'] = df['domain_age'].fillna(0)

print("Missing domain_age after handling:", df['domain_age'].isnull().sum())


Missing domain_age after handling: 0


In [5]:
print("\nFinal dataset shape:", df.shape)
print("\nAny remaining missing values?")
print(df.isnull().sum().sum())



Final dataset shape: (7822, 40)

Any remaining missing values?
0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7822 entries, 0 to 7999
Data columns (total 40 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   email_subject_len            7822 non-null   int64  
 1   email_has_urgent_keyword     7822 non-null   int64  
 2   email_from_domain            7822 non-null   object 
 3   web_url                      7822 non-null   object 
 4   web_url_len                  7822 non-null   int64  
 5   web_ip_add                   7822 non-null   object 
 6   web_geo_loc                  7822 non-null   object 
 7   web_tld                      7822 non-null   object 
 8   web_who_is                   7822 non-null   object 
 9   web_https                    7822 non-null   object 
 10  web_js_len                   7822 non-null   float64
 11  web_js_obf_len               7822 non-null   float64
 12  web_content                  7822 non-null   object 
 13  domain_age             

In [6]:
# ENCODING NON-NUMERICAL FEATURES

df_encoded = df.copy()

# 1Ô∏è‚É£ email_from_domain ‚Üí frequency encoding
domain_freq = df_encoded['email_from_domain'].value_counts(normalize=True)
df_encoded['email_domain_freq'] = df_encoded['email_from_domain'].map(domain_freq)
df_encoded['email_domain_freq'] = df_encoded['email_domain_freq'].fillna(0)
df_encoded.drop(['email_from_domain'], axis=1, inplace=True)

# 2Ô∏è‚É£ web_geo_loc ‚Üí frequency encoding
geo_freq = df_encoded['web_geo_loc'].value_counts(normalize=True)
df_encoded['geo_freq'] = df_encoded['web_geo_loc'].map(geo_freq).fillna(0)
df_encoded.drop(['web_geo_loc'], axis=1, inplace=True)

# 3Ô∏è‚É£ web_tld ‚Üí trust score
trusted_tlds = ['com', 'org', 'net', 'edu', 'gov']
df_encoded['tld_trust_score'] = df_encoded['web_tld'].apply(
    lambda x: 1.0 if x in trusted_tlds else 0.0
)
df_encoded.drop(['web_tld'], axis=1, inplace=True)

# 4Ô∏è‚É£ web_who_is ‚Üí binary
df_encoded['web_who_is'] = df_encoded['web_who_is'].map({'complete': 1, 'incomplete': 0})

# 5Ô∏è‚É£ web_https ‚Üí binary
df_encoded['web_https'] = df_encoded['web_https'].map({'yes': 1, 'no': 0})

# 6Ô∏è‚É£ Drop noisy features
df_encoded.drop(['web_url', 'web_ip_add', 'web_content'], axis=1, inplace=True)

print("‚úÖ Encoding complete")
print("Final shape:", df_encoded.shape)


‚úÖ Encoding complete
Final shape: (7822, 37)


In [11]:
df.head()

Unnamed: 0,email_subject_len,email_has_urgent_keyword,email_from_domain,web_url,web_url_len,web_ip_add,web_geo_loc,web_tld,web_who_is,web_https,...,domain_trust_score,email_domain_matches_url,email_url_domain_similarity,content_num_forms,content_num_inputs,content_num_scripts,content_suspicious_keywords,semantic_coherence_score,brand_consistency_score,domain_age_missing
0,32,0,spamassassin.zones.apache.org,http://tools.ietf.org/html/rfc1583,34,30.180.42.35,United States,org,complete,yes,...,0.8,0,0.411765,0,0,5,0,0.043153,0.5,0
1,46,0,gmail.com>,http://www.quickfixgolf.com,27,150.66.16.42,Japan,com,complete,yes,...,0.8,0,0.5,0,0,2,0,0.081125,0.5,0
2,21,0,telefonica.net>,http://www.lvnazarene.org,25,180.123.185.229,China,org,complete,yes,...,0.8,0,0.4,0,0,2,0,0.0,0.5,0
3,99,1,gmail.com>,http://hatchersmartialarts.homestead.com/front...,51,46.97.122.170,Romania,com,complete,yes,...,0.8,0,0.466667,0,1,2,2,0.07172,0.5,0
4,72,1,luebeck.de>,http://www.gabile.com/,22,94.145.85.24,Denmark,com,incomplete,no,...,0.4,0,0.357143,0,1,24,3,0.0,0.5,0


In [12]:
df_encoded.head()

Unnamed: 0,email_subject_len,email_has_urgent_keyword,web_url_len,web_who_is,web_https,web_js_len,web_js_obf_len,domain_age,final_label,js_obfuscation_ratio,...,content_num_forms,content_num_inputs,content_num_scripts,content_suspicious_keywords,semantic_coherence_score,brand_consistency_score,domain_age_missing,email_domain_freq,geo_freq,tld_trust_score
0,32,0,34,1,1,137.0,0.0,11168.0,0,0.0,...,0,0,5,0,0.043153,0.5,0,0.001278,0.424572,1.0
1,46,0,27,1,1,94.0,0.0,9692.0,0,0.0,...,0,0,2,0,0.081125,0.5,0,0.071976,0.057658,1.0
2,21,0,25,1,1,44.5,0.0,2344.0,0,0.0,...,0,0,2,0,0.0,0.5,0,0.006009,0.094861,1.0
3,99,1,51,1,1,84.5,0.0,10335.0,0,0.0,...,0,1,2,2,0.07172,0.5,0,0.071976,0.002046,1.0
4,72,1,22,0,0,837.0,460.35,7421.0,1,0.549344,...,0,1,24,3,0.0,0.5,0,0.000128,0.004858,1.0


In [13]:
print(df_encoded.head())

   email_subject_len  email_has_urgent_keyword  web_url_len  web_who_is  \
0                 32                         0           34           1   
1                 46                         0           27           1   
2                 21                         0           25           1   
3                 99                         1           51           1   
4                 72                         1           22           0   

   web_https  web_js_len  web_js_obf_len  domain_age  final_label  \
0          1       137.0            0.00     11168.0            0   
1          1        94.0            0.00      9692.0            0   
2          1        44.5            0.00      2344.0            0   
3          1        84.5            0.00     10335.0            0   
4          0       837.0          460.35      7421.0            1   

   js_obfuscation_ratio  ...  content_num_forms  content_num_inputs  \
0              0.000000  ...                  0                

In [7]:
from sklearn.model_selection import train_test_split

X = df_encoded.drop('final_label', axis=1)
y = df_encoded['final_label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print(X_train.shape, X_test.shape)


(6257, 36) (1565, 36)


In [8]:
# Count classes in training set
import pandas as pd

print("Training set class distribution:")
print(y_train.value_counts())

print("\nTest set class distribution:")
print(y_test.value_counts())


Training set class distribution:
final_label
0    3200
1    3057
Name: count, dtype: int64

Test set class distribution:
final_label
0    800
1    765
Name: count, dtype: int64


In [9]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)


In [10]:
# First 5 rows of scaled training data
print("Scaled Training Data:")
display(X_train_scaled.head())

# First 5 rows of scaled test data
print("\nScaled Test Data:")
display(X_test_scaled.head())


Scaled Training Data:


Unnamed: 0,email_subject_len,email_has_urgent_keyword,web_url_len,web_who_is,web_https,web_js_len,web_js_obf_len,domain_age,js_obfuscation_ratio,url_has_ip,...,content_num_forms,content_num_inputs,content_num_scripts,content_suspicious_keywords,semantic_coherence_score,brand_consistency_score,domain_age_missing,email_domain_freq,geo_freq,tld_trust_score
0,0.136842,1.0,0.126354,1.0,1.0,0.152207,0.0,0.547386,0.0,0.0,...,0.0,0.142857,0.0,0.25,0.322002,1.0,0.0,1.0,1.0,1.0
1,0.098246,1.0,0.00361,1.0,1.0,0.180307,0.0,0.538528,0.0,0.0,...,0.0,0.0,0.083333,0.5,0.0,1.0,0.0,1.0,1.0,0.0
2,0.087719,0.0,0.021661,0.0,0.0,0.539515,0.31701,0.035365,0.584526,0.0,...,0.0,0.142857,0.333333,0.25,0.782408,1.0,0.0,0.0,0.069277,1.0
3,0.070175,1.0,0.212996,0.0,1.0,0.950474,0.934189,0.0,0.978669,0.0,...,0.0,0.285714,0.541667,0.75,0.223506,1.0,1.0,0.0,0.029819,1.0
4,0.189474,0.0,0.119134,1.0,1.0,0.20665,0.0,0.014252,0.0,0.0,...,0.0,0.0,0.083333,0.75,0.0,1.0,0.0,0.007117,1.0,1.0



Scaled Test Data:


Unnamed: 0,email_subject_len,email_has_urgent_keyword,web_url_len,web_who_is,web_https,web_js_len,web_js_obf_len,domain_age,js_obfuscation_ratio,url_has_ip,...,content_num_forms,content_num_inputs,content_num_scripts,content_suspicious_keywords,semantic_coherence_score,brand_consistency_score,domain_age_missing,email_domain_freq,geo_freq,tld_trust_score
0,0.112281,1.0,0.043321,0.0,1.0,0.227725,0.0,0.680386,0.0,0.0,...,0.0,0.142857,0.166667,0.75,0.0,1.0,0.0,0.007117,0.223193,1.0
1,0.014035,1.0,0.119134,0.0,0.0,0.730242,0.647518,0.013054,0.882602,0.0,...,0.0,0.428571,0.708333,0.75,0.29652,1.0,0.0,0.0,1.0,1.0
2,0.238596,1.0,0.057762,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.02669,1.0,1.0
3,0.122807,0.0,0.111913,0.0,0.0,0.839831,0.556276,0.0,0.659431,0.0,...,0.0,0.571429,0.833333,0.75,0.47363,1.0,1.0,0.001779,1.0,1.0
4,0.140351,0.0,0.090253,0.0,0.0,0.900948,0.837386,0.0,0.925418,0.0,...,0.0,0.428571,0.708333,0.5,0.0,1.0,1.0,0.001779,0.080723,1.0


In [11]:
#FEATURE SELECTION ON TRAIN DATA

from sklearn.feature_selection import mutual_info_classif

# Compute MI scores
mi_scores = mutual_info_classif(
    X_train_scaled,
    y_train,
    random_state=42
)

# Create MI ranking dataframe
mi_df = pd.DataFrame({
    'feature': X_train_scaled.columns,
    'mi_score': mi_scores
}).sort_values(by='mi_score', ascending=False)

print(" Top MI Features")
mi_df.head(15)


 Top MI Features


Unnamed: 0,feature,mi_score
5,web_js_len,0.657304
8,js_obfuscation_ratio,0.52116
6,web_js_obf_len,0.519802
28,content_num_scripts,0.494982
22,content_entropy,0.472273
23,domain_trust_score,0.369185
33,email_domain_freq,0.324097
4,web_https,0.276868
29,content_suspicious_keywords,0.269552
3,web_who_is,0.261283


In [12]:
import numpy as np
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression

# ================= PARAMETERS =================
alpha = 0.6            # redundancy penalty
beta = 2.0             # cardinality constraint strength
k = 6                  # REQUIRED number of features
num_reads = 120
anneal_steps = 1200
# ==============================================


# =========================
# 1. FEATURE ‚Üí LABEL MI
# =========================
MI_feat_label = mutual_info_classif(
    X_train_scaled,
    y_train,
    random_state=42
)


# =========================
# 2. FEATURE ‚Üî FEATURE MI
# =========================
n_features = X_train_scaled.shape[1]
MI_feat_feat = np.zeros((n_features, n_features))

for i in range(n_features):
    for j in range(i + 1, n_features):
        MI = mutual_info_regression(
            X_train_scaled.iloc[:, [i]],
            X_train_scaled.iloc[:, j]
        )[0]
        MI_feat_feat[i, j] = MI
        MI_feat_feat[j, i] = MI


# =========================
# 3. BUILD QUBO (WITH CARDINALITY)
# =========================
Q = {}

# Linear terms
for i in range(n_features):
    Q[(i, i)] = (
        -MI_feat_label[i]           # relevance
        + beta * (1 - 2 * k)        # cardinality
    )

# Quadratic terms
for i in range(n_features):
    for j in range(i + 1, n_features):
        Q[(i, j)] = (
            alpha * MI_feat_feat[i, j]   # redundancy
            + 2 * beta                   # cardinality coupling
        )


# =========================
# 4. PURE PYTHON SA SAMPLER
# =========================
class SimulatedAnnealingSampler:

    def sample_qubo(self, Q, num_reads=100, steps=1500):
        n = max(max(i, j) for i, j in Q.keys()) + 1
        samples = []

        for _ in range(num_reads):
            x = np.random.randint(0, 2, size=n)
            energy = self._energy(Q, x)

            for step in range(steps):
                T = max(0.01, 1.0 - step / steps)
                i = np.random.randint(n)

                x[i] ^= 1
                new_energy = self._energy(Q, x)
                delta = new_energy - energy

                if delta < 0 or np.random.rand() < np.exp(-delta / T):
                    energy = new_energy
                else:
                    x[i] ^= 1

            samples.append((dict(enumerate(x)), energy))

        samples.sort(key=lambda s: s[1])
        return SamplerResponse(samples)

    def _energy(self, Q, x):
        e = 0.0
        for (i, j), q in Q.items():
            e += q * x[i] * x[j]
        return e


class SamplerResponse:
    def __init__(self, samples):
        self.first = Sample(samples[0][0], samples[0][1])


class Sample:
    def __init__(self, sample, energy):
        self.sample = sample
        self.energy = energy


# =========================
# 5. SOLVE QUBO
# =========================
sampler = SimulatedAnnealingSampler()
response = sampler.sample_qubo(
    Q,
    num_reads=num_reads,
    steps=anneal_steps
)

best_sample = response.first.sample


# =========================
# 6. SELECT FEATURES
# =========================
selected_indices = [i for i, v in best_sample.items() if v == 1]
selected_features = X_train_scaled.columns[selected_indices].tolist()

print("‚úÖ QUBO Selected Features:")
print(selected_features)
print(f"Total selected: {len(selected_features)}")

if len(selected_features) != k:
    print("‚ö† Cardinality slightly off ‚Äî increase beta if needed")

X_train_selected = X_train_scaled[selected_features]
X_test_selected = X_test_scaled[selected_features]

print("üìê Shapes after QUBO selection:")
print(X_train_selected.shape, X_test_selected.shape)


‚úÖ QUBO Selected Features:
['email_subject_len', 'web_js_len', 'url_num_hyphens', 'url_suspicious_chars', 'domain_contains_numbers', 'content_entropy']
Total selected: 6
üìê Shapes after QUBO selection:
(6257, 6) (1565, 6)


In [13]:
#CORRECTED IMPORTS

# Qiskit imports (CORRECTED for 2025)
from qiskit import QuantumCircuit
from qiskit.circuit.library import ZZFeatureMap, RealAmplitudes
from qiskit_algorithms.utils import algorithm_globals

from qiskit_machine_learning.kernels import FidelityQuantumKernel
from qiskit_machine_learning.algorithms import QSVC
from qiskit.primitives import Sampler
from qiskit_aer import AerSimulator

print(" All libraries imported successfully!")
print(" Using FidelityQuantumKernel (current API)")

 All libraries imported successfully!
 Using FidelityQuantumKernel (current API)


In [None]:
# FINAL CORRECTED CODE - BATCH-WISE SVM & QSVM TRAINING

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import time
from qiskit.visualization import circuit_drawer
from sklearn.utils import shuffle
import numpy as np

algorithm_globals.random_seed = 42

n_qubo_features = len(selected_features)
print(f"‚öõÔ∏è QSVM will use {n_qubo_features} qubits")

feature_map = ZZFeatureMap(
    feature_dimension=n_qubo_features,
    reps=1,
    entanglement="linear"
)

quantum_kernel = FidelityQuantumKernel(feature_map=feature_map)

print("‚úÖ Quantum kernel created")
print(f"‚Ä¢ Circuit depth: {feature_map.depth()}")

if n_qubo_features <= 13:
    print("\nüî¨ Quantum Feature Map Circuit:")
    print(feature_map.decompose().draw(output='text'))
else:
    print("\nüî¨ Quantum circuit too large to display")

BATCH_SIZE = 500        

# Shuffle training data
X_train_shuffled, y_train_shuffled = shuffle(
    X_train_selected,
    y_train,
    random_state=42
)

# Scale data to [0, œÄ] for quantum models
X_train_scaled_q = X_train_shuffled.values * np.pi
X_test_scaled_q = X_test_selected.values * np.pi

print("‚úÖ Data shuffled and œÄ-scaled")

svm_preds_list = []
qsvm_preds_list = []

total_batches = 0
skipped_batches = 0

print("\nüöÄ STARTING BATCH-WISE TRAINING\n")

for i in range(0, len(X_train_shuffled), BATCH_SIZE):
    batch_num = i // BATCH_SIZE + 1
    total_batches += 1

    X_batch = X_train_scaled_q[i:i+BATCH_SIZE]
    y_batch = y_train_shuffled.iloc[i:i+BATCH_SIZE].values

    unique_classes, class_counts = np.unique(y_batch, return_counts=True)
    class_dist = dict(zip(unique_classes, class_counts))

    print(f"üì¶ BATCH {batch_num}")
    print(f"Samples: {len(X_batch)}")
    print(f"Class distribution: {class_dist}")

    if len(unique_classes) < 2:
        print("‚è≠Ô∏è Skipped (single class batch)")
        skipped_batches += 1
        continue

    # Calculate class balance ratio
    balance_ratio = min(class_counts) / max(class_counts)
    print(f"Balance ratio: {balance_ratio:.3f}")

    # -------- Classical SVM --------
    print("üîµ Training Classical SVM...")
    svm_start = time.time()

    svm = SVC(kernel='rbf', class_weight='balanced', random_state=42)
    svm.fit(X_batch, y_batch)

    svm_time = time.time() - svm_start
    svm_decision = svm.decision_function(X_test_scaled_q)
    svm_pred = np.where(svm_decision >= 0, 1, 0)

    print(f"‚úÖ SVM trained in {svm_time:.2f}s")

    # -------- QSVM --------
    print("‚öõÔ∏è Training QSVM...")
    qsvm_start = time.time()

    qsvm = QSVC(
        quantum_kernel=quantum_kernel,
        C=1.0
    )

    try:
        qsvm.fit(X_batch, y_batch)
        qsvm_time = time.time() - qsvm_start

        # Use decision_function instead of predict for stability
        decision_scores = qsvm.decision_function(X_test_scaled_q)
        qsvm_pred = np.where(decision_scores >= 0, 1, 0)
    
        # Verify we got both classes
        unique_preds = np.unique(qsvm_pred)
        print(f"  Predicted classes: {unique_preds}")

        # SUCCESS - Add both predictions
        qsvm_preds_list.append(qsvm_pred)
        svm_preds_list.append(svm_pred)

        print(f"‚úÖ QSVM trained in {qsvm_time:.2f}s")
        print(f"‚è±Ô∏è QSVM is {qsvm_time / svm_time:.1f}√ó slower than SVM")

    except Exception as e:
        print(f"‚ùå QSVM failed: {e}")
        print("   ‚Üí Skipping this batch for both models")
        skipped_batches += 1

print(f"\n{'='*70}")
print(f"BATCH TRAINING COMPLETE")
print(f"{'='*70}")
print(f"Total batches: {total_batches}")
print(f"Skipped batches: {skipped_batches}")
print(f"Successful batches: {len(svm_preds_list)}")
print(f"Success rate: {len(svm_preds_list)/total_batches*100:.1f}%")

‚öõÔ∏è QSVM will use 6 qubits
‚úÖ Quantum kernel created
‚Ä¢ Circuit depth: 1

üî¨ Quantum Feature Map Circuit:
     ‚îå‚îÄ‚îÄ‚îÄ‚îê‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê                                               ¬ª
q_0: ‚î§ H ‚îú‚î§ P(2.0*x[0]) ‚îú‚îÄ‚îÄ‚ñ†‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚ñ†‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ¬ª
     ‚îú‚îÄ‚îÄ‚îÄ‚î§‚îú‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î§‚îå‚îÄ‚î¥‚îÄ‚îê‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê‚îå‚îÄ‚î¥‚îÄ‚îê     ¬ª
q_1: ‚î§ H ‚îú‚î§ P(2.0*x[1]) ‚îú‚î§ X ‚îú‚î§ P(2.0*(œÄ - x[0])*(œÄ - x[1])) ‚îú‚î§ X ‚îú‚îÄ‚îÄ‚ñ†‚îÄ‚îÄ¬ª
     ‚îú‚îÄ‚îÄ‚îÄ‚î§‚îú‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î§‚îî‚îÄ‚îÄ‚îÄ‚îò‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò‚îî‚îÄ‚îÄ‚îÄ‚îò‚îå‚îÄ‚î¥‚îÄ‚îê¬ª
q_2: ‚î§ H ‚îú‚î§ P(2.0*x[2]) ‚îú‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î

In [31]:
# PROFESSIONAL QUANTUM CIRCUIT VISUALIZATION - QISKIT STYLE

import plotly.graph_objects as go
import numpy as np

print("="*70)
print("PROFESSIONAL QUANTUM CIRCUIT DIAGRAM")
print("="*70)

# Create output directory
import os
output_dir = "visualization_outputs"
os.makedirs(output_dir, exist_ok=True)

# Get circuit information
circuit = feature_map.decompose()
n_qubits = circuit.num_qubits

print(f"\n‚öõÔ∏è Circuit Properties:")
print(f"   Qubits: {n_qubits}")
print(f"   Depth: {circuit.depth()}")
print(f"   Gates: {len(circuit.data)}")

# ============================================================
# PARSE CIRCUIT AND ORGANIZE GATES BY TIME STEPS
# ============================================================

# Organize gates into time steps (columns)
time_steps = {}
qubit_usage = [0] * n_qubits  # Track current time for each qubit

for instruction in circuit.data:
    gate = instruction[0]
    qubits_raw = instruction[1]
    
    # Handle different Qiskit versions
    qubits = []
    for q in qubits_raw:
        if hasattr(q, 'index'):
            qubits.append(q.index)
        elif hasattr(q, '_index'):
            qubits.append(q._index)
        else:
            # For newer versions, qubit might be directly indexable
            qubits.append(circuit.qubits.index(q))
    
    # Find the earliest time this gate can be placed
    max_time = max([qubit_usage[q] for q in qubits])
    
    # Place gate at this time
    if max_time not in time_steps:
        time_steps[max_time] = []
    
    time_steps[max_time].append({
        'gate': gate.name,
        'qubits': qubits,
        'params': gate.params if hasattr(gate, 'params') else []
    })
    
    # Update qubit usage
    for q in qubits:
        qubit_usage[q] = max_time + 1

# ============================================================
# CREATE PROFESSIONAL CIRCUIT DIAGRAM
# ============================================================

fig = go.Figure()

# Layout parameters
QUBIT_SPACING = 1.0
GATE_SPACING = 1.5
GATE_WIDTH = 0.6
GATE_HEIGHT = 0.4

max_time = max(time_steps.keys()) if time_steps else 0
circuit_length = (max_time + 1) * GATE_SPACING + 1

# Draw qubit lines (horizontal wires)
for qubit_idx in range(n_qubits):
    y = qubit_idx * QUBIT_SPACING
    
    # Main qubit line
    fig.add_trace(go.Scatter(
        x=[0, circuit_length],
        y=[y, y],
        mode='lines',
        line=dict(color='black', width=2),
        showlegend=False,
        hoverinfo='skip'
    ))
    
    # Qubit label on the left
    fig.add_annotation(
        x=-0.3,
        y=y,
        text=f'q<sub>{qubit_idx}</sub> |0‚ü©',
        showarrow=False,
        xanchor='right',
        font=dict(size=14, color='black')
    )

# Draw gates
for time_idx, gates in time_steps.items():
    x_pos = time_idx * GATE_SPACING + 1
    
    for gate_info in gates:
        gate_name = gate_info['gate']
        qubits = gate_info['qubits']
        params = gate_info['params']
        
        if len(qubits) == 1:
            # Single-qubit gate
            qubit = qubits[0]
            y_pos = qubit * QUBIT_SPACING
            
            # Determine gate color and label
            if gate_name.lower() == 'h':
                color = '#FF6B6B'  # Red for Hadamard
                label = 'H'
            elif gate_name.lower() in ['rz', 'p']:
                color = '#4ECDC4'  # Teal for phase gates
                label = 'P' if gate_name.lower() == 'p' else 'RZ'
            elif gate_name.lower() == 'rx':
                color = '#FFE66D'  # Yellow for RX
                label = 'RX'
            elif gate_name.lower() == 'ry':
                color = '#A8E6CF'  # Green for RY
                label = 'RY'
            else:
                color = '#95A5A6'  # Gray for others
                label = gate_name.upper()
            
            # Draw gate box
            fig.add_shape(
                type="rect",
                x0=x_pos - GATE_WIDTH/2,
                y0=y_pos - GATE_HEIGHT/2,
                x1=x_pos + GATE_WIDTH/2,
                y1=y_pos + GATE_HEIGHT/2,
                fillcolor=color,
                line=dict(color='black', width=2),
            )
            
            # Add gate label
            param_text = ""
            if len(params) > 0:
                # Simplify parameter display
                param_text = f"<br><sub>({str(params[0])[:15]})</sub>"
            
            fig.add_annotation(
                x=x_pos,
                y=y_pos,
                text=f"<b>{label}</b>{param_text}",
                showarrow=False,
                font=dict(size=12, color='white'),
                bgcolor=color,
                borderpad=4
            )
            
        elif len(qubits) == 2:
            # Two-qubit gate (controlled gate)
            control_qubit = qubits[0]
            target_qubit = qubits[1]
            
            y_control = control_qubit * QUBIT_SPACING
            y_target = target_qubit * QUBIT_SPACING
            
            # Draw vertical connecting line
            fig.add_trace(go.Scatter(
                x=[x_pos, x_pos],
                y=[y_control, y_target],
                mode='lines',
                line=dict(color='black', width=2),
                showlegend=False,
                hoverinfo='skip'
            ))
            
            # Draw control dot (filled circle)
            fig.add_trace(go.Scatter(
                x=[x_pos],
                y=[y_control],
                mode='markers',
                marker=dict(
                    size=15,
                    color='black',
                    symbol='circle',
                    line=dict(color='black', width=2)
                ),
                showlegend=False,
                hovertext=f'Control: q{control_qubit}',
                hoverinfo='text'
            ))
            
            # Draw target (depends on gate type)
            if gate_name.lower() in ['cx', 'cnot']:
                # CNOT: Draw ‚äï symbol (circle with cross)
                fig.add_trace(go.Scatter(
                    x=[x_pos],
                    y=[y_target],
                    mode='markers',
                    marker=dict(
                        size=25,
                        color='white',
                        symbol='circle',
                        line=dict(color='black', width=3)
                    ),
                    showlegend=False,
                    hovertext=f'Target: q{target_qubit}',
                    hoverinfo='text'
                ))
                
                # Add cross inside circle
                cross_size = 0.15
                # Vertical line of cross
                fig.add_trace(go.Scatter(
                    x=[x_pos, x_pos],
                    y=[y_target - cross_size, y_target + cross_size],
                    mode='lines',
                    line=dict(color='black', width=2),
                    showlegend=False,
                    hoverinfo='skip'
                ))
                # Horizontal line of cross
                fig.add_trace(go.Scatter(
                    x=[x_pos - cross_size, x_pos + cross_size],
                    y=[y_target, y_target],
                    mode='lines',
                    line=dict(color='black', width=2),
                    showlegend=False,
                    hoverinfo='skip'
                ))
                
            elif gate_name.lower() == 'cz':
                # CZ: Draw control dot on target too
                fig.add_trace(go.Scatter(
                    x=[x_pos],
                    y=[y_target],
                    mode='markers',
                    marker=dict(
                        size=15,
                        color='black',
                        symbol='circle',
                        line=dict(color='black', width=2)
                    ),
                    showlegend=False,
                    hovertext=f'Target: q{target_qubit}',
                    hoverinfo='text'
                ))
            else:
                # Other controlled gates: draw as box
                fig.add_shape(
                    type="rect",
                    x0=x_pos - GATE_WIDTH/2,
                    y0=y_target - GATE_HEIGHT/2,
                    x1=x_pos + GATE_WIDTH/2,
                    y1=y_target + GATE_HEIGHT/2,
                    fillcolor='#4ECDC4',
                    line=dict(color='black', width=2),
                )
                
                fig.add_annotation(
                    x=x_pos,
                    y=y_target,
                    text=f"<b>{gate_name.upper()}</b>",
                    showarrow=False,
                    font=dict(size=12, color='white')
                )

# Update layout
total_width = circuit_length * 100
total_height = n_qubits * 80

# If circuit is too wide, adjust spacing
if total_width > 2000:
    # Scale down for very wide circuits
    scale_factor = 2000 / total_width
    GATE_SPACING_DISPLAY = GATE_SPACING * scale_factor
    total_width = 2000
else:
    GATE_SPACING_DISPLAY = GATE_SPACING

fig.update_layout(
    title=f'‚öõÔ∏è Quantum Circuit Diagram - {n_qubits} Qubits',
    xaxis=dict(
        showgrid=False,
        zeroline=False,
        showticklabels=True,
        range=[-1, circuit_length + 0.5],
        title='Gate Position'
    ),
    yaxis=dict(
        showgrid=False,
        zeroline=False,
        showticklabels=False,
        range=[-0.5, (n_qubits - 1) * QUBIT_SPACING + 0.5],
        scaleanchor="x",
        scaleratio=1
    ),
    plot_bgcolor='white',
    height=max(500, total_height),
    width=1400,  # Fixed width for browser
    showlegend=False,
    margin=dict(l=100, r=50, t=80, b=50),
    # Enable drag and zoom
    dragmode='pan',
    hovermode='closest'
)

# Add zoom/pan instructions
fig.add_annotation(
    text='üí° Use mouse wheel to zoom, click and drag to pan',
    xref='paper',
    yref='paper',
    x=0.5,
    y=-0.05,
    showarrow=False,
    font=dict(size=10, color='gray'),
    xanchor='center'
)

# Configure modebar (toolbar)
config = {
    'scrollZoom': True,
    'displayModeBar': True,
    'displaylogo': False,
    'modeBarButtonsToAdd': ['pan2d', 'zoomIn2d', 'zoomOut2d', 'resetScale2d'],
    'toImageButtonOptions': {
        'format': 'png',
        'filename': 'quantum_circuit',
        'height': total_height,
        'width': max(2000, int(circuit_length * 100)),
        'scale': 2
    }
}

# Save with config
filename = f"{output_dir}/quantum_circuit_professional.html"
fig.write_html(filename, config=config)

print(f"\n‚úÖ Professional circuit diagram saved: {filename}")
print(f"   Layout: Qiskit-style with proper gate boxes")
print(f"   Size: {n_qubits} qubits √ó {max_time + 1} time steps")
print(f"   Gates: H (red), P/RZ (teal), CNOT (‚äï)")
print(f"\nüí° How to view:")
print(f"   ‚Ä¢ Open in browser - will fit to window width")
print(f"   ‚Ä¢ Use mouse wheel to ZOOM in/out")
print(f"   ‚Ä¢ Click and DRAG to pan left/right")
print(f"   ‚Ä¢ Click camera icon to download high-res PNG")

# ============================================================
# CREATE LEGEND
# ============================================================

fig_legend = go.Figure()

gate_types = [
    ('H', '#FF6B6B', 'Hadamard Gate'),
    ('P/RZ', '#4ECDC4', 'Phase/Rotation Gate'),
    ('RX', '#FFE66D', 'X-Rotation Gate'),
    ('RY', '#A8E6CF', 'Y-Rotation Gate'),
    ('‚Ä¢‚îÄ‚äï', 'black', 'CNOT (Controlled-X)'),
    ('‚Ä¢‚îÄ‚Ä¢', 'black', 'CZ (Controlled-Z)')
]

for i, (label, color, desc) in enumerate(gate_types):
    fig_legend.add_trace(go.Scatter(
        x=[0.5],
        y=[i],
        mode='markers+text',
        marker=dict(size=40, color=color, symbol='square', line=dict(color='black', width=2)),
        text=label,
        textposition='middle center',
        textfont=dict(size=14, color='white' if color != '#FFE66D' else 'black'),
        name=desc,
        showlegend=True
    ))

fig_legend.update_layout(
    title='‚öõÔ∏è Quantum Gate Legend',
    xaxis=dict(visible=False),
    yaxis=dict(visible=False),
    height=400,
    width=600,
    showlegend=True,
    legend=dict(
        orientation="v",
        yanchor="middle",
        y=0.5,
        xanchor="left",
        x=0.7,
        font=dict(size=14)
    ),
    plot_bgcolor='white'
)

filename_legend = f"{output_dir}/quantum_circuit_legend.html"
fig_legend.write_html(filename_legend)
print(f"‚úÖ Gate legend saved: {filename_legend}")

print("\n" + "="*70)
print("‚úÖ PROFESSIONAL CIRCUIT VISUALIZATION COMPLETE")
print("="*70)
print(f"\nOpen {filename} in your browser to view!")

PROFESSIONAL QUANTUM CIRCUIT DIAGRAM

‚öõÔ∏è Circuit Properties:
   Qubits: 6
   Depth: 17
   Gates: 27

‚úÖ Professional circuit diagram saved: visualization_outputs/quantum_circuit_professional.html
   Layout: Qiskit-style with proper gate boxes
   Size: 6 qubits √ó 17 time steps
   Gates: H (red), P/RZ (teal), CNOT (‚äï)

üí° How to view:
   ‚Ä¢ Open in browser - will fit to window width
   ‚Ä¢ Use mouse wheel to ZOOM in/out
   ‚Ä¢ Click and DRAG to pan left/right
   ‚Ä¢ Click camera icon to download high-res PNG
‚úÖ Gate legend saved: visualization_outputs/quantum_circuit_legend.html

‚úÖ PROFESSIONAL CIRCUIT VISUALIZATION COMPLETE

Open visualization_outputs/quantum_circuit_professional.html in your browser to view!


In [15]:
# ENSEMBLE VOTING - MAJORITY AND WEIGHTED VOTING

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_recall_fscore_support
from scipy import stats
import numpy as np
import pandas as pd

print("\n" + "="*70)
print("GENERATING ENSEMBLE PREDICTIONS")
print("="*70)

# Verify we have predictions
print(f"\nTotal batches processed: {total_batches}")
print(f"Skipped batches: {skipped_batches}")
print(f"Successful batches: {len(svm_preds_list)}")

if len(svm_preds_list) == 0:
    print("\n‚ùå ERROR: No successful batches! Cannot perform ensemble voting.")
else:
    # ============================================================
    # STEP 1: CALCULATE BATCH WEIGHTS (based on accuracy)
    # ============================================================
    
    print("\n" + "="*70)
    print("CALCULATING BATCH WEIGHTS")
    print("="*70)
    
    svm_weights = []
    qsvm_weights = []
    
    for i in range(len(svm_preds_list)):
        svm_acc = accuracy_score(y_test, svm_preds_list[i])
        qsvm_acc = accuracy_score(y_test, qsvm_preds_list[i])
        svm_weights.append(svm_acc)
        qsvm_weights.append(qsvm_acc)
    
    svm_weights = np.array(svm_weights)
    qsvm_weights = np.array(qsvm_weights)
    
    print(f"\nüìä SVM Batch Weights (Accuracies):")
    print(f"   Values: {np.round(svm_weights, 4)}")
    print(f"   Mean: {svm_weights.mean():.4f}")
    print(f"   Std:  {svm_weights.std():.4f}")
    print(f"   Range: [{svm_weights.min():.4f}, {svm_weights.max():.4f}]")
    
    print(f"\nüìä QSVM Batch Weights (Accuracies):")
    print(f"   Values: {np.round(qsvm_weights, 4)}")
    print(f"   Mean: {qsvm_weights.mean():.4f}")
    print(f"   Std:  {qsvm_weights.std():.4f}")
    print(f"   Range: [{qsvm_weights.min():.4f}, {qsvm_weights.max():.4f}]")
    
    
    # ============================================================
    # STEP 2: CONVERT TO ENSEMBLE FORMAT
    # ============================================================
    
    # Convert from (n_models, n_samples) to (n_samples, n_models)
    svm_ensemble = np.array(svm_preds_list).T
    qsvm_ensemble = np.array(qsvm_preds_list).T
    
    print(f"\nüìã Ensemble Shape: {svm_ensemble.shape}")
    print(f"   (n_samples={svm_ensemble.shape[0]}, n_models={svm_ensemble.shape[1]})")
    
    
    # ============================================================
    # STEP 3: MAJORITY VOTING
    # ============================================================
    
    print("\n" + "="*70)
    print("METHOD 1: MAJORITY VOTING")
    print("="*70)
    
    # Each model gets 1 vote, class with most votes wins
    svm_majority_pred = stats.mode(svm_ensemble, axis=1, keepdims=False)[0]
    qsvm_majority_pred = stats.mode(qsvm_ensemble, axis=1, keepdims=False)[0]
    
    print("‚úÖ Majority voting completed")
    print(f"   Each of {svm_ensemble.shape[1]} models gets 1 vote")
    print(f"   Prediction = class with most votes")
    
    
    # ============================================================
    # STEP 4: WEIGHTED VOTING
    # ============================================================
    
    print("\n" + "="*70)
    print("METHOD 2: WEIGHTED VOTING")
    print("="*70)
    
    def weighted_vote(pred_matrix, weights, verbose=False):
        """
        Weighted voting ensemble
        
        Parameters:
        -----------
        pred_matrix : ndarray, shape (n_samples, n_models)
            Predictions from each model
        weights : ndarray, shape (n_models,)
            Weight for each model (typically accuracy)
            
        Returns:
        --------
        final_pred : ndarray, shape (n_samples,)
            Final predictions
        """
        final_pred = []
        disagreement_count = 0
        weight_made_difference = 0
        
        for i in range(pred_matrix.shape[0]):
            # Calculate weighted votes for each class
            votes = {}
            for j, pred in enumerate(pred_matrix[i]):
                votes[pred] = votes.get(pred, 0) + weights[j]
            
            # Get majority vote (for comparison)
            unique, counts = np.unique(pred_matrix[i], return_counts=True)
            majority = unique[np.argmax(counts)]
            
            # Get weighted vote
            weighted = max(votes, key=votes.get)
            
            final_pred.append(weighted)
            
            # Track when models disagree
            if len(unique) > 1:
                disagreement_count += 1
                if majority != weighted:
                    weight_made_difference += 1
        
        if verbose:
            print(f"\n   üìà Voting Diagnostics:")
            print(f"      Samples with model disagreement: {disagreement_count}/{len(pred_matrix)}")
            if disagreement_count > 0:
                print(f"      Times weights changed outcome: {weight_made_difference}")
                print(f"      Weight influence rate: {weight_made_difference/disagreement_count*100:.2f}%")
            else:
                print(f"      All models agreed on all samples")
        
        return np.array(final_pred)
    
    # Apply weighted voting
    svm_weighted_pred = weighted_vote(svm_ensemble, svm_weights, verbose=True)
    qsvm_weighted_pred = weighted_vote(qsvm_ensemble, qsvm_weights, verbose=True)
    
    print("\n‚úÖ Weighted voting completed")
    print(f"   Each model gets vote weight = its accuracy")
    print(f"   Prediction = class with highest weighted sum")
    
    
    # ============================================================
    # STEP 5: COMPARE VOTING METHODS
    # ============================================================
    
    print("\n" + "="*70)
    print("VOTING METHOD COMPARISON")
    print("="*70)
    
    # Check if predictions differ
    svm_diff = (svm_majority_pred != svm_weighted_pred).sum()
    qsvm_diff = (qsvm_majority_pred != qsvm_weighted_pred).sum()
    
    print(f"\nüîç Predictions that differ between methods:")
    print(f"   SVM:  {svm_diff}/{len(svm_majority_pred)} samples ({svm_diff/len(svm_majority_pred)*100:.2f}%)")
    print(f"   QSVM: {qsvm_diff}/{len(qsvm_majority_pred)} samples ({qsvm_diff/len(qsvm_majority_pred)*100:.2f}%)")
    
    if svm_diff == 0 and qsvm_diff == 0:
        print("\n   ‚ÑπÔ∏è  Note: Majority and weighted voting give identical results")
        print("   This happens when batch weights are very similar (stable training)")
    
    
    # ============================================================
    # STEP 6: DETAILED RESULTS - MAJORITY VOTING
    # ============================================================
    
    print("\n" + "="*70)
    print("RESULTS - MAJORITY VOTING")
    print("="*70)
    
    print("\nüîµ CLASSICAL SVM - MAJORITY VOTING")
    print("-" * 70)
    svm_maj_acc = accuracy_score(y_test, svm_majority_pred)
    print(f"Accuracy: {svm_maj_acc:.4f}")
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, svm_majority_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, svm_majority_pred))
    
    print("\n‚öõÔ∏è  QUANTUM SVM - MAJORITY VOTING")
    print("-" * 70)
    qsvm_maj_acc = accuracy_score(y_test, qsvm_majority_pred)
    print(f"Accuracy: {qsvm_maj_acc:.4f}")
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, qsvm_majority_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, qsvm_majority_pred))
    
    
    # ============================================================
    # STEP 7: DETAILED RESULTS - WEIGHTED VOTING
    # ============================================================
    
    print("\n" + "="*70)
    print("RESULTS - WEIGHTED VOTING")
    print("="*70)
    
    print("\nüîµ CLASSICAL SVM - WEIGHTED VOTING")
    print("-" * 70)
    svm_wt_acc = accuracy_score(y_test, svm_weighted_pred)
    print(f"Accuracy: {svm_wt_acc:.4f}")
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, svm_weighted_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, svm_weighted_pred))
    
    print("\n‚öõÔ∏è  QUANTUM SVM - WEIGHTED VOTING")
    print("-" * 70)
    qsvm_wt_acc = accuracy_score(y_test, qsvm_weighted_pred)
    print(f"Accuracy: {qsvm_wt_acc:.4f}")
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, qsvm_weighted_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, qsvm_weighted_pred))
    
    
    # ============================================================
    # STEP 8: SUMMARY TABLE
    # ============================================================
    
    print("\n" + "="*70)
    print("SUMMARY - ALL METHODS")
    print("="*70)
    
    summary_data = []
    
    # Get precision, recall, f1 for each method
    svm_maj_prf = precision_recall_fscore_support(y_test, svm_majority_pred, average='weighted')[:3]
    svm_wt_prf = precision_recall_fscore_support(y_test, svm_weighted_pred, average='weighted')[:3]
    qsvm_maj_prf = precision_recall_fscore_support(y_test, qsvm_majority_pred, average='weighted')[:3]
    qsvm_wt_prf = precision_recall_fscore_support(y_test, qsvm_weighted_pred, average='weighted')[:3]
    
    summary_data.append({
        'Model': 'SVM',
        'Method': 'Majority Vote',
        'Accuracy': f"{svm_maj_acc:.4f}",
        'Precision': f"{svm_maj_prf[0]:.4f}",
        'Recall': f"{svm_maj_prf[1]:.4f}",
        'F1-Score': f"{svm_maj_prf[2]:.4f}"
    })
    
    summary_data.append({
        'Model': 'SVM',
        'Method': 'Weighted Vote',
        'Accuracy': f"{svm_wt_acc:.4f}",
        'Precision': f"{svm_wt_prf[0]:.4f}",
        'Recall': f"{svm_wt_prf[1]:.4f}",
        'F1-Score': f"{svm_wt_prf[2]:.4f}"
    })
    
    summary_data.append({
        'Model': 'QSVM',
        'Method': 'Majority Vote',
        'Accuracy': f"{qsvm_maj_acc:.4f}",
        'Precision': f"{qsvm_maj_prf[0]:.4f}",
        'Recall': f"{qsvm_maj_prf[1]:.4f}",
        'F1-Score': f"{qsvm_maj_prf[2]:.4f}"
    })
    
    summary_data.append({
        'Model': 'QSVM',
        'Method': 'Weighted Vote',
        'Accuracy': f"{qsvm_wt_acc:.4f}",
        'Precision': f"{qsvm_wt_prf[0]:.4f}",
        'Recall': f"{qsvm_wt_prf[1]:.4f}",
        'F1-Score': f"{qsvm_wt_prf[2]:.4f}"
    })
    
    summary_df = pd.DataFrame(summary_data)
    print("\n", summary_df.to_string(index=False))
    
    
    # ============================================================
    # STEP 9: BATCH-WISE ACCURACY ANALYSIS
    # ============================================================
    
    print("\n" + "="*70)
    print("BATCH-WISE ACCURACY ANALYSIS")
    print("="*70)
    
    svm_batch_acc = [accuracy_score(y_test, preds) for preds in svm_preds_list]
    qsvm_batch_acc = [accuracy_score(y_test, preds) for preds in qsvm_preds_list]
    
    print(f"\nüìä SVM Individual Batch Accuracies:")
    for i, acc in enumerate(svm_batch_acc, 1):
        print(f"   Batch {i}: {acc:.4f}")
    
    print(f"\nüìä QSVM Individual Batch Accuracies:")
    for i, acc in enumerate(qsvm_batch_acc, 1):
        print(f"   Batch {i}: {acc:.4f}")
    
    print(f"\nüìà Statistics:")
    print(f"   SVM  - Mean: {np.mean(svm_batch_acc):.4f}, Std: {np.std(svm_batch_acc):.4f}")
    print(f"   QSVM - Mean: {np.mean(qsvm_batch_acc):.4f}, Std: {np.std(qsvm_batch_acc):.4f}")
    
    
    # ============================================================
    # STEP 10: CHOOSE FINAL PREDICTIONS
    # ============================================================
    
    print("\n" + "="*70)
    print("FINAL PREDICTIONS FOR DOWNSTREAM ANALYSIS")
    print("="*70)
    
    # Use weighted voting as final (you can change this)
    svm_final_pred = svm_weighted_pred
    qsvm_final_pred = qsvm_weighted_pred
    
    print("\n‚úÖ Using WEIGHTED VOTING as final predictions")
    print(f"   SVM Final Accuracy:  {accuracy_score(y_test, svm_final_pred):.4f}")
    print(f"   QSVM Final Accuracy: {accuracy_score(y_test, qsvm_final_pred):.4f}")
    
    print("\nüíæ Variables created:")
    print("   - svm_majority_pred, qsvm_majority_pred (majority voting)")
    print("   - svm_weighted_pred, qsvm_weighted_pred (weighted voting)")
    print("   - svm_final_pred, qsvm_final_pred (final predictions)")
    print("   - svm_preds_list, qsvm_preds_list (individual batch predictions)")
    print("   - svm_weights, qsvm_weights (batch weights)")
    
    print("\n" + "="*70)
    print("ENSEMBLE VOTING COMPLETE ‚úÖ")
    print("="*70)


GENERATING ENSEMBLE PREDICTIONS

Total batches processed: 13
Skipped batches: 0
Successful batches: 13

CALCULATING BATCH WEIGHTS

üìä SVM Batch Weights (Accuracies):
   Values: [0.9687 0.9725 0.9732 0.9719 0.9719 0.9719 0.9712 0.9738 0.9719 0.9725
 0.9706 0.9712 0.9738]
   Mean: 0.9719
   Std:  0.0013
   Range: [0.9687, 0.9738]

üìä QSVM Batch Weights (Accuracies):
   Values: [0.9617 0.9681 0.9681 0.9642 0.9668 0.9693 0.9642 0.9687 0.9655 0.9712
 0.9636 0.9642 0.9617]
   Mean: 0.9659
   Std:  0.0029
   Range: [0.9617, 0.9712]

üìã Ensemble Shape: (1565, 13)
   (n_samples=1565, n_models=13)

METHOD 1: MAJORITY VOTING
‚úÖ Majority voting completed
   Each of 13 models gets 1 vote
   Prediction = class with most votes

METHOD 2: WEIGHTED VOTING

   üìà Voting Diagnostics:
      Samples with model disagreement: 22/1565
      Times weights changed outcome: 0
      Weight influence rate: 0.00%

   üìà Voting Diagnostics:
      Samples with model disagreement: 127/1565
      Times weig

In [18]:
# COMPREHENSIVE VISUALIZATION AND EXPLAINABILITY - FIXED VERSION
# Saves all plots as HTML files for viewing in browser

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support
import warnings
warnings.filterwarnings('ignore')

print("="*70)
print("COMPREHENSIVE VISUALIZATION & EXPLAINABILITY")
print("="*70)
print("\nüíæ All plots will be saved as HTML files")
print("   You can open them in your web browser\n")

# Create output directory for plots
import os
output_dir = "visualization_outputs"
os.makedirs(output_dir, exist_ok=True)

# ============================================================
# SECTION 1: DATASET OVERVIEW & EDA
# ============================================================

print("\nüìä SECTION 1: DATASET OVERVIEW")
print("-"*70)

# 1.1 Class Distribution
fig_class_dist = go.Figure()

class_counts = df['final_label'].value_counts()
total_count = len(df)

fig_class_dist.add_trace(go.Bar(
    x=['Legitimate (0)', 'Phishing (1)'],
    y=[class_counts[0], class_counts[1]],
    marker_color=['#2ecc71', '#e74c3c'],
    text=[f'{class_counts[0]}<br>({class_counts[0]/total_count:.1%})', 
          f'{class_counts[1]}<br>({class_counts[1]/total_count:.1%})'],
    textposition='auto',
    hovertemplate='<b>%{x}</b><br>Count: %{y}<extra></extra>'
))

fig_class_dist.update_layout(
    title={
        'text': 'üìß Dataset Class Distribution',
        'x': 0.5,
        'xanchor': 'center',
        'font': {'size': 20, 'family': 'Arial Black'}
    },
    xaxis_title='Email Type',
    yaxis_title='Count',
    showlegend=False,
    height=500,
    template='plotly_white',
    font=dict(size=12)
)

filename = f"{output_dir}/01_class_distribution.html"
fig_class_dist.write_html(filename)
print(f"‚úÖ Class Distribution saved: {filename}")


# 1.2 Feature Statistics Heatmap
numerical_features = X_train_selected.columns[:6]  # Top 6 features
stats_data = []

for feat in numerical_features:
    stats_data.append({
        'Feature': feat,
        'Mean': X_train_selected[feat].mean(),
        'Std': X_train_selected[feat].std(),
        'Min': X_train_selected[feat].min(),
        'Max': X_train_selected[feat].max()
    })

stats_df = pd.DataFrame(stats_data)

fig_stats = go.Figure(data=go.Heatmap(
    z=stats_df[['Mean', 'Std', 'Min', 'Max']].values.T,
    x=stats_df['Feature'],
    y=['Mean', 'Std', 'Min', 'Max'],
    colorscale='Viridis',
    text=np.round(stats_df[['Mean', 'Std', 'Min', 'Max']].values.T, 3),
    texttemplate='%{text}',
    textfont={"size": 6},
    hovertemplate='Feature: %{x}<br>Stat: %{y}<br>Value: %{z:.3f}<extra></extra>'
))

fig_stats.update_layout(
    title='üìà Feature Statistics Overview (Top 6 Features)',
    xaxis_title='Features',
    yaxis_title='Statistics',
    height=400,
    template='plotly_white'
)

filename = f"{output_dir}/02_feature_statistics.html"
fig_stats.write_html(filename)
print(f"‚úÖ Feature Statistics saved: {filename}")


# ============================================================
# SECTION 2: FEATURE SELECTION EXPLAINABILITY
# ============================================================

print("\nüéØ SECTION 2: FEATURE SELECTION (QUBO)")
print("-"*70)

# 2.1 Mutual Information Scores
from sklearn.feature_selection import mutual_info_classif

mi_scores = mutual_info_classif(X_train_scaled, y_train, random_state=42)
mi_df = pd.DataFrame({
    'Feature': X_train.columns,
    'MI_Score': mi_scores
}).sort_values('MI_Score', ascending=False)

# Top 15 MI features
fig_mi = go.Figure()

fig_mi.add_trace(go.Bar(
    x=mi_df.head(15)['MI_Score'],
    y=mi_df.head(15)['Feature'],
    orientation='h',
    marker=dict(
        color=mi_df.head(15)['MI_Score'],
        colorscale='Blues',
        showscale=True,
        colorbar=dict(title="MI Score")
    ),
    text=np.round(mi_df.head(15)['MI_Score'], 3),
    textposition='auto',
    hovertemplate='<b>%{y}</b><br>MI Score: %{x:.4f}<extra></extra>'
))

fig_mi.update_layout(
    title='üéØ Top 15 Features by Mutual Information Score',
    xaxis_title='Mutual Information Score',
    yaxis_title='Features',
    height=600,
    template='plotly_white',
    yaxis=dict(autorange="reversed")
)

filename = f"{output_dir}/03_mutual_information.html"
fig_mi.write_html(filename)
print(f"‚úÖ Mutual Information saved: {filename}")


# 2.2 QUBO Selected Features Comparison
fig_qubo = go.Figure()

all_features = set(X_train.columns)
selected = set(selected_features)
not_selected = all_features - selected

fig_qubo.add_trace(go.Bar(
    name='Selected by QUBO',
    x=['Selected', 'Not Selected'],
    y=[len(selected), len(not_selected)],
    marker_color=['#3498db', '#95a5a6'],
    text=[len(selected), len(not_selected)],
    textposition='auto'
))

fig_qubo.update_layout(
    title=f'‚öõÔ∏è QUBO Feature Selection: {len(selected)}/{len(all_features)} Features Selected',
    yaxis_title='Number of Features',
    height=400,
    template='plotly_white',
    showlegend=False
)

filename = f"{output_dir}/04_qubo_selection.html"
fig_qubo.write_html(filename)
print(f"‚úÖ QUBO selection saved: {filename}")


# ============================================================
# SECTION 3: TRAINING PROCESS VISUALIZATION
# ============================================================

print("\nüöÄ SECTION 3: BATCH-WISE TRAINING ANALYSIS")
print("-"*70)

# 3.1 Batch Accuracy Stability
svm_batch_acc = [accuracy_score(y_test, preds) for preds in svm_preds_list]
qsvm_batch_acc = [accuracy_score(y_test, preds) for preds in qsvm_preds_list]

fig_batch = go.Figure()

# SVM line
fig_batch.add_trace(go.Scatter(
    x=list(range(1, len(svm_batch_acc) + 1)),
    y=svm_batch_acc,
    mode='lines+markers',
    name='Classical SVM',
    line=dict(color='#3498db', width=3),
    marker=dict(size=10, symbol='circle'),
    hovertemplate='<b>SVM Batch %{x}</b><br>Accuracy: %{y:.4f}<extra></extra>'
))

# QSVM line
fig_batch.add_trace(go.Scatter(
    x=list(range(1, len(qsvm_batch_acc) + 1)),
    y=qsvm_batch_acc,
    mode='lines+markers',
    name='Quantum SVM',
    line=dict(color='#e74c3c', width=3),
    marker=dict(size=10, symbol='diamond'),
    hovertemplate='<b>QSVM Batch %{x}</b><br>Accuracy: %{y:.4f}<extra></extra>'
))

# Add mean lines with better positioning
svm_mean = np.mean(svm_batch_acc)
qsvm_mean = np.mean(qsvm_batch_acc)

fig_batch.add_hline(
    y=svm_mean, 
    line_dash="dash", 
    line_color="#3498db",
    line_width=2,
    annotation_text=f"SVM Mean: {svm_mean:.4f}",
    annotation_position="top right",
    annotation=dict(
        font=dict(size=12, color="#3498db"),
        bgcolor="rgba(255, 255, 255, 0.8)",
        bordercolor="#3498db",
        borderwidth=1,
        borderpad=4
    )
)

fig_batch.add_hline(
    y=qsvm_mean, 
    line_dash="dash", 
    line_color="#e74c3c",
    line_width=2,
    annotation_text=f"QSVM Mean: {qsvm_mean:.4f}",
    annotation_position="bottom right",
    annotation=dict(
        font=dict(size=12, color="#e74c3c"),
        bgcolor="rgba(255, 255, 255, 0.8)",
        bordercolor="#e74c3c",
        borderwidth=1,
        borderpad=4
    )
)

fig_batch.update_layout(
    title='üìä Batch-wise Accuracy Stability',
    xaxis_title='Batch Number',
    yaxis_title='Accuracy',
    height=500,
    width=900,  # Added fixed width
    template='plotly_white',
    hovermode='x unified',
    margin=dict(r=150),  # Add right margin for annotations
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    )
)

filename = f"{output_dir}/05_batch_stability.html"
fig_batch.write_html(filename)
print(f"‚úÖ Batch stability saved: {filename}")




# 3.2 Batch Weight Distribution
fig_weights = go.Figure()

fig_weights.add_trace(go.Box(
    y=svm_weights,
    name='SVM Weights',
    marker_color='#3498db',
    boxmean='sd'
))

fig_weights.add_trace(go.Box(
    y=qsvm_weights,
    name='QSVM Weights',
    marker_color='#e74c3c',
    boxmean='sd'
))

fig_weights.update_layout(
    title='üì¶ Batch Weight Distribution (Accuracy-based)',
    yaxis_title='Weight (Accuracy)',
    height=500,
    template='plotly_white',
    showlegend=True
)

filename = f"{output_dir}/06_weight_distribution.html"
fig_weights.write_html(filename)
print(f"‚úÖ Weight distribution saved: {filename}")


# ============================================================
# SECTION 4: MODEL PERFORMANCE COMPARISON
# ============================================================

print("\nüéØ SECTION 4: MODEL PERFORMANCE COMPARISON")
print("-"*70)

# 4.1 Confusion Matrix Comparison
# Create subplot with 2 confusion matrices
fig_cm = make_subplots(
    rows=1, cols=2,
    subplot_titles=('Classical SVM', 'Quantum SVM'),
    specs=[[{"type": "heatmap"}, {"type": "heatmap"}]]
)

# SVM confusion matrix
cm_svm = confusion_matrix(y_test, svm_final_pred)
cm_svm_norm = cm_svm.astype('float') / cm_svm.sum(axis=1)[:, np.newaxis]

fig_cm.add_trace(
    go.Heatmap(
        z=cm_svm_norm,
        x=['Predicted<br>Legitimate', 'Predicted<br>Phishing'],
        y=['Actual<br>Legitimate', 'Actual<br>Phishing'],
        text=cm_svm,
        texttemplate='<b>%{text}</b><br>(%{z:.1%})',
        colorscale='Blues',
        showscale=False,
        hovertemplate='Actual: %{y}<br>Predicted: %{x}<br>Count: %{text}<br>Rate: %{z:.2%}<extra></extra>'
    ),
    row=1, col=1
)

# QSVM confusion matrix
cm_qsvm = confusion_matrix(y_test, qsvm_final_pred)
cm_qsvm_norm = cm_qsvm.astype('float') / cm_qsvm.sum(axis=1)[:, np.newaxis]

fig_cm.add_trace(
    go.Heatmap(
        z=cm_qsvm_norm,
        x=['Predicted<br>Legitimate', 'Predicted<br>Phishing'],
        y=['Actual<br>Legitimate', 'Actual<br>Phishing'],
        text=cm_qsvm,
        texttemplate='<b>%{text}</b><br>(%{z:.1%})',
        colorscale='Reds',
        showscale=True,
        colorbar=dict(title="Rate", x=1.15),
        hovertemplate='Actual: %{y}<br>Predicted: %{x}<br>Count: %{text}<br>Rate: %{z:.2%}<extra></extra>'
    ),
    row=1, col=2
)

fig_cm.update_layout(
    title_text='üéØ Confusion Matrix Comparison',
    height=500,
    template='plotly_white'
)

filename = f"{output_dir}/07_confusion_matrix.html"
fig_cm.write_html(filename)
print(f"‚úÖ Confusion matrix saved: {filename}")


# 4.2 Performance Metrics Bar Chart
svm_prf = precision_recall_fscore_support(y_test, svm_final_pred, average='weighted')
qsvm_prf = precision_recall_fscore_support(y_test, qsvm_final_pred, average='weighted')

svm_acc = accuracy_score(y_test, svm_final_pred)
qsvm_acc = accuracy_score(y_test, qsvm_final_pred)

metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
svm_scores = [svm_acc, svm_prf[0], svm_prf[1], svm_prf[2]]
qsvm_scores = [qsvm_acc, qsvm_prf[0], qsvm_prf[1], qsvm_prf[2]]

fig_metrics = go.Figure()

fig_metrics.add_trace(go.Bar(
    name='Classical SVM',
    x=metrics,
    y=svm_scores,
    marker_color='#3498db',
    text=[f'{v:.4f}' for v in svm_scores],
    textposition='auto',
    hovertemplate='<b>SVM %{x}</b><br>Score: %{y:.4f}<extra></extra>'
))

fig_metrics.add_trace(go.Bar(
    name='Quantum SVM',
    x=metrics,
    y=qsvm_scores,
    marker_color='#e74c3c',
    text=[f'{v:.4f}' for v in qsvm_scores],
    textposition='auto',
    hovertemplate='<b>QSVM %{x}</b><br>Score: %{y:.4f}<extra></extra>'
))

fig_metrics.update_layout(
    title='üìä Performance Metrics Comparison',
    yaxis_title='Score',
    yaxis=dict(range=[0.8, 1.0]),
    barmode='group',
    height=500,
    template='plotly_white',
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    )
)

filename = f"{output_dir}/08_performance_metrics.html"
fig_metrics.write_html(filename)
print(f"‚úÖ Performance metrics saved: {filename}")


# 4.3 Per-Class Performance
svm_pc = precision_recall_fscore_support(y_test, svm_final_pred, average=None)
qsvm_pc = precision_recall_fscore_support(y_test, qsvm_final_pred, average=None)

class_names = ['Legitimate', 'Phishing']

fig_class_perf = make_subplots(
    rows=1, cols=3,
    subplot_titles=('Precision', 'Recall', 'F1-Score'),
    specs=[[{"type": "bar"}, {"type": "bar"}, {"type": "bar"}]]
)

# Precision
fig_class_perf.add_trace(
    go.Bar(name='SVM', x=class_names, y=svm_pc[0], marker_color='#3498db',
           text=[f'{v:.3f}' for v in svm_pc[0]], textposition='auto'),
    row=1, col=1
)
fig_class_perf.add_trace(
    go.Bar(name='QSVM', x=class_names, y=qsvm_pc[0], marker_color='#e74c3c',
           text=[f'{v:.3f}' for v in qsvm_pc[0]], textposition='auto'),
    row=1, col=1
)

# Recall
fig_class_perf.add_trace(
    go.Bar(name='SVM', x=class_names, y=svm_pc[1], marker_color='#3498db',
           text=[f'{v:.3f}' for v in svm_pc[1]], textposition='auto', showlegend=False),
    row=1, col=2
)
fig_class_perf.add_trace(
    go.Bar(name='QSVM', x=class_names, y=qsvm_pc[1], marker_color='#e74c3c',
           text=[f'{v:.3f}' for v in qsvm_pc[1]], textposition='auto', showlegend=False),
    row=1, col=2
)

# F1-Score
fig_class_perf.add_trace(
    go.Bar(name='SVM', x=class_names, y=svm_pc[2], marker_color='#3498db',
           text=[f'{v:.3f}' for v in svm_pc[2]], textposition='auto', showlegend=False),
    row=1, col=3
)
fig_class_perf.add_trace(
    go.Bar(name='QSVM', x=class_names, y=qsvm_pc[2], marker_color='#e74c3c',
           text=[f'{v:.3f}' for v in qsvm_pc[2]], textposition='auto', showlegend=False),
    row=1, col=3
)

fig_class_perf.update_layout(
    title_text='üìà Per-Class Performance Comparison',
    height=500,
    template='plotly_white',
    barmode='group',
    showlegend=True
)

fig_class_perf.update_yaxes(range=[0.8, 1.0])

filename = f"{output_dir}/09_per_class_performance.html"
fig_class_perf.write_html(filename)
print(f"‚úÖ Per-class performance saved: {filename}")


# ============================================================
# SECTION 5: VOTING METHOD COMPARISON
# ============================================================

print("\nüó≥Ô∏è SECTION 5: VOTING METHOD COMPARISON")
print("-"*70)

# Calculate accuracies for all methods
svm_maj_acc = accuracy_score(y_test, svm_majority_pred)
svm_wt_acc = accuracy_score(y_test, svm_weighted_pred)
qsvm_maj_acc = accuracy_score(y_test, qsvm_majority_pred)
qsvm_wt_acc = accuracy_score(y_test, qsvm_weighted_pred)

fig_voting = go.Figure()

methods = ['Majority<br>Voting', 'Weighted<br>Voting']

fig_voting.add_trace(go.Bar(
    name='Classical SVM',
    x=methods,
    y=[svm_maj_acc, svm_wt_acc],
    marker_color='#3498db',
    text=[f'{svm_maj_acc:.4f}', f'{svm_wt_acc:.4f}'],
    textposition='auto'
))

fig_voting.add_trace(go.Bar(
    name='Quantum SVM',
    x=methods,
    y=[qsvm_maj_acc, qsvm_wt_acc],
    marker_color='#e74c3c',
    text=[f'{qsvm_maj_acc:.4f}', f'{qsvm_wt_acc:.4f}'],
    textposition='auto'
))

fig_voting.update_layout(
    title='üó≥Ô∏è Voting Method Comparison: Majority vs Weighted',
    yaxis_title='Accuracy',
    yaxis=dict(range=[0.85, 1.0]),
    barmode='group',
    height=500,
    template='plotly_white'
)

filename = f"{output_dir}/10_voting_comparison.html"
fig_voting.write_html(filename)
print(f"‚úÖ Voting comparison saved: {filename}")


# ============================================================
# SECTION 6: FEATURE CONTRIBUTION ANALYSIS
# ============================================================

print("\nüîç SECTION 6: FEATURE CONTRIBUTION ANALYSIS")
print("-"*70)

# Compare feature values for legitimate vs phishing emails
feature_contrib_data = []

# Get indices for each class
legit_indices = y_train[y_train == 0].index
phish_indices = y_train[y_train == 1].index

for feat in selected_features[:10]:  # Top 10 QUBO features
    legit_mean = X_train.loc[legit_indices, feat].mean()
    phish_mean = X_train.loc[phish_indices, feat].mean()
    diff = abs(phish_mean - legit_mean)
    
    feature_contrib_data.append({
        'Feature': feat,
        'Legitimate_Mean': legit_mean,
        'Phishing_Mean': phish_mean,
        'Difference': diff
    })

contrib_df = pd.DataFrame(feature_contrib_data).sort_values('Difference', ascending=False)

fig_contrib = go.Figure()

fig_contrib.add_trace(go.Bar(
    name='Legitimate Emails',
    y=contrib_df['Feature'],
    x=contrib_df['Legitimate_Mean'],
    orientation='h',
    marker_color='#2ecc71',
    text=[f'{v:.3f}' for v in contrib_df['Legitimate_Mean']],
    textposition='auto',
    hovertemplate='<b>%{y}</b><br>Legitimate: %{x:.4f}<extra></extra>'
))

fig_contrib.add_trace(go.Bar(
    name='Phishing Emails',
    y=contrib_df['Feature'],
    x=contrib_df['Phishing_Mean'],
    orientation='h',
    marker_color='#e74c3c',
    text=[f'{v:.3f}' for v in contrib_df['Phishing_Mean']],
    textposition='auto',
    hovertemplate='<b>%{y}</b><br>Phishing: %{x:.4f}<extra></extra>'
))

fig_contrib.update_layout(
    title='üîç Feature Value Comparison: Legitimate vs Phishing',
    xaxis_title='Mean Feature Value',
    yaxis_title='Features',
    barmode='group',
    height=600,
    template='plotly_white',
    yaxis=dict(autorange="reversed"),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    )
)

filename = f"{output_dir}/11_feature_contribution.html"
fig_contrib.write_html(filename)
print(f"‚úÖ Feature contribution saved: {filename}")

# Print insights
print(f"\n   Features with largest difference:")
for i, row in contrib_df.head(5).iterrows():
    print(f"   ‚Ä¢ {row['Feature']}: Legit={row['Legitimate_Mean']:.3f}, Phish={row['Phishing_Mean']:.3f}, Diff={row['Difference']:.3f}")


# ============================================================
# SECTION 7: DECISION BOUNDARY VISUALIZATION (2D PCA)
# ============================================================

print("\nüé® SECTION 7: DECISION BOUNDARY VISUALIZATION")
print("-"*70)

from sklearn.decomposition import PCA

# Reduce to 2D for visualization
pca = PCA(n_components=2, random_state=42)
X_test_pca = pca.fit_transform(X_test_scaled[selected_features])

# Create scatter plot
fig_boundary = go.Figure()

# Actual labels
fig_boundary.add_trace(go.Scatter(
    x=X_test_pca[y_test==0, 0],
    y=X_test_pca[y_test==0, 1],
    mode='markers',
    name='Legitimate (Actual)',
    marker=dict(size=8, color='#2ecc71', symbol='circle', opacity=0.6)
))

fig_boundary.add_trace(go.Scatter(
    x=X_test_pca[y_test==1, 0],
    y=X_test_pca[y_test==1, 1],
    mode='markers',
    name='Phishing (Actual)',
    marker=dict(size=8, color='#e74c3c', symbol='x', opacity=0.6)
))

# Misclassifications
misclass_idx = np.where(qsvm_final_pred != y_test)[0]
if len(misclass_idx) > 0:
    fig_boundary.add_trace(go.Scatter(
        x=X_test_pca[misclass_idx, 0],
        y=X_test_pca[misclass_idx, 1],
        mode='markers',
        name='Misclassified',
        marker=dict(size=12, color='yellow', symbol='circle-open', 
                   line=dict(width=2, color='black'))
    ))

fig_boundary.update_layout(
    title=f'üé® Decision Space (PCA 2D) - {len(misclass_idx)} Misclassifications',
    xaxis_title=f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)',
    yaxis_title=f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)',
    height=600,
    template='plotly_white'
)

filename = f"{output_dir}/12_decision_boundary.html"
fig_boundary.write_html(filename)
print(f"‚úÖ Decision boundary saved: {filename}")


# ============================================================
# FINAL SUMMARY
# ============================================================

print("\n" + "="*70)
print("‚úÖ ALL VISUALIZATIONS SAVED SUCCESSFULLY")
print("="*70)

print(f"\nüìÇ Output Directory: {output_dir}/")
print("\nüìä Generated Files:")
print("   01_class_distribution.html")
print("   02_feature_statistics.html")
print("   03_mutual_information.html")
print("   04_qubo_selection.html")
print("   05_batch_stability.html")
print("   06_weight_distribution.html")
print("   07_confusion_matrix.html")
print("   08_performance_metrics.html")
print("   09_per_class_performance.html")
print("   10_voting_comparison.html")
print("   11_feature_contribution.html")
print("   12_decision_boundary.html")



print("\nüéØ Key Insights:")
print(f"   ‚Ä¢ Dataset: {len(df)} samples ({class_counts[0]} legitimate, {class_counts[1]} phishing)")
print(f"   ‚Ä¢ Features: {len(selected_features)} selected by QUBO from {len(X_train.columns)} total")
print(f"   ‚Ä¢ SVM Accuracy: {svm_acc:.4f}")
print(f"   ‚Ä¢ QSVM Accuracy: {qsvm_acc:.4f}")
print(f"   ‚Ä¢ Batch Training: {len(svm_preds_list)} successful batches")
print(f"   ‚Ä¢ Misclassifications: {len(misclass_idx)}")

COMPREHENSIVE VISUALIZATION & EXPLAINABILITY

üíæ All plots will be saved as HTML files
   You can open them in your web browser


üìä SECTION 1: DATASET OVERVIEW
----------------------------------------------------------------------
‚úÖ Class Distribution saved: visualization_outputs/01_class_distribution.html
‚úÖ Feature Statistics saved: visualization_outputs/02_feature_statistics.html

üéØ SECTION 2: FEATURE SELECTION (QUBO)
----------------------------------------------------------------------
‚úÖ Mutual Information saved: visualization_outputs/03_mutual_information.html
‚úÖ QUBO selection saved: visualization_outputs/04_qubo_selection.html

üöÄ SECTION 3: BATCH-WISE TRAINING ANALYSIS
----------------------------------------------------------------------
‚úÖ Batch stability saved: visualization_outputs/05_batch_stability.html
‚úÖ Weight distribution saved: visualization_outputs/06_weight_distribution.html

üéØ SECTION 4: MODEL PERFORMANCE COMPARISON
--------------------------

In [19]:
# COMPREHENSIVE FEATURE COMPARISON - LEGITIMATE VS PHISHING
# Generates comparison charts for all features and MI-selected features

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
import os

print("="*70)
print("FEATURE COMPARISON: LEGITIMATE VS PHISHING")
print("="*70)

# Create output directory
output_dir = "visualization_outputs"
os.makedirs(output_dir, exist_ok=True)

# Get class indices
legit_indices = y_train[y_train == 0].index
phish_indices = y_train[y_train == 1].index

print(f"\nAnalyzing features...")
print(f"   Legitimate emails: {len(legit_indices)}")
print(f"   Phishing emails: {len(phish_indices)}")


# ============================================================
# SECTION 1: ALL FEATURES COMPARISON
# ============================================================

print("\nüìä SECTION 1: ALL FEATURES COMPARISON")
print("-"*70)

all_features = X_train.columns.tolist()
feature_comparison_data = []

for feat in all_features:
    legit_mean = X_train.loc[legit_indices, feat].mean()
    phish_mean = X_train.loc[phish_indices, feat].mean()
    diff = abs(phish_mean - legit_mean)
    
    feature_comparison_data.append({
        'Feature': feat,
        'Legitimate_Mean': legit_mean,
        'Phishing_Mean': phish_mean,
        'Absolute_Difference': diff,
        'Relative_Difference': diff / (legit_mean + 0.0001)  # Avoid division by zero
    })

all_features_df = pd.DataFrame(feature_comparison_data)
all_features_df = all_features_df.sort_values('Absolute_Difference', ascending=False)

print(f"‚úÖ Analyzed {len(all_features)} features")
print(f"\n   Top 5 features with largest differences:")
for i, row in all_features_df.head(5).iterrows():
    print(f"   {i+1}. {row['Feature']}: Legit={row['Legitimate_Mean']:.3f}, Phish={row['Phishing_Mean']:.3f}, Diff={row['Absolute_Difference']:.3f}")


# Create visualization for ALL features (top 20)
fig_all = go.Figure()

top_20_features = all_features_df.head(20)

fig_all.add_trace(go.Bar(
    name='Legitimate Emails',
    y=top_20_features['Feature'],
    x=top_20_features['Legitimate_Mean'],
    orientation='h',
    marker_color='#2ecc71',
    text=[f'{v:.3f}' for v in top_20_features['Legitimate_Mean']],
    textposition='auto',
    hovertemplate='<b>%{y}</b><br>Legitimate: %{x:.4f}<extra></extra>'
))

fig_all.add_trace(go.Bar(
    name='Phishing Emails',
    y=top_20_features['Feature'],
    x=top_20_features['Phishing_Mean'],
    orientation='h',
    marker_color='#e74c3c',
    text=[f'{v:.3f}' for v in top_20_features['Phishing_Mean']],
    textposition='auto',
    hovertemplate='<b>%{y}</b><br>Phishing: %{x:.4f}<extra></extra>'
))

fig_all.update_layout(
    title='üîç Feature Value Comparison: Legitimate vs Phishing (Top 20 by Difference)',
    xaxis_title='Mean Feature Value',
    yaxis_title='Features',
    barmode='group',
    height=800,
    width=1000,
    template='plotly_white',
    yaxis=dict(autorange="reversed"),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    )
)

filename = f"{output_dir}/feature_comparison_top20.html"
fig_all.write_html(filename)
print(f"\n‚úÖ Top 20 features chart saved: {filename}")


# Create comprehensive chart with ALL features (in batches of 25)
num_features = len(all_features)
batch_size = 25
num_batches = (num_features + batch_size - 1) // batch_size

print(f"\nüìä Creating charts for all {num_features} features in {num_batches} batches...")

for batch_idx in range(num_batches):
    start_idx = batch_idx * batch_size
    end_idx = min(start_idx + batch_size, num_features)
    batch_features = all_features_df.iloc[start_idx:end_idx]
    
    fig_batch = go.Figure()
    
    fig_batch.add_trace(go.Bar(
        name='Legitimate Emails',
        y=batch_features['Feature'],
        x=batch_features['Legitimate_Mean'],
        orientation='h',
        marker_color='#2ecc71',
        text=[f'{v:.3f}' for v in batch_features['Legitimate_Mean']],
        textposition='auto',
        hovertemplate='<b>%{y}</b><br>Legitimate: %{x:.4f}<extra></extra>'
    ))
    
    fig_batch.add_trace(go.Bar(
        name='Phishing Emails',
        y=batch_features['Feature'],
        x=batch_features['Phishing_Mean'],
        orientation='h',
        marker_color='#e74c3c',
        text=[f'{v:.3f}' for v in batch_features['Phishing_Mean']],
        textposition='auto',
        hovertemplate='<b>%{y}</b><br>Phishing: %{x:.4f}<extra></extra>'
    ))
    
    fig_batch.update_layout(
        title=f'üîç Feature Comparison - Batch {batch_idx + 1}/{num_batches} (Features {start_idx + 1}-{end_idx})',
        xaxis_title='Mean Feature Value',
        yaxis_title='Features',
        barmode='group',
        height=800,
        width=1000,
        template='plotly_white',
        yaxis=dict(autorange="reversed"),
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1
        )
    )
    
    filename = f"{output_dir}/feature_comparison_all_batch{batch_idx + 1:02d}.html"
    fig_batch.write_html(filename)
    print(f"   ‚úÖ Batch {batch_idx + 1}/{num_batches} saved: {filename}")


# ============================================================
# SECTION 2: MI-SELECTED FEATURES COMPARISON
# ============================================================

print("\nüéØ SECTION 2: MI-SELECTED FEATURES COMPARISON")
print("-"*70)

# Calculate Mutual Information
print("   Calculating Mutual Information scores...")
mi_scores = mutual_info_classif(X_train_scaled, y_train, random_state=42)
mi_df = pd.DataFrame({
    'Feature': X_train.columns,
    'MI_Score': mi_scores
}).sort_values('MI_Score', ascending=False)

# Select top features by MI (e.g., top 15)
n_top_mi = 15
top_mi_features = mi_df.head(n_top_mi)['Feature'].tolist()

print(f"‚úÖ Selected top {n_top_mi} features by Mutual Information")
print(f"\n   Top 5 MI features:")
for i, row in mi_df.head(5).iterrows():
    print(f"   {i+1}. {row['Feature']}: MI={row['MI_Score']:.4f}")

# Get comparison data for MI features
mi_feature_comparison = []

for feat in top_mi_features:
    legit_mean = X_train.loc[legit_indices, feat].mean()
    phish_mean = X_train.loc[phish_indices, feat].mean()
    diff = abs(phish_mean - legit_mean)
    mi_score = mi_df[mi_df['Feature'] == feat]['MI_Score'].values[0]
    
    mi_feature_comparison.append({
        'Feature': feat,
        'Legitimate_Mean': legit_mean,
        'Phishing_Mean': phish_mean,
        'Difference': diff,
        'MI_Score': mi_score
    })

mi_features_df = pd.DataFrame(mi_feature_comparison)
mi_features_df = mi_features_df.sort_values('MI_Score', ascending=False)

# Create MI features comparison chart
fig_mi = go.Figure()

fig_mi.add_trace(go.Bar(
    name='Legitimate Emails',
    y=mi_features_df['Feature'],
    x=mi_features_df['Legitimate_Mean'],
    orientation='h',
    marker_color='#2ecc71',
    text=[f'{v:.3f}' for v in mi_features_df['Legitimate_Mean']],
    textposition='auto',
    hovertemplate='<b>%{y}</b><br>Legitimate: %{x:.4f}<br>MI Score: %{customdata:.4f}<extra></extra>',
    customdata=mi_features_df['MI_Score']
))

fig_mi.add_trace(go.Bar(
    name='Phishing Emails',
    y=mi_features_df['Feature'],
    x=mi_features_df['Phishing_Mean'],
    orientation='h',
    marker_color='#e74c3c',
    text=[f'{v:.3f}' for v in mi_features_df['Phishing_Mean']],
    textposition='auto',
    hovertemplate='<b>%{y}</b><br>Phishing: %{x:.4f}<br>MI Score: %{customdata:.4f}<extra></extra>',
    customdata=mi_features_df['MI_Score']
))

fig_mi.update_layout(
    title=f'üéØ Top {n_top_mi} Features by Mutual Information: Legitimate vs Phishing',
    xaxis_title='Mean Feature Value',
    yaxis_title='Features (Ranked by MI Score)',
    barmode='group',
    height=700,
    width=1000,
    template='plotly_white',
    yaxis=dict(autorange="reversed"),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    )
)

filename = f"{output_dir}/feature_comparison_top_MI.html"
fig_mi.write_html(filename)
print(f"\n‚úÖ Top MI features chart saved: {filename}")


# ============================================================
# SECTION 3: QUBO-SELECTED FEATURES COMPARISON
# ============================================================

if 'selected_features' in locals() or 'selected_features' in globals():
    print("\n‚öõÔ∏è SECTION 3: QUBO-SELECTED FEATURES COMPARISON")
    print("-"*70)
    
    # Get comparison data for QUBO features
    qubo_feature_comparison = []
    
    for feat in selected_features:
        legit_mean = X_train.loc[legit_indices, feat].mean()
        phish_mean = X_train.loc[phish_indices, feat].mean()
        diff = abs(phish_mean - legit_mean)
        
        qubo_feature_comparison.append({
            'Feature': feat,
            'Legitimate_Mean': legit_mean,
            'Phishing_Mean': phish_mean,
            'Difference': diff
        })
    
    qubo_features_df = pd.DataFrame(qubo_feature_comparison)
    qubo_features_df = qubo_features_df.sort_values('Difference', ascending=False)
    
    print(f"‚úÖ Analyzed {len(selected_features)} QUBO-selected features")
    
    # Create QUBO features comparison chart
    fig_qubo = go.Figure()
    
    # Show top 15 or all if less
    n_display = min(15, len(qubo_features_df))
    display_qubo = qubo_features_df.head(n_display)
    
    fig_qubo.add_trace(go.Bar(
        name='Legitimate Emails',
        y=display_qubo['Feature'],
        x=display_qubo['Legitimate_Mean'],
        orientation='h',
        marker_color='#2ecc71',
        text=[f'{v:.3f}' for v in display_qubo['Legitimate_Mean']],
        textposition='auto',
        hovertemplate='<b>%{y}</b><br>Legitimate: %{x:.4f}<extra></extra>'
    ))
    
    fig_qubo.add_trace(go.Bar(
        name='Phishing Emails',
        y=display_qubo['Feature'],
        x=display_qubo['Phishing_Mean'],
        orientation='h',
        marker_color='#e74c3c',
        text=[f'{v:.3f}' for v in display_qubo['Phishing_Mean']],
        textposition='auto',
        hovertemplate='<b>%{y}</b><br>Phishing: %{x:.4f}<extra></extra>'
    ))
    
    fig_qubo.update_layout(
        title=f'‚öõÔ∏è QUBO-Selected Features: Legitimate vs Phishing (Top {n_display} by Difference)',
        xaxis_title='Mean Feature Value',
        yaxis_title='Features',
        barmode='group',
        height=700,
        width=1000,
        template='plotly_white',
        yaxis=dict(autorange="reversed"),
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1
        )
    )
    
    filename = f"{output_dir}/feature_comparison_QUBO_selected.html"
    fig_qubo.write_html(filename)
    print(f"\n‚úÖ QUBO features chart saved: {filename}")
    
    # If QUBO has many features, create full chart
    if len(qubo_features_df) > 15:
        fig_qubo_full = go.Figure()
        
        fig_qubo_full.add_trace(go.Bar(
            name='Legitimate Emails',
            y=qubo_features_df['Feature'],
            x=qubo_features_df['Legitimate_Mean'],
            orientation='h',
            marker_color='#2ecc71',
            text=[f'{v:.3f}' for v in qubo_features_df['Legitimate_Mean']],
            textposition='auto'
        ))
        
        fig_qubo_full.add_trace(go.Bar(
            name='Phishing Emails',
            y=qubo_features_df['Feature'],
            x=qubo_features_df['Phishing_Mean'],
            orientation='h',
            marker_color='#e74c3c',
            text=[f'{v:.3f}' for v in qubo_features_df['Phishing_Mean']],
            textposition='auto'
        ))
        
        fig_qubo_full.update_layout(
            title=f'‚öõÔ∏è All {len(selected_features)} QUBO-Selected Features: Legitimate vs Phishing',
            xaxis_title='Mean Feature Value',
            yaxis_title='Features',
            barmode='group',
            height=max(800, len(qubo_features_df) * 30),
            width=1000,
            template='plotly_white',
            yaxis=dict(autorange="reversed")
        )
        
        filename = f"{output_dir}/feature_comparison_QUBO_all.html"
        fig_qubo_full.write_html(filename)
        print(f"‚úÖ All QUBO features chart saved: {filename}")

else:
    print("\n‚ö†Ô∏è  QUBO selected_features not found, skipping QUBO comparison")


# ============================================================
# SECTION 4: SUMMARY STATISTICS
# ============================================================

print("\nüìä SECTION 4: SUMMARY STATISTICS")
print("-"*70)

# Create summary DataFrame
summary_data = {
    'Category': ['All Features', f'Top {n_top_mi} MI Features'],
    'Count': [len(all_features), n_top_mi],
    'Avg Difference': [
        all_features_df['Absolute_Difference'].mean(),
        mi_features_df['Difference'].mean()
    ],
    'Max Difference': [
        all_features_df['Absolute_Difference'].max(),
        mi_features_df['Difference'].max()
    ]
}

if 'selected_features' in locals() or 'selected_features' in globals():
    summary_data['Category'].append('QUBO Selected')
    summary_data['Count'].append(len(selected_features))
    summary_data['Avg Difference'].append(qubo_features_df['Difference'].mean())
    summary_data['Max Difference'].append(qubo_features_df['Difference'].max())

summary_df = pd.DataFrame(summary_data)

print("\n" + summary_df.to_string(index=False))

# Save summary
summary_df.to_csv(f"{output_dir}/feature_comparison_summary.csv", index=False)
print(f"\n‚úÖ Summary saved: {output_dir}/feature_comparison_summary.csv")


# ============================================================
# FINAL SUMMARY
# ============================================================

print("\n" + "="*70)
print("‚úÖ FEATURE COMPARISON COMPLETE")
print("="*70)

print(f"\nüìÇ Output Directory: {output_dir}/")
print("\nüìä Generated Files:")
print(f"   ‚Ä¢ feature_comparison_top20.html (Top 20 features)")
print(f"   ‚Ä¢ feature_comparison_all_batch01.html to batch{num_batches:02d}.html (All features)")
print(f"   ‚Ä¢ feature_comparison_top_MI.html (Top {n_top_mi} MI features)")
if 'selected_features' in locals() or 'selected_features' in globals():
    print(f"   ‚Ä¢ feature_comparison_QUBO_selected.html (QUBO features)")
    if len(selected_features) > 15:
        print(f"   ‚Ä¢ feature_comparison_QUBO_all.html (All QUBO features)")
print(f"   ‚Ä¢ feature_comparison_summary.csv (Summary statistics)")

print("\nüåê To view: Open any .html file in your web browser")
print("\nüéØ Key Insights:")
print(f"   ‚Ä¢ Total features analyzed: {len(all_features)}")
print(f"   ‚Ä¢ Feature with largest difference: {all_features_df.iloc[0]['Feature']}")
print(f"   ‚Ä¢ Difference: {all_features_df.iloc[0]['Absolute_Difference']:.3f}")
print(f"   ‚Ä¢ Top MI feature: {mi_df.iloc[0]['Feature']} (MI={mi_df.iloc[0]['MI_Score']:.4f})")

FEATURE COMPARISON: LEGITIMATE VS PHISHING

Analyzing features...
   Legitimate emails: 3200
   Phishing emails: 3057

üìä SECTION 1: ALL FEATURES COMPARISON
----------------------------------------------------------------------
‚úÖ Analyzed 36 features

   Top 5 features with largest differences:
   8. domain_age: Legit=7778.861, Phish=3747.043, Diff=4031.818
   6. web_js_len: Legit=107.271, Phish=559.780, Diff=452.509
   7. web_js_obf_len: Legit=0.000, Phish=366.361, Diff=366.361
   1. email_subject_len: Legit=48.022, Phish=31.881, Diff=16.141
   29. content_num_scripts: Legit=2.176, Phish=11.274, Diff=9.098

‚úÖ Top 20 features chart saved: visualization_outputs/feature_comparison_top20.html

üìä Creating charts for all 36 features in 2 batches...
   ‚úÖ Batch 1/2 saved: visualization_outputs/feature_comparison_all_batch01.html
   ‚úÖ Batch 2/2 saved: visualization_outputs/feature_comparison_all_batch02.html

üéØ SECTION 2: MI-SELECTED FEATURES COMPARISON
------------------------

In [25]:
# ADVANCED EXPLAINABILITY & ERROR ANALYSIS - FIXED VERSION
# Saves all plots as HTML files

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, precision_recall_fscore_support, accuracy_score
import os

print("="*70)
print("ADVANCED EXPLAINABILITY & ERROR ANALYSIS")
print("="*70)

# Create output directory
output_dir = "visualization_outputs"
os.makedirs(output_dir, exist_ok=True)

# ============================================================
# SECTION 1: ERROR ANALYSIS
# ============================================================

print("\n‚ùå SECTION 1: ERROR ANALYSIS")
print("-"*70)

# 1.1 Error Type Analysis
svm_errors = (svm_final_pred != y_test.values)
qsvm_errors = (qsvm_final_pred != y_test.values)

# Categorize errors
both_correct = (~svm_errors) & (~qsvm_errors)
both_wrong = svm_errors & qsvm_errors
svm_only_wrong = svm_errors & (~qsvm_errors)
qsvm_only_wrong = (~svm_errors) & qsvm_errors

error_counts = {
    'Both Correct': both_correct.sum(),
    'SVM Wrong,\nQSVM Correct': qsvm_only_wrong.sum(),
    'QSVM Wrong,\nSVM Correct': svm_only_wrong.sum(),
    'Both Wrong': both_wrong.sum()
}

fig_errors = go.Figure()

colors = ['#2ecc71', '#3498db', '#e74c3c', '#95a5a6']

fig_errors.add_trace(go.Bar(
    x=list(error_counts.keys()),
    y=list(error_counts.values()),
    marker_color=colors,
    text=list(error_counts.values()),
    textposition='auto',
    texttemplate='<b>%{text}</b><br>',
    hovertemplate='<b>%{x}</b><br><extra></extra>'
))

fig_errors.update_layout(
    title='‚ùå Error Analysis: Where Do Models Agree/Disagree?',
    yaxis_title='Number of Samples',
    height=500,
    width=900,
    template='plotly_white',
    showlegend=False
)

filename = f"{output_dir}/advanced_01_error_analysis.html"
fig_errors.write_html(filename)
print(f"‚úÖ Error analysis saved: {filename}")

print(f"   Both correct: {error_counts['Both Correct']} ({error_counts['Both Correct']/len(y_test)*100:.2f}%)")
print(f"   SVM wrong, QSVM correct: {qsvm_only_wrong.sum()}")
print(f"   QSVM wrong, SVM correct: {svm_only_wrong.sum()}")
print(f"   Both wrong: {both_wrong.sum()}")


# 1.2 False Positive vs False Negative Analysis
svm_fp = ((svm_final_pred == 1) & (y_test.values == 0)).sum()
svm_fn = ((svm_final_pred == 0) & (y_test.values == 1)).sum()
qsvm_fp = ((qsvm_final_pred == 1) & (y_test.values == 0)).sum()
qsvm_fn = ((qsvm_final_pred == 0) & (y_test.values == 1)).sum()

fig_fpfn = go.Figure()

fig_fpfn.add_trace(go.Bar(
    name='False Positives',
    x=['SVM', 'QSVM'],
    y=[svm_fp, qsvm_fp],
    marker_color='#e67e22',
    text=[svm_fp, qsvm_fp],
    textposition='auto',
    hovertemplate='<b>%{x}</b><br>False Positives: %{y}<extra></extra>'
))

fig_fpfn.add_trace(go.Bar(
    name='False Negatives',
    x=['SVM', 'QSVM'],
    y=[svm_fn, qsvm_fn],
    marker_color='#e74c3c',
    text=[svm_fn, qsvm_fn],
    textposition='auto',
    hovertemplate='<b>%{x}</b><br>False Negatives: %{y}<extra></extra>'
))

fig_fpfn.update_layout(
    title='üö® False Positives vs False Negatives',
    yaxis_title='Error Count',
    barmode='group',
    height=500,
    width=800,
    template='plotly_white'
)

filename = f"{output_dir}/advanced_02_false_pos_neg.html"
fig_fpfn.write_html(filename)
print(f"‚úÖ FP/FN analysis saved: {filename}")

print(f"\nüìä Error Types:")
print(f"   SVM  - False Positives: {svm_fp}, False Negatives: {svm_fn}")
print(f"   QSVM - False Positives: {qsvm_fp}, False Negatives: {qsvm_fn}")


# ============================================================
# SECTION 2: FEATURE CONTRIBUTION TO ERRORS
# ============================================================

print("\nüîç SECTION 2: FEATURE CONTRIBUTION TO ERRORS")
print("-"*70)

# Compare feature values for correct vs incorrect predictions
error_indices = np.where(qsvm_errors)[0]
correct_indices = np.where(~qsvm_errors)[0]

# Calculate mean feature values
features_error = X_test_scaled[selected_features].iloc[error_indices].mean()
features_correct = X_test_scaled[selected_features].iloc[correct_indices].mean()
features_diff = (features_error - features_correct).abs().sort_values(ascending=False)

fig_error_feat = go.Figure()

fig_error_feat.add_trace(go.Bar(
    x=features_diff.head(10).values,
    y=features_diff.head(10).index,
    orientation='h',
    marker=dict(
        color=features_diff.head(10).values,
        colorscale='Reds',
        showscale=True,
        colorbar=dict(title="Difference")
    ),
    text=[f'{v:.3f}' for v in features_diff.head(10).values],
    textposition='auto',
    hovertemplate='<b>%{y}</b><br>Difference: %{x:.4f}<extra></extra>'
))

fig_error_feat.update_layout(
    title='üîç Features Most Associated with Errors ',
    xaxis_title='Mean Feature Value Difference (Error vs Correct)',
    yaxis_title='Features',
    height=600,
    width=900,
    template='plotly_white',
    yaxis=dict(autorange="reversed")
)

filename = f"{output_dir}/advanced_03_error_features.html"
fig_error_feat.write_html(filename)
print(f"‚úÖ Error features saved: {filename}")

print(f"   Features with largest difference in error cases:")
for i, (feat, diff) in enumerate(features_diff.head(5).items(), 1):
    print(f"   {i}. {feat}: {diff:.4f}")


# ============================================================
# SECTION 3: QUANTUM CIRCUIT INSIGHTS
# ============================================================

print("\n‚öõÔ∏è SECTION 3: QUANTUM CIRCUIT INSIGHTS")
print("-"*70)

print(f"\nüî¨ Quantum Circuit Properties:")
print(f"   Qubits: {feature_map.num_qubits}")
print(f"   Circuit Depth: {feature_map.depth()}")
print(f"   Number of Parameters: {feature_map.num_parameters}")
print(f"   Entanglement: {feature_map.entanglement}")
print(f"   Repetitions: {feature_map.reps}")

# Circuit complexity visualization
fig_circuit = go.Figure()

fig_circuit.add_trace(go.Indicator(
    mode="number+gauge",
    value=feature_map.num_qubits,
    domain={'x': [0, 0.3], 'y': [0, 1]},
    title={'text': "Qubits"},
    gauge={'axis': {'range': [None, 20]},
           'bar': {'color': "#3498db"}}
))

fig_circuit.add_trace(go.Indicator(
    mode="number+gauge",
    value=feature_map.depth(),
    domain={'x': [0.35, 0.65], 'y': [0, 1]},
    title={'text': "Circuit<br>Depth"},
    gauge={'axis': {'range': [None, 100]},
           'bar': {'color': "#e74c3c"}}
))

fig_circuit.add_trace(go.Indicator(
    mode="number+gauge",
    value=feature_map.num_parameters,
    domain={'x': [0.7, 1], 'y': [0, 1]},
    title={'text': "Parameters"},
    gauge={'axis': {'range': [None, 50]},
           'bar': {'color': "#2ecc71"}}
))

fig_circuit.update_layout(
    title='‚öõÔ∏è Quantum Circuit Complexity',
    height=400,
    template='plotly_white'
)

filename = f"{output_dir}/advanced_04_quantum_circuit.html"
fig_circuit.write_html(filename)
print(f"‚úÖ Circuit insights saved: {filename}")


# ============================================================
# SECTION 4: MODEL AGREEMENT ANALYSIS
# ============================================================

print("\nü§ù SECTION 4: MODEL AGREEMENT ANALYSIS")
print("-"*70)

# Agreement matrix
agreement = (svm_final_pred == qsvm_final_pred)
agreement_by_class = pd.DataFrame({
    'Actual': y_test.values,
    'Agreement': agreement
}).groupby('Actual')['Agreement'].agg(['sum', 'count', 'mean'])

fig_agreement = go.Figure()

class_names = ['Legitimate', 'Phishing']

fig_agreement.add_trace(go.Bar(
    x=class_names,
    y=[agreement_by_class.loc[0, 'mean'], agreement_by_class.loc[1, 'mean']],
    marker_color=['#2ecc71', '#e74c3c'],
    text=[f"{agreement_by_class.loc[0, 'mean']:.2%}", 
          f"{agreement_by_class.loc[1, 'mean']:.2%}"],
    textposition='auto',
    hovertemplate='<b>%{x}</b><br>Agreement: %{y:.2%}<extra></extra>'
))

fig_agreement.update_layout(
    title='ü§ù SVM-QSVM Agreement Rate by Class',
    yaxis_title='Agreement Rate',
    yaxis=dict(range=[0, 1], tickformat='.0%'),
    height=500,
    width=800,
    template='plotly_white',
    showlegend=False
)

filename = f"{output_dir}/advanced_05_model_agreement.html"
fig_agreement.write_html(filename)
print(f"‚úÖ Agreement analysis saved: {filename}")

print(f"   Overall agreement: {agreement.mean():.2%}")
print(f"   Legitimate class: {agreement_by_class.loc[0, 'mean']:.2%}")
print(f"   Phishing class: {agreement_by_class.loc[1, 'mean']:.2%}")


# ============================================================
# SECTION 5: DIFFICULT SAMPLES ANALYSIS
# ============================================================

print("\nüéØ SECTION 5: DIFFICULT SAMPLES ANALYSIS")
print("-"*70)

# Calculate how many models disagreed for each sample
svm_ensemble = np.array(svm_preds_list).T
qsvm_ensemble = np.array(qsvm_preds_list).T

# Variance in predictions (measure of difficulty)
svm_variance = np.var(svm_ensemble, axis=1)
qsvm_variance = np.var(qsvm_ensemble, axis=1)

# Identify most difficult samples (high variance)
svm_difficult = np.argsort(svm_variance)[-20:]  # Top 20 most difficult
qsvm_difficult = np.argsort(qsvm_variance)[-20:]

fig_difficult = make_subplots(
    rows=1, cols=2,
    subplot_titles=('SVM Prediction Variance', 'QSVM Prediction Variance')
)

fig_difficult.add_trace(
    go.Histogram(
        x=svm_variance,
        marker_color='#3498db',
        nbinsx=50,
        name='SVM',
        showlegend=False
    ),
    row=1, col=1
)

fig_difficult.add_trace(
    go.Histogram(
        x=qsvm_variance,
        marker_color='#e74c3c',
        nbinsx=50,
        name='QSVM',
        showlegend=False
    ),
    row=1, col=2
)

fig_difficult.update_layout(
    title_text='üéØ Sample Difficulty Distribution (Batch Prediction Variance)',
    height=500,
    width=1000,
    template='plotly_white'
)

fig_difficult.update_xaxes(title_text="Variance", row=1, col=1)
fig_difficult.update_xaxes(title_text="Variance", row=1, col=2)
fig_difficult.update_yaxes(title_text="Count", row=1, col=1)

filename = f"{output_dir}/advanced_06_difficult_samples.html"
fig_difficult.write_html(filename)
print(f"‚úÖ Difficult samples saved: {filename}")

print(f"   SVM: {len(svm_difficult)} samples with high prediction variance")
print(f"   QSVM: {len(qsvm_difficult)} samples with high prediction variance")
print(f"   Mean variance - SVM: {svm_variance.mean():.4f}, QSVM: {qsvm_variance.mean():.4f}")


# ============================================================
# SECTION 6: COMPARATIVE PERFORMANCE RADAR
# ============================================================

print("\nüì° SECTION 6: COMPARATIVE PERFORMANCE RADAR")
print("-"*70)

# Calculate all metrics
svm_metrics = precision_recall_fscore_support(y_test, svm_final_pred, average='weighted')
qsvm_metrics = precision_recall_fscore_support(y_test, qsvm_final_pred, average='weighted')

categories = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'Specificity', 'FNR (inverted)']

# Calculate specificity, FP, FN
svm_tn = ((svm_final_pred == 0) & (y_test.values == 0)).sum()
svm_fp = ((svm_final_pred == 1) & (y_test.values == 0)).sum()
svm_tp = ((svm_final_pred == 1) & (y_test.values == 1)).sum()
svm_fn = ((svm_final_pred == 0) & (y_test.values == 1)).sum()
svm_spec = svm_tn / (svm_tn + svm_fp)
svm_fnr = svm_fn / (svm_fn + svm_tp)  # False Negative Rate
svm_fnr_inverted = 1 - svm_fnr  # Invert so higher is better on radar

qsvm_tn = ((qsvm_final_pred == 0) & (y_test.values == 0)).sum()
qsvm_fp = ((qsvm_final_pred == 1) & (y_test.values == 0)).sum()
qsvm_tp = ((qsvm_final_pred == 1) & (y_test.values == 1)).sum()
qsvm_fn = ((qsvm_final_pred == 0) & (y_test.values == 1)).sum()
qsvm_spec = qsvm_tn / (qsvm_tn + qsvm_fp)
qsvm_fnr = qsvm_fn / (qsvm_fn + qsvm_tp)  # False Negative Rate
qsvm_fnr_inverted = 1 - qsvm_fnr  # Invert so higher is better on radar

svm_values = [accuracy_score(y_test, svm_final_pred), 
              svm_metrics[0], svm_metrics[1], svm_metrics[2], svm_spec, svm_fnr_inverted]
qsvm_values = [accuracy_score(y_test, qsvm_final_pred), 
               qsvm_metrics[0], qsvm_metrics[1], qsvm_metrics[2], qsvm_spec, qsvm_fnr_inverted]

print(f"\nüìä Calculated Metrics:")
print(f"   SVM  - Specificity: {svm_spec:.4f}, FNR: {svm_fnr:.4f} ({svm_fn} false negatives)")
print(f"   QSVM - Specificity: {qsvm_spec:.4f}, FNR: {qsvm_fnr:.4f} ({qsvm_fn} false negatives)")

fig_radar = go.Figure()

fig_radar.add_trace(go.Scatterpolar(
    r=svm_values,
    theta=categories,
    fill='toself',
    name='Classical SVM',
    line_color='#3498db',
    fillcolor='rgba(52, 152, 219, 0.3)'
))

fig_radar.add_trace(go.Scatterpolar(
    r=qsvm_values,
    theta=categories,
    fill='toself',
    name='Quantum SVM',
    line_color='#e74c3c',
    fillcolor='rgba(231, 76, 60, 0.3)'
))

fig_radar.update_layout(
    polar=dict(
        radialaxis=dict(
            visible=True,
            range=[0.85, 1.0]
        )
    ),
    title='üì° Comprehensive Performance Radar Chart',
    height=600,
    width=800,
    template='plotly_white',
    showlegend=True
)

filename = f"{output_dir}/advanced_07_performance_radar.html"
fig_radar.write_html(filename)
print(f"‚úÖ Radar chart saved: {filename}")


# ============================================================
# FINAL SUMMARY
# ============================================================

print("\n" + "="*70)
print("‚úÖ ADVANCED EXPLAINABILITY COMPLETE")
print("="*70)

print(f"\nüìÇ Output Directory: {output_dir}/")
print("\nüìä Generated Advanced Visualizations:")
print("   advanced_01_error_analysis.html")
print("   advanced_02_false_pos_neg.html")
print("   advanced_03_error_features.html")
print("   advanced_04_quantum_circuit.html")
print("   advanced_05_model_agreement.html")
print("   advanced_06_difficult_samples.html")
print("   advanced_07_performance_radar.html")

print("\nüí° Key Explainability Insights:")
print(f"   ‚Ä¢ Models agree on {error_counts['Both Correct']/len(y_test)*100:.1f}% of predictions")
print(f"   ‚Ä¢ Challenging cases where both fail: {both_wrong.sum()} samples")
print(f"   ‚Ä¢ Most error-associated feature: {features_diff.index[0]}")
print(f"   ‚Ä¢ Quantum circuit uses {feature_map.num_qubits} qubits at depth {feature_map.depth()}")

print("\nüéØ These visualizations help understand:")
print("   ‚Üí Where models make mistakes")
print("   ‚Üí Why models disagree")
print("   ‚Üí Which samples are inherently difficult")
print("   ‚Üí How quantum circuits encode information")
print("   ‚Üí Feature importance in predictions")



ADVANCED EXPLAINABILITY & ERROR ANALYSIS

‚ùå SECTION 1: ERROR ANALYSIS
----------------------------------------------------------------------
‚úÖ Error analysis saved: visualization_outputs/advanced_01_error_analysis.html
   Both correct: 1480 (94.57%)
   SVM wrong, QSVM correct: 43
   QSVM wrong, SVM correct: 40
   Both wrong: 2
‚úÖ FP/FN analysis saved: visualization_outputs/advanced_02_false_pos_neg.html

üìä Error Types:
   SVM  - False Positives: 0, False Negatives: 42
   QSVM - False Positives: 17, False Negatives: 28

üîç SECTION 2: FEATURE CONTRIBUTION TO ERRORS
----------------------------------------------------------------------
‚úÖ Error features saved: visualization_outputs/advanced_03_error_features.html
   Features with largest difference in error cases:
   1. content_entropy: 0.0987
   2. web_js_len: 0.0732
   3. url_num_hyphens: 0.0345
   4. url_suspicious_chars: 0.0260
   5. domain_contains_numbers: 0.0107

‚öõÔ∏è SECTION 3: QUANTUM CIRCUIT INSIGHTS
---------------

In [32]:
# CURSE OF DIMENSIONALITY - FINDING THE BREAKING POINT
# Incrementally add features and find when accuracy DROPS

import numpy as np
import pandas as pd
import plotly.graph_objects as go
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import mutual_info_classif
import time
import warnings
warnings.filterwarnings('ignore')

print("="*70)
print("CURSE OF DIMENSIONALITY - FINDING THE BREAKING POINT")
print("="*70)

# Create output directory
import os
output_dir = "visualization_outputs"
os.makedirs(output_dir, exist_ok=True)

# ============================================================
# STEP 1: RANK FEATURES BY IMPORTANCE
# ============================================================

print("\nüìä STEP 1: RANKING FEATURES BY IMPORTANCE")
print("-"*70)

# Calculate Mutual Information
print("   Calculating Mutual Information scores...")
mi_scores = mutual_info_classif(X_train_scaled, y_train, random_state=42)
mi_df = pd.DataFrame({
    'Feature': X_train.columns,
    'MI_Score': mi_scores
}).sort_values('MI_Score', ascending=False)

print(f"‚úÖ Features ranked by MI score")
print(f"\n   Top 10 features:")
for i, row in mi_df.head(10).iterrows():
    print(f"   {i+1}. {row['Feature']}: {row['MI_Score']:.4f}")


# ============================================================
# STEP 2: INCREMENTAL FEATURE TESTING - CLASSICAL SVM
# ============================================================

print("\nüîµ STEP 2: TESTING CLASSICAL SVM (Incremental Features)")
print("-"*70)

max_features_to_test = min(30, len(mi_df))  # Test up to 30 features
print(f"   Testing from 1 to {max_features_to_test} features...")
print(f"   This will take a few minutes...\n")

svm_results = []

for n_features in range(1, max_features_to_test + 1):
    print(f"   [{n_features:2d}/{max_features_to_test}] Training with {n_features} features...", end=' ')
    
    # Get top N features
    selected = mi_df.head(n_features)['Feature'].tolist()
    
    # Prepare data
    X_train_subset = X_train_scaled[selected]
    X_test_subset = X_test_scaled[selected]
    
    # Train SVM
    start_time = time.time()
    svm = SVC(kernel='rbf', class_weight='balanced', random_state=42)
    svm.fit(X_train_subset, y_train)
    train_time = time.time() - start_time
    
    # Evaluate
    train_acc = svm.score(X_train_subset, y_train)
    test_acc = svm.score(X_test_subset, y_test)
    
    svm_results.append({
        'n_features': n_features,
        'train_accuracy': train_acc,
        'test_accuracy': test_acc,
        'gap': train_acc - test_acc,
        'train_time': train_time
    })
    
    print(f"Test Acc: {test_acc:.4f} ({train_time:.2f}s)")

svm_df = pd.DataFrame(svm_results)

# Find optimal and breaking points for SVM
svm_peak_idx = svm_df['test_accuracy'].idxmax()
svm_optimal = svm_df.iloc[svm_peak_idx]['n_features']
svm_peak_acc = svm_df.iloc[svm_peak_idx]['test_accuracy']

print(f"\n‚úÖ SVM Testing Complete!")
print(f"\nüéØ SVM Results:")
print(f"   Peak accuracy: {svm_peak_acc:.4f} at {int(svm_optimal)} features")

# Find where accuracy starts consistently dropping
svm_threshold = svm_peak_acc - 0.005  # 0.5% drop threshold
svm_drop_point = None
for idx in range(svm_peak_idx, len(svm_df)):
    if svm_df.iloc[idx]['test_accuracy'] < svm_threshold:
        svm_drop_point = svm_df.iloc[idx]['n_features']
        break

if svm_drop_point:
    print(f"   Curse starts: After {svm_drop_point} features (accuracy drops)")
else:
    print(f"   Curse not observed in tested range")




# ============================================================
# STEP 4: THE CLASSIC CURSE OF DIMENSIONALITY GRAPH
# ============================================================

print("\nüìà STEP 4: CREATING THE CLASSIC CURSE GRAPH")
print("-"*70)

fig = go.Figure()

# SVM curve
fig.add_trace(go.Scatter(
    x=svm_df['n_features'],
    y=svm_df['test_accuracy'],
    mode='lines+markers',
    name='Classical SVM',
    line=dict(color='#3498db', width=4),
    marker=dict(size=10, symbol='circle'),
    hovertemplate='<b>SVM: %{x} features</b><br>Test Acc: %{y:.4f}<extra></extra>'
))



# Mark SVM optimal point
fig.add_trace(go.Scatter(
    x=[svm_optimal],
    y=[svm_peak_acc],
    mode='markers',
    name=f'SVM Peak ({int(svm_optimal)} features)',
    marker=dict(size=20, color='gold', symbol='star', line=dict(color='black', width=2)),
    hovertemplate=f'<b>SVM OPTIMAL</b><br>{int(svm_optimal)} features<br>Acc: {svm_peak_acc:.4f}<extra></extra>'
))


# Add curse of dimensionality annotation
if svm_drop_point:
    fig.add_vline(
        x=svm_drop_point,
        line_dash="dash",
        line_color="red",
        line_width=2,
        annotation_text=f"‚ö†Ô∏è Curse Begins<br>({svm_drop_point} features)",
        annotation_position="top"
    )

# Add optimal zone shading
fig.add_vrect(
    x0=max(1, svm_optimal - 2),
    x1=svm_optimal + 2,
    fillcolor="green",
    opacity=0.1,
    layer="below",
    line_width=0,
    annotation_text="Optimal Zone",
    annotation_position="top left"
)

fig.update_layout(
    title={
        'text': 'üìà THE CURSE OF DIMENSIONALITY<br><sub>Test Accuracy vs Number of Features</sub>',
        'x': 0.5,
        'xanchor': 'center',
        'font': {'size': 24}
    },
    xaxis_title='Number of Features',
    yaxis_title='Test Accuracy',
    xaxis=dict(
        tickmode='linear',
        tick0=1,
        dtick=1,
        gridcolor='lightgray'
    ),
    yaxis=dict(
        range=[min(svm_df['test_accuracy'].min(), 0.85), 1.0],
        gridcolor='lightgray'
    ),
    height=700,
    width=1200,
    template='plotly_white',
    hovermode='x unified',
    legend=dict(
        orientation="v",
        yanchor="bottom",
        y=0.02,
        xanchor="right",
        x=0.98,
        bgcolor="rgba(255,255,255,0.8)",
        bordercolor="black",
        borderwidth=1
    ),
    font=dict(size=14)
)

filename_curse = f"{output_dir}/curse_of_dimensionality_CLASSIC.html"
fig.write_html(filename_curse)
print(f"‚úÖ Classic curse graph saved: {filename_curse}")


# ============================================================
# STEP 5: DETAILED ANALYSIS CHARTS
# ============================================================

print("\nüìä STEP 5: CREATING DETAILED ANALYSIS")
print("-"*70)

# 5.1 Train vs Test Accuracy (Overfitting Detection)
fig_train_test = go.Figure()

# SVM Train
fig_train_test.add_trace(go.Scatter(
    x=svm_df['n_features'],
    y=svm_df['train_accuracy'],
    mode='lines',
    name='SVM Train',
    line=dict(color='#3498db', width=2, dash='dash'),
    showlegend=True
))

# SVM Test
fig_train_test.add_trace(go.Scatter(
    x=svm_df['n_features'],
    y=svm_df['test_accuracy'],
    mode='lines+markers',
    name='SVM Test',
    line=dict(color='#3498db', width=4),
    marker=dict(size=8),
    showlegend=True
))


fig_train_test.update_layout(
    title='üìä Train vs Test Accuracy (Overfitting Analysis)',
    xaxis_title='Number of Features',
    yaxis_title='Accuracy',
    height=600,
    width=1000,
    template='plotly_white',
    legend=dict(x=0.7, y=0.15)
)

filename_train_test = f"{output_dir}/curse_train_vs_test.html"
fig_train_test.write_html(filename_train_test)
print(f"‚úÖ Train vs Test chart saved: {filename_train_test}")


# 5.2 Training Time Analysis
fig_time = go.Figure()

fig_time.add_trace(go.Bar(
    x=svm_df['n_features'],
    y=svm_df['train_time'],
    name='SVM',
    marker_color='#3498db'
))



fig_time.update_layout(
    title='‚è±Ô∏è Training Time vs Number of Features',
    xaxis_title='Number of Features',
    yaxis_title='Training Time (seconds)',
    yaxis_type='log',
    height=500,
    width=1000,
    template='plotly_white',
    barmode='group'
)

filename_time = f"{output_dir}/curse_training_time.html"
fig_time.write_html(filename_time)
print(f"‚úÖ Training time chart saved: {filename_time}")


# ============================================================
# FINAL SUMMARY
# ============================================================

print("\n" + "="*70)
print("‚úÖ CURSE OF DIMENSIONALITY ANALYSIS COMPLETE")
print("="*70)

print(f"\nüéØ CLASSICAL SVM:")
print(f"   ‚úÖ Optimal: {int(svm_optimal)} features")
print(f"   ‚úÖ Peak Accuracy: {svm_peak_acc:.4f}")
if svm_drop_point:
    print(f"   ‚ö†Ô∏è  Curse begins: After {svm_drop_point} features")
    print(f"   üìä Recommendation: Use {int(svm_optimal)} to {svm_drop_point-1} features")
else:
    print(f"   üìä Recommendation: Use {int(svm_optimal)} features")



print(f"\nüìÅ Generated Files:")
print(f"   ‚Ä¢ curse_of_dimensionality_CLASSIC.html (Main graph)")
print(f"   ‚Ä¢ curse_train_vs_test.html")
print(f"   ‚Ä¢ curse_training_time.html")

print(f"\nüí° ANSWER TO YOUR QUESTION:")
print(f"   For SVM: Use maximum {int(svm_optimal)} features before curse hits")

print(f"\n   This is where adding more features STOPS helping!")

# Save results to CSV
svm_df.to_csv(f"{output_dir}/curse_svm_results.csv", index=False)

print(f"\n‚úÖ Results saved to CSV files")

CURSE OF DIMENSIONALITY - FINDING THE BREAKING POINT

üìä STEP 1: RANKING FEATURES BY IMPORTANCE
----------------------------------------------------------------------
   Calculating Mutual Information scores...
‚úÖ Features ranked by MI score

   Top 10 features:
   6. web_js_len: 0.6573
   9. js_obfuscation_ratio: 0.5212
   7. web_js_obf_len: 0.5198
   29. content_num_scripts: 0.4950
   23. content_entropy: 0.4723
   24. domain_trust_score: 0.3692
   34. email_domain_freq: 0.3241
   5. web_https: 0.2769
   30. content_suspicious_keywords: 0.2696
   4. web_who_is: 0.2613

üîµ STEP 2: TESTING CLASSICAL SVM (Incremental Features)
----------------------------------------------------------------------
   Testing from 1 to 30 features...
   This will take a few minutes...

   [ 1/30] Training with 1 features... Test Acc: 0.9738 (1.05s)
   [ 2/30] Training with 2 features... Test Acc: 0.9738 (0.62s)
   [ 3/30] Training with 3 features... Test Acc: 0.9738 (0.68s)
   [ 4/30] Training with 4