In [1]:
# ========================================
# CELL 1: INSTALL & VERIFY ENVIRONMENT
# ========================================

# Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import re
from urllib.parse import urlparse
import math
import os
from tqdm import tqdm

print("üì¶ All libraries imported successfully!")

try:
    import qiskit
    import qiskit_algorithms
    import qiskit_machine_learning
    print(f"‚úÖ Qiskit version: {qiskit.__version__}")
    print(f"‚úÖ All packages installed successfully!")
except ImportError as e:
    print(f"‚ùå Import error: {e}")
    print("üîÑ Please restart runtime and try again")


üì¶ All libraries imported successfully!
‚úÖ Qiskit version: 1.4.4
‚úÖ All packages installed successfully!


In [4]:
# ========================================
# CELL 2: LOAD CLASSICAL DATASET
# ========================================

import pandas as pd

# Path to your phishing dataset
file_path = "email_phising1.csv"  
df = pd.read_csv(file_path)

print("‚úÖ Dataset loaded successfully!")
print("üìä Shape:", df.shape)
df.head()


‚úÖ Dataset loaded successfully!
üìä Shape: (8000, 20)


Unnamed: 0,email_subject_len,email_has_urgent_keyword,email_from_domain,email_url_len,email_num_links_in_email,email_label,web_Unnamed: 0,web_url,web_url_len,web_ip_add,web_geo_loc,web_tld,web_who_is,web_https,web_js_len,web_js_obf_len,web_content,Unnamed: 17,domain_age,final_label
0,32,0,spamassassin.zones.apache.org,39,2,0,975100,http://tools.ietf.org/html/rfc1583,34,30.180.42.35,United States,org,complete,yes,137.0,0.0,"Conversations, sharing on agriculture and ecol...",,11168.0,0
1,46,0,gmail.com>,33,1,0,784809,http://www.quickfixgolf.com,27,150.66.16.42,Japan,com,complete,yes,94.0,0.0,Abiola irele virginians also describe a featur...,,9692.0,0
2,21,0,telefonica.net>,0,0,0,185195,http://www.lvnazarene.org,25,180.123.185.229,China,org,complete,yes,44.5,0.0,Wire-guided rocket. tony accardo battle law en...,,2344.0,0
3,99,1,gmail.com>,0,0,0,478858,http://hatchersmartialarts.homestead.com/front...,51,46.97.122.170,Romania,com,complete,yes,84.5,0.0,Among states that help describe the atlantic b...,,10335.0,0
4,72,1,luebeck.de>,0,0,1,287298,http://www.gabile.com/,22,94.145.85.24,Denmark,com,incomplete,no,837.0,460.35,honkers kunnilingus schlong testicles sixtynin...,,7421.0,1


In [5]:
# ========================================
# CELL 3: CLEANING & BASIC PREPROCESSING
# ========================================

# Drop unnecessary unnamed columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# Fill missing text values
text_cols = ['email_from_domain', 'web_url', 'web_content', 'web_who_is', 'web_geo_loc', 'web_tld']
for col in text_cols:
    if col in df.columns:
        df[col] = df[col].fillna("")

# Handle missing numeric values
df = df.fillna(0)

print("‚úÖ Basic preprocessing done!")
df.info()


‚úÖ Basic preprocessing done!
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   email_subject_len         8000 non-null   int64  
 1   email_has_urgent_keyword  8000 non-null   int64  
 2   email_from_domain         8000 non-null   object 
 3   email_url_len             8000 non-null   int64  
 4   email_num_links_in_email  8000 non-null   int64  
 5   email_label               8000 non-null   int64  
 6   web_Unnamed: 0            8000 non-null   int64  
 7   web_url                   8000 non-null   object 
 8   web_url_len               8000 non-null   int64  
 9   web_ip_add                8000 non-null   object 
 10  web_geo_loc               8000 non-null   object 
 11  web_tld                   8000 non-null   object 
 12  web_who_is                8000 non-null   object 
 13  web_https                 8000 no

In [7]:
# ========================================
# CELL 4: DEFINE QUANTUM-INSPIRED FEATURE FUNCTIONS
# ========================================
# Install TextBlob
%pip install -q textblob

# Download necessary NLTK corpora for sentiment analysis
import nltk
nltk.download('brown')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('vader_lexicon')


from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from numpy.linalg import norm

# --- A. Semantic Similarity (subject vs body) ---
def semantic_similarity(subject, body):
    if not isinstance(subject, str) or not isinstance(body, str) or len(subject.strip()) == 0 or len(body.strip()) == 0:
        return 0
    vectorizer = TfidfVectorizer().fit([subject, body])
    tfidf = vectorizer.transform([subject, body]).toarray()
    cos_sim = np.dot(tfidf[0], tfidf[1]) / (norm(tfidf[0]) * norm(tfidf[1]) + 1e-6)
    return cos_sim

# --- B. Domain Trust Score ---
def domain_trust(domain, https, age):
    tld_score = 1 if any(tld in str(domain) for tld in [".gov", ".edu", ".org"]) else 0
    https_score = 1 if str(https).lower() == "yes" else 0
    age_score = min(float(age) / 365, 1)  # normalize: domains older than 1 year
    return (tld_score * 0.4) + (https_score * 0.4) + (age_score * 0.2)

# --- C. Sentiment Interference Score (contradictory tone) ---
def sentiment_interference(text):
    sentiment = TextBlob(str(text)).sentiment
    polarity = sentiment.polarity
    subjectivity = sentiment.subjectivity
    # phishing emails often have positive but highly subjective tone
    return abs(polarity - subjectivity)

# --- D. Entropy of Attachment / URL ---
def string_entropy(s):
    prob = [float(s.count(c)) / len(s) for c in dict.fromkeys(list(s))]
    entropy = - sum([p * math.log2(p) for p in prob if p > 0])
    return entropy / 8  # normalize


Note: you may need to restart the kernel to use updated packages.


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\brown.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...


In [8]:
# ========================================
# CELL 5: APPLY FEATURE EXTRACTION
# ========================================

quantum_features = []

for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
    subj = str(row.get('email_subject_len', ''))
    body = str(row.get('web_content', ''))
    domain = str(row.get('email_from_domain', ''))
    https = str(row.get('web_https', ''))
    url = str(row.get('web_url', ''))
    age = row.get('domain_age', 0)
    
    # Compute quantum-enhanced features
    q_semantic_sim = semantic_similarity(subj, body)
    q_trust = domain_trust(domain, https, age)
    q_sentiment_interf = sentiment_interference(body)
    q_url_entropy = string_entropy(url)
    
    quantum_features.append([q_semantic_sim, q_trust, q_sentiment_interf, q_url_entropy])

# Create DataFrame for new features
qf_df = pd.DataFrame(quantum_features, columns=[
    'q_semantic_similarity',
    'q_domain_trust',
    'q_sentiment_interference',
    'q_url_entropy'
])

# Concatenate with original dataset
hybrid_df = pd.concat([df.reset_index(drop=True), qf_df.reset_index(drop=True)], axis=1)

print("‚úÖ Quantum-inspired features added successfully!")
print("üìä New shape:", hybrid_df.shape)
hybrid_df.head()


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8000/8000 [00:46<00:00, 170.47it/s]

‚úÖ Quantum-inspired features added successfully!
üìä New shape: (8000, 23)





Unnamed: 0,email_subject_len,email_has_urgent_keyword,email_from_domain,email_url_len,email_num_links_in_email,email_label,web_Unnamed: 0,web_url,web_url_len,web_ip_add,...,web_https,web_js_len,web_js_obf_len,web_content,domain_age,final_label,q_semantic_similarity,q_domain_trust,q_sentiment_interference,q_url_entropy
0,32,0,spamassassin.zones.apache.org,39,2,0,975100,http://tools.ietf.org/html/rfc1583,34,30.180.42.35,...,yes,137.0,0.0,"Conversations, sharing on agriculture and ecol...",11168.0,0,0.0,1.0,0.437818,0.509593
1,46,0,gmail.com>,33,1,0,784809,http://www.quickfixgolf.com,27,150.66.16.42,...,yes,94.0,0.0,Abiola irele virginians also describe a featur...,9692.0,0,0.0,0.6,0.363636,0.507533
2,21,0,telefonica.net>,0,0,0,185195,http://www.lvnazarene.org,25,180.123.185.229,...,yes,44.5,0.0,Wire-guided rocket. tony accardo battle law en...,2344.0,0,0.0,0.6,0.378788,0.486708
3,99,1,gmail.com>,0,0,0,478858,http://hatchersmartialarts.homestead.com/front...,51,46.97.122.170,...,yes,84.5,0.0,Among states that help describe the atlantic b...,10335.0,0,0.0,0.6,0.2,0.486029
4,72,1,luebeck.de>,0,0,1,287298,http://www.gabile.com/,22,94.145.85.24,...,no,837.0,460.35,honkers kunnilingus schlong testicles sixtynin...,7421.0,1,0.0,0.2,0.347301,0.480669


In [9]:
# ========================================
# CELL 6: NORMALIZE SELECTED NUMERIC FEATURES
# ========================================

scaler = StandardScaler()
numeric_cols = hybrid_df.select_dtypes(include=[np.number]).columns.tolist()

hybrid_df[numeric_cols] = scaler.fit_transform(hybrid_df[numeric_cols])

print("‚úÖ Numeric features normalized for ML/QML readiness.")


‚úÖ Numeric features normalized for ML/QML readiness.


In [11]:
# ========================================
# CELL 7: SAVE HYBRID DATASET
# ========================================

output_path = "hybrid_phishing_dataset.csv"
hybrid_df.to_csv(output_path, index=False)

print(f"üíæ Hybrid dataset saved successfully at: {output_path}")


üíæ Hybrid dataset saved successfully at: hybrid_phishing_dataset.csv


In [12]:
# ========================================
# CELL 8 APPLY QUANTUM-INSPIRED FEATURE EXTRACTION for RAW DATA
# ========================================

from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from numpy.linalg import norm
import math
from tqdm import tqdm

# --- Feature functions ---
def semantic_similarity(subject, body):
    if not isinstance(subject, str) or not isinstance(body, str) or len(subject.strip()) == 0 or len(body.strip()) == 0:
        return 0
    vectorizer = TfidfVectorizer().fit([subject, body])
    tfidf = vectorizer.transform([subject, body]).toarray()
    cos_sim = np.dot(tfidf[0], tfidf[1]) / (norm(tfidf[0]) * norm(tfidf[1]) + 1e-6)
    return cos_sim

def domain_trust(domain, https, age):
    tld_score = 1 if any(tld in str(domain) for tld in [".gov", ".edu", ".org"]) else 0
    https_score = 1 if str(https).lower() == "yes" else 0
    age_score = min(float(age) / 365, 1)  # normalized to 1 year
    return (tld_score * 0.4) + (https_score * 0.4) + (age_score * 0.2)

def sentiment_interference(text):
    sentiment = TextBlob(str(text)).sentiment
    polarity = sentiment.polarity
    subjectivity = sentiment.subjectivity
    return abs(polarity - subjectivity)

def string_entropy(s):
    if not isinstance(s, str) or len(s) == 0:
        return 0
    prob = [float(s.count(c)) / len(s) for c in dict.fromkeys(list(s))]
    entropy = - sum([p * math.log2(p) for p in prob if p > 0])
    return entropy / 8  # normalized


In [13]:
# ========================================
# CELL 9: GENERATE RAW HYBRID DATASET
# ========================================

quantum_features = []

for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
    subj = str(row.get('email_subject_len', ''))
    body = str(row.get('web_content', ''))
    domain = str(row.get('email_from_domain', ''))
    https = str(row.get('web_https', ''))
    url = str(row.get('web_url', ''))
    age = row.get('domain_age', 0)
    
    q_semantic_sim = semantic_similarity(subj, body)
    q_trust = domain_trust(domain, https, age)
    q_sentiment_interf = sentiment_interference(body)
    q_url_entropy = string_entropy(url)
    
    quantum_features.append([q_semantic_sim, q_trust, q_sentiment_interf, q_url_entropy])

qf_df = pd.DataFrame(quantum_features, columns=[
    'q_semantic_similarity',
    'q_domain_trust',
    'q_sentiment_interference',
    'q_url_entropy'
])

# Concatenate with original classical dataset
hybrid_df = pd.concat([df.reset_index(drop=True), qf_df.reset_index(drop=True)], axis=1)

# Keep original feature types intact (no normalization)
print("‚úÖ Hybrid dataset ready with classical + quantum features")
print("üìä Shape:", hybrid_df.shape)
hybrid_df.head()


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8000/8000 [00:46<00:00, 171.25it/s]

‚úÖ Hybrid dataset ready with classical + quantum features
üìä Shape: (8000, 23)





Unnamed: 0,email_subject_len,email_has_urgent_keyword,email_from_domain,email_url_len,email_num_links_in_email,email_label,web_Unnamed: 0,web_url,web_url_len,web_ip_add,...,web_https,web_js_len,web_js_obf_len,web_content,domain_age,final_label,q_semantic_similarity,q_domain_trust,q_sentiment_interference,q_url_entropy
0,32,0,spamassassin.zones.apache.org,39,2,0,975100,http://tools.ietf.org/html/rfc1583,34,30.180.42.35,...,yes,137.0,0.0,"Conversations, sharing on agriculture and ecol...",11168.0,0,0.0,1.0,0.437818,0.509593
1,46,0,gmail.com>,33,1,0,784809,http://www.quickfixgolf.com,27,150.66.16.42,...,yes,94.0,0.0,Abiola irele virginians also describe a featur...,9692.0,0,0.0,0.6,0.363636,0.507533
2,21,0,telefonica.net>,0,0,0,185195,http://www.lvnazarene.org,25,180.123.185.229,...,yes,44.5,0.0,Wire-guided rocket. tony accardo battle law en...,2344.0,0,0.0,0.6,0.378788,0.486708
3,99,1,gmail.com>,0,0,0,478858,http://hatchersmartialarts.homestead.com/front...,51,46.97.122.170,...,yes,84.5,0.0,Among states that help describe the atlantic b...,10335.0,0,0.0,0.6,0.2,0.486029
4,72,1,luebeck.de>,0,0,1,287298,http://www.gabile.com/,22,94.145.85.24,...,no,837.0,460.35,honkers kunnilingus schlong testicles sixtynin...,7421.0,1,0.0,0.2,0.347301,0.480669


In [14]:
# ========================================
# CELL 10: SAVE RAW HYBRID DATASET
# ========================================

output_path = "hybrid_phishing_dataset_raw.csv"
hybrid_df.to_csv(output_path, index=False)
print(f"üíæ Hybrid dataset saved at: {output_path}")


üíæ Hybrid dataset saved at: hybrid_phishing_dataset_raw.csv
