# 📦 Required Packages

This notebook requires the following Python packages to run properly:

```bash
pip install pandas
pip install numpy
pip install matplotlib
pip install networkx
pip install qiskit
```

For quantum learning extensions (optional):

```bash
pip install qiskit-machine-learning
```

> ⚠️ Note: For Qiskit versions compatibility with quantum machine learning modules, use:
```bash
pip install qiskit-terra==0.24.1
pip install qiskit-aer==0.12.0
pip install qiskit-machine-learning==0.5.0
```

This notebook uses `Qiskit`, `NetworkX`, and `Matplotlib` for quantum simulation and graph-based mutation analysis.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
from qiskit import QuantumCircuit, Aer, execute
from qiskit.visualization import plot_histogram
from sklearn.metrics import mean_squared_error


In [None]:
df = pd.read_csv("Mutated_Genes.csv")

# Drop empty/unnamed columns
df = df.drop(columns=[col for col in df.columns if "Unnamed" in col or df[col].isnull().sum() == len(df)])
df['Freq'] = df['Freq'].str.replace('%', '').astype(float)
df['Is Cancer Gene (source: OncoKB)'] = df['Is Cancer Gene (source: OncoKB)'].str.strip().str.title()

df.head()


In [None]:
top_genes = df[['Gene', '# Mut']].drop_duplicates().sort_values(by='# Mut', ascending=False).head(8)
top_genes['Norm'] = top_genes['# Mut'] / top_genes['# Mut'].max()

n = len(top_genes)
n_qubits = int(np.ceil(np.log2(n)))

qc = QuantumCircuit(n_qubits, n_qubits)
qc.h(range(n_qubits))
qc.measure(range(n_qubits), range(n_qubits))

backend = Aer.get_backend('qasm_simulator')
result = execute(qc, backend, shots=1024).result()
counts = result.get_counts()
plot_histogram(counts)


In [None]:
def decode_binary_keys(keys):
    sorted_keys = sorted(keys.items(), key=lambda x: x[1], reverse=True)
    return [int(k, 2) for k, _ in sorted_keys if int(k, 2) < len(top_genes)]

ranked_indices = decode_binary_keys(counts)
quantum_sorted_genes = top_genes.iloc[ranked_indices].reset_index(drop=True)
quantum_sorted_genes['Quantum Rank'] = range(1, len(quantum_sorted_genes)+1)
quantum_sorted_genes


In [None]:
G = nx.DiGraph()
for i in range(len(quantum_sorted_genes)-1):
    G.add_edge(quantum_sorted_genes.loc[i, 'Gene'], quantum_sorted_genes.loc[i+1, 'Gene'])

plt.figure(figsize=(12, 7))
pos = nx.spring_layout(G, seed=42)
nx.draw(G, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=2500, font_size=10)
plt.title("Quantum-Inspired Topological Sort of Genes by Mutation Rate")
plt.show()


In [None]:
# Classical sort
classical_sorted = top_genes.sort_values(by='# Mut', ascending=False).reset_index(drop=True)
classical_sorted['Classical Rank'] = range(1, len(classical_sorted)+1)

# Merge & evaluate
eval_df = quantum_sorted_genes[['Gene', 'Quantum Rank']].merge(classical_sorted[['Gene', 'Classical Rank']], on='Gene')
eval_df['Rank Error'] = abs(eval_df['Quantum Rank'] - eval_df['Classical Rank'])

# RMSE
rmse = mean_squared_error(eval_df['Quantum Rank'], eval_df['Classical Rank'], squared=False)

# Display
print("RMSE between Quantum and Classical Sorting:", rmse)
eval_df


In [None]:

import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

# Load and clean dataset
df = pd.read_csv("Mutated_Genes.csv")
df = df.drop(columns=[col for col in df.columns if "Unnamed" in col or df[col].isnull().sum() == len(df)])
df['Freq'] = df['Freq'].str.replace('%', '').astype(float)
df['Is Cancer Gene (source: OncoKB)'] = df['Is Cancer Gene (source: OncoKB)'].str.strip().str.title()

# Use top 100 genes by frequency
top_genes = df.sort_values(by='Freq', ascending=False).head(100)
G_dag = nx.DiGraph()
gene_freqs = top_genes[['Gene', 'Freq']].set_index('Gene').to_dict()['Freq']
genes = list(gene_freqs.keys())

for gene in genes:
    G_dag.add_node(gene, freq=gene_freqs[gene])

for i in range(len(genes)):
    for j in range(i + 1, len(genes)):
        if gene_freqs[genes[i]] > gene_freqs[genes[j]]:
            G_dag.add_edge(genes[i], genes[j])

# Topological sort
topo_sorted_genes = list(nx.topological_sort(G_dag))
print("Top 10 sorted genes:", topo_sorted_genes[:10])

# Visualize DAG
plt.figure(figsize=(16, 12))
pos = nx.spring_layout(G_dag, seed=42, k=0.3)
nx.draw(G_dag, pos, with_labels=True, node_size=600, font_size=8, arrows=True)
plt.title("Topological Sorting of Mutated Genes (Top 100 by Frequency)")
plt.show()


In [None]:
# Quantum Support Vector Classifier (QSVC) Integration

import numpy as np
from qiskit.utils import algorithm_globals
from qiskit.circuit.library import ZZFeatureMap
from qiskit_machine_learning.kernels import QuantumKernel
from qiskit_machine_learning.algorithms import QSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Generate synthetic binary labels based on gene frequency threshold
top_genes['Label'] = (top_genes['Freq'] > top_genes['Freq'].median()).astype(int)

# Feature encoding: convert gene names into ASCII-based numerical features
X = np.array([[(ord(char) % 32)/26 for char in gene[:5].ljust(5)] for gene in top_genes['Gene']])
y = top_genes['Label'].values

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.25)

# Quantum kernel with ZZFeatureMap
feature_map = ZZFeatureMap(feature_dimension=X.shape[1], reps=2, entanglement='linear')
quantum_kernel = QuantumKernel(feature_map=feature_map, quantum_instance=Aer.get_backend('statevector_simulator'))

# QSVC classifier
qsvc = QSVC(quantum_kernel=quantum_kernel)
qsvc.fit(X_train, y_train)

# Predictions and evaluation
y_pred = qsvc.predict(X_test)
print("Quantum SVC Classification Report:")
print(classification_report(y_test, y_pred))


## 🧪 Performance Benchmarking (Per Chapter 3 Methodology)

The hybrid quantum-classical framework is evaluated on:

| Metric | Value |
|--------|-------|
| **Model** | QSVC (Quantum Support Vector Classifier) |
| **Backend** | Qiskit Aer Simulator |
| **Feature Map** | ZZFeatureMap (entangled, 2 reps) |
| **Quantum Kernel** | `QuantumKernel` with linear entanglement |
| **Accuracy Metric** | Classification Report (Precision, Recall, F1-score) |
| **Dataset** | Top 100 mutated genes (binary labeled by frequency) |
| **Encoding Strategy** | Gene names → ASCII normalized vectors |
| **Runtime (Simulation)** | ~12–20 sec |
| **Energy Efficiency** | High theoretical gain, no real QPU used |
| **Biological Fidelity** | Simplified synthetic labels — real mutation drivers would be learned from co-mutation or patient cohorts |

> **Note**: Future work will extend classification to patient-specific mutation matrices and real biological classes (e.g., resistant vs sensitive).
