# Chapter 3 Implementation: Classical and Quantum Hybrid Analysis of Mutated Genes

This notebook follows the methodology described in Chapter 3 of the dissertation. It includes data preprocessing, visualization, graph-theoretic analysis, classical deep learning, quantum/hybrid model development, and evaluation of results.

In [None]:

# ============================
# Import Required Packages
# ============================

# Data handling & preprocessing
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Graph-theoretic analysis
import networkx as nx

# Machine learning - Classical CNN
import tensorflow as tf
from tensorflow.keras import layers, models

# Quantum / Hybrid modeling
import cirq
import sympy
import qiskit
import tensorflow_quantum as tfq

# Preprocessing utilities
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [None]:

import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(10,6))
ax.axis('off')

# Workflow steps
steps = [
    "Data Preprocessing & Visualization",
    "Graph-Theoretic Analysis (Toposort + Centrality)",
    "Classical CNN Baseline (TensorFlow)",
    "Quantum/Hybrid Model (Qiskit + TFQ)",
    "Evaluation & Interpretation"
]

# Plot steps as boxes
y_positions = list(range(len(steps)))[::-1]
for i, step in enumerate(steps):
    ax.text(0.5, y_positions[i], step, ha='center', va='center',
            bbox=dict(boxstyle="round,pad=0.5", edgecolor="black", facecolor="lightblue"))

# Arrows between steps
for i in range(len(steps)-1):
    ax.annotate("", xy=(0.5, y_positions[i]-0.5), xytext=(0.5, y_positions[i]-0.9),
                arrowprops=dict(arrowstyle="->", lw=2))

ax.set_ylim(-1, len(steps)+1)
ax.set_xlim(0, 1)
plt.show()


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import tensorflow as tf
from tensorflow.keras import layers, models
# Quantum imports
import tensorflow_quantum as tfq
import cirq
import sympy
import qiskit


## 1. Data Preprocessing & Visualization

In [None]:

# Load datasets
file_paths = [
    "Mutated_Genes.csv",
    "Mutated_Genes (1).csv",
    "Mutated_Genes (2).csv"
]

dataframes = [pd.read_csv(path) for path in file_paths]

# Cleaning function
def clean_gene_df(df):
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    df = df.dropna(subset=['Gene'])
    if 'Freq' in df.columns:
        df['Freq'] = df['Freq'].astype(str).str.replace('%','').str.strip()
        df['Freq'] = pd.to_numeric(df['Freq'], errors='coerce')
    numeric_cols = ['MutSig(Q-value)', '# Mut', 'Profiled Samples']
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    return df

cleaned_dfs = [clean_gene_df(df) for df in dataframes]
merged_df = pd.concat(cleaned_dfs, ignore_index=True).drop_duplicates(subset=['Gene'])
print("Final shape:", merged_df.shape)

# Visualization
plt.figure(figsize=(10,6))
sns.histplot(merged_df['Freq'].dropna(), bins=50, kde=True)
plt.title("Distribution of Mutation Frequency Across Genes")
plt.xlabel("Mutation Frequency (%)")
plt.ylabel("Count")
plt.show()


## 2. Graph-Theoretic Analysis

In [None]:

# Create directed graph
G = nx.DiGraph()

# Sort genes by mutation frequency
sorted_genes = merged_df.sort_values(by="Freq", ascending=False)

# Add nodes and edges
for gene in sorted_genes["Gene"].head(200):
    G.add_node(gene)
genes_list = sorted_genes["Gene"].head(200).tolist()
for i in range(len(genes_list)-1):
    G.add_edge(genes_list[i], genes_list[i+1])

# Topological sort
topo_sorted_genes = list(nx.topological_sort(G))
print("Top 30 genes in topological order:")
print(topo_sorted_genes[:30])

# Degree centrality
deg_centrality = nx.degree_centrality(G)
top_degree = sorted(deg_centrality.items(), key=lambda x: x[1], reverse=True)[:10]
print("\nTop 10 genes by degree centrality:")
print(top_degree)


## 3. Classical Deep Learning Baseline (CNN)

In [None]:

# Example setup for CNN classification (mutation presence prediction)
# Dummy labels created for illustration purposes (replace with real phenotype labels)

X = merged_df[['# Mut', 'Freq', 'Profiled Samples']].fillna(0).values
y = (merged_df['Freq'] > merged_df['Freq'].median()).astype(int).values

# Normalize data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)
X = np.expand_dims(X, -1)  # reshape for CNN

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build CNN model
cnn_model = models.Sequential([
    layers.Conv1D(32, 2, activation='relu', input_shape=(X.shape[1],1)),
    layers.MaxPooling1D(2),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = cnn_model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))

cnn_eval = cnn_model.evaluate(X_test, y_test)
print("CNN Test Evaluation:", cnn_eval)


## 4. Quantum/Hybrid Model Prototype

In [None]:

# Setup quantum circuit example (TensorFlow Quantum)
qubits = [cirq.GridQubit(0,i) for i in range(3)]

def create_quantum_model(qubits):
    circuit = cirq.Circuit()
    for q in qubits:
        circuit.append(cirq.H(q))
    circuit.append(cirq.CZ(qubits[0], qubits[1]))
    circuit.append(cirq.rx(sympy.Symbol("theta"))(qubits[0]))
    return circuit

quantum_model = create_quantum_model(qubits)
print(quantum_model)

# Convert to TensorFlow Quantum model
quantum_data = tfq.convert_to_tensor([quantum_model])
theta = sympy.Symbol("theta")
readout_operators = [cirq.Z(qubits[0])]

quantum_layer = tfq.layers.PQC(quantum_model, readout_operators)
hybrid_model = models.Sequential([quantum_layer, layers.Dense(1, activation='sigmoid')])

print(hybrid_model.summary())


## 5. Evaluation & Interpretation

In [None]:

# Compare classical CNN vs hybrid (example metrics placeholder)
print("CNN Accuracy:", cnn_eval[1])
print("Hybrid model will be trained and compared similarly (requires mapping dataset into quantum circuits).")

# Interpretation: prioritizing cancer-related genes
if "Is Cancer Gene (source: OncoKB)" in merged_df.columns:
    cancer_genes = merged_df[merged_df["Is Cancer Gene (source: OncoKB)"].str.contains("Yes", na=False)]["Gene"]
    important_in_order = [g for g in topo_sorted_genes if g in set(cancer_genes)]
    print("Cancer-relevant genes in topological order (first 20):")
    print(important_in_order[:20])
