<a href="https://colab.research.google.com/github/pavan-charan/Fake-news-detection-QML/blob/main/Quantum_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# This cell mounts Google Drive to the Colab environment,
# allowing access to files stored in the user's Drive.
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# This cell installs the required Python libraries for the project.
# - pennylane & pennylane-lightning: for quantum machine learning.
# - scikit-learn: for classical machine learning and data processing.
# - pandas, numpy, openpyxl: for data manipulation.
# - seaborn: for plotting.
!pip install pennylane pennylane-lightning scikit-learn pandas numpy openpyxl seaborn --quiet


# Fake News Detection using QML

This notebook demonstrates a project on fake news detection using both classical and quantum machine learning techniques. The goal is to compare the performance of these methods on a real-world dataset of COVID-19 related tweets. The notebook is structured as follows:
1. **Data Preprocessing**: Loading, cleaning, and preparing the text data.
2. **Classical Models**: Training and evaluating classical machine learning models (Linear SVC and RBF-SVM) as a baseline.
3. **Quantum-Enhanced Model (QSVC)**: Implementing and evaluating a quantum-enhanced Support Vector Classifier.

In [None]:
# Import necessary libraries for the project.
import numpy as np
import pandas as pd
import re, time, gc

# Scikit-learn utilities for feature extraction, dimensionality reduction, preprocessing, and model evaluation.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# Libraries for plotting and visualization.
import matplotlib.pyplot as plt
import seaborn as sns

# PennyLane for quantum machine learning.
import pennylane as qml

def clean_text(s: str) -> str:
    """Cleans a string by converting to lowercase and removing URLs, mentions, hashtags, and non-alphanumeric characters."""
    if not isinstance(s, str):
        return ""
    s = s.lower()
    s = re.sub(r"http\S+|www\.\S+", " ", s)  # Remove URLs
    s = re.sub(r"@\w+", " ", s)              # Remove mentions
    s = re.sub(r"#", " ", s)                  # Remove hashtags
    s = re.sub(r"[^a-z0-9'\s]", " ", s)       # Remove non-alphanumeric characters
    s = re.sub(r"\s+", " ", s).strip()         # Remove extra spaces
    return s

def timer():
    """A simple timer function to measure the execution time of code blocks."""
    t=[time.time()]
    return lambda msg="": (print(f"{msg}: {time.time()-t[0]:.2f}s"), t.__setitem__(0,time.time()))

def report(name, y_true, y_pred):
    """Calculates and prints the accuracy and F1-score for a given model's predictions."""
    acc = accuracy_score(y_true, y_pred)
    f1  = f1_score(y_true, y_pred, average="macro")
    print(f"{name} â€” Acc: {acc:.4f}, F1: {f1:.4f}")
    return acc, f1

def plot_confusion_matrix(y_true, y_pred, labels, title):
    """Plots a confusion matrix to visualize the performance of a classification model."""
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=labels, yticklabels=labels)
    plt.title(title)
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.show()


## 1. Data Preprocessing

In this section, we load the dataset, clean the text data, and prepare it for the models. The main steps include:
- Loading the training, validation, and test sets.
- Cleaning the text by removing URLs, mentions, and special characters.
- Encoding the labels ('real' and 'fake') into numerical format.
- Applying the TF-IDF vectorizer to convert text into numerical features.

In [None]:
# Load the training, validation, and test datasets from Excel files.
train_df = pd.read_excel("/content/drive/MyDrive/CovidDataset/Constraint_English_Train.xlsx")
val_df   = pd.read_excel("/content/drive/MyDrive/CovidDataset/Constraint_English_Val.xlsx")
test_df  = pd.read_excel("/content/drive/MyDrive/CovidDataset/english_test_with_labels.xlsx")

# Clean the 'tweet' column in each DataFrame.
for df in (train_df, val_df, test_df):
    df["tweet"] = df["tweet"].astype(str).map(clean_text)

# Encode the 'label' column into numerical format (e.g., 'real' -> 1, 'fake' -> 0).
le = LabelEncoder()
le.fit(pd.concat([train_df["label"], val_df["label"], test_df["label"]]))

# Transform the labels for each dataset.
y_train = le.transform(train_df["label"])
y_val   = le.transform(val_df["label"])
y_test  = le.transform(test_df["label"])

# Create lists of the cleaned tweet text for each dataset.
X_train_text = train_df["tweet"].tolist()
X_val_text   = val_df["tweet"].tolist()
X_test_text  = test_df["tweet"].tolist()


In [None]:
# Initialize the TF-IDF vectorizer to convert text into numerical features.
# - ngram_range=(1,2): considers both single words and pairs of words.
# - min_df=2: ignores terms that appear in fewer than 2 documents.
# - max_features=5000: limits the vocabulary size to the top 5000 terms.
tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=2, max_features=5000)

# Fit the vectorizer on the training data and transform the training, validation, and test data.
Xtr_tfidf = tfidf.fit_transform(X_train_text)
Xv_tfidf  = tfidf.transform(X_val_text)
Xt_tfidf  = tfidf.transform(X_test_text)

# Print the shapes of the resulting TF-IDF matrices.
print("TF-IDF shapes:", Xtr_tfidf.shape, Xv_tfidf.shape, Xt_tfidf.shape)


## 2. Classical Models

We first train and evaluate classical machine learning models as a baseline for comparison. The models used are:
- **Linear Support Vector Classifier (SVC)**
- **SVC with a Radial Basis Function (RBF) kernel**

In [None]:
# Initialize and train a Linear Support Vector Classifier.
clf = LinearSVC()
clf.fit(Xtr_tfidf, y_train)

# Make predictions on the validation and test sets.
pred_val = clf.predict(Xv_tfidf)
pred_test = clf.predict(Xt_tfidf)

# Print the classification report and plot the confusion matrix for the validation set.
print("\n=== Linear SVC Validation Report ===")
print(classification_report(y_val, pred_val, target_names=le.classes_))
plot_confusion_matrix(y_val, pred_val, le.classes_, "Linear SVC - Validation")

# Print the classification report and plot the confusion matrix for the test set.
print("\n=== Linear SVC Test Report ===")
print(classification_report(y_test, pred_test, target_names=le.classes_))
plot_confusion_matrix(y_test, pred_test, le.classes_, "Linear SVC - Test")


In [None]:
# Reduce the dimensionality of the TF-IDF features using Truncated SVD.
svd = TruncatedSVD(n_components=10, random_state=42)
Ztr = svd.fit_transform(Xtr_tfidf)
Zv  = svd.transform(Xv_tfidf)
Zt  = svd.transform(Xt_tfidf)

# Initialize and train an SVC with an RBF kernel on the reduced features.
rbf = SVC(kernel="rbf", C=2.0)
rbf.fit(Ztr, y_train)

# Make predictions on the validation and test sets.
pred_val = rbf.predict(Zv)
pred_test = rbf.predict(Zt)

# Print the classification report and plot the confusion matrix for the validation set.
print("\n=== RBF-SVM Validation Report ===")
print(classification_report(y_val, pred_val, target_names=le.classes_))
plot_confusion_matrix(y_val, pred_val, le.classes_, "RBF SVM - Validation")

# Print the classification report and plot the confusion matrix for the test set.
print("\n=== RBF-SVM Test Report ===")
print(classification_report(y_test, pred_test, target_names=le.classes_))
plot_confusion_matrix(y_test, pred_test, le.classes_, "RBF SVM - Test")


## 3. Quantum-Enhanced Model (QSVC)

Now, we implement a quantum-enhanced version of the Support Vector Classifier (QSVC). This involves:
- Reducing the dimensionality of the data using Truncated SVD.
- Scaling the features to be used in the quantum circuit.
- Defining a quantum feature map to encode the data into a quantum state.
- Creating a quantum kernel to compute the similarity between data points in the quantum feature space.
- Training an SVC with the precomputed quantum kernel.

In [None]:
# Reduce the dimensionality of the TF-IDF features to 10 components using Truncated SVD.
# This is necessary to match the number of qubits in the quantum circuit.
svd = TruncatedSVD(n_components=10, random_state=42)
Ztr = svd.fit_transform(Xtr_tfidf) # Fit and transform the training data.
Zv  = svd.transform(Xv_tfidf)    # Transform the validation data.
Zt  = svd.transform(Xt_tfidf)    # Transform the test data.

# Print the shape of the transformed training data to confirm the dimensionality reduction.
print("Ztr shape:", Ztr.shape)


In [None]:
# Define the number of training samples for the quantum model.
TRAIN_Q = 2500

# Scale the features to a range of [0, 2*pi] to be used as angles in the quantum circuit.
scaler = MinMaxScaler(feature_range=(0, 2*np.pi))
Xtr_s = scaler.fit_transform(Ztr)
Xv_s  = scaler.transform(Zv)
Xt_s  = scaler.transform(Zt)

# Function to cap the number of samples to be used.
def cap(X, y, n):
    idx = np.random.choice(len(X), n, replace=True)
    return X[idx], y[idx]

# Create the final training, validation, and test sets for the quantum model.
Xtr_q, ytr_q = cap(Xtr_s, y_train, TRAIN_Q)
Xv_q, yv_q   = Xv_s, y_val
Xt_q, yt_q   = Xt_s, y_test

print("Quantum train size =", len(Xtr_q))


In [None]:
# Set the number of qubits to match the number of features (10).
n_qubits = 10

# Set up the quantum device, using the high-performance 'lightning.qubit' simulator if available.
try:
    dev = qml.device("lightning.qubit", wires=n_qubits)
    print("Using lightning.qubit backend")
except:
    dev = qml.device("default.qubit", wires=n_qubits)
    print("Using default.qubit backend")

# Number of repetitions for the feature map circuit.
REPS = 3

def feature_map(x):
    """Quantum feature map to encode classical data into a quantum state."""
    for r in range(REPS):
        # Apply single-qubit rotations based on the input features.
        for i in range(n_qubits):
            qml.RY(x[i], wires=i)
            qml.RZ(x[i]**2, wires=i)
        # Apply entangling CZ gates between adjacent qubits.
        for i in range(n_qubits - 1):
            qml.CZ(wires=[i, i+1])

# Define the quantum node that returns the quantum state.
@qml.qnode(dev)
def quantum_state(x):
    """Quantum circuit to generate the quantum state for a given input feature vector."""
    feature_map(x)
    return qml.state()


In [None]:
# Cache to store computed quantum states to avoid redundant calculations.
state_cache = {}

def get_state(x_tuple):
    """Retrieves the quantum state from the cache or computes it if not already present."""
    if x_tuple not in state_cache:
        state_cache[x_tuple] = quantum_state(np.array(x_tuple))
    return state_cache[x_tuple]

def kernel_matrix(A, B):
    """Computes the quantum kernel matrix between two sets of data points."""
    NA, NB = len(A), len(B)
    K = np.zeros((NA, NB))
    for i in range(NA):
        a_tuple = tuple(A[i])
        psi_a = get_state(a_tuple)
        for j in range(NB):
            b_tuple = tuple(B[j])
            psi_b = get_state(b_tuple)
            # The kernel entry is the squared overlap of the quantum states.
            K[i, j] = float(np.abs(np.vdot(psi_a, psi_b))**2)
        if (i+1) % 50 == 0:
            print(f"Built {i+1}/{NA} rows")
    return K


In [None]:
# Compute the training kernel matrix.
print("Building training kernel...")
K_train = kernel_matrix(Xtr_q, Xtr_q)

# Train the QSVC using the precomputed training kernel.
print("Training QSVC...")
qsvc = SVC(kernel="precomputed", C=2.0)
qsvc.fit(K_train, ytr_q)

# Compute the validation and test kernel matrices.
print("Building validation kernel...")
K_val = kernel_matrix(Xv_q, Xtr_q)

print("Building test kernel...")
K_test = kernel_matrix(Xt_q, Xtr_q)

# Make predictions on the validation and test sets.
pred_val_q = qsvc.predict(K_val)
pred_test_q = qsvc.predict(K_test)

# Print the classification reports and plot the confusion matrices for the QSVC.
print("\nQuantum QSVC Results")
print("\nValidation Report:")
print(classification_report(yv_q, pred_val_q, target_names=le.classes_))
plot_confusion_matrix(yv_q, pred_val_q, le.classes_, "Quantum QSVC - Val")

print("\nTest Report:")
print(classification_report(yt_q, pred_test_q, target_names=le.classes_))
plot_confusion_matrix(yt_q, pred_test_q, le.classes_, "Quantum QSVC - Test")
