In [3]:
import os
os.environ['DWAVE_API_TOKEN'] = 'Actual-DW-key'

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the dataset from CSV file
def load_dataset(filepath):
    df = pd.read_csv(filepath)
    X = df.iloc[:, :-1]  # All columns except the last one as features
    y = df.iloc[:, -1]   # The last column as the target (defect label)
    return X, y

# Preprocessing the data: normalize features
def preprocess_data(X):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled

# Split data into training and testing sets
def split_data(X, y, test_size=0.2, random_state=42):
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

# Example usage
X, y = load_dataset('AEEM_JIRA/EQ.csv')
X_scaled = preprocess_data(X)
X_train, X_test, y_train, y_test = split_data(X_scaled, y)

In [28]:
from dwave.system import DWaveSampler, EmbeddingComposite
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Define a QUBO matrix for feature selection with better scaling
def create_qubo(X, y, regularization_factor=0.1):
    num_features = X.shape[1]
    
    # Define a Random Forest classifier for feature evaluation
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    
    # Fit the model to calculate feature importances
    clf.fit(X, y)
    feature_importances = clf.feature_importances_
    
    # Normalize the feature importances to ensure they are not too small or too large
    feature_importances = (feature_importances - np.min(feature_importances)) / (np.max(feature_importances) - np.min(feature_importances))
    
    # Initialize QUBO matrix (zeros)
    Q = np.zeros((num_features, num_features))
    
    # Objective: Select features that maximize their importance
    for i in range(num_features):
        Q[i, i] = -feature_importances[i]  # Negative importance to encourage selection
    
    # Add regularization term to penalize too many selected features
    for i in range(num_features):
        Q[i, i] += regularization_factor  # Regularization penalty
    
    # Print the QUBO matrix to verify it looks correct
    #print("QUBO Matrix:", Q)
    
    return Q

# Use a Quantum Annealer to solve the QUBO problem
def solve_qubo(Q):
    sampler = EmbeddingComposite(DWaveSampler())
    response = sampler.sample_qubo(Q, num_reads=100)
    solution = response.first.sample
    return solution

# Tune the regularization factor
regularization_factor = 0.25  # Start with a smaller penalty
Q = create_qubo(X_train, y_train, regularization_factor)

# Solve the QUBO problem using the D-Wave quantum annealer or a classical solver
selected_features = solve_qubo(Q)

# Get the selected features from the solution
selected_feature_indices = [i for i in range(len(selected_features)) if selected_features[i] == 1]
X_train_selected = X_train[:, selected_feature_indices]
X_test_selected = X_test[:, selected_feature_indices]

print(len(selected_feature_indices), selected_feature_indices)

17 [6, 10, 17, 20, 25, 27, 32, 34, 35, 37, 40, 42, 49, 52, 57, 58, 59]


In [29]:
from dwave.samplers import SimulatedAnnealingSampler

def solve_qubo_classically(Q):
    sampler = SimulatedAnnealingSampler()
    response = sampler.sample_qubo(Q, num_reads=100)
    solution = response.first.sample
    return solution

# Test QUBO solution with classical simulated annealing
selected_features_classical = solve_qubo_classically(Q)
sf_indices_c = [i for i in range(len(selected_features_classical)) if selected_features_classical[i] == 1]
print("Selected Features (Classical Solver):", len(sf_indices_c), sf_indices_c)

Selected Features (Classical Solver): 17 [6, 10, 17, 20, 25, 27, 32, 34, 35, 37, 40, 42, 49, 52, 57, 58, 59]


In [30]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Train a classifier on the selected features
def train_model(X_train, y_train, X_test, y_test):
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    # Calculate performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC-AUC Score: {roc_auc:.4f}")

# Example usage
train_model(X_train_selected, y_train, X_test_selected, y_test)

Accuracy: 0.6923
Precision: 0.6333
Recall: 0.6786
F1 Score: 0.6552
ROC-AUC Score: 0.6906


In [31]:
from sklearn.feature_selection import RFE

def recursive_feature_elimination(X_train, y_train):
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    rfe = RFE(estimator=clf, n_features_to_select=23)  # Adjust the number of features to select
    rfe.fit(X_train, y_train)
    
    return rfe.support_  # Returns a boolean array indicating selected features

# Example usage
selected_features_rfe = recursive_feature_elimination(X_train, y_train)
X_train_rfe = X_train[:, selected_features_rfe]
X_test_rfe = X_test[:, selected_features_rfe]

# Train and evaluate model using features selected by RFE
train_model(X_train_rfe, y_train, X_test_rfe, y_test)


Accuracy: 0.6923
Precision: 0.6250
Recall: 0.7143
F1 Score: 0.6667
ROC-AUC Score: 0.6950


In [32]:
from sklearn.linear_model import LogisticRegression

def l1_regularization(X_train, y_train):
    clf = LogisticRegression(penalty='l1', solver='liblinear', random_state=42)
    clf.fit(X_train, y_train)
    
    # Get selected features (non-zero coefficients)
    selected_features_l1 = clf.coef_[0] != 0
    return selected_features_l1

# Example usage
selected_features_l1 = l1_regularization(X_train, y_train)
X_train_l1 = X_train[:, selected_features_l1]
X_test_l1 = X_test[:, selected_features_l1]

# Train and evaluate model using features selected by L1 regularization
train_model(X_train_l1, y_train, X_test_l1, y_test)

Accuracy: 0.5692
Precision: 0.5000
Recall: 0.5714
F1 Score: 0.5333
ROC-AUC Score: 0.5695


In [33]:
import time

# Classical solver timing
start_time = time.time()
selected_features_classical = solve_qubo_classically(Q)
classical_time = time.time() - start_time
print(f"Classical Solver Time: {classical_time:.4f} seconds")

# Quantum solver timing
start_time = time.time()
selected_features_quantum = solve_qubo(Q)
quantum_time = time.time() - start_time
print(f"Quantum Solver Time: {quantum_time:.4f} seconds")

Classical Solver Time: 0.1139 seconds
Quantum Solver Time: 1.8182 seconds


In [35]:
from sklearn.metrics import accuracy_score

def evaluate_multiple_runs(Q, num_runs=100):
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    classical_results = []
    quantum_results = []

    for _ in range(num_runs):
        # Classical solver
        selected_classical = solve_qubo_classically(Q)
        X_train_classical = X_train[:, [i for i, x in enumerate(selected_classical) if x == 1]]
        X_test_classical = X_test[:, [i for i, x in enumerate(selected_classical) if x == 1]]
        clf.fit(X_train_classical, y_train)
        y_pred_classical = clf.predict(X_test_classical)
        classical_results.append(accuracy_score(y_test, y_pred_classical))

        # Quantum solver
        selected_quantum = solve_qubo(Q)
        X_train_quantum = X_train[:, [i for i, x in enumerate(selected_quantum) if x == 1]]
        X_test_quantum = X_test[:, [i for i, x in enumerate(selected_quantum) if x == 1]]
        clf.fit(X_train_quantum, y_train)
        y_pred_quantum = clf.predict(X_test_quantum)
        quantum_results.append(accuracy_score(y_test, y_pred_quantum))

    print(f"Average Accuracy (Classical): {np.mean(classical_results):.4f}")
    print(f"Average Accuracy (Quantum): {np.mean(quantum_results):.4f}")
    print(f"Standard Deviation (Classical): {np.std(classical_results):.4f}")
    print(f"Standard Deviation (Quantum): {np.std(quantum_results):.4f}")

# Compare results of multiple runs
evaluate_multiple_runs(Q, num_runs=100)


Average Accuracy (Classical): 0.6154
Average Accuracy (Quantum): 0.6154
Standard Deviation (Classical): 0.0000
Standard Deviation (Quantum): 0.0000
