In [7]:
#pip install numpy==1.26.4
# !pip install openml dimod dwave-system
#!pip install xlsxwriter
#!pip install dwave-ocean-sdk
#!pip install imblearn
# Set your DWAVE_API_TOKEN
#!pip install openpyxl

In [1]:
import os
import pandas as pd
import numpy as np
import time
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, matthews_corrcoef, cohen_kappa_score
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import mutual_info_classif
import dimod
from dwave.system import EmbeddingComposite, DWaveSampler, LeapHybridCQMSampler
from dwave.samplers import SimulatedAnnealingSampler, TabuSampler, RandomSampler, PlanarGraphSolver, SteepestDescentSolver
import xlsxwriter

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

#os.environ['DWAVE_API_TOKEN'] = 'Actual-DW-key'

In [2]:
# Function for dataset processing
def dataset(df, seed_value):
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]

    # Impute missing values with the mean
    imputer = SimpleImputer(strategy='mean')
    X = imputer.fit_transform(X)

    X = X.astype(float)
    y = y.astype(int)

    # Handle imbalanced data using SMOTE
    smote = SMOTE(sampling_strategy='auto', k_neighbors=5, random_state=seed_value)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    # Split the resampled data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=seed_value)

    # Normalize the features
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    return X_train_scaled, X_test_scaled, y_train, y_test

# Function for mutual information-based feature selection
def feature_selection_mutual(X_train_scaled, X_test_scaled, y_train, solver_file='TS,default.csv'):
    solver_name = solver_file.split(',')[0]
    filename = solver_file.split(',')[1]
    problem_label = f"{solver_name}-{filename}"
    mutual_info_matrix_features = mutual_info_classif(X_train_scaled, y_train)

    num_features = X_test_scaled.shape[1]
    mutual_info_matrix_pairs = np.zeros((num_features, num_features))

    for i in range(num_features):
        for j in range(num_features):
            mutual_info_matrix_pairs[i, j] = mutual_info_classif(X_train_scaled[:, i].reshape(-1, 1), y_train)

    alpha = 0.99
    Rxy = mutual_info_matrix_features
    Q = mutual_info_matrix_pairs * (1 - alpha)
    np.fill_diagonal(Q, -Rxy * alpha)

    bqm = dimod.BinaryQuadraticModel(Q, "BINARY")

    if solver_name == 'TS':
        solver = TabuSampler()
    elif solver_name == 'SA':
        solver = SimulatedAnnealingSampler()
    elif solver_name == 'PG':
        solver = PlanarGraphSolver()
    elif solver_name == 'RS':
        solver = RandomSampler()
    elif solver_name == 'SD':
        solver = SteepestDescentSolver()
    elif solver_name == 'QPU':
        solver = EmbeddingComposite(DWaveSampler())
    elif solver_name == 'DW':
        cqm = dimod.ConstrainedQuadraticModel()
        cqm.set_objective(bqm)
        solver = LeapHybridCQMSampler()

    sampleset = solver.sample(bqm, num_reads=100, label=problem_label) if solver_name != 'DW' else solver.sample_cqm(cqm, label=problem_label)

    best = sorted(sampleset.data(), key=lambda x: (list(x.sample.values())[0], x.energy))[0]
    is_selected = np.array([bool(val) for val in best.sample.values()])
    features = np.array([i for i, val in enumerate(is_selected) if val])

    X_train_selected = X_train_scaled[:, features]
    X_test_selected = X_test_scaled[:, features]

    return X_train_selected, X_test_selected, features, sampleset.info

# Function to return full features
def feature_full(X_train_scaled, X_test_scaled, y_train):
    features = 'full'
    info = 'X'
    return X_train_scaled, X_test_scaled, features, info

# Function to train the model and calculate metrics
def model_accuracy(X_train, X_test, y_train, y_test, seed_value):
    clf = SVC(probability=True, random_state=seed_value)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    y_prob = clf.predict_proba(X_test)
    y_scores = clf.decision_function(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    auc = roc_auc_score(y_test, y_scores, multi_class='ovr')

    conf_matrix = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = conf_matrix.ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    g_mean = (sensitivity * specificity) ** 0.5
    mcc = matthews_corrcoef(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)

    num_columns = X_train.shape[1]

    return {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'AUC': auc,
        'G-mean': g_mean,
        'Matthew': mcc,
        'Cohen': kappa,
        'Feature No': num_columns
    }

def main():
    #path = '/content/drive/MyDrive/Colab Notebooks/dwExp/'
    path = 'dw_datasets/'
    folder_path = [path + 'NASA/', path + 'JIRA/', path + 'AEEEM/', path+'TEST/']
    selected_path = folder_path[3]
    num_runs = 1
    excel_file_path = selected_path + 'results.xlsx'

    dataset_file_names = [file_name for file_name in os.listdir(selected_path) if file_name.endswith('.csv')]
    results_dict = {}

    for file_name in dataset_file_names:
        print(file_name)
        results_df = pd.DataFrame()
        file_path = os.path.join(selected_path, file_name)
        csv_data = pd.read_csv(file_path)

        # Handle NASA dataset specific preprocessing
        if 'NASA' in selected_path or 'TEST' in selected_path:
            csv_data.replace('?', np.nan, inplace=True)
            mapping = {'Y': 1, 'N': 0}
            csv_data.iloc[:, -1] = csv_data.iloc[:, -1].map(mapping)
            # Ensure that all values are correctly mapped
            csv_data.iloc[:, -1] = pd.to_numeric(csv_data.iloc[:, -1], errors='coerce')

        elif 'JIRA' in selected_path:
            csv_data.replace('?', np.nan, inplace=True)
            csv_data.iloc[:, -1] = csv_data.iloc[:, -1].apply(lambda x: 0 if x == 0 else 1)

        elif 'AEEEM' in selected_path:
            csv_data.replace('?', np.nan, inplace=True)
            mapping = {'buggy': 1, 'clean': 0}
            csv_data.iloc[:, -1] = csv_data.iloc[:, -1].map(mapping)

        # Drop columns with a single unique value or all missing values
        csv_data = csv_data.loc[:, csv_data.nunique() != 1]
        csv_data = csv_data.dropna(axis=1, how='all')

        for run in range(num_runs):
            seed_value = 41  # Set your desired seed value
            X_train_scaled, X_test_scaled, y_train, y_test = dataset(csv_data, seed_value)

            function_dict = {
                'All_feature': (feature_full, (X_train_scaled, X_test_scaled, y_train)),
                'approaches_DW_Hybrid_mutual': (feature_selection_mutual, (X_train_scaled, X_test_scaled, y_train, f'DW,{file_name}')),
                'approaches_DW_QPU_mutual': (feature_selection_mutual, (X_train_scaled, X_test_scaled, y_train, f'QPU,{file_name}')),
            }

            for function_name, (function, args) in function_dict.items():
                start_time = time.time()
                X_train_selected, X_test_selected, features, info = function(*args)
                metrics = model_accuracy(X_train_selected, X_test_selected, y_train, y_test, seed_value)
                elapsed_time = time.time() - start_time

                if isinstance(features, str) and features == 'full':
                  feature_names = csv_data.columns.tolist()
                else:
                  feature_names = [csv_data.columns[idx] for idx in features]

                results_df = pd.concat([results_df, pd.DataFrame({
                    'Approach': [function_name],
                    'Accuracy': [metrics['Accuracy']],
                    'Precision': [metrics['Precision']],
                    'Recall': [metrics['Recall']],
                    'F1 Score': [metrics['F1 Score']],
                    'AUC': [metrics['AUC']],
                    'G-mean': [metrics['G-mean']],
                    'Matthew': [metrics['Matthew']],
                    'Cohen': [metrics['Cohen']],
                    'Feature No': [metrics['Feature No']],
                    'Elapsed Time': [elapsed_time],
                    'Features': [', '.join(feature_names)],
                    'Info (Micro Sec)': [info]
                })], ignore_index=True)

        if os.path.exists(excel_file_path):
            with pd.ExcelWriter(excel_file_path, mode="a", engine="openpyxl", if_sheet_exists="replace") as writer:
                results_df.to_excel(writer, sheet_name=file_name.split('.')[0], index=False)
        else:
            with pd.ExcelWriter(excel_file_path, engine="xlsxwriter") as writer:
                results_df.to_excel(writer, sheet_name=file_name.split('.')[0], index=False)

if __name__ == "__main__":
    main()

KC4.csv
