In [7]:
#pip install numpy==1.26.4
# !pip install openml dimod dwave-system
#!pip install xlsxwriter
#!pip install dwave-ocean-sdk
#!pip install imblearn
# Set your DWAVE_API_TOKEN
#!pip install openpyxl

In [9]:
import os
import pandas as pd
import numpy as np
import time
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, matthews_corrcoef, cohen_kappa_score
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import mutual_info_classif
import dimod
from dwave.system import EmbeddingComposite, DWaveSampler, LeapHybridCQMSampler
from dwave.samplers import SimulatedAnnealingSampler, TabuSampler, RandomSampler, PlanarGraphSolver, SteepestDescentSolver
import xlsxwriter

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

os.environ['DWAVE_API_TOKEN'] = 'Actual-DW-key'

In [29]:
def dataset(df, seed_value):
    """
    Prepares a dataset by imputing missing values, handling imbalanced data, and normalizing features.

    Parameters:
    - df (pandas.DataFrame): Input dataframe where the last column is the target variable.
    - seed_value (int): Random seed for reproducibility in data splitting and SMOTE.

    Returns:
    - X_train_scaled (numpy.ndarray): Scaled training features.
    - X_test_scaled (numpy.ndarray): Scaled testing features.
    - y_train (numpy.ndarray): Training labels.
    - y_test (numpy.ndarray): Testing labels.

    Notes:
    - Missing values in the features are imputed with the mean of each column.
    - SMOTE is used to handle class imbalance by oversampling the minority class.
    - The data is split into training and testing sets with an 80-20 split.
    - Features are normalized using MinMaxScaler.
    """
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]

    # Impute missing values with the mean
    imputer = SimpleImputer(strategy='mean')
    X = imputer.fit_transform(X)

    X = X.astype(float)
    y = y.astype(int)

    # Handle imbalanced data using SMOTE
    smote = SMOTE(sampling_strategy='auto', k_neighbors=5, random_state=seed_value)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    # Split the resampled data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=seed_value)

    # Normalize the features
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    return X_train_scaled, X_test_scaled, y_train, y_test
#--------------------------------------------------------------------
def feature_selection_mutual(X_train_scaled, X_test_scaled, y_train, solver_file='TS,default.csv'):
    """
    Selects features based on mutual information using a specified solver.

    Parameters:
    - X_train_scaled (numpy.ndarray): Scaled training features.
    - X_test_scaled (numpy.ndarray): Scaled testing features.
    - y_train (numpy.ndarray): Training labels.
    - solver_file (str): File identifier for selecting the solver. Format is '<solver_name>,<filename>'.

    Returns:
    - X_train_selected (numpy.ndarray): Scaled training features with selected features.
    - X_test_selected (numpy.ndarray): Scaled testing features with selected features.
    - features (numpy.ndarray): Indices of selected features.
    - sampleset_info (dict): Information about the sampling process.

    Notes:
    - The function calculates mutual information between features and selects the most informative features using binary quadratic programming.
    - Supported solvers include TabuSampler, SimulatedAnnealingSampler, PlanarGraphSolver, RandomSampler, SteepestDescentSolver, EmbeddingComposite, and LeapHybridCQMSampler.
    """
    solver_name = solver_file.split(',')[0]
    filename = solver_file.split(',')[1]
    problem_label = f"{solver_name}-{filename}"
    mutual_info_matrix_features = mutual_info_classif(X_train_scaled, y_train)

    num_features = X_test_scaled.shape[1]
    mutual_info_matrix_pairs = np.zeros((num_features, num_features))

    for i in range(num_features):
        for j in range(num_features):
            mutual_info_matrix_pairs[i, j] = mutual_info_classif(X_train_scaled[:, i].reshape(-1, 1), y_train)

    alpha = 0.99
    Rxy = mutual_info_matrix_features
    Q = mutual_info_matrix_pairs * (1 - alpha)
    np.fill_diagonal(Q, -Rxy * alpha)

    bqm = dimod.BinaryQuadraticModel(Q, "BINARY")

    if solver_name == 'TS':
        solver = TabuSampler()
    elif solver_name == 'SA':
        solver = SimulatedAnnealingSampler()
    elif solver_name == 'PG':
        solver = PlanarGraphSolver()
    elif solver_name == 'RS':
        solver = RandomSampler()
    elif solver_name == 'SD':
        solver = SteepestDescentSolver()
    elif solver_name == 'QPU':
        solver = EmbeddingComposite(DWaveSampler())
    elif solver_name == 'DW':
        cqm = dimod.ConstrainedQuadraticModel()
        cqm.set_objective(bqm)
        solver = LeapHybridCQMSampler()

    sampleset = solver.sample(bqm, num_reads=100, label=problem_label) if solver_name != 'DW' else solver.sample_cqm(cqm, label=problem_label)

    best = sorted(sampleset.data(), key=lambda x: (list(x.sample.values())[0], x.energy))[0]
    is_selected = np.array([bool(val) for val in best.sample.values()])
    features = np.array([i for i, val in enumerate(is_selected) if val])

    X_train_selected = X_train_scaled[:, features]
    X_test_selected = X_test_scaled[:, features]

    return X_train_selected, X_test_selected, features, sampleset.info
#--------------------------------------------------------------------
# Function to return full features
def feature_full(X_train_scaled, X_test_scaled, y_train):
    features = 'full'
    info = 'X'
    return X_train_scaled, X_test_scaled, features, info
#--------------------------------------------------------------------
def model_accuracy(X_train, X_test, y_train, y_test, seed_value):
    """
    Evaluates the performance of an SVC model on the provided dataset.

    Parameters:
    - X_train (numpy.ndarray): Training features.
    - X_test (numpy.ndarray): Testing features.
    - y_train (numpy.ndarray): Training labels.
    - y_test (numpy.ndarray): Testing labels.
    - seed_value (int): Random seed for reproducibility.

    Returns:
    - dict: A dictionary containing various performance metrics:
        - 'Accuracy': Classification accuracy.
        - 'Precision': Weighted precision score.
        - 'Recall': Weighted recall score.
        - 'F1 Score': Weighted F1 score.
        - 'AUC': Area Under the ROC Curve.
        - 'G-mean': Geometric mean of sensitivity and specificity.
        - 'Matthew': Matthews correlation coefficient.
        - 'Cohen': Cohen's kappa score.
        - 'Feature No': Number of features used.

    Notes:
    - The function uses a Support Vector Classifier (SVC) with probability estimates.
    - Metrics include accuracy, precision, recall, F1 score, AUC, sensitivity, specificity, G-mean, MCC, and kappa.
    """
    clf = SVC(probability=True, random_state=seed_value)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    y_prob = clf.predict_proba(X_test)
    y_scores = clf.decision_function(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    auc = roc_auc_score(y_test, y_scores, multi_class='ovr')

    conf_matrix = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = conf_matrix.ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    g_mean = (sensitivity * specificity) ** 0.5
    mcc = matthews_corrcoef(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)

    num_columns = X_train.shape[1]

    return {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'AUC': auc,
        'G-mean': g_mean,
        'Matthew': mcc,
        'Cohen': kappa,
        'Feature No': num_columns
    }
#--------------------------------------------------------------------
def get_info(dictionary, key):
    # Check if the key exists directly in the dictionary
    if key in dictionary:
        return dictionary[key]
    
    # Check if the 'timing' key exists and the key is in the nested 'timing' dictionary
    if 'timing' in dictionary and key in dictionary['timing']:
        return dictionary['timing'][key]
    
    # Return None if the key is not found in either case
    return '---'
#--------------------------------------------------------------------
def main():
    """
    Main function to preprocess datasets, apply feature selection methods, and evaluate models.
    The results are saved to an Excel file.

    Steps:
    1. Set file paths and directories for datasets and results.
    2. Iterate over dataset files in the selected directory.
    3. Preprocess each dataset based on its type (NASA, JIRA, AEEEM).
    4. Apply feature selection methods and evaluate models.
    5. Save the results to an Excel file, with each dataset's results in a separate sheet.

    Notes:
    - Handles different preprocessing for NASA, JIRA, and AEEEM datasets.
    - Uses `feature_full` and `feature_selection_mutual` for feature selection.
    - Evaluates models with `model_accuracy`.
    """
    num_runs = 1
    #data_path = '/content/drive/MyDrive/Colab Notebooks/dwExp/'
    data_path = 'dw_datasets/'
    result_path = 'dw_results/'
    folder_path = [data_path + 'NASA/', data_path + 'JIRA/', data_path + 'AEEEM/', data_path+'TEST/']
    #selected_path = folder_path[3]
    for selected_path in folder_path[-1:]:
        data_dir_name = selected_path.split('/')[-2] #such as NASA, JIRA, etc. 

        result_file_path = f'{result_path}{data_dir_name}_results.xlsx'

        dataset_file_names = [file_name for file_name in os.listdir(selected_path) if file_name.endswith('.csv')]
        results_dict = {}

        for file_name in dataset_file_names:
            print(file_name)
            results_df = pd.DataFrame()
            file_path = os.path.join(selected_path, file_name)
            csv_data = pd.read_csv(file_path)

            # Handle NASA dataset specific preprocessing
            if 'NASA' in selected_path:
                csv_data.replace('?', np.nan, inplace=True)
                mapping = {'Y': 1, 'N': 0}
                csv_data.iloc[:, -1] = csv_data.iloc[:, -1].map(mapping)
                # Ensure that all values are correctly mapped
                csv_data.iloc[:, -1] = pd.to_numeric(csv_data.iloc[:, -1], errors='coerce')

            elif 'JIRA' in selected_path or 'TEST' in selected_path:
                csv_data.replace('?', np.nan, inplace=True)
                csv_data.iloc[:, -1] = csv_data.iloc[:, -1].apply(lambda x: 0 if x == 0 else 1)

            elif 'AEEEM' in selected_path:
                csv_data.replace('?', np.nan, inplace=True)
                mapping = {'buggy': 1, 'clean': 0}
                csv_data.iloc[:, -1] = csv_data.iloc[:, -1].map(mapping)

            # Drop columns with a single unique value or all missing values
            csv_data = csv_data.loc[:, csv_data.nunique() != 1]
            csv_data = csv_data.dropna(axis=1, how='all')

            print(file_name, ":", csv_data.shape)

            for run in range(num_runs):
                seed_value = 41  # Set your desired seed value
                for init_feature in range(100, 801, 100):
                    X_train_scaled, X_test_scaled, y_train, y_test = dataset(csv_data, seed_value)
                    
                    X_train_scaled = X_train_scaled[:, 0:init_feature]
                    X_test_scaled = X_test_scaled[:, 0:init_feature]
                    
                    print("Current Feature Set:", X_train_scaled.shape, X_test_scaled.shape)

                    function_dict = {
                        'All_feature': (feature_full, (X_train_scaled, X_test_scaled, y_train)),
                        'approaches_DW_Hybrid_mutual': (feature_selection_mutual, (X_train_scaled, X_test_scaled, y_train, f'DW,{data_dir_name}_{file_name.split(".")[0]}')),
                        'approaches_DW_QPU_mutual': (feature_selection_mutual, (X_train_scaled, X_test_scaled, y_train, f'QPU,{data_dir_name}_{file_name.split(".")[0]}')),
                    }


                    for function_name, (function, args) in function_dict.items():
                        start_time = time.time()
                        X_train_selected, X_test_selected, features, info = function(*args)
                        metrics = model_accuracy(X_train_selected, X_test_selected, y_train, y_test, seed_value)
                        elapsed_time = time.time() - start_time #in seconds

                        if isinstance(features, str) and features == 'full':
                            feature_names = csv_data.columns.tolist()
                        else:
                            feature_names = [csv_data.columns[idx] for idx in features]

                        results_df = pd.concat([results_df, pd.DataFrame({
                            'Approach': [function_name],
                            'Accuracy': [metrics['Accuracy']],
                            'Precision': [metrics['Precision']],
                            'Recall': [metrics['Recall']],
                            'F1 Score': [metrics['F1 Score']],
                            'AUC': [metrics['AUC']],
                            'G-mean': [metrics['G-mean']],
                            'Matthew': [metrics['Matthew']],
                            'Cohen': [metrics['Cohen']],
                            'Feature No': [metrics['Feature No']],
                            'Elapsed Time (sec)': [elapsed_time], #in seconds
                            'qpu_access_time': [get_info(info, 'qpu_access_time')],
                            'charge_time': [get_info(info, 'charge_time')],
                            'run_time': [get_info(info, 'run_time')],
                            'qpu_sampling_time': [get_info(info, 'qpu_sampling_time')],
                            'qpu_anneal_time_per_sample': [get_info(info, 'qpu_anneal_time_per_sample')],
                            'qpu_readout_time_per_sample': [get_info(info, 'qpu_readout_time_per_sample')],
                            'qpu_access_overhead_time': [get_info(info, 'qpu_access_overhead_time')],
                            'qpu_programming_time': [get_info(info, 'qpu_programming_time')],
                            'qpu_delay_time_per_sample': [get_info(info, 'qpu_delay_time_per_sample')],
                            'total_post_processing_time': [get_info(info, 'total_post_processing_time')],
                            'post_processing_overhead_time': [get_info(info, 'post_processing_overhead_time')],
                            'problem_label': [get_info(info, 'problem_label')],
                            'problem_id': [get_info(info, 'problem_id')],
                            'Features': [', '.join(feature_names)],
                            'Time Unit': ['Micro Sec']
                        })], ignore_index=True)

            if os.path.exists(result_file_path):
                with pd.ExcelWriter(result_file_path, mode="a", engine="openpyxl", if_sheet_exists="replace") as writer:
                    results_df.to_excel(writer, sheet_name=file_name.split('.')[0], index=False)
            else:
                with pd.ExcelWriter(result_file_path, engine="xlsxwriter") as writer:
                    results_df.to_excel(writer, sheet_name=file_name.split('.')[0], index=False)

if __name__ == "__main__":
    main()

activemq.csv
activemq.csv : (1884, 66)
Current Feature Set: (2545, 30) (637, 30)
Current Feature Set: (2545, 35) (637, 35)
Current Feature Set: (2545, 40) (637, 40)
Current Feature Set: (2545, 45) (637, 45)
Current Feature Set: (2545, 50) (637, 50)
Current Feature Set: (2545, 55) (637, 55)
Current Feature Set: (2545, 60) (637, 60)
Current Feature Set: (2545, 65) (637, 65)


In [6]:
#Just for testing things...........
data_path = 'dw_datasets/'
result_path = 'dw_results/'
folder_path = [data_path + 'NASA/', data_path + 'JIRA/', data_path + 'AEEEM/', data_path+'TEST/']

for selected_path in folder_path:
    print(selected_path.split('/')[-2])

NASA
JIRA
AEEEM
TEST


In [52]:
d1 = {'constraint_labels': [], 'qpu_access_time': 15987, 'charge_time': 4306878, 'run_time': 4306878, 'problem_id': '7a116022-e988-47e3-9129-193e6111a58c', 'problem_label': 'DW-TEST_MC2'}

d2 = {'timing': {'qpu_sampling_time': 12732.0, 'qpu_anneal_time_per_sample': 20.0, 'qpu_readout_time_per_sample': 86.74, 'qpu_access_time': 28495.16, 'qpu_access_overhead_time': 662.84, 'qpu_programming_time': 15763.16, 'qpu_delay_time_per_sample': 20.58, 'total_post_processing_time': 23.0, 'post_processing_overhead_time': 23.0}, 'problem_id': '7d1388ee-1bbd-40f0-9a5b-e24a275ad0ea', 'problem_label': 'QPU-TEST_MC2'}

In [53]:
def show_dict(var_dict):
    for key in var_dict: 
        if isinstance(var_dict[key], dict):
            show_dict(var_dict[key])
        else: 
            print(key, var_dict[key])

show_dict(d1)  
print()
show_dict(d2)       

constraint_labels []
qpu_access_time 15987
charge_time 4306878
run_time 4306878
problem_id 7a116022-e988-47e3-9129-193e6111a58c
problem_label DW-TEST_MC2

qpu_sampling_time 12732.0
qpu_anneal_time_per_sample 20.0
qpu_readout_time_per_sample 86.74
qpu_access_time 28495.16
qpu_access_overhead_time 662.84
qpu_programming_time 15763.16
qpu_delay_time_per_sample 20.58
total_post_processing_time 23.0
post_processing_overhead_time 23.0
problem_id 7d1388ee-1bbd-40f0-9a5b-e24a275ad0ea
problem_label QPU-TEST_MC2


In [56]:
getinfo2(d2, 'qpu_access_overhead_time')

662.84

In [8]:
for x in range(100, 801, 100):
    print(x)

100
200
300
400
500
600
700
800


In [6]:
arr = [1, 2, 3, 4]
print(arr[:3])

[1, 2, 3]
