In [1]:
#To compare with simple selectKBest algorithm by selecting same number of best features we find from QA process above. 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

# Dataset names and corresponding number of features to select
dataset_feature_map = {
    'EQ.csv': 39,
    'JDT.csv': 43,
    'Lucene.csv': 40,
    'Mylyn.csv': 38,
    'PDE.csv': 39,
    'activemq.csv': 44,
    'groovy.csv': 44,
    'hbase.csv': 43,
    'hive.csv': 42,
    'jruby.csv': 46,
    'wicket.csv': 45
}

# Set random seed
seed_value = 42

def load_data(file_path):
    """ Load CSV data """
    return pd.read_csv(file_path)

def feature_selection(X, y, k):
    """ Perform feature selection with SelectKBest """
    selector = SelectKBest(score_func=f_classif, k=k)
    X_new = selector.fit_transform(X, y)
    return X_new, selector

def defect_prediction(X_train, X_test, y_train, y_test):
    """ Train a SVC classifier and predict defects """
    classifier = SVC(random_state=seed_value)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    
    # Print performance metrics
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

def process_dataset(file, k):
    """ Process a single dataset file with k selected features """
    # Load dataset
    data = load_data(file)
    
    # Split features and labels
    X = data.iloc[:, :-1]  # All columns except the last one
    y = data.iloc[:, -1]   # Last column is the label

    # Impute missing values with the mean
    imputer = SimpleImputer(strategy='mean')
    X = imputer.fit_transform(X)

    X = X.astype(float)
    y = y.astype(int)

    # Handle imbalanced data using SMOTE
    smote = SMOTE(sampling_strategy='auto', k_neighbors=5, random_state=seed_value)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    # Split the resampled data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=seed_value)

    # Normalize the features
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Perform feature selection with k features
    X_train_fs, selector = feature_selection(X_train_scaled, y_train, k=k)
    X_test_fs = selector.transform(X_test_scaled)
    
    # Perform defect prediction
    defect_prediction(X_train_fs, X_test_fs, y_train, y_test)

# Process each dataset with corresponding number of features
for dataset, k in dataset_feature_map.items():
    print(f"Processing {dataset} with {k} features...")
    process_dataset(dataset, k)


Processing EQ.csv with 39 features...
Accuracy: 0.8076923076923077
              precision    recall  f1-score   support

           0       0.85      0.74      0.79        39
           1       0.77      0.87      0.82        39

    accuracy                           0.81        78
   macro avg       0.81      0.81      0.81        78
weighted avg       0.81      0.81      0.81        78

Processing JDT.csv with 43 features...
Accuracy: 0.8107255520504731
              precision    recall  f1-score   support

           0       0.80      0.85      0.82       164
           1       0.83      0.77      0.80       153

    accuracy                           0.81       317
   macro avg       0.81      0.81      0.81       317
weighted avg       0.81      0.81      0.81       317

Processing Lucene.csv with 40 features...
Accuracy: 0.848605577689243
              precision    recall  f1-score   support

           0       0.88      0.85      0.86       142
           1       0.81      0.8

  f = msb / msw


Accuracy: 0.7758887171561051
              precision    recall  f1-score   support

           0       0.76      0.81      0.79       329
           1       0.79      0.74      0.76       318

    accuracy                           0.78       647
   macro avg       0.78      0.78      0.78       647
weighted avg       0.78      0.78      0.78       647

Processing PDE.csv with 39 features...
Accuracy: 0.7732558139534884
              precision    recall  f1-score   support

           0       0.75      0.81      0.78       258
           1       0.79      0.74      0.76       258

    accuracy                           0.77       516
   macro avg       0.77      0.77      0.77       516
weighted avg       0.77      0.77      0.77       516

Processing activemq.csv with 44 features...
Accuracy: 0.8618524332810047
              precision    recall  f1-score   support

           0       0.84      0.90      0.87       321
           1       0.89      0.82      0.86       316

    accuracy