Load packages

In [2]:
# Import the necessary packages
import pandas as pd
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.feature_selection import mutual_info_classif, chi2, f_classif, SelectKBest


# Load the dataset
df = pd.read_csv('binary2.arff.csv')

# Split data into X (features) and Y (target)
X = df.iloc[:, :-1].values  # Features
Y = df.iloc[:, -1].values   # Labels

# Convert categorical labels to binary labels
le = LabelEncoder()
Y = le.fit_transform(Y)  # Encode 'ALL' as 0 and 'AML' as 1

In [1]:
X

NameError: name 'X' is not defined

Filter Feature Selection Techniques

In [31]:
from sklearn.feature_selection import mutual_info_classif, f_classif, chi2, SelectKBest
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Feature selection methods: Mutual Information, F-Score, Chi-Square
k = 100  # Number of top features to select

# Scaling for Chi-Square
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Dictionary to store results
results = {
    'Mutual Information': {'SVM': {}, 'Random Forest': {}},
    'F-Score': {'SVM': {}, 'Random Forest': {}},
    'Chi-Square': {'SVM': {}, 'Random Forest': {}}
}

# Mutual Information
selector_mi = SelectKBest(mutual_info_classif, k=k)
X_selected_mi = selector_mi.fit_transform(X, Y)

# F-Score
selector_f = SelectKBest(f_classif, k=k)
X_selected_f = selector_f.fit_transform(X, Y)

# Chi-Square
selector_chi2 = SelectKBest(chi2, k=k)
X_selected_chi2 = selector_chi2.fit_transform(X_scaled, Y)

# Define models
svm = SVC(kernel='linear', random_state=12)
rf = RandomForestClassifier(n_estimators=13, random_state=13)

# Feature selection and model training for each method
for method, X_selected in zip(['Mutual Information', 'F-Score', 'Chi-Square'], 
                              [X_selected_mi, X_selected_f, X_selected_chi2]):

    # SVM
    X_train, X_test, Y_train, Y_test = train_test_split(X_selected, Y, test_size=0.2, random_state=13)
    svm.fit(X_train, Y_train)
    Y_pred_svm = svm.predict(X_test)
    accuracy_svm = accuracy_score(Y_test, Y_pred_svm)
    classification_rep_svm = classification_report(Y_test, Y_pred_svm)
    
    # Store SVM results
    results[method]['SVM']['Accuracy'] = accuracy_svm
    results[method]['SVM']['Classification Report'] = classification_rep_svm
    
    # Random Forest
    X_train, X_test, Y_train, Y_test = train_test_split(X_selected, Y, test_size=0.2, random_state=13)
    rf.fit(X_train, Y_train)
    Y_pred_rf = rf.predict(X_test)
    accuracy_rf = accuracy_score(Y_test, Y_pred_rf)
    classification_rep_rf = classification_report(Y_test, Y_pred_rf)
    
    # Store Random Forest results
    results[method]['Random Forest']['Accuracy'] = accuracy_rf
    results[method]['Random Forest']['Classification Report'] = classification_rep_rf

# Output the results
for method, models in results.items():
    print(f"--- {method} ---")
    for model, metrics in models.items():
        print(f"{model} Accuracy: {metrics['Accuracy']}")
        print(f"{model} Classification Report:\n{metrics['Classification Report']}")
        print("\n")


--- Mutual Information ---
SVM Accuracy: 0.9333333333333333
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96        11
           1       1.00      0.75      0.86         4

    accuracy                           0.93        15
   macro avg       0.96      0.88      0.91        15
weighted avg       0.94      0.93      0.93        15



Random Forest Accuracy: 0.9333333333333333
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96        11
           1       1.00      0.75      0.86         4

    accuracy                           0.93        15
   macro avg       0.96      0.88      0.91        15
weighted avg       0.94      0.93      0.93        15



--- F-Score ---
SVM Accuracy: 0.9333333333333333
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96 

Cross Validation Techniques

array([[-103, -251,  101, ...,  284,  -98, 2112],
       [-228,  -26,   74, ...,  188,   27, 1323],
       [ 100,  306,   84, ...,  117,  425, 1033],
       ...,
       [ 748, 1087,  536, ..., 2776, 6763,  412],
       [ 309, 1174,  154, ..., 1897, 1777,  393],
       [  50,  284,   41, ...,  162,  108,  286]], dtype=int64)

In [None]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(svm, X_selected_mi, Y, cv=5)
print("Cross-validation scores for Mutual Information (SVM):", cv_scores)
print("Mean Cross-validation score:", cv_scores.mean())


Cross-validation scores for Mutual Information (SVM): [1.         1.         0.85714286 0.92857143 1.        ]
Mean Cross-validation score: 0.9571428571428571


Recursive Feature Elimination (RFE)

In [None]:
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Define models
svm = SVC(kernel='linear', random_state=12)
rf = RandomForestClassifier(n_estimators=3, random_state=13)

# Recursive Feature Elimination (RFE) with SVM
rfe_svm = RFE(estimator=svm, n_features_to_select=100)
X_selected_rfe_svm = rfe_svm.fit_transform(X, Y)

# Split data for SVM
X_train_rfe_svm, X_test_rfe_svm, Y_train, Y_test = train_test_split(X_selected_rfe_svm, Y, test_size=0.2, random_state=14)

# Train and evaluate SVM
svm.fit(X_train_rfe_svm, Y_train)
Y_pred_rfe_svm = svm.predict(X_test_rfe_svm)
accuracy_rfe_svm = accuracy_score(Y_test, Y_pred_rfe_svm)
classification_rep_rfe_svm = classification_report(Y_test, Y_pred_rfe_svm)

print("SVM RFE Accuracy:", accuracy_rfe_svm)
print("SVM RFE Classification Report:\n", classification_rep_rfe_svm)

# Recursive Feature Elimination (RFE) with Random Forest
rfe_rf = RFE(estimator=rf, n_features_to_select=100)
X_selected_rfe_rf = rfe_rf.fit_transform(X, Y)

# Split data for Random Forest
X_train_rfe_rf, X_test_rfe_rf, Y_train, Y_test = train_test_split(X_selected_rfe_rf, Y, test_size=0.2, random_state=13)

# Train and evaluate Random Forest
rf.fit(X_train_rfe_rf, Y_train)
Y_pred_rfe_rf = rf.predict(X_test_rfe_rf)
accuracy_rfe_rf = accuracy_score(Y_test, Y_pred_rfe_rf)
classification_rep_rfe_rf = classification_report(Y_test, Y_pred_rfe_rf)

print("RF RFE Accuracy:", accuracy_rfe_rf)
print("RF RFE Classification Report:\n", classification_rep_rfe_rf)


SVM RFE Accuracy: 1.0
SVM RFE Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00         3

    accuracy                           1.00        15
   macro avg       1.00      1.00      1.00        15
weighted avg       1.00      1.00      1.00        15

RF RFE Accuracy: 0.7333333333333333
RF RFE Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.91      0.83        11
           1       0.50      0.25      0.33         4

    accuracy                           0.73        15
   macro avg       0.63      0.58      0.58        15
weighted avg       0.70      0.73      0.70        15



Sequential Feature Selector (SFS)

In [None]:
# Import necessary modules for Sequential Feature Selection
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report
import time

# Define models
svm = SVC(kernel='linear', random_state=12)
rf = RandomForestClassifier(n_estimators=10, random_state=13)  # Reduced estimators for faster computation

# Scale features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Define Sequential Feature Selector for SVM and RF
n_features_to_select = 100  # Specify the number of features to select for faster computation

# Sequential Feature Selector for SVM
start_time = time.time()
sfs_svm = SequentialFeatureSelector(svm, n_features_to_select=n_features_to_select, direction='forward', n_jobs=-1)
X_selected_sfs_svm = sfs_svm.fit_transform(X_scaled, Y)
end_time = time.time()
print(f"SVM Sequential Feature Selection completed in {end_time - start_time:.2f} seconds")

# Split data for SVM
X_train_sfs_svm, X_test_sfs_svm, Y_train, Y_test = train_test_split(X_selected_sfs_svm, Y, test_size=0.2, random_state=14)

# Train and evaluate SVM
svm.fit(X_train_sfs_svm, Y_train)
Y_pred_sfs_svm = svm.predict(X_test_sfs_svm)
accuracy_sfs_svm = accuracy_score(Y_test, Y_pred_sfs_svm)
classification_rep_sfs_svm = classification_report(Y_test, Y_pred_sfs_svm)

print("SVM SFS Accuracy:", accuracy_sfs_svm)
print("SVM SFS Classification Report:\n", classification_rep_sfs_svm)

# Sequential Feature Selector for Random Forest
start_time = time.time()
sfs_rf = SequentialFeatureSelector(rf, n_features_to_select=n_features_to_select, direction='forward', n_jobs=-1)
X_selected_sfs_rf = sfs_rf.fit_transform(X_scaled, Y)
end_time = time.time()
print(f"Random Forest Sequential Feature Selection completed in {end_time - start_time:.2f} seconds")

# Split data for Random Forest
X_train_sfs_rf, X_test_sfs_rf, Y_train, Y_test = train_test_split(X_selected_sfs_rf, Y, test_size=0.2, random_state=13)

# Train and evaluate Random Forest
rf.fit(X_train_sfs_rf, Y_train)
Y_pred_sfs_rf = rf.predict(X_test_sfs_rf)
accuracy_sfs_rf = accuracy_score(Y_test, Y_pred_sfs_rf)
classification_rep_sfs_rf = classification_report(Y_test, Y_pred_sfs_rf)

print("Random Forest SFS Accuracy:", accuracy_sfs_rf)
print("Random Forest SFS Classification Report:\n", classification_rep_sfs_rf)


KeyboardInterrupt: 

Genetic Algorithm

In [14]:
import numpy as np
from deap import base, creator, tools, algorithms
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler

# Define models
svm = SVC(kernel='linear', random_state=12)
rf = RandomForestClassifier(n_estimators=10, random_state=13)

# Scale the features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Define the fitness function
def evaluate(individual, model, X_train, X_test, y_train, y_test):
    selected_features = [i for i in range(len(individual)) if individual[i] == 1]
    if len(selected_features) == 0:
        return 0.0,  # Return 0 if no features are selected
    
    X_train_selected = X_train[:, selected_features]
    X_test_selected = X_test[:, selected_features]
    
    model.fit(X_train_selected, y_train)
    predictions = model.predict(X_test_selected)
    
    return accuracy_score(y_test, predictions),

# Enforce selection of exactly 100 features
def init_fixed_size_individual(icls, size, n_selected_features):
    individual = [0] * size
    selected_indices = np.random.choice(size, n_selected_features, replace=False)
    for index in selected_indices:
        individual[index] = 1
    return icls(individual)

# GA Setup
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register("individual", init_fixed_size_individual, creator.Individual, size=X.shape[1], n_selected_features=100)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)

# Use SVM or Random Forest here
def genetic_algorithm_feature_selection(model, n_population=50, n_generations=20):
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=13)
    
    toolbox.register("evaluate", evaluate, model=model, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
    
    pop = toolbox.population(n=n_population)
    hof = tools.HallOfFame(1)
    
    algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=n_generations, halloffame=hof, verbose=True)
    
    # Extract the best individual and the selected features
    best_individual = hof[0]
    selected_features = [i for i in range(len(best_individual)) if best_individual[i] == 1]
    
    return selected_features, best_individual

# Run the genetic algorithm for SVM
selected_features_svm, best_individual_svm = genetic_algorithm_feature_selection(svm)
print(f"Selected features for SVM: {selected_features_svm}")
print(f"Number of features selected: {len(selected_features_svm)}")

# Run the genetic algorithm for Random Forest
selected_features_rf, best_individual_rf = genetic_algorithm_feature_selection(rf)
print(f"Selected features for Random Forest: {selected_features_rf}")
print(f"Number of features selected: {len(selected_features_rf)}")


ModuleNotFoundError: No module named 'deap'

Performance Metrices for GA

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

def evaluate_model_performance(model, X_train, X_test, y_train, y_test):
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict the labels
    y_pred = model.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    # Confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    # Classification report
    class_report = classification_report(y_test, y_pred)
    
    # Calculate ROC-AUC score (only for binary classification and for models that support predict_proba)
    roc_auc = None
    if hasattr(model, "predict_proba") and len(np.unique(y_test)) == 2:
        y_pred_prob = model.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test, y_pred_prob)
    
    return accuracy, conf_matrix, class_report, roc_auc

# Enable probability estimation in SVM
svm = SVC(kernel='linear', random_state=12, probability=True)
rf = RandomForestClassifier(n_estimators=10, random_state=13)

# Get the selected features from GA for both models (assuming you've already run GA for both models)
selected_features_svm, _ = genetic_algorithm_feature_selection(svm)
selected_features_rf, _ = genetic_algorithm_feature_selection(rf)

# Filter the dataset based on the selected features
X_train_ga_svm, X_test_ga_svm, y_train_svm, y_test_svm = train_test_split(X_scaled[:, selected_features_svm], Y, test_size=0.2, random_state=14)
X_train_ga_rf, X_test_ga_rf, y_train_rf, y_test_rf = train_test_split(X_scaled[:, selected_features_rf], Y, test_size=0.2, random_state=13)

# Evaluate SVM model using the selected features
svm_accuracy, svm_conf_matrix, svm_class_report, svm_roc_auc = evaluate_model_performance(svm, X_train_ga_svm, X_test_ga_svm, y_train_svm, y_test_svm)

# Print SVM results
print("\n--- SVM Model Performance ---")
print(f"Accuracy: {svm_accuracy:.4f}")
print(f"Confusion Matrix:\n{svm_conf_matrix}")
print(f"Classification Report:\n{svm_class_report}")
if svm_roc_auc is not None:
    print(f"ROC-AUC Score: {svm_roc_auc:.4f}")

# Evaluate Random Forest model using the selected features
rf_accuracy, rf_conf_matrix, rf_class_report, rf_roc_auc = evaluate_model_performance(rf, X_train_ga_rf, X_test_ga_rf, y_train_rf, y_test_rf)

# Print Random Forest results
print("\n--- Random Forest Model Performance ---")
print(f"Accuracy: {rf_accuracy:.4f}")
print(f"Confusion Matrix:\n{rf_conf_matrix}")
print(f"Classification Report:\n{rf_class_report}")
if rf_roc_auc is not None:
    print(f"ROC-AUC Score: {rf_roc_auc:.4f}")
