Load Packages

In [None]:
import pandas as pd
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score, roc_curve, auc

Load Dataset and split

In [None]:
# Load the dataset
df = pd.read_csv('binary.arff.csv')

# Split data into X (features) and Y (target)
X = df.iloc[:, :-1].values  # Features
Y = df.iloc[:, -1].values   # Labels

# Convert categorical labels to binary labels
le = LabelEncoder()
Y = le.fit_transform(Y)  # Encode 'ALL' as 0 and 'AML' as 1

SVM Before Feature Selection 

In [None]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn import metrics

# Split the dataset into training and test data (80:20 split)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=14)

# Build the model
svm = SVC(kernel='linear', random_state=12, probability=True)  # Set probability=True for ROC-AUC

# Train the model
svm.fit(X_train, Y_train)

# Predict the labels for the test set
Y_pred = svm.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(Y_test, Y_pred)
classification_rep = classification_report(Y_test, Y_pred)

from sklearn import metrics

# Calculate ROC AUC score
roc_auc = metrics.roc_auc_score(Y_test, Y_pred)


# Output the results

print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_rep)
print("Confusion Matrix:")
print(confusion_matrix(Y_test, Y_pred))
print(f"\nROC-AUC: {roc_auc:.4f}")

# Perform n-fold cross-validation
n = 2
cv_scores = cross_val_score(svm, X, Y, cv=n)

# Print cross-validation scores
print("\nCross-validation Scores:", cv_scores)
print("Mean Cross-validation Accuracy:", cv_scores.mean())


RF Before Feature Selection

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Split the dataset into training and test data (80:20 split)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=13)

# Build the Random Forest model
rf = RandomForestClassifier(n_estimators=3, random_state=13)

# Train the model
rf.fit(X_train, Y_train)

# Predict the labels for the test set
Y_pred = rf.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(Y_test, Y_pred)
classification_rep = classification_report(Y_test, Y_pred)

from sklearn import metrics

# Calculate ROC AUC score
roc_auc = metrics.roc_auc_score(Y_test, Y_pred)


# Output the results
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_rep)
print("Confusion Matrix:")
print(confusion_matrix(Y_test, Y_pred))
print(f"\nROC-AUC: {roc_auc:.4f}")

# Perform n-fold cross-validation
n = 5
cv_scores = cross_val_score(rf, X, Y, cv=n)

# Print cross-validation scores
print("Cross-validation Scores:", cv_scores)
print("Mean Cross-validation Accuracy:", cv_scores.mean())


Filter Feature Selection techniques for SVM (K features)

In [None]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.feature_selection import SelectKBest, mutual_info_classif, f_classif, chi2
from sklearn import metrics

# Load the dataset
df = pd.read_csv('binary.arff.csv')

# Split data into X (features) and Y (target)
X = df.iloc[:, :-1].values  # Features
Y = df.iloc[:, -1].values   # Labels

# Convert categorical labels to binary labels
le = LabelEncoder()
Y = le.fit_transform(Y)  # Encode 'ALL' as 0 and 'AML' as 1

# Apply Min-Max Scaling to ensure non-negative values for Chi-Square
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and test data (80:20 split)
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=14)

# Number of top features to select
k = 250

# Feature Selection Techniques
# Mutual Information
mi_selector = SelectKBest(score_func=mutual_info_classif, k=k)
X_train_mi = mi_selector.fit_transform(X_train, Y_train)
X_test_mi = mi_selector.transform(X_test)

# F-Score (ANOVA)
f_selector = SelectKBest(score_func=f_classif, k=k)
X_train_f = f_selector.fit_transform(X_train, Y_train)
X_test_f = f_selector.transform(X_test)

# Chi-Square (with scaled data)
chi2_selector = SelectKBest(score_func=chi2, k=k)
X_train_chi2 = chi2_selector.fit_transform(X_train, Y_train)
X_test_chi2 = chi2_selector.transform(X_test)

# Function to train and evaluate the model
def evaluate_svm(X_train, X_test, Y_train, Y_test):
    svm = SVC(kernel='linear', random_state=12, probability=True)
    svm.fit(X_train, Y_train)
    Y_pred = svm.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(Y_test, Y_pred)
    classification_rep = classification_report(Y_test, Y_pred)
    roc_auc = roc_auc_score(Y_test, svm.predict_proba(X_test)[:, 1])

    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:\n", classification_rep)
    print("Confusion Matrix:")
    print(confusion_matrix(Y_test, Y_pred))
    print(f"\nROC-AUC Score: {roc_auc:.4f}")

    # Perform n-fold cross-validation
    n = 2
    cv_scores = cross_val_score(svm, X_train, Y_train, cv=n)
    print(f"\nCross-validation Scores: {cv_scores}")
    print(f"Mean Cross-validation Accuracy: {cv_scores.mean():.4f}")

# Evaluate the model using Mutual Information selected features
print("\nEvaluating with Mutual Information Selected Features:")
evaluate_svm(X_train_mi, X_test_mi, Y_train, Y_test)

# Evaluate the model using F-Score selected features
print("\nEvaluating with F-Score Selected Features:")
evaluate_svm(X_train_f, X_test_f, Y_train, Y_test)

# Evaluate the model using Chi-Square selected features
print("\nEvaluating with Chi-Square Selected Features:")
evaluate_svm(X_train_chi2, X_test_chi2, Y_train, Y_test)


Filter Feature Selection Techniques for RF (K features)

In [None]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.feature_selection import SelectKBest, mutual_info_classif, f_classif, chi2
from sklearn import metrics

# Set the seed for reproducibility across NumPy random operations
np.random.seed(1111)

# Load the dataset
df = pd.read_csv('binary.arff.csv')

# Split data into X (features) and Y (target)
X = df.iloc[:, :-1].values  # Features
Y = df.iloc[:, -1].values   # Labels

# Convert categorical labels to binary labels
le = LabelEncoder()
Y = le.fit_transform(Y)  # Encode 'ALL' as 0 and 'AML' as 1

# Apply Min-Max Scaling to ensure non-negative values for Chi-Square
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and test data (80:20 split)
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=14)

# Number of top features to select
k = 250

# Feature Selection Techniques
# Mutual Information
mi_selector = SelectKBest(score_func=mutual_info_classif, k=k)
X_train_mi = mi_selector.fit_transform(X_train, Y_train)
X_test_mi = mi_selector.transform(X_test)

# F-Score (ANOVA)
f_selector = SelectKBest(score_func=f_classif, k=k)
X_train_f = f_selector.fit_transform(X_train, Y_train)
X_test_f = f_selector.transform(X_test)

# Chi-Square (with scaled data)
chi2_selector = SelectKBest(score_func=chi2, k=k)
X_train_chi2 = chi2_selector.fit_transform(X_train, Y_train)
X_test_chi2 = chi2_selector.transform(X_test)

# Function to train and evaluate the model
def evaluate_svm(X_train, X_test, Y_train, Y_test):
    svm = SVC(kernel='linear', random_state=12, probability=True)
    svm.fit(X_train, Y_train)
    Y_pred = svm.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(Y_test, Y_pred)
    classification_rep = classification_report(Y_test, Y_pred)
    roc_auc = roc_auc_score(Y_test, svm.predict_proba(X_test)[:, 1])

    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:\n", classification_rep)
    print("Confusion Matrix:")
    print(confusion_matrix(Y_test, Y_pred))
    print(f"\nROC-AUC Score: {roc_auc:.4f}")

    # Perform n-fold cross-validation
    n = 2
    cv_scores = cross_val_score(svm, X_train, Y_train, cv=n)
    print(f"\nCross-validation Scores: {cv_scores}")
    print(f"Mean Cross-validation Accuracy: {cv_scores.mean():.4f}")

# Evaluate the model using Mutual Information selected features
print("\nEvaluating with Mutual Information Selected Features:")
evaluate_svm(X_train_mi, X_test_mi, Y_train, Y_test)

# Evaluate the model using F-Score selected features
print("\nEvaluating with F-Score Selected Features:")
evaluate_svm(X_train_f, X_test_f, Y_train, Y_test)

# Evaluate the model using Chi-Square selected features
print("\nEvaluating with Chi-Square Selected Features:")
evaluate_svm(X_train_chi2, X_test_chi2, Y_train, Y_test)


Wrapper Feature Selection for SVM using RFE:

In [None]:
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

# Split the dataset into training and test data (80:20 split)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=14)

# Build the SVM model
svm = SVC(kernel='linear', random_state=12, probability=True)

# Initialize RFE with SVM as the estimator and select top K features
k = 250  # Number of top features to select
rfe_svm = RFE(estimator=svm, n_features_to_select=k, step=1)

# Fit RFE
rfe_svm.fit(X_train, Y_train)

# Transform the data using the selected features
X_train_rfe = rfe_svm.transform(X_train)
X_test_rfe = rfe_svm.transform(X_test)

# Train SVM using selected features
svm.fit(X_train_rfe, Y_train)

# Make predictions and evaluate performance
Y_pred = svm.predict(X_test_rfe)

# Calculate metrics
accuracy = accuracy_score(Y_test, Y_pred)
classification_rep = classification_report(Y_test, Y_pred)
roc_auc = roc_auc_score(Y_test, svm.predict_proba(X_test_rfe)[:, 1])

# Output the results
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", classification_rep)
print("Confusion Matrix:")
print(confusion_matrix(Y_test, Y_pred))
print(f"\nROC-AUC Score: {roc_auc:.4f}")


Wrapper Feature Selection for Random Forest using RFE:

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

# Split the dataset into training and test data (80:20 split)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=13)

# Build the Random Forest model
rf = RandomForestClassifier(n_estimators=3, random_state=13)

# Initialize RFE with Random Forest as the estimator and select top K features
k = 250  # Number of top features to select
rfe_rf = RFE(estimator=rf, n_features_to_select=k, step=1)

# Fit RFE
rfe_rf.fit(X_train, Y_train)

# Transform the data using the selected features
X_train_rfe = rfe_rf.transform(X_train)
X_test_rfe = rfe_rf.transform(X_test)

# Train Random Forest using selected features
rf.fit(X_train_rfe, Y_train)

# Make predictions and evaluate performance
Y_pred = rf.predict(X_test_rfe)

# Calculate metrics
accuracy = accuracy_score(Y_test, Y_pred)
classification_rep = classification_report(Y_test, Y_pred)
roc_auc = roc_auc_score(Y_test, rf.predict_proba(X_test_rfe)[:, 1])

# Output the results
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", classification_rep)
print("Confusion Matrix:")
print(confusion_matrix(Y_test, Y_pred))
print(f"\nROC-AUC Score: {roc_auc:.4f}")
