Load Packages

In [44]:
import pandas as pd
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score, roc_curve, auc

Load Dataset and split

In [45]:
# Load the dataset
df = pd.read_csv('binary.arff.csv')

# Split data into X (features) and Y (target)
X = df.iloc[:, :-1].values  # Features
Y = df.iloc[:, -1].values   # Labels

# Convert categorical labels to binary labels
le = LabelEncoder()
Y = le.fit_transform(Y)  # Encode 'ALL' as 0 and 'AML' as 1

SVM Before Feature Selection 

In [46]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn import metrics

# Split the dataset into training and test data (80:20 split)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=14)

# Build the model
svm = SVC(kernel='linear', random_state=12, probability=True)  # Set probability=True for ROC-AUC

# Train the model
svm.fit(X_train, Y_train)

# Predict the labels for the test set
Y_pred = svm.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(Y_test, Y_pred)
classification_rep = classification_report(Y_test, Y_pred)

from sklearn import metrics

# Calculate ROC AUC score
roc_auc = metrics.roc_auc_score(Y_test, Y_pred)


# Output the results

print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_rep)
print("Confusion Matrix:")
print(confusion_matrix(Y_test, Y_pred))
print(f"\nROC-AUC: {roc_auc:.4f}")

# Perform n-fold cross-validation
n = 2
cv_scores = cross_val_score(svm, X, Y, cv=n)

# Print cross-validation scores
print("\nCross-validation Scores:", cv_scores)
print("Mean Cross-validation Accuracy:", cv_scores.mean())


Accuracy: 0.9333333333333333

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.92      0.96        12
           1       0.75      1.00      0.86         3

    accuracy                           0.93        15
   macro avg       0.88      0.96      0.91        15
weighted avg       0.95      0.93      0.94        15

Confusion Matrix:
[[11  1]
 [ 0  3]]

ROC-AUC: 0.9583

Cross-validation Scores: [0.91666667 0.88888889]
Mean Cross-validation Accuracy: 0.9027777777777777


RF Before Feature Selection

In [47]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Split the dataset into training and test data (80:20 split)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=13)

# Build the Random Forest model
rf = RandomForestClassifier(n_estimators=3, random_state=13)

# Train the model
rf.fit(X_train, Y_train)

# Predict the labels for the test set
Y_pred = rf.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(Y_test, Y_pred)
classification_rep = classification_report(Y_test, Y_pred)

from sklearn import metrics

# Calculate ROC AUC score
roc_auc = metrics.roc_auc_score(Y_test, Y_pred)


# Output the results
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_rep)
print("Confusion Matrix:")
print(confusion_matrix(Y_test, Y_pred))
print(f"\nROC-AUC: {roc_auc:.4f}")

# Perform n-fold cross-validation
n = 5
cv_scores = cross_val_score(rf, X, Y, cv=n)

# Print cross-validation scores
print("Cross-validation Scores:", cv_scores)
print("Mean Cross-validation Accuracy:", cv_scores.mean())


Accuracy: 0.8666666666666667

Classification Report:
               precision    recall  f1-score   support

           0       0.85      1.00      0.92        11
           1       1.00      0.50      0.67         4

    accuracy                           0.87        15
   macro avg       0.92      0.75      0.79        15
weighted avg       0.89      0.87      0.85        15

Confusion Matrix:
[[11  0]
 [ 2  2]]

ROC-AUC: 0.7500
Cross-validation Scores: [0.86666667 0.66666667 0.57142857 0.78571429 0.78571429]
Mean Cross-validation Accuracy: 0.7352380952380951


Filter Feature Selection techniques for SVM (K features)

In [48]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.feature_selection import SelectKBest, mutual_info_classif, f_classif, chi2
from sklearn import metrics

# Load the dataset
df = pd.read_csv('binary.arff.csv')

# Split data into X (features) and Y (target)
X = df.iloc[:, :-1].values  # Features
Y = df.iloc[:, -1].values   # Labels

# Convert categorical labels to binary labels
le = LabelEncoder()
Y = le.fit_transform(Y)  # Encode 'ALL' as 0 and 'AML' as 1

# Apply Min-Max Scaling to ensure non-negative values for Chi-Square
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and test data (80:20 split)
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=14)

# Number of top features to select
k = 250

# Feature Selection Techniques
# Mutual Information
mi_selector = SelectKBest(score_func=mutual_info_classif, k=k)
X_train_mi = mi_selector.fit_transform(X_train, Y_train)
X_test_mi = mi_selector.transform(X_test)

# F-Score (ANOVA)
f_selector = SelectKBest(score_func=f_classif, k=k)
X_train_f = f_selector.fit_transform(X_train, Y_train)
X_test_f = f_selector.transform(X_test)

# Chi-Square (with scaled data)
chi2_selector = SelectKBest(score_func=chi2, k=k)
X_train_chi2 = chi2_selector.fit_transform(X_train, Y_train)
X_test_chi2 = chi2_selector.transform(X_test)

# Function to train and evaluate the model
def evaluate_svm(X_train, X_test, Y_train, Y_test):
    svm = SVC(kernel='linear', random_state=12, probability=True)
    svm.fit(X_train, Y_train)
    Y_pred = svm.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(Y_test, Y_pred)
    classification_rep = classification_report(Y_test, Y_pred)
    roc_auc = roc_auc_score(Y_test, svm.predict_proba(X_test)[:, 1])

    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:\n", classification_rep)
    print("Confusion Matrix:")
    print(confusion_matrix(Y_test, Y_pred))
    print(f"\nROC-AUC Score: {roc_auc:.4f}")

    # Perform n-fold cross-validation
    n = 2
    cv_scores = cross_val_score(svm, X_train, Y_train, cv=n)
    print(f"\nCross-validation Scores: {cv_scores}")
    print(f"Mean Cross-validation Accuracy: {cv_scores.mean():.4f}")

# Evaluate the model using Mutual Information selected features
print("\nEvaluating with Mutual Information Selected Features:")
evaluate_svm(X_train_mi, X_test_mi, Y_train, Y_test)

# Evaluate the model using F-Score selected features
print("\nEvaluating with F-Score Selected Features:")
evaluate_svm(X_train_f, X_test_f, Y_train, Y_test)

# Evaluate the model using Chi-Square selected features
print("\nEvaluating with Chi-Square Selected Features:")
evaluate_svm(X_train_chi2, X_test_chi2, Y_train, Y_test)



Evaluating with Mutual Information Selected Features:
Accuracy: 0.9333

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.92      0.96        12
           1       0.75      1.00      0.86         3

    accuracy                           0.93        15
   macro avg       0.88      0.96      0.91        15
weighted avg       0.95      0.93      0.94        15

Confusion Matrix:
[[11  1]
 [ 0  3]]

ROC-AUC Score: 1.0000

Cross-validation Scores: [0.96551724 0.96428571]
Mean Cross-validation Accuracy: 0.9649

Evaluating with F-Score Selected Features:
Accuracy: 0.9333

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.92      0.96        12
           1       0.75      1.00      0.86         3

    accuracy                           0.93        15
   macro avg       0.88      0.96      0.91        15
weighted avg       0.95      0.93      0.94        15

Confusion Matr

Filter Feature Selection Techniques for RF (K features)

In [49]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.feature_selection import SelectKBest, mutual_info_classif, f_classif, chi2
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn import metrics

# Load the dataset
df = pd.read_csv('binary.arff.csv')

# Split data into X (features) and Y (target)
X = df.iloc[:, :-1].values  # Features
Y = df.iloc[:, -1].values   # Labels

# Convert categorical labels to binary labels
le = LabelEncoder()
Y = le.fit_transform(Y)  # Encode 'ALL' as 0 and 'AML' as 1

# Apply Min-Max Scaling to ensure non-negative values for Chi-Square
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and test data (80:20 split)
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=13)

# Number of top features to select
k = 250

# Feature Selection Techniques
# Mutual Information
mi_selector = SelectKBest(score_func=mutual_info_classif, k=k)
X_train_mi = mi_selector.fit_transform(X_train, Y_train)
X_test_mi = mi_selector.transform(X_test)

# F-Score (ANOVA)
f_selector = SelectKBest(score_func=f_classif, k=k)
X_train_f = f_selector.fit_transform(X_train, Y_train)
X_test_f = f_selector.transform(X_test)

# Chi-Square (with scaled data)
chi2_selector = SelectKBest(score_func=chi2, k=k)
X_train_chi2 = chi2_selector.fit_transform(X_train, Y_train)
X_test_chi2 = chi2_selector.transform(X_test)

# Function to train and evaluate the Random Forest model
def evaluate_rf(X_train, X_test, Y_train, Y_test):
    # Build the Random Forest model
    rf = RandomForestClassifier(n_estimators=3, random_state=13)
    
    # Train the model
    rf.fit(X_train, Y_train)
    
    # Predict the labels for the test set
    Y_pred = rf.predict(X_test)
    
    # Calculate the accuracy
    accuracy = accuracy_score(Y_test, Y_pred)
    classification_rep = classification_report(Y_test, Y_pred)
    
    # Calculate ROC AUC score
    roc_auc = metrics.roc_auc_score(Y_test, Y_pred)
    
    # Output the results
    print("Accuracy:", accuracy)
    print("\nClassification Report:\n", classification_rep)
    print("Confusion Matrix:")
    print(confusion_matrix(Y_test, Y_pred))
    print(f"\nROC-AUC: {roc_auc:.4f}")
    
    # Perform n-fold cross-validation
    n = 5
    cv_scores = cross_val_score(rf, X_train, Y_train, cv=n)
    
    # Print cross-validation scores
    print("\nCross-validation Scores:", cv_scores)
    print("Mean Cross-validation Accuracy:", cv_scores.mean())

# Evaluate the model using Mutual Information selected features
print("\nEvaluating with Mutual Information Selected Features:")
evaluate_rf(X_train_mi, X_test_mi, Y_train, Y_test)

# Evaluate the model using F-Score selected features
print("\nEvaluating with F-Score Selected Features:")
evaluate_rf(X_train_f, X_test_f, Y_train, Y_test)

# Evaluate the model using Chi-Square selected features
print("\nEvaluating with Chi-Square Selected Features:")
evaluate_rf(X_train_chi2, X_test_chi2, Y_train, Y_test)



Evaluating with Mutual Information Selected Features:
Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       1.00      1.00      1.00         4

    accuracy                           1.00        15
   macro avg       1.00      1.00      1.00        15
weighted avg       1.00      1.00      1.00        15

Confusion Matrix:
[[11  0]
 [ 0  4]]

ROC-AUC: 1.0000

Cross-validation Scores: [1.         0.91666667 0.90909091 0.90909091 1.        ]
Mean Cross-validation Accuracy: 0.9469696969696969

Evaluating with F-Score Selected Features:
Accuracy: 0.8666666666666667

Classification Report:
               precision    recall  f1-score   support

           0       0.85      1.00      0.92        11
           1       1.00      0.50      0.67         4

    accuracy                           0.87        15
   macro avg       0.92      0.75      0.79        15
weighted avg       0.