Load packages

In [None]:
import pandas as pd
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score, roc_curve, auc

Load Dataset and split

In [None]:

# Load the dataset
df = pd.read_csv('binary.arff.csv')

# Split data into X (features) and Y (target)
X = df.iloc[:, :-1].values  # Features
Y = df.iloc[:, -1].values   # Labels

# Convert categorical labels to binary labels
le = LabelEncoder()
Y = le.fit_transform(Y)  # Encode 'ALL' as 0 and 'AML' as 1


Data Visualization

In [None]:
# head
display(df.head())

# columns
display(df.columns)


SVM

In [None]:
# Split the dataset into training and test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=14)

# Build the model
svm = SVC(kernel='linear',  random_state=12)

# Train the model
svm.fit(X_train, Y_train)

# Predict the labels for the test set
Y_pred = svm.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(Y_test, Y_pred)
classification_rep = classification_report(Y_test, Y_pred)

print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_rep)
print("Confusion Matrix:")
print(confusion_matrix(Y_test, Y_pred))


Confusion Matrix

In [None]:
from sklearn import metrics
# Calculate confusion matrix
conf_matrix = confusion_matrix(Y_test, Y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Plot confusion matrix
disp = metrics.ConfusionMatrixDisplay(confusion_matrix=conf_matrix)
disp.plot(cmap='Blues')

plt.title('Confusion Matrix')
plt.show()

Cross Validation Score for SVM

In [None]:
# Perform n-fold cross-validation
n = 2
cv_scores = cross_val_score(svm, X, Y, cv=n)

# Print cross-validation scores
print("Cross-validation Scores:", cv_scores)
print("Mean Cross-validation Accuracy:", cv_scores.mean())

Cross Validation Graph

In [None]:
# Print cross-validation scores
print("\nCross-validation Scores:", cv_scores)
print("Mean Cross-validation Accuracy:", cv_scores.mean())

# Plot Cross-validation Scores
plt.figure(figsize=(8, 6))
plt.bar(range(1, n + 1), cv_scores, color='skyblue')
plt.xlabel('Fold')
plt.ylabel('Accuracy')
plt.title('Cross-validation Scores')
plt.ylim(0, 1)  # Limit y-axis to 0-1 for accuracy scores
plt.grid(True)
plt.show()

ROC-AUC

In [None]:

# Calculate ROC AUC score
roc_auc = metrics.roc_auc_score(Y_test, Y_pred)
print(f"ROC-AUC: {roc_auc:.4f}")

# Plot ROC curve
fpr, tpr, thresholds = metrics.roc_curve(Y_test, Y_pred)
plt.figure()
plt.plot(fpr, tpr, color='red', lw=2, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='green', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()



Classificaion Report

In [None]:
# Get the classification report
classification_rep = classification_report(Y_test, Y_pred, target_names=le.classes_, output_dict=True)

# Extract precision, recall, and f1-score values
metrics = ['precision', 'recall', 'f1-score']
classes = list(le.classes_)
values = {metric: [classification_rep[class_name][metric] for class_name in classes] for metric in metrics}

# Plotting
plt.figure(figsize=(10, 6))

# Bar positions and width
bar_width = 0.2
index = range(len(classes))

# Plot bars for each metric
for i, metric in enumerate(metrics):
    plt.bar([pos + i * bar_width for pos in index], values[metric], bar_width, label=metric)

plt.xlabel('Classes')
plt.ylabel('Score')
plt.title('Classification Report Metrics')
plt.xticks([pos + bar_width for pos in index], classes)
plt.ylim(0, 1)
plt.legend()
plt.grid(True)
plt.show()

Random Forest

In [None]:
# Split the dataset into training and test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=13)

# Build the Random Forest model
rf = RandomForestClassifier(n_estimators=3, random_state=13)

# Train the model
rf.fit(X_train, Y_train)

# Predict the labels for the test set
Y_pred = rf.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(Y_test, Y_pred)
classification_rep = classification_report(Y_test, Y_pred)

print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_rep)
print("Confusion Matrix:")
print(confusion_matrix(Y_test, Y_pred))

Confusion Matrix

In [None]:
from sklearn import metrics
# Calculate confusion matrix
conf_matrix = confusion_matrix(Y_test, Y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Plot confusion matrix
disp = metrics.ConfusionMatrixDisplay(confusion_matrix=conf_matrix)
disp.plot(cmap='Blues')

plt.title('Confusion Matrix')
plt.show()

Cross Validation Score for Random Forest

In [None]:
# Perform n-fold cross-validation
n = 5
cv_scores = cross_val_score(rf, X, Y, cv=n)

# Print cross-validation scores
print("Cross-validation Scores:", cv_scores)
print("Mean Cross-validation Accuracy:", cv_scores.mean())

Cross Validation graph

In [None]:
# Print cross-validation scores
print("\nCross-validation Scores:", cv_scores)
print("Mean Cross-validation Accuracy:", cv_scores.mean())

# Plot Cross-validation Scores
plt.figure(figsize=(8, 6))
plt.bar(range(1, n + 1), cv_scores, color='skyblue')
plt.xlabel('Fold')
plt.ylabel('Accuracy')
plt.title('Cross-validation Scores')
plt.ylim(0, 1)  # Limit y-axis to 0-1 for accuracy scores
plt.grid(True)
plt.show()

ROC-AUC

In [None]:

# Calculate ROC AUC score
roc_auc = metrics.roc_auc_score(Y_test, Y_pred)
print(f"ROC-AUC: {roc_auc:.4f}")

# Plot ROC curve
fpr, tpr, thresholds = metrics.roc_curve(Y_test, Y_pred)
plt.figure()
plt.plot(fpr, tpr, color='red', lw=2, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='green', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()



Classification report

In [None]:
# Get the classification report
classification_rep = classification_report(Y_test, Y_pred, target_names=le.classes_, output_dict=True)

# Extract precision, recall, and f1-score values
metrics = ['precision', 'recall', 'f1-score']
classes = list(le.classes_)
values = {metric: [classification_rep[class_name][metric] for class_name in classes] for metric in metrics}

# Plotting
plt.figure(figsize=(10, 6))

# Bar positions and width
bar_width = 0.2
index = range(len(classes))

# Plot bars for each metric
for i, metric in enumerate(metrics):
    plt.bar([pos + i * bar_width for pos in index], values[metric], bar_width, label=metric)

plt.xlabel('Classes')
plt.ylabel('Score')
plt.title('Classification Report Metrics')
plt.xticks([pos + bar_width for pos in index], classes)
plt.ylim(0, 1)
plt.legend()
plt.grid(True)
plt.show()

Mutual Information:


In [None]:
from sklearn.feature_selection import mutual_info_classif
import numpy as np

# Compute mutual information between each feature and the target
mi = mutual_info_classif(X, Y)

# Sort features based on mutual information
mi_sorted_indices = np.argsort(mi)[::-1]  # Sort in descending order

# Get top 10 features based on mutual information
top_10_mi_features = df.columns[mi_sorted_indices[:10]]

print("Top 10 features based on Mutual Information:")
print(top_10_mi_features)



Anova F-test

In [None]:
from sklearn.feature_selection import f_classif

# Perform ANOVA F-test
f_scores, p_values = f_classif(X, Y)

# Sort features based on F-scores
f_sorted_indices = np.argsort(f_scores)[::-1]

# Get top 10 features based on ANOVA F-test (for example)
top_10_f_features = df.columns[f_sorted_indices[:10]]

print("Top 10 features based on ANOVA F-test:")
print(top_10_f_features)


Chi-Square Test

In [None]:
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

# Chi-square test requires non-negative values, so we scale the data
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Perform Chi-Square test
chi_scores, p_values = chi2(X_scaled, Y)

# Sort features based on Chi-square scores
chi_sorted_indices = np.argsort(chi_scores)[::-1]

# Get top 10 features based on Chi-square test (for example)
top_10_chi_features = df.columns[chi_sorted_indices[:10]]

print("Top 10 features based on Chi-Square Test:")
print(top_10_chi_features)


Convert Top Feature Names to Indices

In [None]:
# Convert column names to indices
def get_feature_indices(df, top_features):
    return [df.columns.get_loc(feature) for feature in top_features]

# Get top 10 features for each selection technique
top_10_mi_indices = get_feature_indices(df, top_10_mi_features)
top_10_f_indices = get_feature_indices(df, top_10_f_features)
top_10_chi_indices = get_feature_indices(df, top_10_chi_features)


SVM Model Performance After Filter Feature Selection

In [None]:
# Define the function to evaluate the SVM model
def evaluate_svm(X, Y, top_features, technique_name):
    # Filter the dataset with the top features
    X_selected = X[:, top_features]
    
    # Split the dataset into training and test data
    X_train, X_test, Y_train, Y_test = train_test_split(X_selected, Y, test_size=0.2, random_state=14)
    
    # Build the SVM model
    svm = SVC(kernel='linear', random_state=12)
    
    # Train the model
    svm.fit(X_train, Y_train)
    
    # Predict the labels for the test set
    Y_pred = svm.predict(X_test)
    
    # Calculate the accuracy
    accuracy = accuracy_score(Y_test, Y_pred)
    
    # Calculate ROC AUC score
    roc_auc = roc_auc_score(Y_test, Y_pred)
    
    # Print classification report
    classification_rep = classification_report(Y_test, Y_pred)
    
    # Print confusion matrix
    conf_matrix = confusion_matrix(Y_test, Y_pred)
    
    # Display the metrics with the technique name
    print(f"SVM Performance using {technique_name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"ROC-AUC: {roc_auc:.4f}")
    print("\nClassification Report:\n", classification_rep)
    print("Confusion Matrix:")
    print(conf_matrix)
    print("\n")

# Evaluate SVM using the top 10 features selected by Mutual Information
evaluate_svm(X, Y, top_10_mi_indices, "Mutual Information")

# Evaluate SVM using the top 10 features selected by F-Score
evaluate_svm(X, Y, top_10_f_indices, "F-Score")

# Evaluate SVM using the top 10 features selected by Chi-Square
evaluate_svm(X, Y, top_10_chi_indices, "Chi-Square")


Random Forest Model Performance After Filter Feature Selection

In [None]:
# Define the function to evaluate the Random Forest model
def evaluate_rf(X, Y, top_features, technique_name):
    # Filter the dataset with the top features
    X_selected = X[:, top_features]
    
    # Split the dataset into training and test data
    X_train, X_test, Y_train, Y_test = train_test_split(X_selected, Y, test_size=0.2, random_state=13)
    
    # Build the Random Forest model
    rf = RandomForestClassifier(n_estimators=100, random_state=13)
    
    # Train the model
    rf.fit(X_train, Y_train)
    
    # Predict the labels for the test set
    Y_pred = rf.predict(X_test)
    
    # Calculate the accuracy
    accuracy = accuracy_score(Y_test, Y_pred)
    
    # Calculate ROC AUC score
    roc_auc = roc_auc_score(Y_test, Y_pred)
    
    # Print classification report
    classification_rep = classification_report(Y_test, Y_pred)
    
    # Print confusion matrix
    conf_matrix = confusion_matrix(Y_test, Y_pred)
    
    # Display the metrics with the technique name
    print(f"Random Forest Performance using {technique_name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"ROC-AUC: {roc_auc:.4f}")
    print("\nClassification Report:\n", classification_rep)
    print("Confusion Matrix:")
    print(conf_matrix)
    print("\n")

# Evaluate Random Forest using the top 10 features selected by Mutual Information
evaluate_rf(X, Y, top_10_mi_indices, "Mutual Information")

# Evaluate Random Forest using the top 10 features selected by F-Score
evaluate_rf(X, Y, top_10_f_indices, "F-Score")

# Evaluate Random Forest using the top 10 features selected by Chi-Square
evaluate_rf(X, Y, top_10_chi_indices, "Chi-Square")


Feature Importance with RF

In [14]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df = pd.read_csv('binary.arff.csv')

# Split data into X (features) and Y (target)
X = df.iloc[:, :-1].values  # Features
Y = df.iloc[:, -1].values   # Labels

# Convert categorical labels to binary labels
le = LabelEncoder()
Y = le.fit_transform(Y)  # Encode 'ALL' as 0 and 'AML' as 1

# Split the dataset into training and test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Build and fit the Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, Y_train)

# Get the feature importance
importances = rf.feature_importances_

# Sort features by importance
sorted_indices = importances.argsort()[::-1]

# Select the top n most important features (for example, the top 500 features)
n_features_to_select = 100  # Adjust this number based on your requirement
top_n_indices = sorted_indices[:n_features_to_select]

# Select the top features from the original dataset
X_selected = X[:, top_n_indices]

# Print the names of the selected features
selected_feature_names = df.columns[top_n_indices]
print("Selected top {} features:".format(n_features_to_select))
print(selected_feature_names)


Selected top 100 features:
Index(['M63379_at', 'M63138_at', 'U22376_cds2_s_at', 'X61587_at', 'M14636_at',
       'M84526_at', 'U57721_at', 'M23197_at', 'L09209_s_at', 'D88422_at',
       'M16652_s_at', 'Z49194_at', 'M31523_at', 'M27891_at', 'M74088_s_at',
       'D38073_at', 'U90549_at', 'M11722_at', 'U05259_rna1_at', 'X62320_at',
       'M16038_at', 'HG4582-HT4987_at', 'X78669_at', 'X13973_at', 'L11669_at',
       'X59417_at', 'U41635_at', 'M63838_s_at', 'U62136_at', 'L21954_at',
       'U36621_cds2_at', 'M29474_at', 'M29696_at', 'HG2591-HT2687_s_at',
       'J05243_at', 'X66533_at', 'M19507_at', 'U10868_at', 'M32304_s_at',
       'M83652_s_at', 'X12451_at', 'HG2981-HT3127_s_at', 'Z15115_at',
       'X85116_rna1_s_at', 'U51127_at', 'Y00787_s_at', 'U56833_at',
       'X82240_rna1_at', 'L19437_at', 'S50223_at', 'U09578_at', 'M27783_s_at',
       'U34877_at', 'D28532_at', 'X52142_at', 'M26708_s_at', 'M55150_at',
       'U60319_at', 'X12447_at', 'D87076_at', 'M63959_at', 'U94836_at',
    

RFE Using Random Forest After Feature Importance Selection

In [15]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Use the selected top features based on Random Forest feature importance
X_selected = X[:, top_n_indices]  # Already selected top 100 features

# Split the dataset with selected features into training and test data
X_train_selected, X_test_selected, Y_train, Y_test = train_test_split(X_selected, Y, test_size=0.2, random_state=42)

# Initialize a new Random Forest model for RFE
rf_for_rfe = RandomForestClassifier(n_estimators=100, random_state=42)

# Apply RFE to select the most important features out of the top 100 selected features
n_features_to_select_rfe = 50  # You can adjust this based on your requirement
rfe = RFE(estimator=rf_for_rfe, n_features_to_select=n_features_to_select_rfe)

# Fit the RFE model
rfe.fit(X_train_selected, Y_train)

# Get the selected features after RFE
X_train_rfe = rfe.transform(X_train_selected)
X_test_rfe = rfe.transform(X_test_selected)

# Train a new Random Forest model on the reduced feature set
rf_after_rfe = RandomForestClassifier(n_estimators=100, random_state=42)
rf_after_rfe.fit(X_train_rfe, Y_train)

# Make predictions and evaluate the model
Y_pred = rf_after_rfe.predict(X_test_rfe)
accuracy = accuracy_score(Y_test, Y_pred)

# Print the accuracy of the model after RFE
print("Accuracy after RFE:", accuracy)

# Print the ranking of features (1 means selected, higher means less important)
print("Feature ranking after RFE:")
print(rfe.ranking_)

# Optionally, print the names of the features that were selected after RFE
selected_rfe_feature_names = selected_feature_names[rfe.support_]
print("Selected features after RFE:")
print(selected_rfe_feature_names)


Accuracy after RFE: 0.9333333333333333
Feature ranking after RFE:
[ 1  1  1  1  1  1  1  1  1  1 20  1  1  1  1 48  1  1  1  1  1 10 49  1
 13  1  1  1 18 37  1  1  1 27  1 33  1 36  1  1  1  1 41  1 29  1 39 35
 45 50 34  1 15 51 16  1  1  7  9 11  4 28  1 21 24 19 44 12 14 30 32  1
  1 42 22 43  1  1  8 26 31 40  1  1 38 25 23  1  5  1  6  1  3  1  1  2
 17  1 46 47]
Selected features after RFE:
Index(['M63379_at', 'M63138_at', 'U22376_cds2_s_at', 'X61587_at', 'M14636_at',
       'M84526_at', 'U57721_at', 'M23197_at', 'L09209_s_at', 'D88422_at',
       'Z49194_at', 'M31523_at', 'M27891_at', 'M74088_s_at', 'U90549_at',
       'M11722_at', 'U05259_rna1_at', 'X62320_at', 'M16038_at', 'X13973_at',
       'X59417_at', 'U41635_at', 'M63838_s_at', 'U36621_cds2_at', 'M29474_at',
       'M29696_at', 'J05243_at', 'M19507_at', 'M32304_s_at', 'M83652_s_at',
       'X12451_at', 'HG2981-HT3127_s_at', 'X85116_rna1_s_at', 'Y00787_s_at',
       'M27783_s_at', 'M26708_s_at', 'M55150_at', 'S76617_at', 

Performance Metrices of RF model after RFE

In [16]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score

# Predict the labels for the test set
Y_pred = rf_after_rfe.predict(X_test_rfe)

# Calculate Accuracy
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Calculate Precision
precision = precision_score(Y_test, Y_pred)
print(f"Precision: {precision:.4f}")

# Calculate Recall
recall = recall_score(Y_test, Y_pred)
print(f"Recall: {recall:.4f}")

# Calculate F1-Score
f1 = f1_score(Y_test, Y_pred)
print(f"F1-Score: {f1:.4f}")

# Calculate ROC AUC Score
roc_auc = roc_auc_score(Y_test, Y_pred)
print(f"ROC AUC: {roc_auc:.4f}")

# Print Classification Report (Precision, Recall, F1-Score for each class)
print("\nClassification Report:")
print(classification_report(Y_test, Y_pred))

# Print Confusion Matrix
print("Confusion Matrix:")
conf_matrix = confusion_matrix(Y_test, Y_pred)
print(conf_matrix)


Accuracy: 0.9333
Precision: 0.7500
Recall: 1.0000
F1-Score: 0.8571
ROC AUC: 0.9583

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.92      0.96        12
           1       0.75      1.00      0.86         3

    accuracy                           0.93        15
   macro avg       0.88      0.96      0.91        15
weighted avg       0.95      0.93      0.94        15

Confusion Matrix:
[[11  1]
 [ 0  3]]


In [19]:
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming X and Y are already defined as the features and target labels

# Split the dataset into training and test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Initialize the SVM model with a linear kernel
svm_for_rfe = SVC(kernel='linear', random_state=42)

# Apply RFE with the SVM model to select a smaller subset of features
n_features_to_select_rfe = 50  # Adjust this based on your requirement
rfe = RFE(estimator=svm_for_rfe, n_features_to_select=n_features_to_select_rfe)

# Fit the RFE model
rfe.fit(X_train, Y_train)

# Get the selected features after RFE
X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)

# Train a new SVM model on the reduced feature set
svm_after_rfe = SVC(kernel='linear', random_state=42)
svm_after_rfe.fit(X_train_rfe, Y_train)

# Make predictions and evaluate the model
Y_pred = svm_after_rfe.predict(X_test_rfe)
accuracy = accuracy_score(Y_test, Y_pred)

# Print the accuracy of the model after RFE
print("Accuracy after RFE:", accuracy)

# Print the ranking of features (1 means selected, higher means less important)
print("Feature ranking after RFE:")
print(rfe.ranking_)

# Print the confusion matrix and classification report
print("Confusion Matrix:")
print(confusion_matrix(Y_test, Y_pred))
print("\nClassification Report:")
print(classification_report(Y_test, Y_pred))

# Correctly print the names of the features that were selected after RFE
# Assuming 'df' contains the original dataset with all feature columns and labels
# df.columns[:-1] excludes the label column to match the features
selected_feature_names_rf = df.columns[:-1]  # Assuming the last column is the label column
selected_rfe_feature_names = selected_feature_names_rf[rfe.support_]

print("Selected features after RFE:")
print(selected_rfe_feature_names)


Accuracy after RFE: 1.0
Feature ranking after RFE:
[2504 6687 6227 ... 6829  185 5817]
Confusion Matrix:
[[12  0]
 [ 0  3]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00         3

    accuracy                           1.00        15
   macro avg       1.00      1.00      1.00        15
weighted avg       1.00      1.00      1.00        15

Selected features after RFE:
Index(['hum_alu_at', 'AFFX-HUMRGE/M10098_3_at', 'D21261_at',
       'HG1612-HT1612_at', 'HG3549-HT3751_at', 'J04164_at', 'L20941_at',
       'M11147_at', 'M11722_at', 'M13792_at', 'M19507_at', 'M26602_at',
       'M27891_at', 'M31627_at', 'M33680_at', 'M63138_at', 'M69043_at',
       'M91036_rna1_at', 'M92287_at', 'M96326_rna1_at', 'U05259_rna1_at',
       'U14968_at', 'U46751_at', 'U51004_at', 'X15940_at', 'X17042_at',
       'X51466_at', 'X55715_at', 'X59417_at', 'Y00433_at', 'Z23090_at',
