Deep Feature extraction like i extracted in matlab 

In [1]:

from tensorflow.keras.models import load_model, Model
import pickle
import numpy as np
import pandas as pd
from pathlib import Path
import os
import matplotlib.pyplot as plt
from IPython.display import display, Markdown
import seaborn as sns
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from tensorflow.keras import Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, auc
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Function to print with Markdown
def printmd(string):
    display(Markdown(string))


# Load and preprocess dataset
image_dir = Path(r'E:\Abroad period research\new idea implementation codes\Second part of the paper\justchest_Unet_Segmented_Dataset')

filepaths = list(image_dir.glob(r'**/*.png'))
labels = list(map(lambda x: os.path.split(os.path.split(x)[0])[1], filepaths))

filepaths = pd.Series(filepaths, name='Filepath').astype(str)
labels = pd.Series(labels, name='Label')

image_df = pd.concat([filepaths, labels], axis=1)
image_df = image_df.sample(frac=1).reset_index(drop=True)

# Split data into training, validation, and test sets
train_df, temp_df = train_test_split(image_df, train_size=0.7, shuffle=True, random_state=1)
val_df, test_df = train_test_split(temp_df, train_size=0.5, shuffle=True, random_state=1)

# Create data generators with augmentation
def create_gen():
    train_generator = ImageDataGenerator(
        preprocessing_function=tf.keras.applications.mobilenet_v2.preprocess_input,
        rotation_range=20,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest'
    )
    val_test_generator = ImageDataGenerator(
        preprocessing_function=tf.keras.applications.mobilenet_v2.preprocess_input
    )

    train_images = train_generator.flow_from_dataframe(
        dataframe=train_df,
        x_col='Filepath',
        y_col='Label',
        target_size=(224, 224),
        color_mode='rgb',
        class_mode='categorical',
        batch_size=32,
        shuffle=True,
        seed=0
    )

    val_images = val_test_generator.flow_from_dataframe(
        dataframe=val_df,
        x_col='Filepath',
        y_col='Label',
        target_size=(224, 224),
        color_mode='rgb',
        class_mode='categorical',
        batch_size=32,
        shuffle=False
    )

    test_images = val_test_generator.flow_from_dataframe(
        dataframe=test_df,
        x_col='Filepath',
        y_col='Label',
        target_size=(224, 224),
        color_mode='rgb',
        class_mode='categorical',
        batch_size=32,
        shuffle=False
    )

    return train_images, val_images, test_images


# Load the pre-trained model
model_path = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\10 features results\Mobilenetv2_finetuned_with_CLR_and_GradientAccum.h5'
loaded_model = load_model(model_path)

# Define the feature extraction model
# Adjust the layer as needed; here, we use the fourth-last layer's output
feature_extractor = Model(inputs=loaded_model.input, outputs=loaded_model.layers[-4].output)

# Directory to save extracted features
feature_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\10 features results\extracted_features'
os.makedirs(feature_dir, exist_ok=True)

def extract_features(data_gen, set_name):
    features = []
    labels = []
    
    # Loop through batches in the data generator
    for batch_images, batch_labels in data_gen:
        # Extract features for the batch
        batch_features = feature_extractor.predict(batch_images)
        features.extend(batch_features)
        labels.extend(batch_labels)
        
        # Break if we’ve covered all images in the generator
        if data_gen.batch_index == 0:
            break

    # Convert to numpy arrays
    features = np.array(features)
    labels = np.array(labels)
    
    # Save extracted features and labels to a file
    with open(os.path.join(feature_dir, f"{set_name}_features.pkl"), 'wb') as f:
        pickle.dump((features, labels), f)

# Create data generators with augmentation
train_images, val_images, test_images = create_gen()

# Extract features for training, validation, and test sets
extract_features(train_images, "train")
extract_features(val_images, "val")
extract_features(test_images, "test")

print("Features extracted and saved.")




Found 28537 validated image filenames belonging to 4 classes.
Found 6115 validated image filenames belonging to 4 classes.
Found 6116 validated image filenames belonging to 4 classes.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 901ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 905ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 883ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 820ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 756ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 753ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 749ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 754ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 731ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 746ms/step
[1m1/1[0m [32

Classification of features using saved extracted features

In [1]:
# Import necessary libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pickle
import os
import numpy as np

# Directory to save extracted features and models
feature_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/testing code on brain tumor dataset/extracted_features'
model_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/testing code on brain tumor dataset/saved_models'
os.makedirs(model_dir, exist_ok=True)

# Load extracted features
with open(os.path.join(feature_dir, "train_features.pkl"), 'rb') as f:
    train_features, train_labels = pickle.load(f)
    
with open(os.path.join(feature_dir, "val_features.pkl"), 'rb') as f:
    val_features, val_labels = pickle.load(f)
    
with open(os.path.join(feature_dir, "test_features.pkl"), 'rb') as f:
    test_features, test_labels = pickle.load(f)

# Convert one-hot encoded labels to single-class labels
train_labels = np.argmax(train_labels, axis=1)
val_labels = np.argmax(val_labels, axis=1)
test_labels = np.argmax(test_labels, axis=1)

# Combine training and validation features and labels
combined_features = np.vstack([train_features, val_features])
combined_labels = np.hstack([train_labels, val_labels])

# Train Decision Tree Classifier on combined training and validation data
clf = DecisionTreeClassifier(random_state=0)
clf.fit(combined_features, combined_labels)

# Save the trained Decision Tree model
tree_model_path = os.path.join(model_dir, 'decision_tree_model.pkl')
with open(tree_model_path, 'wb') as f:
    pickle.dump(clf, f)

# Evaluate on the combined training set
train_predictions = clf.predict(combined_features)
train_accuracy = accuracy_score(combined_labels, train_predictions)
print(f"Combined Training Accuracy: {train_accuracy * 100:.2f}%")
print(f"Combined Training Confusion Matrix:\n", confusion_matrix(combined_labels, train_predictions))
print(f"Combined Training Classification Report:\n", classification_report(combined_labels, train_predictions))

# Evaluate on the test set
test_predictions = clf.predict(test_features)
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")
print(f"Test Confusion Matrix:\n", confusion_matrix(test_labels, test_predictions))
print(f"Test Classification Report:\n", classification_report(test_labels, test_predictions))

print(f"Decision Tree model saved to: {tree_model_path}")


Combined Training Accuracy: 100.00%
Combined Training Confusion Matrix:
 [[1017    0]
 [   0 1023]]
Combined Training Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1017
           1       1.00      1.00      1.00      1023

    accuracy                           1.00      2040
   macro avg       1.00      1.00      1.00      2040
weighted avg       1.00      1.00      1.00      2040

Test Accuracy: 99.72%
Test Confusion Matrix:
 [[183   0]
 [  1 176]]
Test Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00       183
           1       1.00      0.99      1.00       177

    accuracy                           1.00       360
   macro avg       1.00      1.00      1.00       360
weighted avg       1.00      1.00      1.00       360

Decision Tree model saved to: E:/Abroad period research/new idea implementation codes/Second part of the p

In [3]:
from sklearn.tree import export_text
import pickle

# Load the trained decision tree model
with open(tree_model_path, 'rb') as f:
    clf = pickle.load(f)

# Extract decision rules
tree_rules = export_text(clf, feature_names=[f'feature_{i}' for i in range(train_features.shape[1])])

# Display decision rules
print("Extracted Decision Tree Rules:\n")
print(tree_rules)


Extracted Decision Tree Rules:

|--- feature_14 <= 0.50
|   |--- feature_96 <= 1.98
|   |   |--- feature_79 <= 0.41
|   |   |   |--- feature_37 <= 0.04
|   |   |   |   |--- feature_125 <= 1.95
|   |   |   |   |   |--- feature_26 <= 2.38
|   |   |   |   |   |   |--- feature_108 <= 0.50
|   |   |   |   |   |   |   |--- feature_93 <= 2.56
|   |   |   |   |   |   |   |   |--- feature_107 <= 0.68
|   |   |   |   |   |   |   |   |   |--- class: 3
|   |   |   |   |   |   |   |   |--- feature_107 >  0.68
|   |   |   |   |   |   |   |   |   |--- feature_103 <= 3.10
|   |   |   |   |   |   |   |   |   |   |--- feature_88 <= 0.63
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 31
|   |   |   |   |   |   |   |   |   |   |--- feature_88 >  0.63
|   |   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |   |--- feature_103 >  3.10
|   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |--- feature_93 >  2.56
|   | 

USing Rulefit on the extracted deep features

In [3]:
import os
import pickle
import numpy as np
from rulefit import RuleFit
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Directory to save extracted features and models
feature_dir = 'extracted_features'
model_dir = 'saved_models'
os.makedirs(model_dir, exist_ok=True)

# Load extracted features
with open(os.path.join(feature_dir, "train_features.pkl"), 'rb') as f:
    train_features, train_labels = pickle.load(f)
    
with open(os.path.join(feature_dir, "val_features.pkl"), 'rb') as f:
    val_features, val_labels = pickle.load(f)
    
with open(os.path.join(feature_dir, "test_features.pkl"), 'rb') as f:
    test_features, test_labels = pickle.load(f)

# Check shapes of features and labels
print("Train Features Shape:", train_features.shape)
print("Train Labels Shape:", train_labels.shape)
print("Validation Features Shape:", val_features.shape)
print("Validation Labels Shape:", val_labels.shape)

# Ensure labels are aligned with features
assert train_features.shape[0] == train_labels.shape[0], "Mismatch between train features and labels."
assert val_features.shape[0] == val_labels.shape[0], "Mismatch between validation features and labels."

# Convert multi-dimensional labels to single class labels (if needed)
train_labels = np.argmax(train_labels, axis=1)  # Assuming one-hot encoding
val_labels = np.argmax(val_labels, axis=1)      # Assuming one-hot encoding
test_labels = np.argmax(test_labels, axis=1)    # If test labels are also one-hot encoded

# Combine training and validation data for final training
train_val_features = np.vstack([train_features, val_features])
train_val_labels = np.hstack([train_labels, val_labels])

print(f"Combined Training + Validation Features Shape: {train_val_features.shape}")
print(f"Combined Training + Validation Labels Shape: {train_val_labels.shape}")

# Initialize RuleFit model
rf = RuleFit(tree_size=3, sample_fract=1.0, max_rules=2000, random_state=42)

# Fit the RuleFit model to combined training and validation data
rf.fit(train_val_features, train_val_labels)

# Predict on the test set (continuous values)
test_predictions = rf.predict(test_features)

# Convert continuous predictions to discrete class labels by rounding or using a threshold
test_predictions_discrete = np.round(test_predictions).astype(int)

# Ensure the predicted labels are within the valid range of classes
test_predictions_discrete = np.clip(test_predictions_discrete, np.min(train_val_labels), np.max(train_val_labels))

# Calculate testing accuracy
test_accuracy = accuracy_score(test_labels, test_predictions_discrete)
print(f"Testing Accuracy: {test_accuracy:.4f}")

# Generate confusion matrix
conf_matrix = confusion_matrix(test_labels, test_predictions_discrete)
print("Confusion Matrix:")
print(conf_matrix)

# Generate classification report
classification_report_str = classification_report(test_labels, test_predictions_discrete)
print("Classification Report:")
print(classification_report_str)

# Extract rules from the RuleFit model
rules = rf.get_rules()
rules = rules[rules.coef != 0].sort_values("importance", ascending=False)

# Display the top rules
print("Top Rules from RuleFit Model:")
print(rules.head())

# Save rules to a text file
output_file_path = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/testing code on brain tumor dataset/rulefit_rules_on_deep_features.txt'
rules.to_csv(output_file_path, index=False)
print(f"Rules have been saved to {output_file_path}.")


Train Features Shape: (1680, 128)
Train Labels Shape: (1680, 2)
Validation Features Shape: (360, 128)
Validation Labels Shape: (360, 2)
Combined Training + Validation Features Shape: (2040, 128)
Combined Training + Validation Labels Shape: (2040,)
Testing Accuracy: 1.0000
Confusion Matrix:
[[183   0]
 [  0 177]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       183
           1       1.00      1.00      1.00       177

    accuracy                           1.00       360
   macro avg       1.00      1.00      1.00       360
weighted avg       1.00      1.00      1.00       360

Top Rules from RuleFit Model:
                                                   rule  type      coef  \
171                    feature_21 > 0.10284307599067688  rule  0.255339   
288   feature_21 <= 0.0033734366297721863 & feature_...  rule -0.248009   
447   feature_43 <= 1.2528671026229858 & feature_21 ...  rule -0.134909   
320   

  model = cd_fast.enet_coordinate_descent(


Calculate Statistical Features for Training, Validation, and Testing Sets

In [5]:
import numpy as np
import pickle
import pandas as pd
from scipy.stats import skew, entropy, kurtosis, variation, iqr
import os

# Load extracted features
feature_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/testing code on brain tumor dataset/extracted_features'
stat_feature_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/testing code on brain tumor dataset/10 statistical_features'
os.makedirs(stat_feature_dir, exist_ok=True)

with open(os.path.join(feature_dir, "train_features.pkl"), 'rb') as f:
    train_features, train_labels = pickle.load(f)
    
with open(os.path.join(feature_dir, "val_features.pkl"), 'rb') as f:
    val_features, val_labels = pickle.load(f)
    
with open(os.path.join(feature_dir, "test_features.pkl"), 'rb') as f:
    test_features, test_labels = pickle.load(f)

# Function to calculate statistical features
def calculate_statistical_features(features):
    stats_features = []
    for f in features:
        stats = {
            'mean': np.mean(f),
            'std_dev': np.std(f),
            'skewness': skew(f),
            'variance': np.var(f),
            'median': np.median(f),
            'range': np.ptp(f),  # Peak-to-peak range
            'entropy': entropy(np.abs(f) + 1e-6),  # Add small constant to avoid log(0)
            'energy': np.sum(f ** 2),
            'contrast': np.std(f) ** 2,  # Contrast as variance
            'mean_abs_dev': np.mean(np.abs(f - np.mean(f)))
        }
        stats_features.append(stats)
    return pd.DataFrame(stats_features)

# Convert labels to 1D if necessary (e.g., if labels are one-hot encoded)
def convert_labels_to_1d(labels):
    if len(labels.shape) > 1 and labels.shape[1] > 1:
        # Assuming the labels are one-hot encoded, convert to single-class labels
        return np.argmax(labels, axis=1)
    return labels

# Calculate and save statistical features in CSV files
for set_name, features, labels in [("train", train_features, train_labels), 
                                   ("val", val_features, val_labels), 
                                   ("test", test_features, test_labels)]:
    stats_features_df = calculate_statistical_features(features)
    labels_1d = convert_labels_to_1d(labels)  # Ensure labels are 1D
    stats_features_df['label'] = labels_1d  # Append 1D labels to the DataFrame
    csv_file_path = os.path.join(stat_feature_dir, f"{set_name}_stat_features.csv")
    stats_features_df.to_csv(csv_file_path, index=False)

print("Statistical features calculated and saved to CSV files.")


Statistical features calculated and saved to CSV files.


Load Statistical Features and Train & Evaluate Decision Tree Classifier

In [4]:
import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Define the directory containing the CSV files
stat_feature_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/testing code on brain tumor dataset/statistical_features'

# Load the CSV files
train_df = pd.read_csv(os.path.join(stat_feature_dir, "train_stat_features.csv"))
val_df = pd.read_csv(os.path.join(stat_feature_dir, "val_stat_features.csv"))
test_df = pd.read_csv(os.path.join(stat_feature_dir, "test_stat_features.csv"))

# Separate features and labels
train_stat_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values

val_stat_features = val_df.drop(columns=['label']).values
val_labels = val_df['label'].values

test_stat_features = test_df.drop(columns=['label']).values
test_labels = test_df['label'].values

# Combine training and validation data for final training
combined_features = np.vstack([train_stat_features, val_stat_features])
combined_labels = np.hstack([train_labels, val_labels])

# Train Decision Tree Classifier on combined training and validation data
clf = DecisionTreeClassifier(random_state=0)
clf.fit(combined_features, combined_labels)

# Function to print classification report with four decimal points
def print_classification_report(set_name, labels, predictions):
    report = classification_report(labels, predictions, output_dict=True)
    print(f"{set_name} Classification Report:")
    for label, metrics in report.items():
        if label == 'accuracy':
            print(f"  Accuracy: {metrics:.4f}")
        else:
            print(f"  Class {label}: Precision: {metrics['precision']:.4f}, Recall: {metrics['recall']:.4f}, F1-Score: {metrics['f1-score']:.4f}")
    print()

# Evaluate on the combined training set
train_predictions = clf.predict(combined_features)
train_accuracy = accuracy_score(combined_labels, train_predictions)
print(f"Combined Training Accuracy: {train_accuracy * 100:.4f}%")
print(f"Combined Training Confusion Matrix:\n", confusion_matrix(combined_labels, train_predictions))
print_classification_report("Combined Training", combined_labels, train_predictions)

# Evaluate on the test set
test_predictions = clf.predict(test_stat_features)
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy * 100:.4f}%")
print(f"Test Confusion Matrix:\n", confusion_matrix(test_labels, test_predictions))
print_classification_report("Test", test_labels, test_predictions)


FileNotFoundError: [Errno 2] No such file or directory: 'E:/Abroad period research/new idea implementation codes/Second part of the paper/testing code on brain tumor dataset/statistical_features\\train_stat_features.csv'

Using Rulefit on the statistical features

In [7]:
import os
import pandas as pd
import numpy as np
from rulefit import RuleFit
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Define the directory containing the CSV files
input_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/testing code on brain tumor dataset/statistical_features'

# Load the CSV files
train_data = pd.read_csv(os.path.join(input_dir, "train_stat_features.csv"))
val_data = pd.read_csv(os.path.join(input_dir, "val_stat_features.csv"))
test_data = pd.read_csv(os.path.join(input_dir, "test_stat_features.csv"))

# Separate features and labels for each dataset
train_features = train_data.drop(columns=['label']).values
train_labels = train_data['label'].values

val_features = val_data.drop(columns=['label']).values
val_labels = val_data['label'].values

test_features = test_data.drop(columns=['label']).values
test_labels = test_data['label'].values

# Combine training and validation data for final training
train_val_features = np.vstack([train_features, val_features])
train_val_labels = np.hstack([train_labels, val_labels])

print(f"Combined Training + Validation Features Shape: {train_val_features.shape}")
print(f"Combined Training + Validation Labels Shape: {train_val_labels.shape}")

# Define feature names dynamically based on the training data
feature_names = train_data.columns[:-1].tolist()  # Exclude the label column

# Initialize RuleFit model
rf = RuleFit(tree_size=3, sample_fract=1.0, max_rules=2000, random_state=42)

# Fit the RuleFit model to combined training and validation data
rf.fit(train_val_features, train_val_labels, feature_names=feature_names)

# Predict on the test set (continuous values)
test_predictions = rf.predict(test_features)

# Convert continuous predictions to discrete class labels by rounding
test_predictions_discrete = np.round(test_predictions).astype(int)

# Ensure the predicted labels are within the valid range of classes
test_predictions_discrete = np.clip(test_predictions_discrete, np.min(train_val_labels), np.max(train_val_labels))

# Calculate testing accuracy
test_accuracy = accuracy_score(test_labels, test_predictions_discrete)
print(f"Testing Accuracy: {test_accuracy:.4f}")

# Generate confusion matrix
conf_matrix = confusion_matrix(test_labels, test_predictions_discrete)
print("Confusion Matrix:")
print(conf_matrix)

# Generate classification report
report = classification_report(test_labels, test_predictions_discrete, digits=4)
print("Classification Report:")
print(report)

# Extract rules from the RuleFit model
rules = rf.get_rules()
rules = rules[rules.coef != 0].sort_values("importance", ascending=False)

# Display the top rules
print("Top Rules from RuleFit Model:")
print(rules.head())

# Save rules to a text file
output_file_path = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/10 features results/rulefit_rules_on_statistical_features.txt'
rules.to_csv(output_file_path, index=False)
print(f"Rules have been saved to {output_file_path}.")


Combined Training + Validation Features Shape: (34652, 10)
Combined Training + Validation Labels Shape: (34652,)


  model = cd_fast.enet_coordinate_descent(


Testing Accuracy: 0.8300
Confusion Matrix:
[[1032  406   35    3]
 [ 216 1125  133   13]
 [   0    0 1556    4]
 [   0    4  226 1363]]
Classification Report:
              precision    recall  f1-score   support

           0     0.8269    0.6992    0.7577      1476
           1     0.7329    0.7566    0.7445      1487
           2     0.7979    0.9974    0.8866      1560
           3     0.9855    0.8556    0.9160      1593

    accuracy                         0.8300      6116
   macro avg     0.8358    0.8272    0.8262      6116
weighted avg     0.8380    0.8300    0.8286      6116

Top Rules from RuleFit Model:
                                                   rule  type      coef  \
1021  entropy <= 3.8998141288757324 & skewness > 0.9...  rule  0.273116   
793   skewness <= 1.000166416168213 & entropy > 4.01...  rule -0.170730   
657   entropy > 3.959961771965027 & mean > 0.7950559...  rule -0.127935   
1984  mean <= 1.1481293439865112 & entropy > 3.89978...  rule  0.122074   
1

Code for evaluating the statistical features (Mutual information and feature importance appraoch)
I used Mutual Information and Feature Importance to directly assess how well your statistical features relate to the class labels. If many features have low MI scores or feature importance values close to zero, they may not be contributing to the classification, and we might need to generate new features or revisit the feature extraction method.


In [8]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import mutual_info_classif

# Load the statistical features and labels from CSV
stat_feature_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/testing code on brain tumor dataset/statistical_features'

# Load the CSV file
train_df = pd.read_csv(f"{stat_feature_dir}/train_stat_features.csv")

# Separate features and labels
train_features = train_df.drop(columns=['label']).values  # Drop the 'label' column to get features
train_labels = train_df['label'].values  # Extract the labels

# Names of the statistical features (based on the columns in the CSV)
stat_feature_names = train_df.columns[:-1]  # All columns except 'label' are feature names

# 1. Mutual Information (MI) for each feature
# Mutual Information helps to see which features carry the most information about the target
mi_scores = mutual_info_classif(train_features, train_labels, discrete_features=False)
mi_scores_df = pd.DataFrame({'Feature': stat_feature_names, 'MI Score': mi_scores})
mi_scores_df.sort_values(by='MI Score', ascending=False, inplace=True)

# Display MI scores for each feature
print(mi_scores_df)


        Feature  MI Score
2      skewness  0.586474
6       entropy  0.535292
0          mean  0.382283
7        energy  0.318985
5         range  0.312447
9  mean_abs_dev  0.305015
1       std_dev  0.302740
3      variance  0.302729
8      contrast  0.302717
4        median  0.253758


Drop features with more than 50% of data 0s

In [9]:
import os
import pandas as pd

# Define directories
input_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/10 features results/statistical_features'
output_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/10 features results/filtered_statistical_features'

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Load datasets
train_data = pd.read_csv(os.path.join(input_dir, "train_stat_features.csv"))
val_data = pd.read_csv(os.path.join(input_dir, "val_stat_features.csv"))
test_data = pd.read_csv(os.path.join(input_dir, "test_stat_features.csv"))

# Function to find features with more than 50% zeros
def find_zero_features(df):
    zero_features = [col for col in df.columns if (df[col] == 0).mean() > 0.5]
    return zero_features

# Identify features with more than 50% zeros in each dataset
train_zero_features = find_zero_features(train_data)
val_zero_features = find_zero_features(val_data)
test_zero_features = find_zero_features(test_data)

# Union of features with more than 50% zeros across all datasets
all_zero_features = set(train_zero_features).union(set(val_zero_features)).union(set(test_zero_features))

# Display features with more than 50% zeros in any dataset
print(f"Features with more than 50% zeros in any dataset: {list(all_zero_features)}")

# Drop these features from all datasets
train_filtered = train_data.drop(columns=all_zero_features)
val_filtered = val_data.drop(columns=all_zero_features)
test_filtered = test_data.drop(columns=all_zero_features)

# Save filtered datasets
train_filtered.to_csv(os.path.join(output_dir, "filtered_train_stat_features.csv"), index=False)
val_filtered.to_csv(os.path.join(output_dir, "filtered_val_stat_features.csv"), index=False)
test_filtered.to_csv(os.path.join(output_dir, "filtered_test_stat_features.csv"), index=False)

print("Filtered features have been saved in the 'filtered_statistical_features' directory.")


Features with more than 50% zeros in any dataset: ['median']
Filtered features have been saved in the 'filtered_statistical_features' directory.


tree evaluation using filtered_statistical_features

In [10]:
import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Define the directory containing the CSV files
stat_feature_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/10 features results/filtered_statistical_features'

# Load the CSV files
train_df = pd.read_csv(os.path.join(stat_feature_dir, "filtered_train_stat_features.csv"))
val_df = pd.read_csv(os.path.join(stat_feature_dir, "filtered_val_stat_features.csv"))
test_df = pd.read_csv(os.path.join(stat_feature_dir, "filtered_test_stat_features.csv"))

# Separate features and labels
train_stat_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values

val_stat_features = val_df.drop(columns=['label']).values
val_labels = val_df['label'].values

test_stat_features = test_df.drop(columns=['label']).values
test_labels = test_df['label'].values

# Train Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=0)
clf.fit(train_stat_features, train_labels)

# Function to print classification report with four decimal points
def print_classification_report(set_name, labels, predictions):
    report = classification_report(labels, predictions, output_dict=True)
    print(f"{set_name} Classification Report:")
    for label, metrics in report.items():
        if label == 'accuracy':
            print(f"  Accuracy: {metrics:.4f}")
        else:
            print(f"  Class {label}: Precision: {metrics['precision']:.4f}, Recall: {metrics['recall']:.4f}, F1-Score: {metrics['f1-score']:.4f}")
    print()

# Evaluate on training, validation, and test sets
for set_name, features, labels in [("Train", train_stat_features, train_labels), 
                                   ("Validation", val_stat_features, val_labels), 
                                   ("Test", test_stat_features, test_labels)]:
    predictions = clf.predict(features)
    accuracy = accuracy_score(labels, predictions)
    print(f"{set_name} Accuracy: {accuracy * 100:.4f}%")
    print(f"{set_name} Confusion Matrix:\n", confusion_matrix(labels, predictions))
    
    # Print the classification report with four decimal points
    print_classification_report(set_name, labels, predictions)


Train Accuracy: 100.0000%
Train Confusion Matrix:
 [[7173    0    0    0]
 [   0 7171    0    0]
 [   0    0 7117    0]
 [   0    0    0 7076]]
Train Classification Report:
  Class 0: Precision: 1.0000, Recall: 1.0000, F1-Score: 1.0000
  Class 1: Precision: 1.0000, Recall: 1.0000, F1-Score: 1.0000
  Class 2: Precision: 1.0000, Recall: 1.0000, F1-Score: 1.0000
  Class 3: Precision: 1.0000, Recall: 1.0000, F1-Score: 1.0000
  Accuracy: 1.0000
  Class macro avg: Precision: 1.0000, Recall: 1.0000, F1-Score: 1.0000
  Class weighted avg: Precision: 1.0000, Recall: 1.0000, F1-Score: 1.0000

Validation Accuracy: 52.8046%
Validation Confusion Matrix:
 [[1029  409  102    3]
 [ 322  960  236   16]
 [1494    0    1   20]
 [   8   14  262 1239]]
Validation Classification Report:
  Class 0: Precision: 0.3607, Recall: 0.6669, F1-Score: 0.4682
  Class 1: Precision: 0.6941, Recall: 0.6258, F1-Score: 0.6582
  Class 2: Precision: 0.0017, Recall: 0.0007, F1-Score: 0.0009
  Class 3: Precision: 0.9695, Reca

Rulefit evaluation on filtered statistical features

In [11]:
import os
import pandas as pd
import numpy as np
from rulefit import RuleFit
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Define the directory containing the CSV files
input_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/10 features results/filtered_statistical_features'

# Load the CSV files
train_data = pd.read_csv(os.path.join(stat_feature_dir, "filtered_train_stat_features.csv"))
val_data = pd.read_csv(os.path.join(stat_feature_dir, "filtered_val_stat_features.csv"))
test_data = pd.read_csv(os.path.join(stat_feature_dir, "filtered_test_stat_features.csv"))

# Separate features and labels for each dataset
train_features = train_data.drop(columns=['label']).values
train_labels = train_data['label'].values

val_features = val_data.drop(columns=['label']).values
val_labels = val_data['label'].values

test_features = test_data.drop(columns=['label']).values
test_labels = test_data['label'].values

# Combine training and validation data for final training
train_val_features = np.vstack([train_features, val_features])
train_val_labels = np.hstack([train_labels, val_labels])

print(f"Combined Training + Validation Features Shape: {train_val_features.shape}")
print(f"Combined Training + Validation Labels Shape: {train_val_labels.shape}")

# Define feature names dynamically based on the training data
feature_names = train_data.columns[:-1].tolist()  # Exclude the label column

# Initialize RuleFit model
rf = RuleFit(tree_size=3, sample_fract=1.0, max_rules=2000, random_state=42)

# Fit the RuleFit model to combined training and validation data
rf.fit(train_val_features, train_val_labels, feature_names=feature_names)

# Predict on the test set (continuous values)
test_predictions = rf.predict(test_features)

# Convert continuous predictions to discrete class labels by rounding
test_predictions_discrete = np.round(test_predictions).astype(int)

# Ensure the predicted labels are within the valid range of classes
test_predictions_discrete = np.clip(test_predictions_discrete, np.min(train_val_labels), np.max(train_val_labels))

# Calculate testing accuracy
test_accuracy = accuracy_score(test_labels, test_predictions_discrete)
print(f"Testing Accuracy: {test_accuracy:.4f}")

# Generate confusion matrix
conf_matrix = confusion_matrix(test_labels, test_predictions_discrete)
print("Confusion Matrix:")
print(conf_matrix)

# Generate classification report
report = classification_report(test_labels, test_predictions_discrete, digits=4)
print("Classification Report:")
print(report)

# Extract rules from the RuleFit model
rules = rf.get_rules()
rules = rules[rules.coef != 0].sort_values("importance", ascending=False)

# Display the top rules
print("Top Rules from RuleFit Model:")
print(rules.head())

# Save rules to a text file
output_file_path = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/10 features results/rulefit_rules_on_filtered_statistical_features.txt'
rules.to_csv(output_file_path, index=False)
print(f"Rules have been saved to {output_file_path}.")
			


Combined Training + Validation Features Shape: (34652, 9)
Combined Training + Validation Labels Shape: (34652,)


  model = cd_fast.enet_coordinate_descent(


Testing Accuracy: 0.8394
Confusion Matrix:
[[1069  368   37    2]
 [ 190 1148  135   14]
 [   0    0 1555    5]
 [   0    3  228 1362]]
Classification Report:
              precision    recall  f1-score   support

           0     0.8491    0.7243    0.7817      1476
           1     0.7558    0.7720    0.7638      1487
           2     0.7954    0.9968    0.8848      1560
           3     0.9848    0.8550    0.9153      1593

    accuracy                         0.8394      6116
   macro avg     0.8463    0.8370    0.8364      6116
weighted avg     0.8481    0.8394    0.8385      6116

Top Rules from RuleFit Model:
                                                   rule  type      coef  \
1965  skewness > 0.9461686015129089 & entropy <= 3.8...  rule  0.196359   
490   skewness <= 1.000166416168213 & entropy > 4.01...  rule -0.139293   
1851  entropy > 3.998006582260132 & mean > 0.8762070...  rule -0.132793   
686   entropy > 3.8705025911331177 & mean > 1.293648...  rule -0.115594   
1

Selection 9 best features

In [12]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest

# Define the input directory where the selected statistical features are saved
input_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/10 features results/filtered_statistical_features'

# Define the output directory for saving selected features after CFS
output_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/10 features results/9 best features'
os.makedirs(output_dir, exist_ok=True)

# Function to apply CFS and save selected features
def apply_mic_and_save(data, selected_columns, output_name):
    # Separate features and labels
    X = data[selected_columns]  # Use only the selected features
    y = data['label']  # Labels
    
    # Add the label back to the selected features
    selected_data = pd.concat([X, y], axis=1)
    
    # Save the selected features with labels
    selected_data.to_csv(os.path.join(output_dir, output_name), index=False)

# Step 1: Perform feature selection on the training set
train_data = pd.read_csv(os.path.join(input_dir, "filtered_train_stat_features.csv"))

# Separate features and labels for the training set
X_train = train_data.drop(columns=['label'])  # Features
y_train = train_data['label']  # Labels

# Use mutual information on training data to select top 3 features
selector = SelectKBest(score_func=mutual_info_classif, k=9)  # Select top 3 features
selector.fit(X_train, y_train)

# Get the selected feature names based on training set
selected_columns = X_train.columns[selector.get_support()]
print(f"Selected features based on training set: {selected_columns}")

# Step 2: Apply the same selected features to the training, validation, and testing datasets

# Apply CFS to the training set (using selected features)
apply_mic_and_save(train_data, selected_columns, "9_training_selected_features.csv")

# Apply the same selected features to the validation set
val_data = pd.read_csv(os.path.join(input_dir, "filtered_val_stat_features.csv"))
apply_mic_and_save(val_data, selected_columns, "9_validation_selected_features.csv")

# Apply the same selected features to the testing set
test_data = pd.read_csv(os.path.join(input_dir, "filtered_test_stat_features.csv"))
apply_mic_and_save(test_data, selected_columns, "9_testing_selected_features.csv")

print("Feature selection completed and saved.")


Selected features based on training set: Index(['mean', 'std_dev', 'skewness', 'variance', 'range', 'entropy', 'energy',
       'contrast', 'mean_abs_dev'],
      dtype='object')
Feature selection completed and saved.


Decision tree evaluation on 9 selected features

In [13]:
import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Define the directory containing the CSV files
stat_feature_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/10 features results/9 best features'

# Load the CSV files
train_df = pd.read_csv(os.path.join(stat_feature_dir, "9_training_selected_features.csv"))
val_df = pd.read_csv(os.path.join(stat_feature_dir, "9_validation_selected_features.csv"))
test_df = pd.read_csv(os.path.join(stat_feature_dir, "9_testing_selected_features.csv"))

# Separate features and labels
train_stat_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values

val_stat_features = val_df.drop(columns=['label']).values
val_labels = val_df['label'].values

test_stat_features = test_df.drop(columns=['label']).values
test_labels = test_df['label'].values

# Train Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=0)
clf.fit(train_stat_features, train_labels)

# Function to print classification report with four decimal points
def print_classification_report(set_name, labels, predictions):
    report = classification_report(labels, predictions, output_dict=True)
    print(f"{set_name} Classification Report:")
    for label, metrics in report.items():
        if label == 'accuracy':
            print(f"  Accuracy: {metrics:.4f}")
        else:
            print(f"  Class {label}: Precision: {metrics['precision']:.4f}, Recall: {metrics['recall']:.4f}, F1-Score: {metrics['f1-score']:.4f}")
    print()

# Evaluate on training, validation, and test sets
for set_name, features, labels in [("Train", train_stat_features, train_labels), 
                                   ("Validation", val_stat_features, val_labels), 
                                   ("Test", test_stat_features, test_labels)]:
    predictions = clf.predict(features)
    accuracy = accuracy_score(labels, predictions)
    print(f"{set_name} Accuracy: {accuracy * 100:.4f}%")
    print(f"{set_name} Confusion Matrix:\n", confusion_matrix(labels, predictions))
    
    # Print the classification report with four decimal points
    print_classification_report(set_name, labels, predictions)


Train Accuracy: 100.0000%
Train Confusion Matrix:
 [[7173    0    0    0]
 [   0 7171    0    0]
 [   0    0 7117    0]
 [   0    0    0 7076]]
Train Classification Report:
  Class 0: Precision: 1.0000, Recall: 1.0000, F1-Score: 1.0000
  Class 1: Precision: 1.0000, Recall: 1.0000, F1-Score: 1.0000
  Class 2: Precision: 1.0000, Recall: 1.0000, F1-Score: 1.0000
  Class 3: Precision: 1.0000, Recall: 1.0000, F1-Score: 1.0000
  Accuracy: 1.0000
  Class macro avg: Precision: 1.0000, Recall: 1.0000, F1-Score: 1.0000
  Class weighted avg: Precision: 1.0000, Recall: 1.0000, F1-Score: 1.0000

Validation Accuracy: 52.8046%
Validation Confusion Matrix:
 [[1029  409  102    3]
 [ 322  960  236   16]
 [1494    0    1   20]
 [   8   14  262 1239]]
Validation Classification Report:
  Class 0: Precision: 0.3607, Recall: 0.6669, F1-Score: 0.4682
  Class 1: Precision: 0.6941, Recall: 0.6258, F1-Score: 0.6582
  Class 2: Precision: 0.0017, Recall: 0.0007, F1-Score: 0.0009
  Class 3: Precision: 0.9695, Reca

Rulefit evaluation on 9 important features

In [16]:
import os
import pandas as pd
import numpy as np
from rulefit import RuleFit
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


# Define the directory containing the CSV files
input_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/10 features results/9 best features'

# Load the CSV files
train_df = pd.read_csv(os.path.join(input_dir, "9_training_selected_features.csv"))
val_df = pd.read_csv(os.path.join(input_dir, "9_validation_selected_features.csv"))
test_df = pd.read_csv(os.path.join(input_dir, "9_testing_selected_features.csv"))


# Separate features and labels for each dataset
train_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values
print(f"Training Features Shape: {train_features.shape}")

val_features = val_df.drop(columns=['label']).values
val_labels = val_df['label'].values
print(f"Validation Features Shape: {val_features.shape}")

test_features = test_df.drop(columns=['label']).values
test_labels = test_df['label'].values
print(f"Testing Features Shape: {test_features.shape}")

# Combine training and validation data for final training
train_val_features = np.vstack([train_features, val_features])
train_val_labels = np.hstack([train_labels, val_labels])

print(f"Combined Training + Validation Features Shape: {train_val_features.shape}")
print(f"Combined Training + Validation Labels Shape: {train_val_labels.shape}")

# Define feature names dynamically based on the training data
feature_names = train_data.columns[:-1].tolist()  # Exclude the label column

# Initialize RuleFit model
rf = RuleFit(tree_size=3, sample_fract=1.0, max_rules=2000, random_state=42)

# Fit the RuleFit model to combined training and validation data
rf.fit(train_val_features, train_val_labels, feature_names=feature_names)

# Predict on the test set (continuous values)
test_predictions = rf.predict(test_features)

# Convert continuous predictions to discrete class labels by rounding
test_predictions_discrete = np.round(test_predictions).astype(int)

# Ensure the predicted labels are within the valid range of classes
test_predictions_discrete = np.clip(test_predictions_discrete, np.min(train_val_labels), np.max(train_val_labels))

# Calculate testing accuracy
test_accuracy = accuracy_score(test_labels, test_predictions_discrete)
print(f"Testing Accuracy: {test_accuracy:.4f}")

# Generate confusion matrix
conf_matrix = confusion_matrix(test_labels, test_predictions_discrete)
print("Confusion Matrix:")
print(conf_matrix)

# Generate classification report
report = classification_report(test_labels, test_predictions_discrete, digits=4)
print("Classification Report:")
print(report)

# Extract rules from the RuleFit model
rules = rf.get_rules()
rules = rules[rules.coef != 0].sort_values("importance", ascending=False)

# Display the top rules
print("Top Rules from RuleFit Model:")
print(rules.head())

# Save rules to a text file
output_file_path = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/10 features results/rulefit_rules_on_9_statistical_features.txt'
rules.to_csv(output_file_path, index=False)
print(f"Rules have been saved to {output_file_path}.")


Training Features Shape: (28537, 9)
Validation Features Shape: (6115, 9)
Testing Features Shape: (6116, 9)
Combined Training + Validation Features Shape: (34652, 9)
Combined Training + Validation Labels Shape: (34652,)


  model = cd_fast.enet_coordinate_descent(


Testing Accuracy: 0.8394
Confusion Matrix:
[[1069  368   37    2]
 [ 190 1148  135   14]
 [   0    0 1555    5]
 [   0    3  228 1362]]
Classification Report:
              precision    recall  f1-score   support

           0     0.8491    0.7243    0.7817      1476
           1     0.7558    0.7720    0.7638      1487
           2     0.7954    0.9968    0.8848      1560
           3     0.9848    0.8550    0.9153      1593

    accuracy                         0.8394      6116
   macro avg     0.8463    0.8370    0.8364      6116
weighted avg     0.8481    0.8394    0.8385      6116

Top Rules from RuleFit Model:
                                                   rule  type      coef  \
1965  skewness > 0.9461686015129089 & entropy <= 3.8...  rule  0.196359   
490   skewness <= 1.000166416168213 & entropy > 4.01...  rule -0.139293   
1851  entropy > 3.998006582260132 & mean > 0.8762070...  rule -0.132793   
686   entropy > 3.8705025911331177 & mean > 1.293648...  rule -0.115594   
1

Selecting 6 most important features based on mutual importance 

In [20]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest

# Define the input directory where the selected statistical features are saved
input_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/10 features results/filtered_statistical_features'

# Define the output directory for saving selected features after CFS
output_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/10 features results/6 best features'
os.makedirs(output_dir, exist_ok=True)

# Function to apply CFS and save selected features
def apply_mic_and_save(data, selected_columns, output_name):
    # Separate features and labels
    X = data[selected_columns]  # Use only the selected features
    y = data['label']  # Labels
    
    # Add the label back to the selected features
    selected_data = pd.concat([X, y], axis=1)
    
    # Save the selected features with labels
    selected_data.to_csv(os.path.join(output_dir, output_name), index=False)

# Step 1: Perform feature selection on the training set
train_data = pd.read_csv(os.path.join(input_dir, "filtered_train_stat_features.csv"))

# Separate features and labels for the training set
X_train = train_data.drop(columns=['label'])  # Features
y_train = train_data['label']  # Labels

# Use mutual information on training data to select top 3 features
selector = SelectKBest(score_func=mutual_info_classif, k=6)  # Select top 3 features
selector.fit(X_train, y_train)

# Get the selected feature names based on training set
selected_columns = X_train.columns[selector.get_support()]
print(f"Selected features based on training set: {selected_columns}")

# Step 2: Apply the same selected features to the training, validation, and testing datasets

# Apply CFS to the training set (using selected features)
apply_mic_and_save(train_data, selected_columns, "6_training_selected_features.csv")

# Apply the same selected features to the validation set
val_data = pd.read_csv(os.path.join(input_dir, "filtered_val_stat_features.csv"))
apply_mic_and_save(val_data, selected_columns, "6_validation_selected_features.csv")

# Apply the same selected features to the testing set
test_data = pd.read_csv(os.path.join(input_dir, "filtered_test_stat_features.csv"))
apply_mic_and_save(test_data, selected_columns, "6_testing_selected_features.csv")

print("Feature selection completed and saved.")


Selected features based on training set: Index(['mean', 'skewness', 'range', 'entropy', 'energy', 'mean_abs_dev'], dtype='object')
Feature selection completed and saved.


Evaluating decision tree on 6 important features

In [21]:
import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Define the directory containing the CSV files
stat_feature_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/10 features results/6 best features'

# Load the CSV files
train_df = pd.read_csv(os.path.join(stat_feature_dir, "6_training_selected_features.csv"))
val_df = pd.read_csv(os.path.join(stat_feature_dir, "6_validation_selected_features.csv"))
test_df = pd.read_csv(os.path.join(stat_feature_dir, "6_testing_selected_features.csv"))

# Separate features and labels
train_stat_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values

val_stat_features = val_df.drop(columns=['label']).values
val_labels = val_df['label'].values

test_stat_features = test_df.drop(columns=['label']).values
test_labels = test_df['label'].values

# Train Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=0)
clf.fit(train_stat_features, train_labels)

# Function to print classification report with four decimal points
def print_classification_report(set_name, labels, predictions):
    report = classification_report(labels, predictions, output_dict=True)
    print(f"{set_name} Classification Report:")
    for label, metrics in report.items():
        if label == 'accuracy':
            print(f"  Accuracy: {metrics:.4f}")
        else:
            print(f"  Class {label}: Precision: {metrics['precision']:.4f}, Recall: {metrics['recall']:.4f}, F1-Score: {metrics['f1-score']:.4f}")
    print()

# Evaluate on training, validation, and test sets
for set_name, features, labels in [("Train", train_stat_features, train_labels), 
                                   ("Validation", val_stat_features, val_labels), 
                                   ("Test", test_stat_features, test_labels)]:
    predictions = clf.predict(features)
    accuracy = accuracy_score(labels, predictions)
    print(f"{set_name} Accuracy: {accuracy * 100:.4f}%")
    print(f"{set_name} Confusion Matrix:\n", confusion_matrix(labels, predictions))
    
    # Print the classification report with four decimal points
    print_classification_report(set_name, labels, predictions)


Train Accuracy: 100.0000%
Train Confusion Matrix:
 [[7173    0    0    0]
 [   0 7171    0    0]
 [   0    0 7117    0]
 [   0    0    0 7076]]
Train Classification Report:
  Class 0: Precision: 1.0000, Recall: 1.0000, F1-Score: 1.0000
  Class 1: Precision: 1.0000, Recall: 1.0000, F1-Score: 1.0000
  Class 2: Precision: 1.0000, Recall: 1.0000, F1-Score: 1.0000
  Class 3: Precision: 1.0000, Recall: 1.0000, F1-Score: 1.0000
  Accuracy: 1.0000
  Class macro avg: Precision: 1.0000, Recall: 1.0000, F1-Score: 1.0000
  Class weighted avg: Precision: 1.0000, Recall: 1.0000, F1-Score: 1.0000

Validation Accuracy: 51.8070%
Validation Confusion Matrix:
 [[1002  428  110    3]
 [ 331  926  264   13]
 [1494    1    1   19]
 [  13   26  245 1239]]
Validation Classification Report:
  Class 0: Precision: 0.3528, Recall: 0.6494, F1-Score: 0.4572
  Class 1: Precision: 0.6705, Recall: 0.6037, F1-Score: 0.6353
  Class 2: Precision: 0.0016, Recall: 0.0007, F1-Score: 0.0009
  Class 3: Precision: 0.9725, Reca

Evaluating Rulefit on 6 important features

In [22]:
import os
import pandas as pd
import numpy as np
from rulefit import RuleFit
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


# Define the directory containing the CSV files
input_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/10 features results/6 best features'

# Load the CSV files
train_df = pd.read_csv(os.path.join(input_dir, "6_training_selected_features.csv"))
val_df = pd.read_csv(os.path.join(input_dir, "6_validation_selected_features.csv"))
test_df = pd.read_csv(os.path.join(input_dir, "6_testing_selected_features.csv"))


# Separate features and labels for each dataset
train_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values
print(f"Training Features Shape: {train_features.shape}")

val_features = val_df.drop(columns=['label']).values
val_labels = val_df['label'].values
print(f"Validation Features Shape: {val_features.shape}")

test_features = test_df.drop(columns=['label']).values
test_labels = test_df['label'].values
print(f"Testing Features Shape: {test_features.shape}")

# Combine training and validation data for final training
train_val_features = np.vstack([train_features, val_features])
train_val_labels = np.hstack([train_labels, val_labels])

print(f"Combined Training + Validation Features Shape: {train_val_features.shape}")
print(f"Combined Training + Validation Labels Shape: {train_val_labels.shape}")

# Define feature names dynamically based on the training data
feature_names = train_data.columns[:-1].tolist()  # Exclude the label column

# Initialize RuleFit model
rf = RuleFit(tree_size=3, sample_fract=1.0, max_rules=2000, random_state=42)

# Fit the RuleFit model to combined training and validation data
rf.fit(train_val_features, train_val_labels, feature_names=feature_names)

# Predict on the test set (continuous values)
test_predictions = rf.predict(test_features)

# Convert continuous predictions to discrete class labels by rounding
test_predictions_discrete = np.round(test_predictions).astype(int)

# Ensure the predicted labels are within the valid range of classes
test_predictions_discrete = np.clip(test_predictions_discrete, np.min(train_val_labels), np.max(train_val_labels))

# Calculate testing accuracy
test_accuracy = accuracy_score(test_labels, test_predictions_discrete)
print(f"Testing Accuracy: {test_accuracy:.4f}")

# Generate confusion matrix
conf_matrix = confusion_matrix(test_labels, test_predictions_discrete)
print("Confusion Matrix:")
print(conf_matrix)

# Generate classification report
report = classification_report(test_labels, test_predictions_discrete, digits=4)
print("Classification Report:")
print(report)

# Extract rules from the RuleFit model
rules = rf.get_rules()
rules = rules[rules.coef != 0].sort_values("importance", ascending=False)

# Display the top rules
print("Top Rules from RuleFit Model:")
print(rules.head())

# Save rules to a text file
output_file_path = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/10 features results/rulefit_rules_on_6_statistical_features.txt'
rules.to_csv(output_file_path, index=False)
print(f"Rules have been saved to {output_file_path}.")


Training Features Shape: (28537, 6)
Validation Features Shape: (6115, 6)
Testing Features Shape: (6116, 6)
Combined Training + Validation Features Shape: (34652, 6)
Combined Training + Validation Labels Shape: (34652,)


  model = cd_fast.enet_coordinate_descent(


Testing Accuracy: 0.8373
Confusion Matrix:
[[1049  388   37    2]
 [ 187 1152  133   15]
 [   0    0 1555    5]
 [   0    3  225 1365]]
Classification Report:
              precision    recall  f1-score   support

           0     0.8487    0.7107    0.7736      1476
           1     0.7466    0.7747    0.7604      1487
           2     0.7974    0.9968    0.8860      1560
           3     0.9841    0.8569    0.9161      1593

    accuracy                         0.8373      6116
   macro avg     0.8442    0.8348    0.8340      6116
weighted avg     0.8461    0.8373    0.8362      6116

Top Rules from RuleFit Model:
                                                   rule  type      coef  \
423   variance <= 3.8998141288757324 & std_dev > 0.9...  rule  0.279309   
633   mean > 0.8762070536613464 & variance > 3.99800...  rule -0.196973   
365   std_dev <= 1.000166416168213 & variance > 4.01...  rule -0.133797   
568   mean <= 1.1481293439865112 & variance > 3.8997...  rule  0.126030   
1

Selecting 3 most important features

In [23]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest

# Define the input directory where the selected statistical features are saved
input_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/10 features results/filtered_statistical_features'

# Define the output directory for saving selected features after CFS
output_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/10 features results/3 best features'
os.makedirs(output_dir, exist_ok=True)

# Function to apply CFS and save selected features
def apply_mic_and_save(data, selected_columns, output_name):
    # Separate features and labels
    X = data[selected_columns]  # Use only the selected features
    y = data['label']  # Labels
    
    # Add the label back to the selected features
    selected_data = pd.concat([X, y], axis=1)
    
    # Save the selected features with labels
    selected_data.to_csv(os.path.join(output_dir, output_name), index=False)

# Step 1: Perform feature selection on the training set
train_data = pd.read_csv(os.path.join(input_dir, "filtered_train_stat_features.csv"))

# Separate features and labels for the training set
X_train = train_data.drop(columns=['label'])  # Features
y_train = train_data['label']  # Labels

# Use mutual information on training data to select top 3 features
selector = SelectKBest(score_func=mutual_info_classif, k=3)  # Select top 3 features
selector.fit(X_train, y_train)

# Get the selected feature names based on training set
selected_columns = X_train.columns[selector.get_support()]
print(f"Selected features based on training set: {selected_columns}")

# Step 2: Apply the same selected features to the training, validation, and testing datasets

# Apply CFS to the training set (using selected features)
apply_mic_and_save(train_data, selected_columns, "3_training_selected_features.csv")

# Apply the same selected features to the validation set
val_data = pd.read_csv(os.path.join(input_dir, "filtered_val_stat_features.csv"))
apply_mic_and_save(val_data, selected_columns, "3_validation_selected_features.csv")

# Apply the same selected features to the testing set
test_data = pd.read_csv(os.path.join(input_dir, "filtered_test_stat_features.csv"))
apply_mic_and_save(test_data, selected_columns, "3_testing_selected_features.csv")

print("Feature selection completed and saved.")


Selected features based on training set: Index(['mean', 'skewness', 'entropy'], dtype='object')
Feature selection completed and saved.


Decision tree evaluation on 3 most important features

In [24]:
import os
import pandas as pd
import numpy as np

# Define the output directory where the CFS-selected features are saved
input_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/10 features results/3 best features'

# Load the CSV files
train_data = pd.read_csv(os.path.join(input_dir, "3_training_selected_features.csv"))
val_data = pd.read_csv(os.path.join(input_dir, "3_validation_selected_features.csv"))
test_data = pd.read_csv(os.path.join(input_dir, "3_testing_selected_features.csv"))

# Separate features and labels for each dataset
train_features = train_data.drop(columns=['label']).values
train_labels = train_data['label'].values

val_features = val_data.drop(columns=['label']).values
val_labels = val_data['label'].values

test_features = test_data.drop(columns=['label']).values
test_labels = test_data['label'].values

# Combine training and validation data for final training
train_val_features = np.vstack([train_features, val_features])
train_val_labels = np.hstack([train_labels, val_labels])

# Display the shape of the combined data
print(f"Combined Training + Validation Features Shape: {train_val_features.shape}")
print(f"Combined Training + Validation Labels Shape: {train_val_labels.shape}")
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Train a decision tree classifier on the combined training and validation data
clf = DecisionTreeClassifier(random_state=42)
clf.fit(train_val_features, train_val_labels)

# Predict on the test set
test_predictions = clf.predict(test_features)

# Calculate testing accuracy
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Testing Accuracy: {test_accuracy:.4f}")

# Generate the confusion matrix
conf_matrix = confusion_matrix(test_labels, test_predictions)
print("Confusion Matrix:")
print(conf_matrix)



Combined Training + Validation Features Shape: (34652, 3)
Combined Training + Validation Labels Shape: (34652,)
Testing Accuracy: 0.8190
Confusion Matrix:
[[1072  321   77    6]
 [ 313  990  160   24]
 [   0    0 1555    5]
 [   6   17  178 1392]]


Rulefit evaluation using 3 most important features

In [25]:
import os
import pandas as pd
import numpy as np
from rulefit import RuleFit
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelBinarizer

# Define the directory containing the CSV files
input_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/10 features results/3 best features'

# Load the CSV files
train_data = pd.read_csv(os.path.join(input_dir, "3_training_selected_features.csv"))
val_data = pd.read_csv(os.path.join(input_dir, "3_validation_selected_features.csv"))
test_data = pd.read_csv(os.path.join(input_dir, "3_testing_selected_features.csv"))


# Separate features and labels for each dataset
train_features = train_data.drop(columns=['label']).values
train_labels = train_data['label'].values

val_features = val_data.drop(columns=['label']).values
val_labels = val_data['label'].values

test_features = test_data.drop(columns=['label']).values
test_labels = test_data['label'].values

# Combine training and validation data for final training
train_val_features = np.vstack([train_features, val_features])
train_val_labels = np.hstack([train_labels, val_labels])

print(f"Combined Training + Validation Features Shape: {train_val_features.shape}")
print(f"Combined Training + Validation Labels Shape: {train_val_labels.shape}")

# Define feature names based on your dataset
feature_names = ['skewness', 'mean', 'entropy']  # Update based on your actual features

# Initialize RuleFit model
rf = RuleFit(tree_size=3, sample_fract=1.0, max_rules=2000, random_state=42)

# Fit the RuleFit model to combined training and validation data
rf.fit(train_val_features, train_val_labels, feature_names=feature_names)

# Predict on the test set (continuous values)
test_predictions = rf.predict(test_features)

# Convert continuous predictions to discrete class labels by rounding or using a threshold
# Since it's multiclass, we can round to the nearest integer or use argmax if using probabilities
test_predictions_discrete = np.round(test_predictions).astype(int)

# Ensure the predicted labels are within the valid range of classes
test_predictions_discrete = np.clip(test_predictions_discrete, np.min(train_val_labels), np.max(train_val_labels))

# Calculate testing accuracy
test_accuracy = accuracy_score(test_labels, test_predictions_discrete)
print(f"Testing Accuracy: {test_accuracy:.4f}")

# Generate confusion matrix
conf_matrix = confusion_matrix(test_labels, test_predictions_discrete)
print("Confusion Matrix:")
print(conf_matrix)

# Extract rules from the RuleFit model
rules = rf.get_rules()
rules = rules[rules.coef != 0].sort_values("importance", ascending=False)

# Display the top rules
print("Top Rules from RuleFit Model:")
print(rules.head())


# Save rules to a text file
output_file_path = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/10 features results/rulefit_rules_on_3_statistical_features.txt'
rules.to_csv(output_file_path, index=False)
print(f"Rules have been saved to {output_file_path}.")


Combined Training + Validation Features Shape: (34652, 3)
Combined Training + Validation Labels Shape: (34652,)


  model = cd_fast.enet_coordinate_descent(


Testing Accuracy: 0.8136
Confusion Matrix:
[[ 991  443   40    2]
 [ 269 1068  135   15]
 [   0    0 1555    5]
 [   0    5  226 1362]]
Top Rules from RuleFit Model:
                                                   rule  type      coef  \
1442  mean <= 0.9528847336769104 & entropy > 4.01117...  rule -0.157365   
933   entropy <= 3.917411684989929 & mean > 0.900384...  rule  0.137359   
97    skewness > 0.8413510024547577 & entropy <= 3.9...  rule  0.124651   
712   entropy > 3.8997843265533447 & skewness > 1.14...  rule -0.107435   
465   mean <= 0.9497046768665314 & entropy > 3.99800...  rule -0.105526   

       support  importance  
1442  0.337171    0.074393  
933   0.280428    0.061703  
97    0.349507    0.059435  
712   0.369243    0.051848  
465   0.348684    0.050289  
Rules have been saved to E:/Abroad period research/new idea implementation codes/Second part of the paper/10 features results/rulefit_rules_on_3_statistical_features.txt.
