Deep Feature extraction using trained custom MobileNetV2 model

In [1]:

from tensorflow.keras.models import load_model, Model
import pickle
import numpy as np
import pandas as pd
from pathlib import Path
import os
import matplotlib.pyplot as plt
from IPython.display import display, Markdown
import seaborn as sns
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from tensorflow.keras import Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, auc
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Function to print with Markdown
def printmd(string):
    display(Markdown(string))


# Load and preprocess dataset
image_dir = Path(r'E:\Abroad period research\new idea implementation codes\Second part of the paper\justchest_Unet_Segmented_Dataset')

filepaths = list(image_dir.glob(r'**/*.png'))
labels = list(map(lambda x: os.path.split(os.path.split(x)[0])[1], filepaths))

filepaths = pd.Series(filepaths, name='Filepath').astype(str)
labels = pd.Series(labels, name='Label')

image_df = pd.concat([filepaths, labels], axis=1)
image_df = image_df.sample(frac=1).reset_index(drop=True)

# Split data into training, validation, and test sets
train_df, temp_df = train_test_split(image_df, train_size=0.7, shuffle=True, random_state=1)
val_df, test_df = train_test_split(temp_df, train_size=0.5, shuffle=True, random_state=1)

# Create data generators with augmentation
def create_gen():
    train_generator = ImageDataGenerator(
        preprocessing_function=tf.keras.applications.mobilenet_v2.preprocess_input,
        rotation_range=20,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest'
    )
    val_test_generator = ImageDataGenerator(
        preprocessing_function=tf.keras.applications.mobilenet_v2.preprocess_input
    )

    train_images = train_generator.flow_from_dataframe(
        dataframe=train_df,
        x_col='Filepath',
        y_col='Label',
        target_size=(224, 224),
        color_mode='rgb',
        class_mode='categorical',
        batch_size=32,
        shuffle=True,
        seed=0
    )

    val_images = val_test_generator.flow_from_dataframe(
        dataframe=val_df,
        x_col='Filepath',
        y_col='Label',
        target_size=(224, 224),
        color_mode='rgb',
        class_mode='categorical',
        batch_size=32,
        shuffle=False
    )

    test_images = val_test_generator.flow_from_dataframe(
        dataframe=test_df,
        x_col='Filepath',
        y_col='Label',
        target_size=(224, 224),
        color_mode='rgb',
        class_mode='categorical',
        batch_size=32,
        shuffle=False
    )

    return train_images, val_images, test_images


# Load the pre-trained model
model_path = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\26 features results\Mobilenetv2_finetuned_with_CLR_and_GradientAccum.h5'
loaded_model = load_model(model_path)

# Define the feature extraction model
# Adjust the layer as needed; here, we use the fourth-last layer's output
feature_extractor = Model(inputs=loaded_model.input, outputs=loaded_model.layers[-4].output)

# Directory to save extracted features
feature_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\26 features results\extracted_features'
os.makedirs(feature_dir, exist_ok=True)

def extract_features(data_gen, set_name):
    features = []
    labels = []
    
    # Loop through batches in the data generator
    for batch_images, batch_labels in data_gen:
        # Extract features for the batch
        batch_features = feature_extractor.predict(batch_images)
        features.extend(batch_features)
        labels.extend(batch_labels)
        
        # Break if we’ve covered all images in the generator
        if data_gen.batch_index == 0:
            break

    # Convert to numpy arrays
    features = np.array(features)
    labels = np.array(labels)
    
    # Save extracted features and labels to a file
    with open(os.path.join(feature_dir, f"{set_name}_features.pkl"), 'wb') as f:
        pickle.dump((features, labels), f)

# Create data generators with augmentation
train_images, val_images, test_images = create_gen()

# Extract features for training, validation, and test sets
extract_features(train_images, "train")
extract_features(val_images, "val")
extract_features(test_images, "test")

print("Features extracted and saved.")




Found 28537 validated image filenames belonging to 4 classes.
Found 6115 validated image filenames belonging to 4 classes.
Found 6116 validated image filenames belonging to 4 classes.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 878ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 746ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 774ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 791ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━

Classification of features using saved extracted features

In [2]:
# # Import necessary libraries
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pickle
import os
import numpy as np

# Directory to save extracted features and models
feature_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\26 features results\extracted_features'
model_dir = 'saved_models'
os.makedirs(model_dir, exist_ok=True)

# Load extracted features
with open(os.path.join(feature_dir, "train_features.pkl"), 'rb') as f:
    train_features, train_labels = pickle.load(f)
    
with open(os.path.join(feature_dir, "val_features.pkl"), 'rb') as f:
    val_features, val_labels = pickle.load(f)
    
with open(os.path.join(feature_dir, "test_features.pkl"), 'rb') as f:
    test_features, test_labels = pickle.load(f)

# Convert one-hot encoded labels to single-class labels
train_labels = np.argmax(train_labels, axis=1)
val_labels = np.argmax(val_labels, axis=1)
test_labels = np.argmax(test_labels, axis=1)

# Train Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=0)
clf.fit(train_features, train_labels)

# Save the trained Decision Tree model
tree_model_path = os.path.join(model_dir, 'decision_tree_model.pkl')
with open(tree_model_path, 'wb') as f:
    pickle.dump(clf, f)

# Evaluate on training, validation, and test sets
for set_name, features, labels in [("Train", train_features, train_labels), ("Validation", val_features, val_labels), ("Test", test_features, test_labels)]:
    predictions = clf.predict(features)
    accuracy = accuracy_score(labels, predictions)
    print(f"{set_name} Accuracy: {accuracy * 100:.2f}%")
    print(f"{set_name} Confusion Matrix:\n", confusion_matrix(labels, predictions))
    print(f"{set_name} Classification Report:\n", classification_report(labels, predictions))

print(f"Decision Tree model saved to: {tree_model_path}")


Train Accuracy: 100.00%
Train Confusion Matrix:
 [[7161    0    0    0]
 [   0 7095    0    0]
 [   0    0 7189    0]
 [   0    0    0 7092]]
Train Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      7161
           1       1.00      1.00      1.00      7095
           2       1.00      1.00      1.00      7189
           3       1.00      1.00      1.00      7092

    accuracy                           1.00     28537
   macro avg       1.00      1.00      1.00     28537
weighted avg       1.00      1.00      1.00     28537

Validation Accuracy: 86.35%
Validation Confusion Matrix:
 [[1280  160   89    4]
 [ 227 1100  129   74]
 [   0    0 1454    0]
 [   5    6  141 1446]]
Validation Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.83      0.84      1533
           1       0.87      0.72      0.79      1530
           2       0.80      1.00      0.89 

In [3]:
from sklearn.tree import export_text
import pickle

# Load the trained decision tree model
with open(tree_model_path, 'rb') as f:
    clf = pickle.load(f)

# Extract decision rules
tree_rules = export_text(clf, feature_names=[f'feature_{i}' for i in range(train_features.shape[1])])

# Display decision rules
print("Extracted Decision Tree Rules:\n")
print(tree_rules)


Extracted Decision Tree Rules:

|--- feature_70 <= 0.39
|   |--- feature_96 <= 2.19
|   |   |--- feature_37 <= 0.58
|   |   |   |--- feature_125 <= 1.24
|   |   |   |   |--- feature_23 <= 0.18
|   |   |   |   |   |--- feature_101 <= 1.32
|   |   |   |   |   |   |--- feature_33 <= 0.40
|   |   |   |   |   |   |   |--- feature_112 <= 1.08
|   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |--- feature_112 >  1.08
|   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |--- feature_33 >  0.40
|   |   |   |   |   |   |   |--- feature_4 <= 0.32
|   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |--- feature_4 >  0.32
|   |   |   |   |   |   |   |   |--- class: 3
|   |   |   |   |   |--- feature_101 >  1.32
|   |   |   |   |   |   |--- feature_88 <= 0.31
|   |   |   |   |   |   |   |--- feature_45 <= 1.74
|   |   |   |   |   |   |   |   |--- feature_7 <= 3.29
|   |   |   |   |   |   |   |   |   |--- feature_8 <= 1.06
|   |   |   |  

USing Rulefit on the extracted deep features

In [4]:
import os
import pickle
import numpy as np
from rulefit import RuleFit
from sklearn.metrics import accuracy_score, confusion_matrix

# Directory to save extracted features and models
feature_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\26 features results\extracted_features'
model_dir = 'saved_models'
os.makedirs(model_dir, exist_ok=True)

# Load extracted features
with open(os.path.join(feature_dir, "train_features.pkl"), 'rb') as f:
    train_features, train_labels = pickle.load(f)
    
with open(os.path.join(feature_dir, "val_features.pkl"), 'rb') as f:
    val_features, val_labels = pickle.load(f)
    
with open(os.path.join(feature_dir, "test_features.pkl"), 'rb') as f:
    test_features, test_labels = pickle.load(f)

# Check shapes of features and labels
print("Train Features Shape:", train_features.shape)
print("Train Labels Shape:", train_labels.shape)
print("Validation Features Shape:", val_features.shape)
print("Validation Labels Shape:", val_labels.shape)

# Ensure labels are aligned with features
assert train_features.shape[0] == train_labels.shape[0], "Mismatch between train features and labels."
assert val_features.shape[0] == val_labels.shape[0], "Mismatch between validation features and labels."

# Convert multi-dimensional labels to single class labels (if needed)
train_labels = np.argmax(train_labels, axis=1)  # Assuming one-hot encoding
val_labels = np.argmax(val_labels, axis=1)      # Assuming one-hot encoding
test_labels = np.argmax(test_labels, axis=1)    # If test labels are also one-hot encoded

# Combine training and validation data for final training
train_val_features = np.vstack([train_features, val_features])
train_val_labels = np.hstack([train_labels, val_labels])

print(f"Combined Training + Validation Features Shape: {train_val_features.shape}")
print(f"Combined Training + Validation Labels Shape: {train_val_labels.shape}")

# Initialize RuleFit model
rf = RuleFit(tree_size=3, sample_fract=1.0, max_rules=2000, random_state=42)

# Fit the RuleFit model to combined training and validation data
rf.fit(train_val_features, train_val_labels)

# Predict on the test set (continuous values)
test_predictions = rf.predict(test_features)

# Convert continuous predictions to discrete class labels by rounding or using a threshold
test_predictions_discrete = np.round(test_predictions).astype(int)

# Ensure the predicted labels are within the valid range of classes
test_predictions_discrete = np.clip(test_predictions_discrete, np.min(train_val_labels), np.max(train_val_labels))

# Calculate testing accuracy
test_accuracy = accuracy_score(test_labels, test_predictions_discrete)
print(f"Testing Accuracy: {test_accuracy:.4f}")

# Generate confusion matrix
conf_matrix = confusion_matrix(test_labels, test_predictions_discrete)
print("Confusion Matrix:")
print(conf_matrix)
print(f"{set_name} Confusion Matrix:\n", confusion_matrix(test_labels, test_predictions_discrete))
print(f"{set_name} Classification Report:\n", classification_report(test_labels, test_predictions_discrete))

# Extract rules from the RuleFit model
rules = rf.get_rules()
rules = rules[rules.coef != 0].sort_values("importance", ascending=False)

# Display the top rules
print("Top Rules from RuleFit Model:")
print(rules.head())

# Save rules to a text file
output_file_path = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/rulefit_rules_on_deep_features.txt'
rules.to_csv(output_file_path, index=False)
print(f"Rules have been saved to {output_file_path}.")


Train Features Shape: (28537, 128)
Train Labels Shape: (28537, 4)
Validation Features Shape: (6115, 128)
Validation Labels Shape: (6115, 4)
Combined Training + Validation Features Shape: (34652, 128)
Combined Training + Validation Labels Shape: (34652,)


  model = cd_fast.enet_coordinate_descent(


Testing Accuracy: 0.9472
Confusion Matrix:
[[1360  124   14    0]
 [  94 1399   72    2]
 [   0    0 1549    0]
 [   0    2   15 1485]]
Test Confusion Matrix:
 [[1360  124   14    0]
 [  94 1399   72    2]
 [   0    0 1549    0]
 [   0    2   15 1485]]
Test Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.91      0.92      1498
           1       0.92      0.89      0.90      1567
           2       0.94      1.00      0.97      1549
           3       1.00      0.99      0.99      1502

    accuracy                           0.95      6116
   macro avg       0.95      0.95      0.95      6116
weighted avg       0.95      0.95      0.95      6116

Top Rules from RuleFit Model:
                                                  rule  type      coef  \
518  feature_27 <= 1.4795796871185303 & feature_57 ...  rule  0.247160   
424  feature_90 > 1.8816329836845398 & feature_115 ...  rule  0.240747   
186                  feature_12

Calculate Statistical Features for Training, Validation, and Testing Sets

In [5]:
import numpy as np
import pickle
import pandas as pd
from scipy.stats import skew, entropy, kurtosis, variation, iqr
import os

# Load extracted features
feature_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/extracted_features'
stat_feature_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/statistical_features'
os.makedirs(stat_feature_dir, exist_ok=True)

with open(os.path.join(feature_dir, "train_features.pkl"), 'rb') as f:
    train_features, train_labels = pickle.load(f)
    
with open(os.path.join(feature_dir, "val_features.pkl"), 'rb') as f:
    val_features, val_labels = pickle.load(f)
    
with open(os.path.join(feature_dir, "test_features.pkl"), 'rb') as f:
    test_features, test_labels = pickle.load(f)

# Convert one-hot encoded labels to single class labels, if needed
if len(train_labels.shape) > 1 and train_labels.shape[1] > 1:
    train_labels = np.argmax(train_labels, axis=1)

if len(val_labels.shape) > 1 and val_labels.shape[1] > 1:
    val_labels = np.argmax(val_labels, axis=1)

if len(test_labels.shape) > 1 and test_labels.shape[1] > 1:
    test_labels = np.argmax(test_labels, axis=1)

# Function to calculate signal-to-noise ratio
def signal_to_noise(f):
    mean = np.mean(f)
    std = np.std(f)
    return mean / (std + 1e-6)  # Adding small constant to avoid division by zero

# Function to calculate more advanced statistical features from deep features
def calculate_statistical_features(features):
    stats_features = []
    for f in features:
        stats = {
            'mean': np.mean(f),
            'std_dev': np.std(f),
            'variance': np.var(f),
            'median': np.median(f),
            'range': np.ptp(f),  # Peak-to-peak range
            'skewness': skew(f),
            'kurtosis': kurtosis(f),
            'entropy': entropy(np.abs(f) + 1e-6),  # Add small constant to avoid log(0)
            'energy': np.sum(f ** 2),  # Sum of squared elements
            'contrast': np.std(f) ** 2,  # Contrast as variance
            'mean_abs_dev': np.mean(np.abs(f - np.mean(f))),
            'min_value': np.min(f),
            'max_value': np.max(f),
            'iqr': iqr(f),  # Interquartile range
            'percentile_25': np.percentile(f, 25),
            'percentile_50': np.percentile(f, 50),  # Median
            'percentile_75': np.percentile(f, 75),
            'signal_to_noise': signal_to_noise(f),
            'coef_of_var': variation(f),  # Coefficient of variation
            'autocorrelation': np.corrcoef(f[:-1], f[1:])[0, 1] if len(f) > 1 else 0,  # Lag-1 autocorrelation
            'shannon_entropy': -np.sum(f * np.log2(f + 1e-6)),  # Shannon entropy for diversity measure
            'root_mean_square': np.sqrt(np.mean(f ** 2)),  # Root mean square
            'harmonic_mean': len(f) / np.sum(1.0 / (f + 1e-6)),  # Harmonic mean
            'geometric_mean': np.exp(np.mean(np.log(f + 1e-6))),  # Geometric mean
            'std_error_mean': np.std(f) / np.sqrt(len(f)),  # Standard error of the mean
            'median_abs_dev': np.median(np.abs(f - np.median(f))),  # Median absolute deviation
        }
        stats_features.append(stats)
    return stats_features

# Function to save features and labels as CSV
def save_statistical_features_as_csv(features, labels, set_name):
    df = pd.DataFrame(features)
    df['label'] = labels  # Ensure labels are a 1D array
    df.to_csv(os.path.join(stat_feature_dir, f"{set_name}_stat_features.csv"), index=False)

# Calculate and save statistical features for each dataset
for set_name, features, labels in [("train", train_features, train_labels), 
                                   ("val", val_features, val_labels), 
                                   ("test", test_features, test_labels)]:
    stats_features = calculate_statistical_features(features)
    save_statistical_features_as_csv(stats_features, labels, set_name)

print("Statistical features calculated and saved in CSV format.")

Statistical features calculated and saved in CSV format.


Load Statistical Features and Train & Evaluate Decision Tree Classifier

In [None]:

import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Define the directory containing the CSV files
stat_feature_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/statistical_features'

# Load the CSV files
train_df = pd.read_csv(os.path.join(stat_feature_dir, "train_stat_features.csv"))
val_df = pd.read_csv(os.path.join(stat_feature_dir, "val_stat_features.csv"))
test_df = pd.read_csv(os.path.join(stat_feature_dir, "test_stat_features.csv"))

# Separate features and labels
train_stat_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values

val_stat_features = val_df.drop(columns=['label']).values
val_labels = val_df['label'].values

test_stat_features = test_df.drop(columns=['label']).values
test_labels = test_df['label'].values

# Combine training and validation data for final training
combined_features = np.vstack([train_stat_features, val_stat_features])
combined_labels = np.hstack([train_labels, val_labels])

# Train Decision Tree Classifier with specified hyperparameters
clf = DecisionTreeClassifier(
    random_state=0,
    max_depth=10,
    min_samples_leaf=4,
    min_samples_split=2
)
clf.fit(combined_features, combined_labels)

# Function to print classification report with four decimal points
def print_classification_report(set_name, labels, predictions):
    report = classification_report(labels, predictions, output_dict=True)
    print(f"{set_name} Classification Report:")
    for label, metrics in report.items():
        if label == 'accuracy':
            print(f"  Accuracy: {metrics:.4f}")
        else:
            print(f"  Class {label}: Precision: {metrics['precision']:.4f}, Recall: {metrics['recall']:.4f}, F1-Score: {metrics['f1-score']:.4f}")
    print()

# Evaluate on the combined training set
train_predictions = clf.predict(combined_features)
train_accuracy = accuracy_score(combined_labels, train_predictions)
print(f"Combined Training Accuracy: {train_accuracy * 100:.4f}%")
print(f"Combined Training Confusion Matrix:\n", confusion_matrix(combined_labels, train_predictions))
print_classification_report("Combined Training", combined_labels, train_predictions)

# Evaluate on the test set
test_predictions = clf.predict(test_stat_features)
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy * 100:.4f}%")
print(f"Test Confusion Matrix:\n", confusion_matrix(test_labels, test_predictions))
print_classification_report("Test", test_labels, test_predictions)


Combined Training Accuracy: 91.9283%
Combined Training Confusion Matrix:
 [[7373 1236   70   15]
 [ 663 7857   94   11]
 [  43   90 8214  296]
 [   6   40  233 8411]]
Combined Training Classification Report:
  Class 0: Precision: 0.9119, Recall: 0.8481, F1-Score: 0.8788
  Class 1: Precision: 0.8519, Recall: 0.9110, F1-Score: 0.8804
  Class 2: Precision: 0.9539, Recall: 0.9504, F1-Score: 0.9521
  Class 3: Precision: 0.9631, Recall: 0.9679, F1-Score: 0.9655
  Accuracy: 0.9193
  Class macro avg: Precision: 0.9202, Recall: 0.9193, F1-Score: 0.9192
  Class weighted avg: Precision: 0.9203, Recall: 0.9193, F1-Score: 0.9192

Test Accuracy: 88.3257%
Test Confusion Matrix:
 [[1194  262   40    2]
 [ 157 1307   93   10]
 [   0    0 1537   12]
 [   2    7  129 1364]]
Test Classification Report:
  Class 0: Precision: 0.8825, Recall: 0.7971, F1-Score: 0.8376
  Class 1: Precision: 0.8293, Recall: 0.8341, F1-Score: 0.8317
  Class 2: Precision: 0.8544, Recall: 0.9923, F1-Score: 0.9182
  Class 3: Precis

Using Rulefit on the statistical features

In [1]:
import os
import pandas as pd
import numpy as np
from rulefit import RuleFit
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Define the directory containing the CSV files
input_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/statistical_features'

# Load the CSV files
train_data = pd.read_csv(os.path.join(input_dir, "train_stat_features.csv"))
val_data = pd.read_csv(os.path.join(input_dir, "val_stat_features.csv"))
test_data = pd.read_csv(os.path.join(input_dir, "test_stat_features.csv"))

# Separate features and labels for each dataset
train_features = train_data.drop(columns=['label']).values
train_labels = train_data['label'].values

val_features = val_data.drop(columns=['label']).values
val_labels = val_data['label'].values

test_features = test_data.drop(columns=['label']).values
test_labels = test_data['label'].values

# Combine training and validation data for final training
train_val_features = np.vstack([train_features, val_features])
train_val_labels = np.hstack([train_labels, val_labels])

print(f"Combined Training + Validation Features Shape: {train_val_features.shape}")
print(f"Combined Training + Validation Labels Shape: {train_val_labels.shape}")

# Define feature names dynamically based on the training data
feature_names = train_data.columns[:-1].tolist()  # Exclude the label column

# Initialize RuleFit model
rf = RuleFit(tree_size=3, sample_fract=0.7, max_rules=3000, random_state=42)

# Fit the RuleFit model to combined training and validation data
rf.fit(train_val_features, train_val_labels, feature_names=feature_names)

# Predict on the test set (continuous values)
test_predictions = rf.predict(test_features)

# Convert continuous predictions to discrete class labels by rounding
test_predictions_discrete = np.round(test_predictions).astype(int)

# Ensure the predicted labels are within the valid range of classes
test_predictions_discrete = np.clip(test_predictions_discrete, np.min(train_val_labels), np.max(train_val_labels))

# Calculate testing accuracy
test_accuracy = accuracy_score(test_labels, test_predictions_discrete)
print(f"Testing Accuracy: {test_accuracy:.4f}")

# Generate confusion matrix
conf_matrix = confusion_matrix(test_labels, test_predictions_discrete)
print("Confusion Matrix:")
print(conf_matrix)

# Generate classification report
report = classification_report(test_labels, test_predictions_discrete, digits=4)
print("Classification Report:")
print(report)

# Extract rules from the RuleFit model
rules = rf.get_rules()
rules = rules[rules.coef != 0].sort_values("importance", ascending=False)

# Display the top rules
print("Top Rules from RuleFit Model:")
print(rules.head())

# Save rules to a text file
output_file_path = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/rulefit_rules_on_statistical_features.txt'
rules.to_csv(output_file_path, index=False)
print(f"Rules have been saved to {output_file_path}.")


Combined Training + Validation Features Shape: (34652, 26)
Combined Training + Validation Labels Shape: (34652,)


  model = cd_fast.enet_coordinate_descent(


Testing Accuracy: 0.8726
Confusion Matrix:
[[1154  320   24    0]
 [ 136 1308  119    4]
 [   0    0 1540    9]
 [   0    1  166 1335]]
Classification Report:
              precision    recall  f1-score   support

           0     0.8946    0.7704    0.8278      1498
           1     0.8029    0.8347    0.8185      1567
           2     0.8329    0.9942    0.9064      1549
           3     0.9904    0.8888    0.9368      1502

    accuracy                         0.8726      6116
   macro avg     0.8802    0.8720    0.8724      6116
weighted avg     0.8790    0.8726    0.8721      6116

Top Rules from RuleFit Model:
                                                   rule  type      coef  \
2362  autocorrelation > -0.0024353615008294582 & sig...  rule -0.147717   
1609  max_value > 2.8537813425064087 & kurtosis > -0...  rule  0.127445   
2304  mean > 1.0546014308929443 & geometric_mean > 0...  rule -0.112176   
1304  geometric_mean <= 0.0011798902414739132 & perc...  rule  0.121715   
9

Code for evaluating the statistical features (Mutual information and feature importance appraoch)
I used Mutual Information and Feature Importance to directly assess how well your statistical features relate to the class labels. If many features have low MI scores or feature importance values close to zero, they may not be contributing to the classification, and we might need to generate new features or revisit the feature extraction method.


In [8]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import mutual_info_classif

# Load the statistical features and labels from CSV
stat_feature_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/statistical_features'

# Load the CSV file
train_df = pd.read_csv(f"{stat_feature_dir}/train_stat_features.csv")

# Separate features and labels
train_features = train_df.drop(columns=['label']).values  # Drop the 'label' column to get features
train_labels = train_df['label'].values  # Extract the labels

# Names of the statistical features (based on the columns in the CSV)
stat_feature_names = train_df.columns[:-1]  # All columns except 'label' are feature names

# 1. Mutual Information (MI) for each feature
# Mutual Information helps to see which features carry the most information about the target
mi_scores = mutual_info_classif(train_features, train_labels, discrete_features=False)
mi_scores_df = pd.DataFrame({'Feature': stat_feature_names, 'MI Score': mi_scores})
mi_scores_df.sort_values(by='MI Score', ascending=False, inplace=True)

# Display MI scores for each feature
print(mi_scores_df)


             Feature  MI Score
22     harmonic_mean  0.606683
5           skewness  0.598683
18       coef_of_var  0.573632
17   signal_to_noise  0.573415
23    geometric_mean  0.552168
6           kurtosis  0.542938
7            entropy  0.533806
16     percentile_75  0.402670
13               iqr  0.402660
0               mean  0.375342
19   autocorrelation  0.358632
21  root_mean_square  0.322491
8             energy  0.322483
20   shannon_entropy  0.313630
12         max_value  0.306126
4              range  0.306105
2           variance  0.298579
10      mean_abs_dev  0.298576
9           contrast  0.298568
1            std_dev  0.298555
24    std_error_mean  0.298528
3             median  0.266166
25    median_abs_dev  0.265641
15     percentile_50  0.260394
14     percentile_25  0.008844
11         min_value  0.000000


Drop features with more than 50% of data 0s

In [9]:
import os
import pandas as pd

# Define directories
input_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/statistical_features'
output_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/filtered_statistical_features'

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Load datasets
train_data = pd.read_csv(os.path.join(input_dir, "train_stat_features.csv"))
val_data = pd.read_csv(os.path.join(input_dir, "val_stat_features.csv"))
test_data = pd.read_csv(os.path.join(input_dir, "test_stat_features.csv"))

# Function to find features with more than 50% zeros
def find_zero_features(df):
    zero_features = [col for col in df.columns if (df[col] == 0).mean() > 0.5]
    return zero_features

# Identify features with more than 50% zeros in each dataset
train_zero_features = find_zero_features(train_data)
val_zero_features = find_zero_features(val_data)
test_zero_features = find_zero_features(test_data)

# Union of features with more than 50% zeros across all datasets
all_zero_features = set(train_zero_features).union(set(val_zero_features)).union(set(test_zero_features))

# Display features with more than 50% zeros in any dataset
print(f"Features with more than 50% zeros in any dataset: {list(all_zero_features)}")

# Drop these features from all datasets
train_filtered = train_data.drop(columns=all_zero_features)
val_filtered = val_data.drop(columns=all_zero_features)
test_filtered = test_data.drop(columns=all_zero_features)

# Save filtered datasets
train_filtered.to_csv(os.path.join(output_dir, "filtered_train_stat_features.csv"), index=False)
val_filtered.to_csv(os.path.join(output_dir, "filtered_val_stat_features.csv"), index=False)
test_filtered.to_csv(os.path.join(output_dir, "filtered_test_stat_features.csv"), index=False)

print("Filtered features have been saved in the 'filtered_statistical_features' directory.")


Features with more than 50% zeros in any dataset: ['median', 'median_abs_dev', 'min_value', 'percentile_25', 'percentile_50']
Filtered features have been saved in the 'filtered_statistical_features' directory.


tree evaluation using filtered_statistical_features

In [None]:


import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Define the directory containing the CSV files
stat_feature_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/filtered_statistical_features'

# Load the CSV files
train_df = pd.read_csv(os.path.join(stat_feature_dir, "filtered_train_stat_features.csv"))
val_df = pd.read_csv(os.path.join(stat_feature_dir, "filtered_val_stat_features.csv"))
test_df = pd.read_csv(os.path.join(stat_feature_dir, "filtered_test_stat_features.csv"))

# Separate features and labels
train_stat_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values

val_stat_features = val_df.drop(columns=['label']).values
val_labels = val_df['label'].values

test_stat_features = test_df.drop(columns=['label']).values
test_labels = test_df['label'].values

# Combine training and validation data for final training
combined_features = np.vstack([train_stat_features, val_stat_features])
combined_labels = np.hstack([train_labels, val_labels])

# Train Decision Tree Classifier with specified hyperparameters
clf = DecisionTreeClassifier(
    random_state=0,
    max_depth=10,
    min_samples_leaf=4,
    min_samples_split=2
)
clf.fit(combined_features, combined_labels)

# Function to print classification report with four decimal points
def print_classification_report(set_name, labels, predictions):
    report = classification_report(labels, predictions, output_dict=True)
    print(f"{set_name} Classification Report:")
    for label, metrics in report.items():
        if label == 'accuracy':
            print(f"  Accuracy: {metrics:.4f}")
        else:
            print(f"  Class {label}: Precision: {metrics['precision']:.4f}, Recall: {metrics['recall']:.4f}, F1-Score: {metrics['f1-score']:.4f}")
    print()

# Evaluate on the combined training set
train_predictions = clf.predict(combined_features)
train_accuracy = accuracy_score(combined_labels, train_predictions)
print(f"Combined Training Accuracy: {train_accuracy * 100:.4f}%")
print(f"Combined Training Confusion Matrix:\n", confusion_matrix(combined_labels, train_predictions))
print_classification_report("Combined Training", combined_labels, train_predictions)

# Evaluate on the test set
test_predictions = clf.predict(test_stat_features)
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy * 100:.4f}%")
print(f"Test Confusion Matrix:\n", confusion_matrix(test_labels, test_predictions))
print_classification_report("Test", test_labels, test_predictions)


Combined Training Accuracy: 91.6801%
Combined Training Confusion Matrix:
 [[7332 1283   61   18]
 [ 663 7857   88   17]
 [  43  113 8102  385]
 [  10   24  178 8478]]
Combined Training Classification Report:
  Class 0: Precision: 0.9110, Recall: 0.8433, F1-Score: 0.8759
  Class 1: Precision: 0.8469, Recall: 0.9110, F1-Score: 0.8778
  Class 2: Precision: 0.9612, Recall: 0.9374, F1-Score: 0.9492
  Class 3: Precision: 0.9528, Recall: 0.9756, F1-Score: 0.9641
  Accuracy: 0.9168
  Class macro avg: Precision: 0.9180, Recall: 0.9168, F1-Score: 0.9167
  Class weighted avg: Precision: 0.9181, Recall: 0.9168, F1-Score: 0.9167

Test Accuracy: 88.9307%
Test Confusion Matrix:
 [[1204  253   38    3]
 [ 159 1310   84   14]
 [   0    0 1537   12]
 [   4    3  107 1388]]
Test Classification Report:
  Class 0: Precision: 0.8808, Recall: 0.8037, F1-Score: 0.8405
  Class 1: Precision: 0.8365, Recall: 0.8360, F1-Score: 0.8363
  Class 2: Precision: 0.8703, Recall: 0.9923, F1-Score: 0.9273
  Class 3: Precis

Rulefit evaluation on filtered statistical features

In [6]:
import os
import pandas as pd
import numpy as np
from rulefit import RuleFit
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Define the directory containing the CSV files
input_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/filtered_statistical_features'

# Load the CSV files
train_data = pd.read_csv(os.path.join(stat_feature_dir, "filtered_train_stat_features.csv"))
val_data = pd.read_csv(os.path.join(stat_feature_dir, "filtered_val_stat_features.csv"))
test_data = pd.read_csv(os.path.join(stat_feature_dir, "filtered_test_stat_features.csv"))

# Separate features and labels for each dataset
train_features = train_data.drop(columns=['label']).values
train_labels = train_data['label'].values

val_features = val_data.drop(columns=['label']).values
val_labels = val_data['label'].values

test_features = test_data.drop(columns=['label']).values
test_labels = test_data['label'].values

# Combine training and validation data for final training
train_val_features = np.vstack([train_features, val_features])
train_val_labels = np.hstack([train_labels, val_labels])

print(f"Combined Training + Validation Features Shape: {train_val_features.shape}")
print(f"Combined Training + Validation Labels Shape: {train_val_labels.shape}")

# Define feature names dynamically based on the training data
feature_names = train_data.columns[:-1].tolist()  # Exclude the label column

# Initialize RuleFit model
rf = RuleFit(tree_size=3, sample_fract=0.7, max_rules=3000, random_state=42)

# Fit the RuleFit model to combined training and validation data
rf.fit(train_val_features, train_val_labels, feature_names=feature_names)

# Predict on the test set (continuous values)
test_predictions = rf.predict(test_features)

# Convert continuous predictions to discrete class labels by rounding
test_predictions_discrete = np.round(test_predictions).astype(int)

# Ensure the predicted labels are within the valid range of classes
test_predictions_discrete = np.clip(test_predictions_discrete, np.min(train_val_labels), np.max(train_val_labels))

# Calculate testing accuracy
test_accuracy = accuracy_score(test_labels, test_predictions_discrete)
print(f"Testing Accuracy: {test_accuracy:.4f}")

# Generate confusion matrix
conf_matrix = confusion_matrix(test_labels, test_predictions_discrete)
print("Confusion Matrix:")
print(conf_matrix)

# Generate classification report
report = classification_report(test_labels, test_predictions_discrete, digits=4)
print("Classification Report:")
print(report)

# Extract rules from the RuleFit model
rules = rf.get_rules()
rules = rules[rules.coef != 0].sort_values("importance", ascending=False)

# Display the top rules
print("Top Rules from RuleFit Model:")
print(rules.head())

# Save rules to a text file
output_file_path = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/rulefit_rules_on_filtered_statistical_features.txt'
rules.to_csv(output_file_path, index=False)
print(f"Rules have been saved to {output_file_path}.")
			


Combined Training + Validation Features Shape: (34652, 21)
Combined Training + Validation Labels Shape: (34652,)


  model = cd_fast.enet_coordinate_descent(


Testing Accuracy: 0.8710
Confusion Matrix:
[[1145  327   26    0]
 [ 151 1307  106    3]
 [   0    2 1538    9]
 [   0    1  164 1337]]
Classification Report:
              precision    recall  f1-score   support

           0     0.8835    0.7644    0.8196      1498
           1     0.7984    0.8341    0.8159      1567
           2     0.8386    0.9929    0.9093      1549
           3     0.9911    0.8901    0.9379      1502

    accuracy                         0.8710      6116
   macro avg     0.8779    0.8704    0.8707      6116
weighted avg     0.8768    0.8710    0.8704      6116

Top Rules from RuleFit Model:
                                                   rule  type      coef  \
1057  autocorrelation > -0.048355357721447945 & coef...  rule  0.120761   
691   mean > 0.8756935596466064 & coef_of_var <= 1.2...  rule -0.113350   
826   mean > 1.0546014308929443 & geometric_mean > 0...  rule -0.106838   
58    iqr <= 3.206506848335266 & geometric_mean <= 0...  rule  0.117154   
1

applying mutual information based approach to select 18 best best features among filtered_statistical_features

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest

# Define the input directory where the selected statistical features are saved
input_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/filtered_statistical_features'

# Define the output directory for saving selected features after CFS
output_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/18 best features'
os.makedirs(output_dir, exist_ok=True)

# Function to apply CFS and save selected features
def apply_mic_and_save(data, selected_columns, output_name):
    # Separate features and labels
    X = data[selected_columns]  # Use only the selected features
    y = data['label']  # Labels
    
    # Add the label back to the selected features
    selected_data = pd.concat([X, y], axis=1)
    
    # Save the selected features with labels
    selected_data.to_csv(os.path.join(output_dir, output_name), index=False)

# Step 1: Perform feature selection on the training set
train_data = pd.read_csv(os.path.join(input_dir, "filtered_train_stat_features.csv"))

# Separate features and labels for the training set
X_train = train_data.drop(columns=['label'])  # Features
y_train = train_data['label']  # Labels

# Use mutual information on training data to select top 18 features
selector = SelectKBest(score_func=mutual_info_classif, k=18)  # Select top 18 features
selector.fit(X_train, y_train)

# Get the selected feature names based on training set
selected_columns = X_train.columns[selector.get_support()]
print(f"Selected features based on training set: {selected_columns}")

# Step 2: Apply the same selected features to the training, validation, and testing datasets

# Apply CFS to the training set (using selected features)
apply_mic_and_save(train_data, selected_columns, "18_training_selected_features.csv")

# Apply the same selected features to the validation set
val_data = pd.read_csv(os.path.join(input_dir, "filtered_val_stat_features.csv"))
apply_mic_and_save(val_data, selected_columns, "18_validation_selected_features.csv")

# Apply the same selected features to the testing set
test_data = pd.read_csv(os.path.join(input_dir, "filtered_test_stat_features.csv"))
apply_mic_and_save(test_data, selected_columns, "18_testing_selected_features.csv")

print("Feature selection completed and saved.")


Selected features based on training set: Index(['mean', 'std_dev', 'range', 'skewness', 'kurtosis', 'entropy', 'energy',
       'mean_abs_dev', 'max_value', 'iqr', 'percentile_75', 'signal_to_noise',
       'coef_of_var', 'autocorrelation', 'shannon_entropy', 'root_mean_square',
       'harmonic_mean', 'geometric_mean'],
      dtype='object')
Feature selection completed and saved.


Evaluating decision tree using 18 selected statistical features

In [7]:
# import os
# import numpy as np
# import pandas as pd
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# # Define the directory containing the CSV files
# stat_feature_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/18 best features'

# # Load the CSV files
# train_df = pd.read_csv(os.path.join(stat_feature_dir, "18_training_selected_features.csv"))
# val_df = pd.read_csv(os.path.join(stat_feature_dir, "18_validation_selected_features.csv"))
# test_df = pd.read_csv(os.path.join(stat_feature_dir, "18_testing_selected_features.csv"))

# # Separate features and labels
# train_stat_features = train_df.drop(columns=['label']).values
# train_labels = train_df['label'].values

# val_stat_features = val_df.drop(columns=['label']).values
# val_labels = val_df['label'].values

# test_stat_features = test_df.drop(columns=['label']).values
# test_labels = test_df['label'].values

# # Combine training and validation data for final training
# combined_features = np.vstack([train_stat_features, val_stat_features])
# combined_labels = np.hstack([train_labels, val_labels])

# # Train Decision Tree Classifier on combined training and validation data
# clf = DecisionTreeClassifier(random_state=0)
# clf.fit(combined_features, combined_labels)

# # Function to print classification report with four decimal points
# def print_classification_report(set_name, labels, predictions):
#     report = classification_report(labels, predictions, output_dict=True)
#     print(f"{set_name} Classification Report:")
#     for label, metrics in report.items():
#         if label == 'accuracy':
#             print(f"  Accuracy: {metrics:.4f}")
#         else:
#             print(f"  Class {label}: Precision: {metrics['precision']:.4f}, Recall: {metrics['recall']:.4f}, F1-Score: {metrics['f1-score']:.4f}")
#     print()

# # Evaluate on the combined training set
# train_predictions = clf.predict(combined_features)
# train_accuracy = accuracy_score(combined_labels, train_predictions)
# print(f"Combined Training Accuracy: {train_accuracy * 100:.4f}%")
# print(f"Combined Training Confusion Matrix:\n", confusion_matrix(combined_labels, train_predictions))
# print_classification_report("Combined Training", combined_labels, train_predictions)

# # Evaluate on the test set
# test_predictions = clf.predict(test_stat_features)
# test_accuracy = accuracy_score(test_labels, test_predictions)
# print(f"Test Accuracy: {test_accuracy * 100:.4f}%")
# print(f"Test Confusion Matrix:\n", confusion_matrix(test_labels, test_predictions))
# print_classification_report("Test", test_labels, test_predictions)

import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Define the directory containing the CSV files
stat_feature_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/18 best features'

# Load the CSV files
train_df = pd.read_csv(os.path.join(stat_feature_dir, "18_training_selected_features.csv"))
val_df = pd.read_csv(os.path.join(stat_feature_dir, "18_validation_selected_features.csv"))
test_df = pd.read_csv(os.path.join(stat_feature_dir, "18_testing_selected_features.csv"))

# Separate features and labels
train_stat_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values

val_stat_features = val_df.drop(columns=['label']).values
val_labels = val_df['label'].values

test_stat_features = test_df.drop(columns=['label']).values
test_labels = test_df['label'].values

# Combine training and validation data for final training
combined_features = np.vstack([train_stat_features, val_stat_features])
combined_labels = np.hstack([train_labels, val_labels])

# Train Decision Tree Classifier with specified hyperparameters on combined training and validation data
clf = DecisionTreeClassifier(
    random_state=0,
    max_depth=10,
    min_samples_leaf=4,
    min_samples_split=2
)
clf.fit(combined_features, combined_labels)

# Function to print classification report with four decimal points
def print_classification_report(set_name, labels, predictions):
    report = classification_report(labels, predictions, output_dict=True)
    print(f"{set_name} Classification Report:")
    for label, metrics in report.items():
        if label == 'accuracy':
            print(f"  Accuracy: {metrics:.4f}")
        else:
            print(f"  Class {label}: Precision: {metrics['precision']:.4f}, Recall: {metrics['recall']:.4f}, F1-Score: {metrics['f1-score']:.4f}")
    print()

# Evaluate on the combined training set
train_predictions = clf.predict(combined_features)
train_accuracy = accuracy_score(combined_labels, train_predictions)
print(f"Combined Training Accuracy: {train_accuracy * 100:.4f}%")
print(f"Combined Training Confusion Matrix:\n", confusion_matrix(combined_labels, train_predictions))
print_classification_report("Combined Training", combined_labels, train_predictions)

# Evaluate on the test set
test_predictions = clf.predict(test_stat_features)
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy * 100:.4f}%")
print(f"Test Confusion Matrix:\n", confusion_matrix(test_labels, test_predictions))
print_classification_report("Test", test_labels, test_predictions)


Combined Training Accuracy: 91.6744%
Combined Training Confusion Matrix:
 [[7329 1286   61   18]
 [ 662 7858   88   17]
 [  43  113 8102  385]
 [  10   24  178 8478]]
Combined Training Classification Report:
  Class 0: Precision: 0.9111, Recall: 0.8430, F1-Score: 0.8757
  Class 1: Precision: 0.8467, Recall: 0.9111, F1-Score: 0.8777
  Class 2: Precision: 0.9612, Recall: 0.9374, F1-Score: 0.9492
  Class 3: Precision: 0.9528, Recall: 0.9756, F1-Score: 0.9641
  Accuracy: 0.9167
  Class macro avg: Precision: 0.9179, Recall: 0.9168, F1-Score: 0.9167
  Class weighted avg: Precision: 0.9180, Recall: 0.9167, F1-Score: 0.9167

Test Accuracy: 88.8816%
Test Confusion Matrix:
 [[1202  256   37    3]
 [ 160 1310   83   14]
 [   0    0 1537   12]
 [   4    3  108 1387]]
Test Classification Report:
  Class 0: Precision: 0.8799, Recall: 0.8024, F1-Score: 0.8394
  Class 1: Precision: 0.8349, Recall: 0.8360, F1-Score: 0.8355
  Class 2: Precision: 0.8708, Recall: 0.9923, F1-Score: 0.9276
  Class 3: Precis

Using Rulefit on 18 features

In [8]:
import os
import pandas as pd
import numpy as np
from rulefit import RuleFit
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


# Define the directory containing the CSV files
input_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/18 best features'

# Load the CSV files
train_df = pd.read_csv(os.path.join(input_dir, "18_training_selected_features.csv"))
val_df = pd.read_csv(os.path.join(input_dir, "18_validation_selected_features.csv"))
test_df = pd.read_csv(os.path.join(input_dir, "18_testing_selected_features.csv"))


# Separate features and labels for each dataset
train_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values
print(f"Training Features Shape: {train_features.shape}")

val_features = val_df.drop(columns=['label']).values
val_labels = val_df['label'].values
print(f"Validation Features Shape: {val_features.shape}")

test_features = test_df.drop(columns=['label']).values
test_labels = test_df['label'].values
print(f"Testing Features Shape: {test_features.shape}")

# Combine training and validation data for final training
train_val_features = np.vstack([train_features, val_features])
train_val_labels = np.hstack([train_labels, val_labels])

print(f"Combined Training + Validation Features Shape: {train_val_features.shape}")
print(f"Combined Training + Validation Labels Shape: {train_val_labels.shape}")

# Define feature names dynamically based on the training data
feature_names = train_data.columns[:-1].tolist()  # Exclude the label column

# Initialize RuleFit model
rf = RuleFit(tree_size=3, sample_fract=0.7, max_rules=3000, random_state=42)

# Fit the RuleFit model to combined training and validation data
rf.fit(train_val_features, train_val_labels, feature_names=feature_names)

# Predict on the test set (continuous values)
test_predictions = rf.predict(test_features)

# Convert continuous predictions to discrete class labels by rounding
test_predictions_discrete = np.round(test_predictions).astype(int)

# Ensure the predicted labels are within the valid range of classes
test_predictions_discrete = np.clip(test_predictions_discrete, np.min(train_val_labels), np.max(train_val_labels))

# Calculate testing accuracy
test_accuracy = accuracy_score(test_labels, test_predictions_discrete)
print(f"Testing Accuracy: {test_accuracy:.4f}")

# Generate confusion matrix
conf_matrix = confusion_matrix(test_labels, test_predictions_discrete)
print("Confusion Matrix:")
print(conf_matrix)

# Generate classification report
report = classification_report(test_labels, test_predictions_discrete, digits=4)
print("Classification Report:")
print(report)

# Extract rules from the RuleFit model
rules = rf.get_rules()
rules = rules[rules.coef != 0].sort_values("importance", ascending=False)

# Display the top rules
print("Top Rules from RuleFit Model:")
print(rules.head())

# Save rules to a text file
output_file_path = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/rulefit_rules_on_18_statistical_features.txt'
rules.to_csv(output_file_path, index=False)
print(f"Rules have been saved to {output_file_path}.")


Training Features Shape: (28537, 18)
Validation Features Shape: (6115, 18)
Testing Features Shape: (6116, 18)
Combined Training + Validation Features Shape: (34652, 18)
Combined Training + Validation Labels Shape: (34652,)


  model = cd_fast.enet_coordinate_descent(


Testing Accuracy: 0.8698
Confusion Matrix:
[[1145  329   24    0]
 [ 159 1296  109    3]
 [   0    0 1540    9]
 [   0    1  162 1339]]
Classification Report:
              precision    recall  f1-score   support

           0     0.8781    0.7644    0.8173      1498
           1     0.7970    0.8271    0.8118      1567
           2     0.8392    0.9942    0.9102      1549
           3     0.9911    0.8915    0.9387      1502

    accuracy                         0.8698      6116
   macro avg     0.8764    0.8693    0.8695      6116
weighted avg     0.8752    0.8698    0.8692      6116

Top Rules from RuleFit Model:
                                                   rule  type      coef  \
242           root_mean_square <= 0.0009085230121854693  rule  0.164117   
322                       kurtosis > 3.9242721796035767  rule -0.130895   
2473  root_mean_square <= 0.0011798902414739132 & me...  rule  0.130629   
390   percentile_75 <= 1.2989734411239624 & mean > 0...  rule -0.116706   
2

Selecting 15 statistical features based on mutual importance

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest

# Define the input directory where the selected statistical features are saved
input_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/filtered_statistical_features'

# Define the output directory for saving selected features after CFS
output_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/15 best features'
os.makedirs(output_dir, exist_ok=True)

# Function to apply CFS and save selected features
def apply_mic_and_save(data, selected_columns, output_name):
    # Separate features and labels
    X = data[selected_columns]  # Use only the selected features
    y = data['label']  # Labels
    
    # Add the label back to the selected features
    selected_data = pd.concat([X, y], axis=1)
    
    # Save the selected features with labels
    selected_data.to_csv(os.path.join(output_dir, output_name), index=False)

# Step 1: Perform feature selection on the training set
train_data = pd.read_csv(os.path.join(input_dir, "filtered_train_stat_features.csv"))

# Separate features and labels for the training set
X_train = train_data.drop(columns=['label'])  # Features
y_train = train_data['label']  # Labels

# Use mutual information on training data to select top 3 features
selector = SelectKBest(score_func=mutual_info_classif, k=15)  # Select top 3 features
selector.fit(X_train, y_train)

# Get the selected feature names based on training set
selected_columns = X_train.columns[selector.get_support()]
print(f"Selected features based on training set: {selected_columns}")

# Step 2: Apply the same selected features to the training, validation, and testing datasets

# Apply CFS to the training set (using selected features)
apply_mic_and_save(train_data, selected_columns, "15_training_selected_features.csv")

# Apply the same selected features to the validation set
val_data = pd.read_csv(os.path.join(input_dir, "filtered_val_stat_features.csv"))
apply_mic_and_save(val_data, selected_columns, "15_validation_selected_features.csv")

# Apply the same selected features to the testing set
test_data = pd.read_csv(os.path.join(input_dir, "filtered_test_stat_features.csv"))
apply_mic_and_save(test_data, selected_columns, "15_testing_selected_features.csv")

print("Feature selection completed and saved.")


Selected features based on training set: Index(['mean', 'range', 'skewness', 'kurtosis', 'entropy', 'energy', 'iqr',
       'percentile_75', 'signal_to_noise', 'coef_of_var', 'autocorrelation',
       'shannon_entropy', 'root_mean_square', 'harmonic_mean',
       'geometric_mean'],
      dtype='object')
Feature selection completed and saved.


Decision tree evaluation using 15 features

In [None]:


import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Define the directory containing the CSV files
stat_feature_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/15 best features'

# Load the CSV files
train_df = pd.read_csv(os.path.join(stat_feature_dir, "15_training_selected_features.csv"))
val_df = pd.read_csv(os.path.join(stat_feature_dir, "15_validation_selected_features.csv"))
test_df = pd.read_csv(os.path.join(stat_feature_dir, "15_testing_selected_features.csv"))

# Separate features and labels
train_stat_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values

val_stat_features = val_df.drop(columns=['label']).values
val_labels = val_df['label'].values

test_stat_features = test_df.drop(columns=['label']).values
test_labels = test_df['label'].values

# Combine training and validation data for final training
combined_features = np.vstack([train_stat_features, val_stat_features])
combined_labels = np.hstack([train_labels, val_labels])

# Train Decision Tree Classifier with specified hyperparameters on combined training and validation data
clf = DecisionTreeClassifier(
    random_state=0,
    max_depth=10,
    min_samples_leaf=4,
    min_samples_split=2
)
clf.fit(combined_features, combined_labels)

# Function to print classification report with four decimal points
def print_classification_report(set_name, labels, predictions):
    report = classification_report(labels, predictions, output_dict=True)
    print(f"{set_name} Classification Report:")
    for label, metrics in report.items():
        if label == 'accuracy':
            print(f"  Accuracy: {metrics:.4f}")
        else:
            print(f"  Class {label}: Precision: {metrics['precision']:.4f}, Recall: {metrics['recall']:.4f}, F1-Score: {metrics['f1-score']:.4f}")
    print()

# Evaluate on the combined training set
train_predictions = clf.predict(combined_features)
train_accuracy = accuracy_score(combined_labels, train_predictions)
print(f"Combined Training Accuracy: {train_accuracy * 100:.4f}%")
print(f"Combined Training Confusion Matrix:\n", confusion_matrix(combined_labels, train_predictions))
print_classification_report("Combined Training", combined_labels, train_predictions)

# Evaluate on the test set
test_predictions = clf.predict(test_stat_features)
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy * 100:.4f}%")
print(f"Test Confusion Matrix:\n", confusion_matrix(test_labels, test_predictions))
print_classification_report("Test", test_labels, test_predictions)


Combined Training Accuracy: 91.8966%
Combined Training Confusion Matrix:
 [[7465 1128   77   24]
 [ 763 7717  123   22]
 [  43   56 8235  309]
 [   8   18  237 8427]]
Combined Training Classification Report:
  Class 0: Precision: 0.9017, Recall: 0.8586, F1-Score: 0.8796
  Class 1: Precision: 0.8652, Recall: 0.8947, F1-Score: 0.8797
  Class 2: Precision: 0.9496, Recall: 0.9528, F1-Score: 0.9512
  Class 3: Precision: 0.9596, Recall: 0.9697, F1-Score: 0.9646
  Accuracy: 0.9190
  Class macro avg: Precision: 0.9190, Recall: 0.9190, F1-Score: 0.9188
  Class weighted avg: Precision: 0.9191, Recall: 0.9190, F1-Score: 0.9188

Test Accuracy: 88.4565%
Test Confusion Matrix:
 [[1216  234   42    6]
 [ 167 1282   99   19]
 [   1    0 1537   11]
 [   3    3  121 1375]]
Test Classification Report:
  Class 0: Precision: 0.8767, Recall: 0.8117, F1-Score: 0.8430
  Class 1: Precision: 0.8440, Recall: 0.8181, F1-Score: 0.8308
  Class 2: Precision: 0.8544, Recall: 0.9923, F1-Score: 0.9182
  Class 3: Precis

Rulefit evaluation using 15 best features

In [9]:
import os
import pandas as pd
import numpy as np
from rulefit import RuleFit
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


# Define the directory containing the CSV files
input_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/15 best features'

# Load the CSV files
train_df = pd.read_csv(os.path.join(input_dir, "15_training_selected_features.csv"))
val_df = pd.read_csv(os.path.join(input_dir, "15_validation_selected_features.csv"))
test_df = pd.read_csv(os.path.join(input_dir, "15_testing_selected_features.csv"))


# Separate features and labels for each dataset
train_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values
print(f"Training Features Shape: {train_features.shape}")

val_features = val_df.drop(columns=['label']).values
val_labels = val_df['label'].values
print(f"Validation Features Shape: {val_features.shape}")

test_features = test_df.drop(columns=['label']).values
test_labels = test_df['label'].values
print(f"Testing Features Shape: {test_features.shape}")

# Combine training and validation data for final training
train_val_features = np.vstack([train_features, val_features])
train_val_labels = np.hstack([train_labels, val_labels])

print(f"Combined Training + Validation Features Shape: {train_val_features.shape}")
print(f"Combined Training + Validation Labels Shape: {train_val_labels.shape}")

# Define feature names dynamically based on the training data
feature_names = train_data.columns[:-1].tolist()  # Exclude the label column

# Initialize RuleFit model
rf = RuleFit(tree_size=3, sample_fract=0.7, max_rules=3000, random_state=42)

# Fit the RuleFit model to combined training and validation data
rf.fit(train_val_features, train_val_labels, feature_names=feature_names)

# Predict on the test set (continuous values)
test_predictions = rf.predict(test_features)

# Convert continuous predictions to discrete class labels by rounding
test_predictions_discrete = np.round(test_predictions).astype(int)

# Ensure the predicted labels are within the valid range of classes
test_predictions_discrete = np.clip(test_predictions_discrete, np.min(train_val_labels), np.max(train_val_labels))

# Calculate testing accuracy
test_accuracy = accuracy_score(test_labels, test_predictions_discrete)
print(f"Testing Accuracy: {test_accuracy:.4f}")

# Generate confusion matrix
conf_matrix = confusion_matrix(test_labels, test_predictions_discrete)
print("Confusion Matrix:")
print(conf_matrix)

# Generate classification report
report = classification_report(test_labels, test_predictions_discrete, digits=4)
print("Classification Report:")
print(report)

# Extract rules from the RuleFit model
rules = rf.get_rules()
rules = rules[rules.coef != 0].sort_values("importance", ascending=False)

# Display the top rules
print("Top Rules from RuleFit Model:")
print(rules.head())

# Save rules to a text file
output_file_path = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/rulefit_rules_on_15_statistical_features.txt'
rules.to_csv(output_file_path, index=False)
print(f"Rules have been saved to {output_file_path}.")


Training Features Shape: (28537, 15)
Validation Features Shape: (6115, 15)
Testing Features Shape: (6116, 15)
Combined Training + Validation Features Shape: (34652, 15)
Combined Training + Validation Labels Shape: (34652,)


  model = cd_fast.enet_coordinate_descent(


Testing Accuracy: 0.8721
Confusion Matrix:
[[1155  319   24    0]
 [ 154 1299  111    3]
 [   0    0 1540    9]
 [   0    1  161 1340]]
Classification Report:
              precision    recall  f1-score   support

           0     0.8824    0.7710    0.8229      1498
           1     0.8023    0.8290    0.8154      1567
           2     0.8388    0.9942    0.9099      1549
           3     0.9911    0.8921    0.9390      1502

    accuracy                         0.8721      6116
   macro avg     0.8787    0.8716    0.8718      6116
weighted avg     0.8775    0.8721    0.8716      6116

Top Rules from RuleFit Model:
                                                   rule  type      coef  \
2270  energy <= 3.206506848335266 & coef_of_var <= 0...  rule  0.136075   
731   range > -0.3833502531051636 & mean > 0.8013095...  rule  0.117491   
419   mean > 0.9142641723155975 & contrast > 0.76917...  rule -0.111519   
1000                coef_of_var > 0.0009085230121854693  rule -0.107847   
3

selecting 12 most important features among filtered features

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest

# Define the input directory where the selected statistical features are saved
input_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/filtered_statistical_features'

# Define the output directory for saving selected features after CFS
output_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/12 best features'
os.makedirs(output_dir, exist_ok=True)

# Function to apply CFS and save selected features
def apply_mic_and_save(data, selected_columns, output_name):
    # Separate features and labels
    X = data[selected_columns]  # Use only the selected features
    y = data['label']  # Labels
    
    # Add the label back to the selected features
    selected_data = pd.concat([X, y], axis=1)
    
    # Save the selected features with labels
    selected_data.to_csv(os.path.join(output_dir, output_name), index=False)

# Step 1: Perform feature selection on the training set
train_data = pd.read_csv(os.path.join(input_dir, "filtered_train_stat_features.csv"))

# Separate features and labels for the training set
X_train = train_data.drop(columns=['label'])  # Features
y_train = train_data['label']  # Labels

# Use mutual information on training data to select top 3 features
selector = SelectKBest(score_func=mutual_info_classif, k=12)  # Select top 3 features
selector.fit(X_train, y_train)

# Get the selected feature names based on training set
selected_columns = X_train.columns[selector.get_support()]
print(f"Selected features based on training set: {selected_columns}")

# Step 2: Apply the same selected features to the training, validation, and testing datasets

# Apply CFS to the training set (using selected features)
apply_mic_and_save(train_data, selected_columns, "12_training_selected_features.csv")

# Apply the same selected features to the validation set
val_data = pd.read_csv(os.path.join(input_dir, "filtered_val_stat_features.csv"))
apply_mic_and_save(val_data, selected_columns, "12_validation_selected_features.csv")

# Apply the same selected features to the testing set
test_data = pd.read_csv(os.path.join(input_dir, "filtered_test_stat_features.csv"))
apply_mic_and_save(test_data, selected_columns, "12_testing_selected_features.csv")

print("Feature selection completed and saved.")


Selected features based on training set: Index(['mean', 'skewness', 'kurtosis', 'entropy', 'iqr', 'percentile_75',
       'signal_to_noise', 'coef_of_var', 'autocorrelation', 'root_mean_square',
       'harmonic_mean', 'geometric_mean'],
      dtype='object')
Feature selection completed and saved.


evaluating decision tree on 12 most important features

In [None]:


import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Define the directory containing the CSV files
stat_feature_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/12 best features'

# Load the CSV files
train_df = pd.read_csv(os.path.join(stat_feature_dir, "12_training_selected_features.csv"))
val_df = pd.read_csv(os.path.join(stat_feature_dir, "12_validation_selected_features.csv"))
test_df = pd.read_csv(os.path.join(stat_feature_dir, "12_testing_selected_features.csv"))

# Separate features and labels
train_stat_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values

val_stat_features = val_df.drop(columns=['label']).values
val_labels = val_df['label'].values

test_stat_features = test_df.drop(columns=['label']).values
test_labels = test_df['label'].values

# Combine training and validation data for final training
combined_features = np.vstack([train_stat_features, val_stat_features])
combined_labels = np.hstack([train_labels, val_labels])

# Train Decision Tree Classifier with specified hyperparameters on combined training and validation data
clf = DecisionTreeClassifier(
    random_state=0,
    max_depth=10,
    min_samples_leaf=4,
    min_samples_split=2
)
clf.fit(combined_features, combined_labels)

# Function to print classification report with four decimal points
def print_classification_report(set_name, labels, predictions):
    report = classification_report(labels, predictions, output_dict=True)
    print(f"{set_name} Classification Report:")
    for label, metrics in report.items():
        if label == 'accuracy':
            print(f"  Accuracy: {metrics:.4f}")
        else:
            print(f"  Class {label}: Precision: {metrics['precision']:.4f}, Recall: {metrics['recall']:.4f}, F1-Score: {metrics['f1-score']:.4f}")
    print()

# Evaluate on the combined training set
train_predictions = clf.predict(combined_features)
train_accuracy = accuracy_score(combined_labels, train_predictions)
print(f"Combined Training Accuracy: {train_accuracy * 100:.4f}%")
print(f"Combined Training Confusion Matrix:\n", confusion_matrix(combined_labels, train_predictions))
print_classification_report("Combined Training", combined_labels, train_predictions)

# Evaluate on the test set
test_predictions = clf.predict(test_stat_features)
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy * 100:.4f}%")
print(f"Test Confusion Matrix:\n", confusion_matrix(test_labels, test_predictions))
print_classification_report("Test", test_labels, test_predictions)


Combined Training Accuracy: 91.8446%
Combined Training Confusion Matrix:
 [[7473 1120   77   24]
 [ 769 7700  131   25]
 [  44   47 8207  345]
 [   6   16  222 8446]]
Combined Training Classification Report:
  Class 0: Precision: 0.9012, Recall: 0.8596, F1-Score: 0.8799
  Class 1: Precision: 0.8668, Recall: 0.8928, F1-Score: 0.8796
  Class 2: Precision: 0.9502, Recall: 0.9496, F1-Score: 0.9499
  Class 3: Precision: 0.9554, Recall: 0.9719, F1-Score: 0.9636
  Accuracy: 0.9184
  Class macro avg: Precision: 0.9184, Recall: 0.9184, F1-Score: 0.9182
  Class weighted avg: Precision: 0.9185, Recall: 0.9184, F1-Score: 0.9183

Test Accuracy: 88.2276%
Test Confusion Matrix:
 [[1210  241   41    6]
 [ 167 1273  106   21]
 [   1    0 1537   11]
 [   3    2  121 1376]]
Test Classification Report:
  Class 0: Precision: 0.8762, Recall: 0.8077, F1-Score: 0.8406
  Class 1: Precision: 0.8397, Recall: 0.8124, F1-Score: 0.8258
  Class 2: Precision: 0.8515, Recall: 0.9923, F1-Score: 0.9165
  Class 3: Precis

Evalutaion rulefit on 12 most important features

In [10]:
import os
import pandas as pd
import numpy as np
from rulefit import RuleFit
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


# Define the directory containing the CSV files
input_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/12 best features'

# Load the CSV files
train_df = pd.read_csv(os.path.join(input_dir, "12_training_selected_features.csv"))
val_df = pd.read_csv(os.path.join(input_dir, "12_validation_selected_features.csv"))
test_df = pd.read_csv(os.path.join(input_dir, "12_testing_selected_features.csv"))


# Separate features and labels for each dataset
train_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values
print(f"Training Features Shape: {train_features.shape}")

val_features = val_df.drop(columns=['label']).values
val_labels = val_df['label'].values
print(f"Validation Features Shape: {val_features.shape}")

test_features = test_df.drop(columns=['label']).values
test_labels = test_df['label'].values
print(f"Testing Features Shape: {test_features.shape}")

# Combine training and validation data for final training
train_val_features = np.vstack([train_features, val_features])
train_val_labels = np.hstack([train_labels, val_labels])

print(f"Combined Training + Validation Features Shape: {train_val_features.shape}")
print(f"Combined Training + Validation Labels Shape: {train_val_labels.shape}")

# Define feature names dynamically based on the training data
feature_names = train_data.columns[:-1].tolist()  # Exclude the label column

# Initialize RuleFit model
rf = RuleFit(tree_size=3, sample_fract=0.7, max_rules=3000, random_state=42)

# Fit the RuleFit model to combined training and validation data
rf.fit(train_val_features, train_val_labels, feature_names=feature_names)

# Predict on the test set (continuous values)
test_predictions = rf.predict(test_features)

# Convert continuous predictions to discrete class labels by rounding
test_predictions_discrete = np.round(test_predictions).astype(int)

# Ensure the predicted labels are within the valid range of classes
test_predictions_discrete = np.clip(test_predictions_discrete, np.min(train_val_labels), np.max(train_val_labels))

# Calculate testing accuracy
test_accuracy = accuracy_score(test_labels, test_predictions_discrete)
print(f"Testing Accuracy: {test_accuracy:.4f}")

# Generate confusion matrix
conf_matrix = confusion_matrix(test_labels, test_predictions_discrete)
print("Confusion Matrix:")
print(conf_matrix)

# Generate classification report
report = classification_report(test_labels, test_predictions_discrete, digits=4)
print("Classification Report:")
print(report)

# Extract rules from the RuleFit model
rules = rf.get_rules()
rules = rules[rules.coef != 0].sort_values("importance", ascending=False)

# Display the top rules
print("Top Rules from RuleFit Model:")
print(rules.head())

# Save rules to a text file
output_file_path = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/rulefit_rules_on_12_statistical_features.txt'
rules.to_csv(output_file_path, index=False)
print(f"Rules have been saved to {output_file_path}.")


Training Features Shape: (28537, 12)
Validation Features Shape: (6115, 12)
Testing Features Shape: (6116, 12)
Combined Training + Validation Features Shape: (34652, 12)
Combined Training + Validation Labels Shape: (34652,)


  model = cd_fast.enet_coordinate_descent(


Testing Accuracy: 0.8659
Confusion Matrix:
[[1107  363   28    0]
 [ 128 1318  118    3]
 [   0    2 1541    6]
 [   0    0  172 1330]]
Classification Report:
              precision    recall  f1-score   support

           0     0.8964    0.7390    0.8101      1498
           1     0.7831    0.8411    0.8111      1567
           2     0.8289    0.9948    0.9043      1549
           3     0.9933    0.8855    0.9363      1502

    accuracy                         0.8659      6116
   macro avg     0.8754    0.8651    0.8655      6116
weighted avg     0.8741    0.8659    0.8652      6116

Top Rules from RuleFit Model:
                                                  rule  type      coef  \
183  mean > 0.9056200385093689 & skewness <= 3.2065...  rule  0.171093   
998  contrast > -0.0024353615008294582 & energy <= ...  rule -0.146447   
299                         energy > 1.299041211605072  rule  0.097731   
842                        range <= 3.9242721796035767  rule  0.095917   
275  i

Selection 9 best features

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest

# Define the input directory where the selected statistical features are saved
input_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/filtered_statistical_features'

# Define the output directory for saving selected features after CFS
output_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/9 best features'
os.makedirs(output_dir, exist_ok=True)

# Function to apply CFS and save selected features
def apply_mic_and_save(data, selected_columns, output_name):
    # Separate features and labels
    X = data[selected_columns]  # Use only the selected features
    y = data['label']  # Labels
    
    # Add the label back to the selected features
    selected_data = pd.concat([X, y], axis=1)
    
    # Save the selected features with labels
    selected_data.to_csv(os.path.join(output_dir, output_name), index=False)

# Step 1: Perform feature selection on the training set
train_data = pd.read_csv(os.path.join(input_dir, "filtered_train_stat_features.csv"))

# Separate features and labels for the training set
X_train = train_data.drop(columns=['label'])  # Features
y_train = train_data['label']  # Labels

# Use mutual information on training data to select top 3 features
selector = SelectKBest(score_func=mutual_info_classif, k=9)  # Select top 3 features
selector.fit(X_train, y_train)

# Get the selected feature names based on training set
selected_columns = X_train.columns[selector.get_support()]
print(f"Selected features based on training set: {selected_columns}")

# Step 2: Apply the same selected features to the training, validation, and testing datasets

# Apply CFS to the training set (using selected features)
apply_mic_and_save(train_data, selected_columns, "9_training_selected_features.csv")

# Apply the same selected features to the validation set
val_data = pd.read_csv(os.path.join(input_dir, "filtered_val_stat_features.csv"))
apply_mic_and_save(val_data, selected_columns, "9_validation_selected_features.csv")

# Apply the same selected features to the testing set
test_data = pd.read_csv(os.path.join(input_dir, "filtered_test_stat_features.csv"))
apply_mic_and_save(test_data, selected_columns, "9_testing_selected_features.csv")

print("Feature selection completed and saved.")


Selected features based on training set: Index(['skewness', 'kurtosis', 'entropy', 'iqr', 'percentile_75',
       'signal_to_noise', 'coef_of_var', 'harmonic_mean', 'geometric_mean'],
      dtype='object')
Feature selection completed and saved.


Decision tree evaluation on 9 selected features

In [None]:


import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Define the directory containing the CSV files
stat_feature_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/9 best features'

# Load the CSV files
train_df = pd.read_csv(os.path.join(stat_feature_dir, "9_training_selected_features.csv"))
val_df = pd.read_csv(os.path.join(stat_feature_dir, "9_validation_selected_features.csv"))
test_df = pd.read_csv(os.path.join(stat_feature_dir, "9_testing_selected_features.csv"))

# Separate features and labels
train_stat_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values

val_stat_features = val_df.drop(columns=['label']).values
val_labels = val_df['label'].values

test_stat_features = test_df.drop(columns=['label']).values
test_labels = test_df['label'].values

# Combine training and validation data for final training
combined_features = np.vstack([train_stat_features, val_stat_features])
combined_labels = np.hstack([train_labels, val_labels])

# Train Decision Tree Classifier with specified hyperparameters on combined training and validation data
clf = DecisionTreeClassifier(
    random_state=0,
    max_depth=10,
    min_samples_leaf=4,
    min_samples_split=2
)
clf.fit(combined_features, combined_labels)

# Function to print classification report with four decimal points
def print_classification_report(set_name, labels, predictions):
    report = classification_report(labels, predictions, output_dict=True)
    print(f"{set_name} Classification Report:")
    for label, metrics in report.items():
        if label == 'accuracy':
            print(f"  Accuracy: {metrics:.4f}")
        else:
            print(f"  Class {label}: Precision: {metrics['precision']:.4f}, Recall: {metrics['recall']:.4f}, F1-Score: {metrics['f1-score']:.4f}")
    print()

# Evaluate on the combined training set
train_predictions = clf.predict(combined_features)
train_accuracy = accuracy_score(combined_labels, train_predictions)
print(f"Combined Training Accuracy: {train_accuracy * 100:.4f}%")
print(f"Combined Training Confusion Matrix:\n", confusion_matrix(combined_labels, train_predictions))
print_classification_report("Combined Training", combined_labels, train_predictions)

# Evaluate on the test set
test_predictions = clf.predict(test_stat_features)
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy * 100:.4f}%")
print(f"Test Confusion Matrix:\n", confusion_matrix(test_labels, test_predictions))
print_classification_report("Test", test_labels, test_predictions)


Combined Training Accuracy: 86.2778%
Combined Training Confusion Matrix:
 [[7295 1242  149    8]
 [2086 6311  208   20]
 [  76  137 7969  461]
 [   5   32  331 8322]]
Combined Training Classification Report:
  Class 0: Precision: 0.7710, Recall: 0.8391, F1-Score: 0.8036
  Class 1: Precision: 0.8173, Recall: 0.7317, F1-Score: 0.7721
  Class 2: Precision: 0.9205, Recall: 0.9220, F1-Score: 0.9213
  Class 3: Precision: 0.9445, Recall: 0.9577, F1-Score: 0.9510
  Accuracy: 0.8628
  Class macro avg: Precision: 0.8633, Recall: 0.8626, F1-Score: 0.8620
  Class weighted avg: Precision: 0.8633, Recall: 0.8628, F1-Score: 0.8621

Test Accuracy: 83.8129%
Test Confusion Matrix:
 [[1251  182   64    1]
 [ 373 1008  168   18]
 [   0    0 1535   14]
 [   1    8  161 1332]]
Test Classification Report:
  Class 0: Precision: 0.7698, Recall: 0.8351, F1-Score: 0.8012
  Class 1: Precision: 0.8414, Recall: 0.6433, F1-Score: 0.7291
  Class 2: Precision: 0.7962, Recall: 0.9910, F1-Score: 0.8829
  Class 3: Precis

Rulefit evaluation using 9 best features

In [11]:
import os
import pandas as pd
import numpy as np
from rulefit import RuleFit
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


# Define the directory containing the CSV files
input_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/9 best features'

# Load the CSV files
train_df = pd.read_csv(os.path.join(input_dir, "9_training_selected_features.csv"))
val_df = pd.read_csv(os.path.join(input_dir, "9_validation_selected_features.csv"))
test_df = pd.read_csv(os.path.join(input_dir, "9_testing_selected_features.csv"))


# Separate features and labels for each dataset
train_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values
print(f"Training Features Shape: {train_features.shape}")

val_features = val_df.drop(columns=['label']).values
val_labels = val_df['label'].values
print(f"Validation Features Shape: {val_features.shape}")

test_features = test_df.drop(columns=['label']).values
test_labels = test_df['label'].values
print(f"Testing Features Shape: {test_features.shape}")

# Combine training and validation data for final training
train_val_features = np.vstack([train_features, val_features])
train_val_labels = np.hstack([train_labels, val_labels])

print(f"Combined Training + Validation Features Shape: {train_val_features.shape}")
print(f"Combined Training + Validation Labels Shape: {train_val_labels.shape}")

# Define feature names dynamically based on the training data
feature_names = train_data.columns[:-1].tolist()  # Exclude the label column

# Initialize RuleFit model
rf = RuleFit(tree_size=3, sample_fract=0.7, max_rules=3000, random_state=42)

# Fit the RuleFit model to combined training and validation data
rf.fit(train_val_features, train_val_labels, feature_names=feature_names)

# Predict on the test set (continuous values)
test_predictions = rf.predict(test_features)

# Convert continuous predictions to discrete class labels by rounding
test_predictions_discrete = np.round(test_predictions).astype(int)

# Ensure the predicted labels are within the valid range of classes
test_predictions_discrete = np.clip(test_predictions_discrete, np.min(train_val_labels), np.max(train_val_labels))

# Calculate testing accuracy
test_accuracy = accuracy_score(test_labels, test_predictions_discrete)
print(f"Testing Accuracy: {test_accuracy:.4f}")

# Generate confusion matrix
conf_matrix = confusion_matrix(test_labels, test_predictions_discrete)
print("Confusion Matrix:")
print(conf_matrix)

# Generate classification report
report = classification_report(test_labels, test_predictions_discrete, digits=4)
print("Classification Report:")
print(report)

# Extract rules from the RuleFit model
rules = rf.get_rules()
rules = rules[rules.coef != 0].sort_values("importance", ascending=False)

# Display the top rules
print("Top Rules from RuleFit Model:")
print(rules.head())

# Save rules to a text file
output_file_path = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/rulefit_rules_on_9_statistical_features.txt'
rules.to_csv(output_file_path, index=False)
print(f"Rules have been saved to {output_file_path}.")


Training Features Shape: (28537, 9)
Validation Features Shape: (6115, 9)
Testing Features Shape: (6116, 9)
Combined Training + Validation Features Shape: (34652, 9)
Combined Training + Validation Labels Shape: (34652,)


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent(


Testing Accuracy: 0.8229
Confusion Matrix:
[[1053  424   21    0]
 [ 234 1216  110    7]
 [   0    3 1521   25]
 [   0    0  259 1243]]
Classification Report:
              precision    recall  f1-score   support

           0     0.8182    0.7029    0.7562      1498
           1     0.7401    0.7760    0.7576      1567
           2     0.7959    0.9819    0.8792      1549
           3     0.9749    0.8276    0.8952      1502

    accuracy                         0.8229      6116
   macro avg     0.8323    0.8221    0.8221      6116
weighted avg     0.8310    0.8229    0.8219      6116

Top Rules from RuleFit Model:
                                                   rule    type  \
0                                                  mean  linear   
7                                                energy  linear   
221   kurtosis <= 0.721629410982132 & contrast <= 0....    rule   
1048  skewness > 2.124513626098633 & variance > 3.90...    rule   
2135  mean > 0.9969204664230347 & contras

Selecting 6 most important features based on mutual importance 

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest

# Define the input directory where the selected statistical features are saved
input_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/filtered_statistical_features'

# Define the output directory for saving selected features after CFS
output_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/6 best features'
os.makedirs(output_dir, exist_ok=True)

# Function to apply CFS and save selected features
def apply_mic_and_save(data, selected_columns, output_name):
    # Separate features and labels
    X = data[selected_columns]  # Use only the selected features
    y = data['label']  # Labels
    
    # Add the label back to the selected features
    selected_data = pd.concat([X, y], axis=1)
    
    # Save the selected features with labels
    selected_data.to_csv(os.path.join(output_dir, output_name), index=False)

# Step 1: Perform feature selection on the training set
train_data = pd.read_csv(os.path.join(input_dir, "filtered_train_stat_features.csv"))

# Separate features and labels for the training set
X_train = train_data.drop(columns=['label'])  # Features
y_train = train_data['label']  # Labels

# Use mutual information on training data to select top 3 features
selector = SelectKBest(score_func=mutual_info_classif, k=6)  # Select top 3 features
selector.fit(X_train, y_train)

# Get the selected feature names based on training set
selected_columns = X_train.columns[selector.get_support()]
print(f"Selected features based on training set: {selected_columns}")

# Step 2: Apply the same selected features to the training, validation, and testing datasets

# Apply CFS to the training set (using selected features)
apply_mic_and_save(train_data, selected_columns, "6_training_selected_features.csv")

# Apply the same selected features to the validation set
val_data = pd.read_csv(os.path.join(input_dir, "filtered_val_stat_features.csv"))
apply_mic_and_save(val_data, selected_columns, "6_validation_selected_features.csv")

# Apply the same selected features to the testing set
test_data = pd.read_csv(os.path.join(input_dir, "filtered_test_stat_features.csv"))
apply_mic_and_save(test_data, selected_columns, "6_testing_selected_features.csv")

print("Feature selection completed and saved.")


Selected features based on training set: Index(['skewness', 'kurtosis', 'signal_to_noise', 'coef_of_var',
       'harmonic_mean', 'geometric_mean'],
      dtype='object')
Feature selection completed and saved.


Evaluating decision tree on 6 important features

In [None]:


import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Define the directory containing the CSV files
stat_feature_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/6 best features'

# Load the CSV files
train_df = pd.read_csv(os.path.join(stat_feature_dir, "6_training_selected_features.csv"))
val_df = pd.read_csv(os.path.join(stat_feature_dir, "6_validation_selected_features.csv"))
test_df = pd.read_csv(os.path.join(stat_feature_dir, "6_testing_selected_features.csv"))

# Separate features and labels
train_stat_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values

val_stat_features = val_df.drop(columns=['label']).values
val_labels = val_df['label'].values

test_stat_features = test_df.drop(columns=['label']).values
test_labels = test_df['label'].values

# Combine training and validation data for final training
combined_features = np.vstack([train_stat_features, val_stat_features])
combined_labels = np.hstack([train_labels, val_labels])

# Train Decision Tree Classifier with specified hyperparameters on combined training and validation data
clf = DecisionTreeClassifier(
    random_state=0,
    max_depth=10,
    min_samples_leaf=4,
    min_samples_split=2
)
clf.fit(combined_features, combined_labels)

# Function to print classification report with four decimal points
def print_classification_report(set_name, labels, predictions):
    report = classification_report(labels, predictions, output_dict=True)
    print(f"{set_name} Classification Report:")
    for label, metrics in report.items():
        if label == 'accuracy':
            print(f"  Accuracy: {metrics:.4f}")
        else:
            print(f"  Class {label}: Precision: {metrics['precision']:.4f}, Recall: {metrics['recall']:.4f}, F1-Score: {metrics['f1-score']:.4f}")
    print()

# Evaluate on the combined training set
train_predictions = clf.predict(combined_features)
train_accuracy = accuracy_score(combined_labels, train_predictions)
print(f"Combined Training Accuracy: {train_accuracy * 100:.4f}%")
print(f"Combined Training Confusion Matrix:\n", confusion_matrix(combined_labels, train_predictions))
print_classification_report("Combined Training", combined_labels, train_predictions)

# Evaluate on the test set
test_predictions = clf.predict(test_stat_features)
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy * 100:.4f}%")
print(f"Test Confusion Matrix:\n", confusion_matrix(test_labels, test_predictions))
print_classification_report("Test", test_labels, test_predictions)


Combined Training Accuracy: 77.3202%
Combined Training Confusion Matrix:
 [[7190 1212  273   19]
 [2879 5349  373   24]
 [ 191  276 6568 1608]
 [   7   27  970 7686]]
Combined Training Classification Report:
  Class 0: Precision: 0.7003, Recall: 0.8270, F1-Score: 0.7584
  Class 1: Precision: 0.7793, Recall: 0.6202, F1-Score: 0.6907
  Class 2: Precision: 0.8025, Recall: 0.7599, F1-Score: 0.7807
  Class 3: Precision: 0.8232, Recall: 0.8845, F1-Score: 0.8527
  Accuracy: 0.7732
  Class macro avg: Precision: 0.7763, Recall: 0.7729, F1-Score: 0.7706
  Class weighted avg: Precision: 0.7763, Recall: 0.7732, F1-Score: 0.7707

Test Accuracy: 74.4441%
Test Confusion Matrix:
 [[1149  209  127   13]
 [ 598  800  159   10]
 [   0    0 1469   80]
 [   4    8  355 1135]]
Test Classification Report:
  Class 0: Precision: 0.6562, Recall: 0.7670, F1-Score: 0.7073
  Class 1: Precision: 0.7866, Recall: 0.5105, F1-Score: 0.6192
  Class 2: Precision: 0.6962, Recall: 0.9484, F1-Score: 0.8030
  Class 3: Precis

Evaluating Rulefit on 6 important features

In [12]:
import os
import pandas as pd
import numpy as np
from rulefit import RuleFit
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Define the directory containing the CSV files
input_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/6 best features'

# Load the CSV files
train_df = pd.read_csv(os.path.join(input_dir, "6_training_selected_features.csv"))
val_df = pd.read_csv(os.path.join(input_dir, "6_validation_selected_features.csv"))
test_df = pd.read_csv(os.path.join(input_dir, "6_testing_selected_features.csv"))

# Separate features and labels for each dataset
train_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values
print(f"Training Features Shape: {train_features.shape}")

val_features = val_df.drop(columns=['label']).values
val_labels = val_df['label'].values
print(f"Validation Features Shape: {val_features.shape}")

test_features = test_df.drop(columns=['label']).values
test_labels = test_df['label'].values
print(f"Testing Features Shape: {test_features.shape}")

# Combine training and validation data for final training
train_val_features = np.vstack([train_features, val_features])
train_val_labels = np.hstack([train_labels, val_labels])

print(f"Combined Training + Validation Features Shape: {train_val_features.shape}")
print(f"Combined Training + Validation Labels Shape: {train_val_labels.shape}")

# Define feature names dynamically based on the training data
feature_names = train_df.columns[:-1].tolist()  # Exclude the label column

# Initialize RuleFit model
rf = RuleFit(tree_size=3, sample_fract=0.7, max_rules=3000, random_state=42)

# Fit the RuleFit model to combined training and validation data
rf.fit(train_val_features, train_val_labels, feature_names=feature_names)

# Predict on the test set (continuous values)
test_predictions = rf.predict(test_features)

# Convert continuous predictions to discrete class labels by rounding
test_predictions_discrete = np.round(test_predictions).astype(int)

# Ensure the predicted labels are within the valid range of classes
test_predictions_discrete = np.clip(test_predictions_discrete, np.min(train_val_labels), np.max(train_val_labels))

# Calculate testing accuracy
test_accuracy = accuracy_score(test_labels, test_predictions_discrete)
print(f"Testing Accuracy: {test_accuracy:.4f}")

# Generate confusion matrix
conf_matrix = confusion_matrix(test_labels, test_predictions_discrete)
print("Confusion Matrix:")
print(conf_matrix)

# Generate classification report
report = classification_report(test_labels, test_predictions_discrete, digits=4)
print("Classification Report:")
print(report)

# Evaluate on the combined training set
train_predictions = rf.predict(train_val_features)
train_predictions_discrete = np.round(train_predictions).astype(int)
train_accuracy = accuracy_score(train_val_labels, train_predictions_discrete)

print(f"Combined Training Accuracy: {train_accuracy:.4f}")
train_conf_matrix = confusion_matrix(train_val_labels, train_predictions_discrete)
print("Combined Training Confusion Matrix:")
print(train_conf_matrix)
print("Combined Training Classification Report:")
print(classification_report(train_val_labels, train_predictions_discrete, digits=4))

# Extract rules from the RuleFit model
rules = rf.get_rules()
rules = rules[rules.coef != 0].sort_values("importance", ascending=False)

# Display the top rules
print("Top Rules from RuleFit Model:")
print(rules.head())

# Save rules to a text file
output_file_path = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/rulefit_rules_on_6_statistical_features.txt'
rules.to_csv(output_file_path, index=False)
print(f"Rules have been saved to {output_file_path}.")


Training Features Shape: (28537, 6)
Validation Features Shape: (6115, 6)
Testing Features Shape: (6116, 6)
Combined Training + Validation Features Shape: (34652, 6)
Combined Training + Validation Labels Shape: (34652,)


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent(


Testing Accuracy: 0.7871
Confusion Matrix:
[[1041  430   26    1]
 [ 348 1123   93    3]
 [   0    1 1516   32]
 [   0    5  363 1134]]
Classification Report:
              precision    recall  f1-score   support

           0     0.7495    0.6949    0.7212      1498
           1     0.7203    0.7167    0.7185      1567
           2     0.7588    0.9787    0.8548      1549
           3     0.9692    0.7550    0.8488      1502

    accuracy                         0.7871      6116
   macro avg     0.7994    0.7863    0.7858      6116
weighted avg     0.7983    0.7871    0.7857      6116

Combined Training Accuracy: 0.7368
Combined Training Confusion Matrix:
[[6143 2475   73    3]
 [2163 6211  241   10]
 [   7  891 5671 2074]
 [   0   39 1143 7508]]
Combined Training Classification Report:
              precision    recall  f1-score   support

           0     0.7390    0.7066    0.7224      8694
           1     0.6459    0.7201    0.6810      8625
           2     0.7956    0.6561    0

Selecting 3 most important features

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest

# Define the input directory where the selected statistical features are saved
input_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/filtered_statistical_features'

# Define the output directory for saving selected features after CFS
output_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/3 best features'
os.makedirs(output_dir, exist_ok=True)

# Function to apply CFS and save selected features
def apply_mic_and_save(data, selected_columns, output_name):
    # Separate features and labels
    X = data[selected_columns]  # Use only the selected features
    y = data['label']  # Labels
    
    # Add the label back to the selected features
    selected_data = pd.concat([X, y], axis=1)
    
    # Save the selected features with labels
    selected_data.to_csv(os.path.join(output_dir, output_name), index=False)

# Step 1: Perform feature selection on the training set
train_data = pd.read_csv(os.path.join(input_dir, "filtered_train_stat_features.csv"))

# Separate features and labels for the training set
X_train = train_data.drop(columns=['label'])  # Features
y_train = train_data['label']  # Labels

# Use mutual information on training data to select top 3 features
selector = SelectKBest(score_func=mutual_info_classif, k=3)  # Select top 3 features
selector.fit(X_train, y_train)

# Get the selected feature names based on training set
selected_columns = X_train.columns[selector.get_support()]
print(f"Selected features based on training set: {selected_columns}")

# Step 2: Apply the same selected features to the training, validation, and testing datasets

# Apply CFS to the training set (using selected features)
apply_mic_and_save(train_data, selected_columns, "3_training_selected_features.csv")

# Apply the same selected features to the validation set
val_data = pd.read_csv(os.path.join(input_dir, "filtered_val_stat_features.csv"))
apply_mic_and_save(val_data, selected_columns, "3_validation_selected_features.csv")

# Apply the same selected features to the testing set
test_data = pd.read_csv(os.path.join(input_dir, "filtered_test_stat_features.csv"))
apply_mic_and_save(test_data, selected_columns, "3_testing_selected_features.csv")

print("Feature selection completed and saved.")


Selected features based on training set: Index(['skewness', 'coef_of_var', 'harmonic_mean'], dtype='object')
Feature selection completed and saved.


Decision tree evaluation on 3 most important features

In [None]:

import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Define the directory containing the CSV files
stat_feature_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/3 best features'

# Load the CSV files
train_df = pd.read_csv(os.path.join(stat_feature_dir, "3_training_selected_features.csv"))
val_df = pd.read_csv(os.path.join(stat_feature_dir, "3_validation_selected_features.csv"))
test_df = pd.read_csv(os.path.join(stat_feature_dir, "3_testing_selected_features.csv"))

# Separate features and labels
train_stat_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values

val_stat_features = val_df.drop(columns=['label']).values
val_labels = val_df['label'].values

test_stat_features = test_df.drop(columns=['label']).values
test_labels = test_df['label'].values

# Combine training and validation data for final training
combined_features = np.vstack([train_stat_features, val_stat_features])
combined_labels = np.hstack([train_labels, val_labels])

# Train Decision Tree Classifier with specified hyperparameters on combined training and validation data
clf = DecisionTreeClassifier(
    random_state=0,
    max_depth=10,
    min_samples_leaf=4,
    min_samples_split=2
)
clf.fit(combined_features, combined_labels)

# Function to print classification report with four decimal points
def print_classification_report(set_name, labels, predictions):
    report = classification_report(labels, predictions, output_dict=True)
    print(f"{set_name} Classification Report:")
    for label, metrics in report.items():
        if label == 'accuracy':
            print(f"  Accuracy: {metrics:.4f}")
        else:
            print(f"  Class {label}: Precision: {metrics['precision']:.4f}, Recall: {metrics['recall']:.4f}, F1-Score: {metrics['f1-score']:.4f}")
    print()

# Evaluate on the combined training set
train_predictions = clf.predict(combined_features)
train_accuracy = accuracy_score(combined_labels, train_predictions)
print(f"Combined Training Accuracy: {train_accuracy * 100:.4f}%")
print(f"Combined Training Confusion Matrix:\n", confusion_matrix(combined_labels, train_predictions))
print_classification_report("Combined Training", combined_labels, train_predictions)

# Evaluate on the test set
test_predictions = clf.predict(test_stat_features)
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy * 100:.4f}%")
print(f"Test Confusion Matrix:\n", confusion_matrix(test_labels, test_predictions))
print_classification_report("Test", test_labels, test_predictions)


Combined Training Accuracy: 71.4533%
Combined Training Confusion Matrix:
 [[7009 1125  510   50]
 [3548 4415  590   72]
 [ 417  179 5782 2265]
 [  32  105  999 7554]]
Combined Training Classification Report:
  Class 0: Precision: 0.6368, Recall: 0.8062, F1-Score: 0.7116
  Class 1: Precision: 0.7581, Recall: 0.5119, F1-Score: 0.6111
  Class 2: Precision: 0.7337, Recall: 0.6690, F1-Score: 0.6998
  Class 3: Precision: 0.7599, Recall: 0.8693, F1-Score: 0.8109
  Accuracy: 0.7145
  Class macro avg: Precision: 0.7221, Recall: 0.7141, F1-Score: 0.7084
  Class weighted avg: Precision: 0.7220, Recall: 0.7145, F1-Score: 0.7086

Test Accuracy: 67.2008%
Test Confusion Matrix:
 [[1025  144  309   20]
 [ 698  609  237   23]
 [   0    0 1307  242]
 [   9   20  304 1169]]
Test Classification Report:
  Class 0: Precision: 0.5918, Recall: 0.6842, F1-Score: 0.6347
  Class 1: Precision: 0.7878, Recall: 0.3886, F1-Score: 0.5205
  Class 2: Precision: 0.6059, Recall: 0.8438, F1-Score: 0.7053
  Class 3: Precis

Rulefit evaluation using 3 most important features

In [15]:
import os
import pandas as pd
import numpy as np
from rulefit import RuleFit
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


# Define the directory containing the CSV files
input_dir = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/3 best features'

# Load the CSV files
train_df = pd.read_csv(os.path.join(input_dir, "3_training_selected_features.csv"))
val_df = pd.read_csv(os.path.join(input_dir, "3_validation_selected_features.csv"))
test_df = pd.read_csv(os.path.join(input_dir, "3_testing_selected_features.csv"))


# Separate features and labels for each dataset
train_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values
print(f"Training Features Shape: {train_features.shape}")

val_features = val_df.drop(columns=['label']).values
val_labels = val_df['label'].values
print(f"Validation Features Shape: {val_features.shape}")

test_features = test_df.drop(columns=['label']).values
test_labels = test_df['label'].values
print(f"Testing Features Shape: {test_features.shape}")

# Combine training and validation data for final training
train_val_features = np.vstack([train_features, val_features])
train_val_labels = np.hstack([train_labels, val_labels])

print(f"Combined Training + Validation Features Shape: {train_val_features.shape}")
print(f"Combined Training + Validation Labels Shape: {train_val_labels.shape}")

# Define feature names dynamically based on the training data
feature_names = train_data.columns[:-1].tolist()  # Exclude the label column

# Initialize RuleFit model
rf = RuleFit(tree_size=3, sample_fract=0.7, max_rules=3000, random_state=42)

# Fit the RuleFit model to combined training and validation data
rf.fit(train_val_features, train_val_labels, feature_names=feature_names)

# Predict on the test set (continuous values)
test_predictions = rf.predict(test_features)

# Convert continuous predictions to discrete class labels by rounding
test_predictions_discrete = np.round(test_predictions).astype(int)

# Ensure the predicted labels are within the valid range of classes
test_predictions_discrete = np.clip(test_predictions_discrete, np.min(train_val_labels), np.max(train_val_labels))

# Calculate testing accuracy
test_accuracy = accuracy_score(test_labels, test_predictions_discrete)
print(f"Testing Accuracy: {test_accuracy:.4f}")

# Generate confusion matrix
conf_matrix = confusion_matrix(test_labels, test_predictions_discrete)
print("Confusion Matrix:")
print(conf_matrix)

# Generate classification report
report = classification_report(test_labels, test_predictions_discrete, digits=4)
print("Classification Report:")
print(report)

# Extract rules from the RuleFit model
rules = rf.get_rules()
rules = rules[rules.coef != 0].sort_values("importance", ascending=False)

# Display the top rules
print("Top Rules from RuleFit Model:")
print(rules.head())

# Save rules to a text file
output_file_path = 'E:/Abroad period research/new idea implementation codes/Second part of the paper/26 features results/rulefit_rules_on_3_statistical_features.txt'
rules.to_csv(output_file_path, index=False)
print(f"Rules have been saved to {output_file_path}.")


Training Features Shape: (28537, 3)
Validation Features Shape: (6115, 3)
Testing Features Shape: (6116, 3)
Combined Training + Validation Features Shape: (34652, 3)
Combined Training + Validation Labels Shape: (34652,)


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent(


Testing Accuracy: 0.6938
Confusion Matrix:
[[ 812  543  138    5]
 [ 428  967  165    7]
 [   0    0 1313  236]
 [   1   16  334 1151]]
Classification Report:
              precision    recall  f1-score   support

           0     0.6543    0.5421    0.5929      1498
           1     0.6337    0.6171    0.6253      1567
           2     0.6733    0.8476    0.7505      1549
           3     0.8227    0.7663    0.7935      1502

    accuracy                         0.6938      6116
   macro avg     0.6960    0.6933    0.6906      6116
weighted avg     0.6952    0.6938    0.6904      6116

Top Rules from RuleFit Model:
                                                  rule    type          coef  \
2                                             variance  linear -1.398644e+06   
73   mean <= 1.0566121339797974 & mean <= 1.0397928...    rule -2.021831e-01   
48   std_dev > 1.0096375346183777 & std_dev <= 1.20...    rule -1.592777e-01   
75                        std_dev > 1.3185044527053833  