# Very Fast Decision Tree

In [2]:
import sys
sys.path.append("../")
from prototypes.xuilvq import XuILVQ
from prototypes_mod import XuILVQ as XuILVQ_mod
from river import forest, tree, linear_model, optim
from utils import read_dataset, evaluate_model_online_learning, calculate_metrics
from tqdm import tqdm
from sklearn.metrics import confusion_matrix, f1_score, precision_score
import pandas as pd


In [3]:
filename = "electricity.csv"
dataset = pd.read_csv("../dataset/" + filename)
dataset.replace({'UP': 1, 'DOWN': 0, 'True': 1, 'False': 0}, inplace=True)
dataset.iloc [:15000]

test_dataset = dataset.iloc[15000:20000]
dataset1 = dataset.iloc[0::3]  
dataset2 = dataset.iloc[1::3]  
dataset3 = dataset.iloc[2::3]  

  dataset.replace({'UP': 1, 'DOWN': 0, 'True': 1, 'False': 0}, inplace=True)


In [5]:
vfdt1 = tree.HoeffdingTreeClassifier()
vfdt2 = tree.HoeffdingTreeClassifier()
vfdt3 = tree.HoeffdingTreeClassifier()



# Initialize storage for true labels and predictions
true_labels = []
predicted_labels_vfdt1 = []
predicted_labels_vfdt2 = []
predicted_labels_vfdt3 = []

# Predict-then-train
for i, (row1, row2, row3) in enumerate(zip(dataset1.iterrows(), dataset2.iterrows(), dataset3.iterrows())):
    # Extract features and labels for each dataset
    sample1, label1 = row1[1][:-1].to_dict(), row1[1][-1]
    sample2, label2 = row2[1][:-1].to_dict(), row2[1][-1]
    sample3, label3 = row3[1][:-1].to_dict(), row3[1][-1]

    # Store true labels
    true_labels.extend([label1, label2, label3])

    # Predict with vfdt1, then train
    pred1 = vfdt1.predict_one(sample1)
    predicted_labels_vfdt1.append(pred1 if pred1 is not None else 0)
    vfdt1.learn_one(sample1, label1)

    # Predict with vfdt2, then train
    pred2 = vfdt2.predict_one(sample2)
    predicted_labels_vfdt2.append(pred2 if pred2 is not None else 0)
    vfdt2.learn_one(sample2, label2)

    # Predict with vfdt3, then train
    pred3 = vfdt3.predict_one(sample3)
    predicted_labels_vfdt3.append(pred3 if pred3 is not None else 0)
    vfdt3.learn_one(sample3, label3)

# Evaluate VFDT1
conf_matrix_vfdt1 = confusion_matrix(true_labels[0::3], predicted_labels_vfdt1)
f1_vfdt1 = f1_score(true_labels[0::3], predicted_labels_vfdt1)
precision_vfdt1 = precision_score(true_labels[0::3], predicted_labels_vfdt1)

print("VFDT1 Metrics:")
print(f"Confusion Matrix:\n{conf_matrix_vfdt1}")
print(f"F1 Score: {f1_vfdt1:.4f}")
print(f"Precision: {precision_vfdt1:.4f}\n")

# Evaluate VFDT2
conf_matrix_vfdt2 = confusion_matrix(true_labels[1::3], predicted_labels_vfdt2)
f1_vfdt2 = f1_score(true_labels[1::3], predicted_labels_vfdt2)
precision_vfdt2 = precision_score(true_labels[1::3], predicted_labels_vfdt2)

print("VFDT2 Metrics:")
print(f"Confusion Matrix:\n{conf_matrix_vfdt2}")
print(f"F1 Score: {f1_vfdt2:.4f}")
print(f"Precision: {precision_vfdt2:.4f}\n")

# Evaluate VFDT3
conf_matrix_vfdt3 = confusion_matrix(true_labels[2::3], predicted_labels_vfdt3)
f1_vfdt3 = f1_score(true_labels[2::3], predicted_labels_vfdt3)
precision_vfdt3 = precision_score(true_labels[2::3], predicted_labels_vfdt3)

print("VFDT3 Metrics:")
print(f"Confusion Matrix:\n{conf_matrix_vfdt3}")
print(f"F1 Score: {f1_vfdt3:.4f}")
print(f"Precision: {precision_vfdt3:.4f}\n")



####################### THE AGGREGATED MODEL ############################

true_labels_test = list(test_dataset.iloc[:, -1])  # Extract true labels from the test set

import pickle

# Probability-based voting function
def probability_vote(serialized_models, sample):
    """
    Perform voting based on predicted probabilities, deserializing models inside the function.
    
    Args:
        serialized_models (list): List of serialized models to aggregate.
        sample (dict): The input features for prediction.

    Returns:
        int: The aggregated prediction (0 or 1).
    """
    total_prob = 0
    for i, serialized_model in enumerate(serialized_models):
        # Deserialize the model
        model = pickle.loads(serialized_model)
        
        # Get the probability of class 1
        prob = model.predict_proba_one(sample).get(1, 0)  # Probability of class 1
        
        # Print the probability predicted by each model
        print(f"Model {i+1} Probability: {prob:.4f}")
        total_prob += prob
    
    # Average the probabilities and decide based on a threshold of 0.5
    avg_prob = total_prob / len(serialized_models)
    return 1 if avg_prob >= 0.5 else 0

# Serialize the models
serialized_vfdt1 = pickle.dumps(vfdt1)
serialized_vfdt2 = pickle.dumps(vfdt2)
serialized_vfdt3 = pickle.dumps(vfdt3)

# List of serialized models
serialized_models = [serialized_vfdt1, serialized_vfdt2, serialized_vfdt3]

# Make predictions using the aggregated model
predicted_labels_ensemble = []
for _, row in test_dataset.iterrows():
    sample = row[:-1].to_dict()  # Extract features
    pred = probability_vote(serialized_models, sample)
    predicted_labels_ensemble.append(pred)

# Evaluate aggregated model on test dataset
conf_matrix_ensemble = confusion_matrix(true_labels_test, predicted_labels_ensemble)
f1_ensemble = f1_score(true_labels_test, predicted_labels_ensemble)
precision_ensemble = precision_score(true_labels_test, predicted_labels_ensemble)

print("Aggregated Model (Probability-Based Voting) Metrics:")
print(f"Confusion Matrix:\n{conf_matrix_ensemble}")
print(f"F1 Score: {f1_ensemble:.4f}")
print(f"Precision: {precision_ensemble:.4f}\n")


  sample1, label1 = row1[1][:-1].to_dict(), row1[1][-1]
  sample2, label2 = row2[1][:-1].to_dict(), row2[1][-1]
  sample3, label3 = row3[1][:-1].to_dict(), row3[1][-1]


VFDT1 Metrics:
Confusion Matrix:
[[7253 1465]
 [1985 4401]]
F1 Score: 0.7184
Precision: 0.7503

VFDT2 Metrics:
Confusion Matrix:
[[7203 1417]
 [1893 4591]]
F1 Score: 0.7350
Precision: 0.7641

VFDT3 Metrics:
Confusion Matrix:
[[7294 1443]
 [1950 4417]]
F1 Score: 0.7225
Precision: 0.7538

Model 1 Probability: 0.3588
Model 2 Probability: 1.0000
Model 3 Probability: 0.0004
Model 1 Probability: 0.3588
Model 2 Probability: 1.0000
Model 3 Probability: 0.0004
Model 1 Probability: 0.3588
Model 2 Probability: 0.0000
Model 3 Probability: 0.0004
Model 1 Probability: 0.3588
Model 2 Probability: 0.0000
Model 3 Probability: 0.0004
Model 1 Probability: 0.3588
Model 2 Probability: 0.0000
Model 3 Probability: 0.0004
Model 1 Probability: 0.3588
Model 2 Probability: 0.0000
Model 3 Probability: 0.0004
Model 1 Probability: 0.3588
Model 2 Probability: 0.0000
Model 3 Probability: 0.0004
Model 1 Probability: 0.3588
Model 2 Probability: 0.0000
Model 3 Probability: 0.0004
Model 1 Probability: 0.3588
Model 2 Prob