In [43]:
from river import metrics, naive_bayes, tree, forest
import pandas as pd
from utils import evaluate_model_online_learning, read_dataset, calculate_metrics
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from prototypes_mod.xuilvq import XuILVQ as UpdatedILVQ
import itertools
import multiprocessing

In [44]:
# Datasets from THU-Concept-Drift-Datasets-v1.0 (replace with actual paths if needed)
dataset_paths = {
    "linear_gradual": "../dataset/linear_gradual_rotation_noise_and_redunce.csv",
    "linear_recurrent": "../dataset/linear_recurrent_rotation_noise_and_redunce.csv",
    "linear_sudden": "../dataset/linear_sudden_rotation_noise_and_redunce.csv",
    # "cake_gradual": "../dataset/nonlinear_gradual_cakerotation_noise_and_redunce.csv",
    # "cake_sudden": "../dataset/nonlinear_sudden_cakerotation_noise_and_redunce.csv",
    # "cake_recurrent": "../dataset/nonlinear_recurrent_cakerotation_noise_and_redunce.csv",
    # "chocolate_gradual": "../dataset/nonlinear_gradual_chocolaterotation_noise_and_redunce.csv",
    # "chocolate_sudden": "../dataset/nonlinear_sudden_chocolaterotation_noise_and_redunce.csv",
    # "chocolate_recurrent": "../dataset/nonlinear_recurrent_chocolaterotation_noise_and_redunce.csv"
}



# Models to use for each dataset
model_factories = {
    "Incremental Learning Vector Quantization": lambda: UpdatedILVQ(),
    "Gaussian Naive Bayes": lambda: naive_bayes.GaussianNB(),
    "Hoeffding Tree": lambda: tree.HoeffdingTreeClassifier(),
    "Adaptive Random Forest": lambda: forest.ARFClassifier(),
}

In [45]:
# Variables related to the sampling of the dataset

# Step sizes for each dataset (determines how frequently samples are taken)
# step_sizes = {
#     "linear_gradual": 50,
#     "linear_recurrent": 50,
#     "linear_sudden": 50, 
#     # "cake_gradual": 1,
#     # "cake_sudden": 1,
#     # "cake_recurrent": 1,
#     # "chocolate_gradual": 1,
#     # "chocolate_sudden": 1,
#     # "chocolate_recurrent": 1
# }

step_sizes_values = [5, 10, 50, 100, 250, 500, 1000, 2000, 5000, 10000]


# Cycle shift values (how much the starting index shifts per cycle)
cycle_shifts = {
    "linear_gradual": 5,
    "linear_recurrent": 5,
    "linear_sudden": 5,
    # "cake_gradual": 1,
    # "cake_sudden": 1,
    # "cake_recurrent": 1,
    # "chocolate_gradual": 1,
    # "chocolate_sudden": 1,
    # "chocolate_recurrent": 1
}

num_samples = 1000  # Desired number of samples
output_file = "datasets_grid_search_results.txt"  # File to save results


In [46]:
# cyclic sampling


def cyclic_sampling_with_shift(df, num_samples, step, cycle_shift):
    """
    Performs cyclic sampling with a configurable shift.
    - 'step': Distance between selected samples within a cycle.
    - 'cycle_shift': Shift applied to the starting index of each new cycle.
    """
    total_samples = len(df)
    indices = []
    shift = 0  # Initial shift is 0 (first cycle starts at index 0)

    while len(indices) < num_samples:
        max_per_cycle = total_samples // step  # How many samples per cycle
        
        for i in range(max_per_cycle):
            idx = (shift + i * step) % total_samples  # Wrap around if needed
            indices.append(idx)
            if len(indices) == num_samples:  # Stop if we have enough samples
                break

        shift += cycle_shift  # Increase shift for the next cycle

    sampled_df = df.iloc[indices]
    return sampled_df

In [47]:
# Load datasets with a specific step size
def load_datasets(step_size):
    datasets = {}
    for name, path in dataset_paths.items():
        try:
            df = read_dataset(name, dataset_paths)
            cycle_shift = cycle_shifts.get(name, 1)  # Default shift = 1 if not defined
            
            df = cyclic_sampling_with_shift(df, num_samples, step_size, cycle_shift)  # Apply cyclic sampling
            
            if len(df) >= num_samples:
                df = df.iloc[:num_samples]
            else:
                print(f"Dataset {name} has only {len(df)} samples after step {step_size}, but {num_samples} were requested.")
            
            datasets[name] = df
            print(f"✅ Loaded {name} (Shape: {df.shape}) - Step {step_size}, Cycle Shift {cycle_shift}, {num_samples} samples taken.")
        except FileNotFoundError:
            print(f"❌ Error: File '{dataset_paths[name]}' not found.")
        except Exception as e:
            print(f"⚠️ Error loading {name}: {e}")
    return datasets

In [48]:
# Evaluate models for a dataset
def evaluate_models_for_dataset(dataset_name, df, step_size):
    results = []
    print(f"\n\n\n🔍 Evaluating dataset: {dataset_name} with step size {step_size}")

    for model_name, model_factory in model_factories.items():
        model = model_factory()  # Create a new instance of the model
        print(f"🧠 Model: {model_name}")

        # Evaluate using online learning
        try:
            conf_matrix, elapsed_time = evaluate_model_online_learning(model, df)
            metrics_result = calculate_metrics(conf_matrix)

            # Store results
            result_str = (
                f"Dataset: {dataset_name}, Step Size: {step_size}, Model: {model_name}, "
                f"Precision: {metrics_result['precision']}, Recall: {metrics_result['recall']}, "
                f"F1: {metrics_result['f1']}, Execution Time: {elapsed_time:.4f} sec\n"
            )
            results.append(result_str)
            print(result_str)
        except Exception as e:
            print(f"⚠️ An error occurred while processing {dataset_name} with {model_name}: {e}")

    return results

In [49]:
# Grid Search over step sizes
def grid_search():
    # Ensure the output file starts fresh with a header
    with open(output_file, "w") as f:
        f.write("Dataset, Step Size, Model, Precision, Recall, F1, Execution Time (sec)\n")

    for step_size in step_sizes_values:
        print(f"\n🚀 Running grid search with step size: {step_size}")

        # Load datasets with the current step size
        datasets = load_datasets(step_size)

        # Evaluate models for each dataset
        all_results = []
        for dataset_name, df in datasets.items():
            results = evaluate_models_for_dataset(dataset_name, df, step_size)
            all_results.extend(results)  # Collect results

        # Save results to file
        with open(output_file, "a") as f:
            f.writelines(all_results)
            f.write("\n" + "=" * 80 + "\n\n")  # Separator for readability

# Run the grid search
grid_search()
print(f"\n=== Grid Search Completed! Results saved to '{output_file}' ===")


🚀 Running grid search with step size: 5
✅ Loaded linear_gradual (Shape: (1000, 6)) - Step 5, Cycle Shift 5, 1000 samples taken.
✅ Loaded linear_recurrent (Shape: (1000, 6)) - Step 5, Cycle Shift 5, 1000 samples taken.
✅ Loaded linear_sudden (Shape: (1000, 6)) - Step 5, Cycle Shift 5, 1000 samples taken.



🔍 Evaluating dataset: linear_gradual with step size 5
🧠 Model: Incremental Learning Vector Quantization
Dataset: linear_gradual, Step Size: 5, Model: Incremental Learning Vector Quantization, Precision: 0.9, Recall: 0.879, F1: 0.889, Execution Time: 0.3598 sec

🧠 Model: Gaussian Naive Bayes
Dataset: linear_gradual, Step Size: 5, Model: Gaussian Naive Bayes, Precision: 0.929, Recall: 0.949, F1: 0.939, Execution Time: 0.0836 sec

🧠 Model: Hoeffding Tree
Dataset: linear_gradual, Step Size: 5, Model: Hoeffding Tree, Precision: 0.887, Recall: 0.93, F1: 0.908, Execution Time: 0.0863 sec

🧠 Model: Adaptive Random Forest
Dataset: linear_gradual, Step Size: 5, Model: Adaptive Random Forest, 