# Probando regresión logística para ver si es apta para el entorno distribuido

In [1]:
import sys
sys.path.append("../")

from prototypes.xuilvq import XuILVQ
from prototypes_mod import XuILVQ as XuILVQ_mod
from river import forest, tree, linear_model, optim
from utils import read_dataset, evaluate_model_online_learning, calculate_metrics
from tqdm import tqdm


## Vamos a probar con los datasets con Hoeffding tree, ilvq para comparar métricas y coste computacional

In [9]:
data_name = {
    "elec": "electricity.csv",
    "sea_dataset_min1000_max10000": "sea_datasets/sea_dataset_min1000_max10000.csv",
    "sea_dataset_min1000_max1000": "sea_datasets/sea_dataset_min1000_max1000.csv",
    "sea_dataset_min1000_max2000": "sea_datasets/sea_dataset_min1000_max2000.csv",
    "sea_dataset_min1000_max5000": "sea_datasets/sea_dataset_min1000_max5000.csv",
    "sea_dataset_min100_max10000": "sea_datasets/sea_dataset_min100_max10000.csv",
    "sea_dataset_min100_max1000": "sea_datasets/sea_dataset_min100_max1000.csv",
    "sea_dataset_min100_max100": "sea_datasets/sea_dataset_min100_max100.csv",
    "sea_dataset_min100_max2000": "sea_datasets/sea_dataset_min100_max2000.csv",
    "sea_dataset_min100_max200": "sea_datasets/sea_dataset_min100_max200.csv",
    "sea_dataset_min100_max5000": "sea_datasets/sea_dataset_min100_max5000.csv",
    "sea_dataset_min100_max500": "sea_datasets/sea_dataset_min100_max500.csv",
    "sea_dataset_min10_max100": "sea_datasets/sea_dataset_min10_max100.csv",
    "sea_dataset_min10_max10": "sea_datasets/sea_dataset_min10_max10.csv",
    "sea_dataset_min10_max200": "sea_datasets/sea_dataset_min10_max200.csv",
    "sea_dataset_min10_max20": "sea_datasets/sea_dataset_min10_max20.csv",
    "sea_dataset_min10_max50": "sea_datasets/sea_dataset_min10_max50.csv",
    "sea_dataset_min200_max10000": "sea_datasets/sea_dataset_min200_max10000.csv",
    "sea_dataset_min200_max1000": "sea_datasets/sea_dataset_min200_max1000.csv",
    "sea_dataset_min200_max2000": "sea_datasets/sea_dataset_min200_max2000.csv",
    "sea_dataset_min200_max5000": "sea_datasets/sea_dataset_min200_max5000.csv",
    "sea_dataset_min200_max500": "sea_datasets/sea_dataset_min200_max500.csv",
    "sea_dataset_min20_max100": "sea_datasets/sea_dataset_min20_max100.csv",
    "sea_dataset_min20_max200": "sea_datasets/sea_dataset_min20_max200.csv",
    "sea_dataset_min20_max20": "sea_datasets/sea_dataset_min20_max20.csv",
    "sea_dataset_min20_max50": "sea_datasets/sea_dataset_min20_max50.csv",
    "sea_dataset_min30_max100": "sea_datasets/sea_dataset_min30_max100.csv",
    "sea_dataset_min30_max200": "sea_datasets/sea_dataset_min30_max200.csv",
    "sea_dataset_min30_max50": "sea_datasets/sea_dataset_min30_max50.csv",
    "sea_dataset_min500_max10000": "sea_datasets/sea_dataset_min500_max10000.csv",
    "sea_dataset_min500_max1000": "sea_datasets/sea_dataset_min500_max1000.csv",
    "sea_dataset_min500_max2000": "sea_datasets/sea_dataset_min500_max2000.csv",
    "sea_dataset_min500_max5000": "sea_datasets/sea_dataset_min500_max5000.csv",
    "sea_dataset_min500_max500": "sea_datasets/sea_dataset_min500_max500.csv",
    "sea_dataset_min50_max10000": "sea_datasets/sea_dataset_min50_max10000.csv",
    "sea_dataset_min50_max1000": "sea_datasets/sea_dataset_min50_max1000.csv",
    "sea_dataset_min50_max100": "sea_datasets/sea_dataset_min50_max100.csv",
    "sea_dataset_min50_max2000": "sea_datasets/sea_dataset_min50_max2000.csv",
    "sea_dataset_min50_max200": "sea_datasets/sea_dataset_min50_max200.csv",
    "sea_dataset_min50_max5000": "sea_datasets/sea_dataset_min50_max5000.csv",
    "sea_dataset_min50_max500": "sea_datasets/sea_dataset_min50_max500.csv",
    "sea_dataset_min50_max50": "sea_datasets/sea_dataset_min50_max50.csv",
    "sea_dataset_min5_max100": "sea_datasets/sea_dataset_min5_max100.csv",
    "sea_dataset_min5_max10": "sea_datasets/sea_dataset_min5_max10.csv",
    "sea_dataset_min5_max200": "sea_datasets/sea_dataset_min5_max200.csv",
    "sea_dataset_min5_max20": "sea_datasets/sea_dataset_min5_max20.csv",
    "sea_dataset_min5_max50": "sea_datasets/sea_dataset_min5_max50.csv",
}

### Loop para testear cada una de las combinaciones

In [10]:
# Dictionary to store results
results = {}

# Iterate over each dataset with a progress bar
for dataset_name, dataset_file in tqdm(data_name.items(), desc="Processing datasets"):
    
    # Initialize models
    models = {
        "ARF": forest.ARFClassifier(n_models=3, max_size=20),
        "Hoeffding Tree": tree.HoeffdingTreeClassifier(),
        "ILVQ": XuILVQ(),
        "ILVQ_mod": XuILVQ_mod(),
    }
    
    # Load dataset
    dataset = read_dataset(dataset_name, data_name)
    dataset = dataset.iloc[:5000]  # Limit to first 5000 rows for consistency
    
    # Dictionary to store results for this dataset
    results[dataset_name] = {}
    
    # # Calculate the distribution of the last column (label)
    # label_distribution = dataset.iloc[:, -1].value_counts()

    # print("Label Distribution:")
    # print(label_distribution)


    
    # Evaluate each model
    for model_name, model in models.items():
        # Print the current model-dataset iteration
        # tqdm.write(f"[ITERATION] {model_name} - {dataset_name}")
        conf_matrix, elapsed_time = evaluate_model_online_learning(model, dataset)
        metrics = calculate_metrics(conf_matrix)
        
        
        # Store results
        results[dataset_name][model_name] = {
            "precision": metrics["precision"],  # Extract individual metrics
            "recall": metrics["recall"],
            "f1": metrics["f1"],
            "elapsed_time": elapsed_time
        }

  dataset.replace({'UP': 1, 'DOWN': 0, 'True': 1, 'False': 0}, inplace=True)
Processing datasets: 100%|██████████| 47/47 [04:44<00:00,  6.05s/it]


In [4]:
# Display results
# Display results
for dataset_name, model_results in results.items():
    print(f"Results for dataset: {dataset_name}")
    for model_name, metrics in model_results.items():
        print(
            f"{model_name} - "
            f"Precision: {metrics['precision']:.4f}, "
            f"Recall: {metrics['recall']:.4f}, "
            f"F1-score: {metrics['f1']:.4f}, "
            f"Time: {metrics['elapsed_time']:.2f} sec"
        )
    print("\n")


Results for dataset: elec
ARF - Precision: 0.8470, Recall: 0.8020, F1-score: 0.8240, Time: 1.78 sec
Hoeffding Tree - Precision: 0.7900, Recall: 0.8260, F1-score: 0.8080, Time: 0.46 sec
ILVQ - Precision: 0.7420, Recall: 0.6610, F1-score: 0.6990, Time: 2.90 sec
ILVQ_mod - Precision: 0.6880, Recall: 0.7540, F1-score: 0.7190, Time: 4.54 sec


Results for dataset: sea_dataset_min1000_max10000
ARF - Precision: 0.9070, Recall: 0.9450, F1-score: 0.9260, Time: 1.20 sec
Hoeffding Tree - Precision: 0.9080, Recall: 0.9310, F1-score: 0.9190, Time: 0.34 sec
ILVQ - Precision: 0.9220, Recall: 0.9260, F1-score: 0.9240, Time: 1.80 sec
ILVQ_mod - Precision: 0.9080, Recall: 0.9180, F1-score: 0.9130, Time: 3.61 sec


Results for dataset: sea_dataset_min100_max500
ARF - Precision: 0.9040, Recall: 0.9510, F1-score: 0.9270, Time: 1.24 sec
Hoeffding Tree - Precision: 0.9080, Recall: 0.9310, F1-score: 0.9190, Time: 0.34 sec
ILVQ - Precision: 0.9220, Recall: 0.9260, F1-score: 0.9240, Time: 1.78 sec
ILVQ_mod - Pr

In [7]:
# Log the results

import logging

# Configure logging
logging.basicConfig(
    filename="model_results.log",  # Log file name
    level=logging.INFO,            # Logging level
    format="%(message)s",          # Log format (just the message)
    filemode="w"                   # Overwrite the file if it exists
)

# Function to log results
def log_results(results):
    for dataset_name, model_results in results.items():
        logging.info(f"Results for dataset: {dataset_name}")
        for model_name, metrics in model_results.items():
            logging.info(
                f"{model_name} - "
                f"Precision: {metrics['precision']:.4f}, "
                f"Recall: {metrics['recall']:.4f}, "
                f"F1-score: {metrics['f1']:.4f}, "
                f"Time: {metrics['elapsed_time']:.2f} sec"
            )
        logging.info("\n")

# Log the results
log_results(results)