# Probando regresión logística para ver si es apta para el entorno distribuido

In [1]:
import sys
sys.path.append("../")
from prototypes.xuilvq import XuILVQ
from prototypes_mod import XuILVQ as XuILVQ_mod
from river import forest, tree, linear_model, optim
from utils import read_dataset, evaluate_model_online_learning, calculate_metrics
from tqdm import tqdm


## Vamos a probar con los datasets con Hoeffding tree, ilvq para comparar métricas y coste computacional

In [6]:
data_name = {
    "lgr": "linear_gradual_rotation_noise_and_redunce.csv",
    # "nlgc": "nonlinear_gradual_cakerotation_noise_and_redunce.csv",

    # "elec": "electricity.csv",
    # "sea_dataset_min1000_max10000": "sea_datasets/sea_dataset_min1000_max10000.csv",
    # "sea_dataset_min1000_max1000": "sea_datasets/sea_dataset_min1000_max1000.csv",
    # "sea_dataset_min1000_max2000": "sea_datasets/sea_dataset_min1000_max2000.csv",
    # "sea_dataset_min1000_max5000": "sea_datasets/sea_dataset_min1000_max5000.csv",
    # "sea_dataset_min100_max10000": "sea_datasets/sea_dataset_min100_max10000.csv",
    # "sea_dataset_min100_max1000": "sea_datasets/sea_dataset_min100_max1000.csv",
    # "sea_dataset_min100_max100": "sea_datasets/sea_dataset_min100_max100.csv",
    # "sea_dataset_min100_max2000": "sea_datasets/sea_dataset_min100_max2000.csv",
    # "sea_dataset_min100_max200": "sea_datasets/sea_dataset_min100_max200.csv",
    # "sea_dataset_min100_max5000": "sea_datasets/sea_dataset_min100_max5000.csv",
    # "sea_dataset_min100_max500": "sea_datasets/sea_dataset_min100_max500.csv",
    # "sea_dataset_min10_max100": "sea_datasets/sea_dataset_min10_max100.csv",
    # "sea_dataset_min10_max10": "sea_datasets/sea_dataset_min10_max10.csv",
    # "sea_dataset_min10_max200": "sea_datasets/sea_dataset_min10_max200.csv",
    # "sea_dataset_min10_max20": "sea_datasets/sea_dataset_min10_max20.csv",
    # "sea_dataset_min10_max50": "sea_datasets/sea_dataset_min10_max50.csv",
    # "sea_dataset_min200_max10000": "sea_datasets/sea_dataset_min200_max10000.csv",
    # "sea_dataset_min200_max1000": "sea_datasets/sea_dataset_min200_max1000.csv",
    # "sea_dataset_min200_max2000": "sea_datasets/sea_dataset_min200_max2000.csv",
    # "sea_dataset_min200_max5000": "sea_datasets/sea_dataset_min200_max5000.csv",
    # "sea_dataset_min200_max500": "sea_datasets/sea_dataset_min200_max500.csv",
    # "sea_dataset_min20_max100": "sea_datasets/sea_dataset_min20_max100.csv",
    # "sea_dataset_min20_max200": "sea_datasets/sea_dataset_min20_max200.csv",
    # "sea_dataset_min20_max20": "sea_datasets/sea_dataset_min20_max20.csv",
    # "sea_dataset_min20_max50": "sea_datasets/sea_dataset_min20_max50.csv",
    # "sea_dataset_min30_max100": "sea_datasets/sea_dataset_min30_max100.csv",
    # "sea_dataset_min30_max200": "sea_datasets/sea_dataset_min30_max200.csv",
    # "sea_dataset_min30_max50": "sea_datasets/sea_dataset_min30_max50.csv",
    # "sea_dataset_min500_max10000": "sea_datasets/sea_dataset_min500_max10000.csv",
    # "sea_dataset_min500_max1000": "sea_datasets/sea_dataset_min500_max1000.csv",
    # "sea_dataset_min500_max2000": "sea_datasets/sea_dataset_min500_max2000.csv",
    # "sea_dataset_min500_max5000": "sea_datasets/sea_dataset_min500_max5000.csv",
    # "sea_dataset_min500_max500": "sea_datasets/sea_dataset_min500_max500.csv",
    # "sea_dataset_min50_max10000": "sea_datasets/sea_dataset_min50_max10000.csv",
    # "sea_dataset_min50_max1000": "sea_datasets/sea_dataset_min50_max1000.csv",
    # "sea_dataset_min50_max100": "sea_datasets/sea_dataset_min50_max100.csv",
    # "sea_dataset_min50_max2000": "sea_datasets/sea_dataset_min50_max2000.csv",
    # "sea_dataset_min50_max200": "sea_datasets/sea_dataset_min50_max200.csv",
    # "sea_dataset_min50_max5000": "sea_datasets/sea_dataset_min50_max5000.csv",
    # "sea_dataset_min50_max500": "sea_datasets/sea_dataset_min50_max500.csv",
    # "sea_dataset_min50_max50": "sea_datasets/sea_dataset_min50_max50.csv",
    # "sea_dataset_min5_max100": "sea_datasets/sea_dataset_min5_max100.csv",
    # "sea_dataset_min5_max10": "sea_datasets/sea_dataset_min5_max10.csv",
    # "sea_dataset_min5_max200": "sea_datasets/sea_dataset_min5_max200.csv",
    # "sea_dataset_min5_max20": "sea_datasets/sea_dataset_min5_max20.csv",
    # "sea_dataset_min5_max50": "sea_datasets/sea_dataset_min5_max50.csv",
}

In [17]:
# Initialize storage for results
results = {}

# Iterate over datasets
for dataset_name, dataset_file in tqdm(data_name.items(), desc="Processing datasets"):
    
    # Load the dataset
    dataset = read_dataset(dataset_name, data_name)
    dataset_size = len(dataset)
    
    # Dictionary to store results for this dataset
    results[dataset_name] = {}
    
    # Test with cyclic sampling
    for step in [1000, 5000, 10000]:  # Example steps
        for start_shift in range(5):  # 5 different starting points to avoid repeating patterns
            
            # Precompute cyclic indices efficiently
            sampled_indices = [
                (start_shift + i * step) % dataset_size
                for i in range(1000)
            ]
            
            # Select rows using precomputed indices
            sampled_dataset = dataset.iloc[sampled_indices]
            
            # Check the label distribution
            print(f"Step: {step}, Start Shift: {start_shift}, Sample Size: {len(sampled_dataset)}")
            print(sampled_dataset.iloc[:, -1].value_counts())
            
            # Initialize models
            models = {
                "Hoeffding Tree": tree.HoeffdingTreeClassifier(),
                "ILVQ": XuILVQ(),
            }
            
            # Evaluate each model
            step_key = f"step_{step}_startshift_{start_shift}"
            results[dataset_name][step_key] = {}
            for model_name, model in models.items():
                conf_matrix, elapsed_time = evaluate_model_online_learning(model, sampled_dataset)
                metrics = calculate_metrics(conf_matrix)
                
                # Store results
                results[dataset_name][step_key][model_name] = {
                    "precision": metrics["precision"],
                    "recall": metrics["recall"],
                    "f1": metrics["f1"],
                    "elapsed_time": elapsed_time
                }

# Display results
for dataset_name, steps_results in results.items():
    print(f"Results for dataset: {dataset_name}")
    for step_key, model_results in steps_results.items():
        print(f"  Sampling strategy: {step_key}")
        for model_name, metrics in model_results.items():
            print(
                f"    {model_name} - "
                f"Precision: {metrics['precision']:.4f}, "
                f"Recall: {metrics['recall']:.4f}, "
                f"F1-score: {metrics['f1']:.4f}, "
                f"Time: {metrics['elapsed_time']:.2f} sec"
            )
        print("\n")


Processing datasets:   0%|          | 0/1 [00:00<?, ?it/s]

Step: 1000, Start Shift: 0, Sample Size: 1000
label
1    550
0    450
Name: count, dtype: int64
Step: 1000, Start Shift: 1, Sample Size: 1000
label
0    520
1    480
Name: count, dtype: int64
Step: 1000, Start Shift: 2, Sample Size: 1000
label
0    510
1    490
Name: count, dtype: int64
Step: 1000, Start Shift: 3, Sample Size: 1000
label
1    520
0    480
Name: count, dtype: int64
Step: 1000, Start Shift: 4, Sample Size: 1000
label
1    650
0    350
Name: count, dtype: int64
Step: 5000, Start Shift: 0, Sample Size: 1000
label
1    750
0    250
Name: count, dtype: int64
Step: 5000, Start Shift: 1, Sample Size: 1000
label
0    550
1    450
Name: count, dtype: int64
Step: 5000, Start Shift: 2, Sample Size: 1000
label
0    650
1    350
Name: count, dtype: int64
Step: 5000, Start Shift: 3, Sample Size: 1000
label
0    600
1    400
Name: count, dtype: int64
Step: 5000, Start Shift: 4, Sample Size: 1000
label
0    500
1    500
Name: count, dtype: int64
Step: 10000, Start Shift: 0, Sample Size

Processing datasets: 100%|██████████| 1/1 [00:04<00:00,  4.27s/it]

Step: 10000, Start Shift: 4, Sample Size: 1000
label
0    500
1    500
Name: count, dtype: int64
Results for dataset: lgr
  Sampling strategy: step_1000_startshift_0
    Hoeffding Tree - Precision: 0.7580, Recall: 0.7710, F1-score: 0.7640, Time: 0.08 sec
    ILVQ - Precision: 0.8340, Recall: 0.8840, F1-score: 0.8580, Time: 0.35 sec


  Sampling strategy: step_1000_startshift_1
    Hoeffding Tree - Precision: 0.7410, Recall: 0.6960, F1-score: 0.7180, Time: 0.08 sec
    ILVQ - Precision: 0.7810, Recall: 0.7280, F1-score: 0.7540, Time: 0.29 sec


  Sampling strategy: step_1000_startshift_2
    Hoeffding Tree - Precision: 0.7410, Recall: 0.6800, F1-score: 0.7090, Time: 0.08 sec
    ILVQ - Precision: 0.6990, Recall: 0.7190, F1-score: 0.7090, Time: 0.29 sec


  Sampling strategy: step_1000_startshift_3
    Hoeffding Tree - Precision: 0.7690, Recall: 0.8310, F1-score: 0.7990, Time: 0.08 sec
    ILVQ - Precision: 0.8070, Recall: 0.8120, F1-score: 0.8090, Time: 0.27 sec


  Sampling strategy: s




### Loop para testear cada una de las combinaciones

In [10]:
# Dictionary to store results
results = {}

# Iterate over each dataset with a progress bar
for dataset_name, dataset_file in tqdm(data_name.items(), desc="Processing datasets"):
    
    # Initialize models
    models = {
        "ARF": forest.ARFClassifier(n_models=3, max_size=20),
        "Hoeffding Tree": tree.HoeffdingTreeClassifier(),
        "ILVQ": XuILVQ(),
        "ILVQ_mod": XuILVQ_mod(),
    }
    
    # Load dataset
    dataset = read_dataset(dataset_name, data_name)
    dataset = dataset.iloc[:5000]  # Limit to first 5000 rows for consistency
    
    # Dictionary to store results for this dataset
    results[dataset_name] = {}
    
    # # Calculate the distribution of the last column (label)
    # label_distribution = dataset.iloc[:, -1].value_counts()

    # print("Label Distribution:")
    # print(label_distribution)


    
    # Evaluate each model
    for model_name, model in models.items():
        # Print the current model-dataset iteration
        # tqdm.write(f"[ITERATION] {model_name} - {dataset_name}")
        conf_matrix, elapsed_time = evaluate_model_online_learning(model, dataset)
        metrics = calculate_metrics(conf_matrix)
        
        
        # Store results
        results[dataset_name][model_name] = {
            "precision": metrics["precision"],  # Extract individual metrics
            "recall": metrics["recall"],
            "f1": metrics["f1"],
            "elapsed_time": elapsed_time
        }

  dataset.replace({'UP': 1, 'DOWN': 0, 'True': 1, 'False': 0}, inplace=True)
Processing datasets: 100%|██████████| 47/47 [04:44<00:00,  6.05s/it]


In [4]:
# Display results
# Display results
for dataset_name, model_results in results.items():
    print(f"Results for dataset: {dataset_name}")
    for model_name, metrics in model_results.items():
        print(
            f"{model_name} - "
            f"Precision: {metrics['precision']:.4f}, "
            f"Recall: {metrics['recall']:.4f}, "
            f"F1-score: {metrics['f1']:.4f}, "
            f"Time: {metrics['elapsed_time']:.2f} sec"
        )
    print("\n")


Results for dataset: elec
ARF - Precision: 0.8470, Recall: 0.8020, F1-score: 0.8240, Time: 1.78 sec
Hoeffding Tree - Precision: 0.7900, Recall: 0.8260, F1-score: 0.8080, Time: 0.46 sec
ILVQ - Precision: 0.7420, Recall: 0.6610, F1-score: 0.6990, Time: 2.90 sec
ILVQ_mod - Precision: 0.6880, Recall: 0.7540, F1-score: 0.7190, Time: 4.54 sec


Results for dataset: sea_dataset_min1000_max10000
ARF - Precision: 0.9070, Recall: 0.9450, F1-score: 0.9260, Time: 1.20 sec
Hoeffding Tree - Precision: 0.9080, Recall: 0.9310, F1-score: 0.9190, Time: 0.34 sec
ILVQ - Precision: 0.9220, Recall: 0.9260, F1-score: 0.9240, Time: 1.80 sec
ILVQ_mod - Precision: 0.9080, Recall: 0.9180, F1-score: 0.9130, Time: 3.61 sec


Results for dataset: sea_dataset_min100_max500
ARF - Precision: 0.9040, Recall: 0.9510, F1-score: 0.9270, Time: 1.24 sec
Hoeffding Tree - Precision: 0.9080, Recall: 0.9310, F1-score: 0.9190, Time: 0.34 sec
ILVQ - Precision: 0.9220, Recall: 0.9260, F1-score: 0.9240, Time: 1.78 sec
ILVQ_mod - Pr

In [7]:
# Log the results

import logging

# Configure logging
logging.basicConfig(
    filename="model_results.log",  # Log file name
    level=logging.INFO,            # Logging level
    format="%(message)s",          # Log format (just the message)
    filemode="w"                   # Overwrite the file if it exists
)

# Function to log results
def log_results(results):
    for dataset_name, model_results in results.items():
        logging.info(f"Results for dataset: {dataset_name}")
        for model_name, metrics in model_results.items():
            logging.info(
                f"{model_name} - "
                f"Precision: {metrics['precision']:.4f}, "
                f"Recall: {metrics['recall']:.4f}, "
                f"F1-score: {metrics['f1']:.4f}, "
                f"Time: {metrics['elapsed_time']:.2f} sec"
            )
        logging.info("\n")

# Log the results
log_results(results)