# ARE

In [None]:
import collections
import statistics
import time
import os
import psutil
import csv
import numpy as np
from river import base, metrics, tree, drift, utils, stats, stream

In [None]:
class AdaptiveRegularizedEnsemble(base.Ensemble, base.Classifier):
    """Adaptive Regularized Ensemble (ARE) portado do MOA.
    
    Implementa a lógica de treinamento adaptativo e votação seletiva baseada na acurácia 
    de uma janela deslizante.
    """

    def __init__(
        self,
        model: base.Classifier = None,
        n_models: int = 100,
        lambd: float = 6.0,
        drift_detector: base.DriftDetector = None,
        window_size: int = 1000,
        n_rejections: int = 5,
        seed: int = 1
    ):
        self.model = model or tree.HoeffdingTreeClassifier()
        self.n_models = n_models
        self.lambd = lambd
        self.drift_detector = drift_detector or drift.ADWIN(delta=1e-3)
        self.window_size = window_size
        self.n_rejections = n_rejections
        self.seed = seed
        self._rng = np.random.RandomState(self.seed)
        
        # Cada AREBaseLearner no Java gerencia seu classificador, detector de drift e acurácia
        self._ensemble_members = [
            {
                'model': self.model.clone(),
                'detector': self.drift_detector.clone(),
                'untrained_counts': collections.defaultdict(int),
                # Uso correto de stats.Mean() dentro do utils.Rolling conforme documentação
                'window_acc': utils.Rolling(stats.Mean(), window_size=self.window_size)
            }
            for _ in range(self.n_models)
        ]
        
        # Inicializa a classe base Ensemble com a lista de modelos puros
        super().__init__(models=[m['model'] for m in self._ensemble_members])
        
        self._avg_window_acc = 0.0

    def learn_one(self, x, y):
        all_accs = []
        
        for m in self._ensemble_members:
            # Predição local para avaliar necessidade de treino e detecção de drift
            y_pred = m['model'].predict_one(x)
            correct = (y == y_pred)
            
            # Estratégia de treinamento adaptativo: treina no erro ou após N acertos
            will_train = not correct
            if correct:
                m['untrained_counts'][y] += 1
                if self.n_rejections > 0 and m['untrained_counts'][y] >= self.n_rejections:
                    m['untrained_counts'][y] = 0
                    will_train = True
            
            if will_train:
                # Bagging online usando peso de Poisson
                k = self._rng.poisson(self.lambd)
                if k > 0:
                    for _ in range(k):
                        m['model'].learn_one(x, y)
            
            # Atualiza detector de mudança (0 para acerto, 1 para erro)
            m['detector'].update(0 if correct else 1)
            if m['detector'].drift_detected:
                self._reset_member(m)
            
            # Atualiza a média da janela deslizante para o processo de votação
            m['window_acc'].update(1 if correct else 0)
            all_accs.append(m['window_acc'].get())

        # Média global de acurácia das janelas para filtrar quem vota no próximo passo
        if all_accs:
            self._avg_window_acc = statistics.mean(all_accs)
            
        return self

    def predict_proba_one(self, x):
        combined_votes = collections.Counter()
        
        # Seleção: apenas modelos com acurácia >= média global participam da votação
        eligible_members = [
            m for m in self._ensemble_members 
            if self.window_size == 0 or m['window_acc'].get() >= self._avg_window_acc
        ]
        
        # Se nenhum modelo for elegível (início do stream), usa todos
        if not eligible_members:
            eligible_members = self._ensemble_members

        for m in eligible_members:
            votes = m['model'].predict_proba_one(x)
            if votes:
                total = sum(votes.values())
                if total > 0:
                    for cls, prob in votes.items():
                        combined_votes[cls] += prob / total

        return combined_votes

    def _reset_member(self, m):
        """Reinicia o classificador e as estatísticas do membro após um drift."""
        m['model'] = self.model.clone()
        m['detector'] = self.drift_detector.clone()
        m['untrained_counts'].clear()
        m['window_acc'] = utils.Rolling(stats.Mean(), window_size=self.window_size)

In [None]:
def get_target_column_name(file_path):
    """Detecta dinamicamente o nome da última coluna (target) no ARFF."""
    last_attribute = None
    with open(file_path, 'r') as f:
        for line in f:
            line_upper = line.upper().strip()
            if line_upper.startswith('@ATTRIBUTE'):
                parts = line.split()
                if len(parts) > 1:
                    last_attribute = parts[1].replace("'", "").replace('"', '')
            if line_upper.startswith('@DATA'):
                break
    return last_attribute

def get_memory_usage():
    """Retorna o uso de memória em MB."""
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / (1024 * 1024)

# 1. Configuração de Caminhos e Datasets
base_path = os.path.expanduser("~/moa/aldopaim/AdaptiveRegularizedEnsemble/datasets")
output_file = "resultados_are_river.csv"
arff_files = {
    "Airlines": "airlines.arff", "Census": "census.arff", "Connect-4": "connect-4.arff",
    "Covtype": "covtypeNorm.arff", "ElecNorm": "elecNormNew.arff", "GMSC": "GMSC.arff",
    "Keystroke": "keystroke.arff", "NOAA": "NOAA.arff", "Nomao": "nomao.arff",
    "Outdoor": "outdoor.arff", "Ozone": "ozone.arff"
}

# 2. Preparação do arquivo CSV
header = ['Dataset', 'Seed', 'Acuracia', 'Tempo_s', 'Memoria_MB', 'Instancias']
with open(output_file, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(header)

# 3. Loop Principal: Datasets -> Seeds (1 a 5)
print(f"{'Dataset':<12} | {'Seed':<5} | {'Acurácia':<10} | {'Tempo (s)':<10}")
print("-" * 45)

for name, filename in arff_files.items():
    file_path = os.path.join(base_path, filename)
    if not os.path.exists(file_path): continue

    target_col = get_target_column_name(file_path)
    
    # Executa 5 rodadas com seeds diferentes
    for s in range(1, 6):
        # Reinicia Hiperparâmetros para garantir independência entre as rodadas
        base_tree = tree.HoeffdingTreeClassifier(grace_period=100, delta=0.01)
        adwin_detector = drift.ADWIN(delta=1e-3)
        
        model = AdaptiveRegularizedEnsemble(
            model=base_tree,
            n_models=100,
            drift_detector=adwin_detector,
            seed=s
        )
        metric = metrics.Accuracy()
        
        start_time = time.perf_counter()
        count = 0
        
        try:
            dataset_stream = stream.iter_arff(file_path, target=target_col)
            for x, y in dataset_stream:
                y_pred = model.predict_one(x)
                if y_pred is not None:
                    metric.update(y, y_pred)
                model.learn_one(x, y)
                count += 1
            
            elapsed = time.perf_counter() - start_time
            mem = get_memory_usage()
            acc = metric.get()
            
            # Salva no CSV
            with open(output_file, 'a', newline='') as f:
                writer = csv.writer(f)
                writer.writerow([name, s, f"{acc:.4f}", f"{elapsed:.2f}", f"{mem:.2f}", count])
            
            print(f"{name:<12} | {s:<5} | {acc:>9.2%} | {elapsed:>9.2f}")
            
        except Exception as e:
            print(f"Erro em {name} (Seed {s}): {e}")

print("-" * 45)
print(f"Experimento concluído! Resultados salvos em: {output_file}")