In [None]:
import pandas as pd
import numpy as np
import random
import seaborn as sns
import matplotlib.pyplot as plt
from deap import base, creator, tools, algorithms
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix


try:
    df = pd.read_csv('candles_previstos_como_1.csv')

    df['date'] = pd.to_datetime(df['date'])

    if 'id_ticker' in df.columns:
        df.drop_duplicates(subset=['date', 'id_ticker'], keep='first', inplace=True)
    else:
        df.drop_duplicates(subset=['date'], keep='first', inplace=True)

    df.sort_values(by='date', inplace=True)

except FileNotFoundError:
    print("Erro: O arquivo 'data_with_errors.csv' não foi encontrado.")
    dates = pd.to_datetime(pd.date_range(start='2024-01-01', end='2024-06-30', freq='h'))
    data = {
        'date': dates,
        'Bands_Norm': np.random.rand(len(dates)),
        'NSMA_3': np.random.rand(len(dates)),
        'NSMA_5': np.random.rand(len(dates)),
        'NSMA_7': np.random.rand(len(dates)),
        'NSMA_9': np.random.rand(len(dates)),
        'NSMA_11': np.random.rand(len(dates)),
        'ERRORS': np.random.randint(0, 2, len(dates))
    }
    df = pd.DataFrame(data)
    df.to_csv('candles_previstos_como_1.csv', index=False)
    print("Arquivo 'data_with_errors.csv' de exemplo criado.")


features = ['Bands_Norm', 'NSMA_3', 'NSMA_5', 'NSMA_7', 'NSMA_9','NSMA_11']
target = 'trend'

train_start, train_end = '2024-01-01', '2024-03-30'
test_start, test_end = '2024-04-01', '2024-06-30'

train_df = df[(df['date'] >= train_start) & (df['date'] <= train_end)]
test_df = df[(df['date'] >= test_start) & (df['date'] <= test_end)]

X_train = train_df[features]
y_train = train_df[target]

X_test = test_df[features]
y_test = test_df[target]

print(f"Tamanho do conjunto de treino: {len(X_train)} amostras")
print(f"Tamanho do conjunto de teste: {len(X_test)} amostras\n")


param_ranges = {
    'hidden_layer_sizes': (10, 200),
    'activation': (0, 2),
    'alpha': (0.0001, 0.1),
    'learning_rate_init': (0.0001, 0.1),
    'max_iter': (100, 500),
}

param_bits = {
    'hidden_layer_sizes': 8,
    'activation': 2,
    'alpha': 10,
    'learning_rate_init': 10,
    'max_iter': 9,
}

activation_map = {
    0: 'relu',
    1: 'tanh',
    2: 'logistic'
}

total_bits = sum(param_bits.values())

creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)
toolbox = base.Toolbox()

toolbox.register("attr_bin", random.randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bin, total_bits)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)


def decode_binary(gene, minimo, maximo, n_bits):
    binary_str = ''.join(map(str, gene))
    int_value = int(binary_str, 2)
    max_value = 2 ** n_bits - 1
    return minimo + (int_value / max_value) * (maximo - minimo)


def decode_individual(individual):
    idx = 0
    decoded = {}
    for param, (min_val, max_val) in param_ranges.items():
        bits = param_bits[param]
        value = decode_binary(individual[idx:idx+bits], min_val, max_val, bits)
        if param in ['hidden_layer_sizes', 'max_iter']:
            value = int(round(value))
        elif param == 'activation':
            value = activation_map[int(round(value))]
        decoded[param] = value
        idx += bits
    return decoded


def calculate_fitness(individual):
    params = decode_individual(individual)
    
    if params['hidden_layer_sizes'] == 0:
        return (0.0,)
        
    model = MLPClassifier(
        hidden_layer_sizes=(params['hidden_layer_sizes'],),
        activation=params['activation'],
        alpha=params['alpha'],
        learning_rate_init=params['learning_rate_init'],
        max_iter=params['max_iter'],
        random_state=42
    )
    
    try:
        skf = StratifiedKFold(n_splits=9, shuffle=True, random_state=42)
        cv_scores = cross_val_score(model, X_train, y_train, cv=skf, scoring='accuracy')
        train_acc = np.mean(cv_scores)

        model.fit(X_train, y_train)
        y_pred_test = model.predict(X_test)
        test_acc = accuracy_score(y_test, y_pred_test)
        

        base_fitness = (0.4 * train_acc) + (0.6 * test_acc)

        final_fitness =  base_fitness

    except Exception as e:
        final_fitness = 0.0 
        
    return (final_fitness,)


toolbox.register("evaluate", calculate_fitness)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)


def main(n_gen=100, pop_size=20, cxpb=0.8, mutpb=0.2):
    population = toolbox.population(n=pop_size)
    hof = tools.HallOfFame(1)
    
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("min", np.min)
    stats.register("mean", np.mean)
    stats.register("max", np.max)
    
    population, logbook = algorithms.eaSimple(
        population,
        toolbox,
        cxpb=cxpb,
        mutpb=mutpb,
        ngen=n_gen,
        stats=stats,
        halloffame=hof,
        verbose=True
    )

    best_individual = hof[0]
    best_params = decode_individual(best_individual)
    best_fitness = best_individual.fitness.values[0]

    evolution_df = pd.DataFrame(logbook.chapters['fitness'])
    evolution_df['gen'] = logbook.select('gen')
    evolution_df.set_index('gen', inplace=True)
    print(evolution_df)
    
    def plot_confusion_matrix(cm, title, class_names=['Classe 0', 'Classe 1']):
        plt.figure(figsize=(6, 5))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                    xticklabels=class_names, yticklabels=class_names)
        plt.title(title)
        plt.ylabel('Verdadeiro')
        plt.xlabel('Predito')
        plt.show()

    best_model = MLPClassifier(
        hidden_layer_sizes=(best_params['hidden_layer_sizes'],),
        activation=best_params['activation'],
        alpha=best_params['alpha'],
        learning_rate_init=best_params['learning_rate_init'],
        max_iter=best_params['max_iter'],
        random_state=42
    )

    skf_final = StratifiedKFold(n_splits=9, shuffle=True, random_state=42)
    y_pred_cv = cross_val_predict(best_model, X_train, y_train, cv=skf_final)
    cm_cv = confusion_matrix(y_train, y_pred_cv)
    plot_confusion_matrix(cm_cv, 'Matriz de Confusão - Validação Cruzada (Treino)')

    best_model.fit(X_train, y_train)
    y_pred_final_test = best_model.predict(X_test)
    cm_test = confusion_matrix(y_test, y_pred_final_test)
    plot_confusion_matrix(cm_test, 'Matriz de Confusão - Período de Teste')

    print(f'Melhor Fitness Encontrado: {best_fitness:.4f}')
    print('Melhores Hiperparâmetros:')
    for k, v in best_params.items():
        print(f"  - {k}: {v}")

    return best_individual, best_params, best_fitness, y_pred_cv, y_pred_final_test


if __name__ == '__main__':
    best_individual, best_params, best_fitness, previsoes_treino, previsoes_teste = main(n_gen=100, pop_size=20)
