## Imports

In [None]:
import os

os.chdir("../..")
os.getcwd()

In [None]:
import lightgbm as lgb
import pandas as pd
from src.data.preprocessing import create_dataset
import mlflow
import plotly.graph_objects as go
import numpy as np

from src.data.utils import get_feature_importances_and_print_useless_columns
from src.model.model_selection import split_train_test

In [None]:
SEED = 666
IS_BUSINESS = True
MODEL_TYPE = "consumer"

In [None]:
def get_learning_curves(model: lgb.LGBMModel) -> go.Figure:
    results = pd.DataFrame({"train": model.evals_result_["train"]["l1"], "valid": model.evals_result_["valid"]["l1"]})

    fig = go.Figure()
    for col in results.columns:
        fig.add_trace(go.Scatter(x=results.index+1, y=results[col], mode='lines', name=f"{col}"))

    fig.update_layout(title='Learning curves',
                      xaxis_title='Iteration',
                      yaxis_title='MAE',
                      margin=dict(l=0,r=0,b=0,t=30),
                      )
    return fig

In [24]:
def train_model(x_train: pd.DataFrame, y_train: pd.DataFrame, eval_set: tuple[pd.DataFrame, pd.DataFrame], params) -> lgb.LGBMModel:
    eval_results = {}
    model = lgb.LGBMRegressor(**params)
    
    model.fit(
        X=x_train,
        y=y_train,
        eval_set=[(x_train, y_train), eval_set],
        eval_names=["train", "valid"],
        eval_metric="mae",
        callbacks=[lgb.log_evaluation(), lgb.record_evaluation(eval_results), lgb.early_stopping(stopping_rounds=100, first_metric_only=True)],
        
    )
    return model

In [17]:
dataset: pd.DataFrame = create_dataset(model_type=IS_BUSINESS, is_business=IS_BUSINESS, columns=[])

columns_to_drop = [
    "client_id",
    "data_block_id",
    # "date",
    "date_client",
    "date_gas",
    "datetime",
    "county",
    "county_name_forecast",
    "latitude_min_forecast",
    "latitude_max_forecast",
    "longitude_min_forecast",
    "longitude_max_forecast",
    # "hour",
    # "quarter",
    # "eic_count",
    "installed_capacity",
    "installed_capacity_eic_count_ratio",
    # "eic_count_installed_capacity_ratio",
    "year",
    "month"
]

dataset["installed_capacity_eic_count_ratio"] = dataset["installed_capacity"] / dataset["eic_count"]
dataset["eic_count_installed_capacity_ratio"] = dataset["eic_count"] / dataset["installed_capacity"]
dataset = dataset.drop(columns=columns_to_drop)
train_index, test_index = split_train_test(dataset, test_months=3)
dataset = dataset.drop(columns=["date"])

x_train, x_test = dataset.loc[train_index], dataset.loc[test_index]
y_train, y_test = x_train.pop("target"), x_test.pop("target")

In [None]:
# dataset[["eic_count", "installed_capacity", "target"]].corr()

In [25]:
import warnings
warnings.filterwarnings("ignore")
mlflow.set_experiment(f"user type: consumer is business: True")

mlflow.lightgbm.autolog(
    log_input_examples=False,
    log_model_signatures=True,
    log_models=True,
    log_datasets=False,
    disable=False,
    exclusive=False,
    disable_for_unsupported_versions=False,
    silent=False,
    registered_model_name=None,
    extra_tags=None
)
params = {
    "boosting_type": "gbdt",
    "colsample_bytree": .7,
    "device":"cpu",
    "learning_rate": 0.0222,
    "linear_tree": True,
    "max_depth": 8,
    "max_bin": 256,
    "min_child_samples": 50,
    "min_child_weight": 2.031,
    "min_split_gain": 0.285,
    "num_leaves": 67,
    "objective": "l2",
    "reg_alpha": 1.319,
    "reg_lambda": 9.0,
    "subsample": .85,
    "subsample_freq": 0,
    "num_boost_round": 20_000,
    "random_state": SEED,
    "verbosity": 1,
    "categorical_feature": "auto",
    "feature_name": "auto",
    "keep_training_booster": False,
    "n_jobs": -1 
}


with mlflow.start_run(log_system_metrics=True) as run:
    mlflow.log_param("columns", list(x_train.columns))
    # mlflow.log_params(params)
    model = train_model(x_train=x_train, y_train=y_train, eval_set=(x_test, y_test), params=params)

    fig = get_learning_curves(model)
    mlflow.log_figure(fig, artifact_file="consumer_model.png")

2024/02/08 16:57:37 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.113688 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 20099
[LightGBM] [Info] Number of data points in the train set: 440558, number of used features: 94
[LightGBM] [Info] Start training from score 64.311413
[1]	train's l1: 98.3508	train's l2: 75902.9	valid's l1: 201.241	valid's l2: 438266
Training until validation scores don't improve for 100 rounds
[2]	train's l1: 96.3244	train's l2: 73020.9	valid's l1: 197.669	valid's l2: 425446
[3]	train's l1: 94.3636	train's l2: 70299.9	valid's l1: 194.076	valid's l2: 411875
[4]	train's l1: 92.4438	train's l2: 67649.2	valid's l1: 190.502	valid's l2: 398700
[5]	train's l1: 90.5994	train's l2: 65182.3	valid's l1: 187.131	valid's l2: 386928
[6]	train's l1: 88.7743	train's l2: 62753	valid's l1: 183.742	valid's l2: 374891
[7]	train's l1: 86.9974	train's l





2024/02/08 16:57:52 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/02/08 16:57:52 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


## Genetic algorithms

In [None]:
class GeneticAlgorithm:
    def __init__(self, parameter_space: dict[str, int | float | bool | str], population_size: int = 10, max_generations: int = 5, mutation_rate: float = 0.1, crossover_rate: float = 0.5, tournament_size: int = 3):
        self.population_size = population_size
        self.max_generations = int(max_generations)
        self.mutation_rate = mutation_rate
        self.crossover_rate = crossover_rate
        self.tournament_size = int(tournament_size)
        self.parameter_space = parameter_space
        self.best_solution = [None] * self.max_generations
        self.default_model_parameters = {
            "random_state":SEED,
            "n_jobs":-1,
            "importance_type":"split",
            "linear_tree":True,
            "verbosity":-1,
            "device":"cpu",
            "n_estimators":10_000,
            "subsample_for_bin":200_000,
        }

    def set_parameter_space(self, parameter_space: dict):
        self.parameter_space = parameter_space

    def initialize_population(self, initial_population_sample: dict | None = None) -> list[dict[str, int | float | bool | str]]:
        population = []
        for _ in range(self.population_size):
            chromosome = {param: np.random.choice(values) for param, values in self.parameter_space.items()}
            population.append(chromosome)

        if initial_population_sample:    
            population[np.random.choice(np.arange(self.population_size))] = initial_population_sample

        return population

    def tournament_selection(self, population, fitness_scores):
        selected_parents = []
        for _ in range(len(population)):
            tournament_indices = np.random.choice(len(population), self.tournament_size, replace=False)
            tournament_scores = [fitness_scores[i] for i in tournament_indices]
            winner_index = tournament_indices[np.argmin(tournament_scores)]
            selected_parents.append(population[winner_index])
        return selected_parents

    def _crossover(self, parent_1, parent_2):
        if np.random.random() < self.crossover_rate:
            return parent_1, parent_2

        # create two children
        offsprings = []
        for i in range(2):
            offspring = {}
            # create each children by selecting each gene from the two parents
            for gene in parent_1.keys():
                offspring[gene] = np.random.choice([parent_1[gene], parent_2[gene]])
            offsprings.append(offspring)
        return offsprings
            
    def crossover(self, parents):
        offspring = []
        for i in range(0, len(parents), 2):
            offspring.extend(self._crossover(parents[i], parents[i + 1]))
        return offspring


    def mutate(self, chromosome, parameter_space):
        mutated_chromosome = chromosome.copy()
        for param in parameter_space.keys():
            # print(f"Param: {param}")
            if np.random.rand() < self.mutation_rate:
                mutated_chromosome[param] = np.random.choice(parameter_space[param])
        return mutated_chromosome
        

    def evaluate_model(self, params, x_train: pd.DataFrame, y_train: pd.Series, eval_set: tuple[pd.DataFrame, pd.Series]):
        params = params | self.default_model_parameters
        
        model = lgb.LGBMRegressor(**params)
        model.fit(
            X=x_train,
            y=y_train,
            eval_set=[(x_train, y_train), eval_set],
            eval_names=["train", "valid"],
            eval_metric="mae",
            callbacks=[lgb.early_stopping(stopping_rounds=100, first_metric_only=True)],
        )
        return model.best_score_.get("valid", {}).get("l1", np.inf)


    def fit(self, x_train: pd.DataFrame, y_train: pd.Series, eval_set: tuple[pd.DataFrame, pd.Series], initial_population_sample: dict | None = None):
        # Genetic algorithm main loop
        population = self.initialize_population(initial_population_sample)

        for generation in range(self.max_generations):
            fitness_scores = [self.evaluate_model(chromosome, x_train, y_train, eval_set) for chromosome in population]
            print("Generation:", generation, "Best score:", min(fitness_scores))

            selected_parents = self.tournament_selection(population, fitness_scores)
            offspring = self.crossover(selected_parents)
            mutated_offspring = [self.mutate(chromosome, self.parameter_space) for chromosome in offspring]
            population = mutated_offspring

        # Select best solution
            best_fitness_score_generation = np.argmin(fitness_scores)
            self.best_solution[generation] = (fitness_scores[best_fitness_score_generation], population[best_fitness_score_generation])

    def fit_mlflow(self, x_train: pd.DataFrame, y_train: pd.Series, eval_set: tuple[pd.DataFrame, pd.Series], initial_population_sample: dict | None = None, experiment_name: str = None):
        mlflow.set_experiment(experiment_name=experiment_name)
        population = self.initialize_population(initial_population_sample)

        with mlflow.start_run(log_system_metrics=True):
            for generation in range(self.max_generations):
                fitness_scores = []
                with mlflow.start_run(run_name=f"generation {generation+1}", nested=True, log_system_metrics=True):
                    for i, chromosome in enumerate(population):
                        with mlflow.start_run(run_name=f"population sample {i+1}", nested=True, log_system_metrics=True):
                            valid_score = self.evaluate_model(chromosome, x_train, y_train, eval_set)
                            fitness_scores.append(valid_score)

                print("Generation:", generation, "Best score:", min(fitness_scores))

                selected_parents = self.tournament_selection(population, fitness_scores)
                offspring = self.crossover(selected_parents)
                mutated_offspring = [self.mutate(chromosome, self.parameter_space) for chromosome in offspring]
                population = mutated_offspring

        # Select best solution
                best_fitness_score_generation = np.argmin(fitness_scores)
                self.best_solution[generation] = (fitness_scores[best_fitness_score_generation], population[best_fitness_score_generation])

In [None]:
import warnings
warnings.filterwarnings('ignore')


parameter_space = {
    "learning_rate": np.arange(start=0.001, stop=0.1, step=0.0001),
    "num_leaves": np.arange(start=5, stop=100),
    "max_depth": np.arange(start=-1, stop=10),    
    "objective": ["l2", "poisson"],
    "min_split_gain": np.arange(start=0., stop=1., step=0.001),
    "min_child_weight": np.arange(start=0.001, stop=10, step=0.01),
    "min_child_samples": np.arange(start=20, stop=100, step=5),
    "subsample": np.arange(start=0.6, stop=1.0, step=0.05),
    "colsample_bytree": np.arange(start=0.6, stop=1.0, step=0.05),
    "reg_alpha": np.arange(start=0.0, stop=10., step=0.001),
    "reg_lambda": np.arange(start=0.0, stop=10., step=0.001)
}

initial_population_sample = None
initial_population_sample = {
    "learning_rate": 0.04410,
    "num_leaves": 67,
    "max_depth": 8,
    "objective": "l2",
    "min_split_gain": 0.285, 
    "min_child_weight": 2.031, 
    "min_child_samples": 50, 
    "subsample": 0.85,
    "colsample_bytree": 0.7,
    "reg_alpha": 1.319, 
    "reg_lambda": 9.,
}

if False:
    mlflow.lightgbm.autolog(
        log_input_examples=False,
        log_model_signatures=True,
        log_models=True,
        log_datasets=False,
        disable=False,
        exclusive=False,
        disable_for_unsupported_versions=False,
        silent=False,
        registered_model_name=None,
        extra_tags=None
    )
    
    ga = GeneticAlgorithm(parameter_space=parameter_space, population_size=10, max_generations=5, mutation_rate=0.1, crossover_rate=0.8, tournament_size=3)
    
    ga.fit_mlflow(x_train, y_train, eval_set=(x_test, y_test), initial_population_sample=initial_population_sample, experiment_name=f"user type: consumer is business: True")

    ga.best_solution