## Imports

In [1]:
import os

os.chdir("../..")
os.getcwd()

'/home/paolo/git/enerfit-predict-energy-behaviour'

In [2]:
import lightgbm as lgb
import pandas as pd
from src.data.preprocessing import create_dataset
import mlflow
import plotly.graph_objects as go
import numpy as np

from src.data.utils import get_feature_importances_and_print_useless_columns
from src.data.model_selection import split_train_test

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
SEED = 666

In [5]:
def get_learning_curves(model: lgb.LGBMModel) -> go.Figure:
    results = pd.DataFrame({"train": model.evals_result_["train"]["l1"], "valid": model.evals_result_["valid"]["l1"]})

    fig = go.Figure()
    for col in results.columns:
        fig.add_trace(go.Scatter(x=results.index+1, y=results[col], mode='lines', name=f"{col}"))

    fig.update_layout(title='Learning curves',
                      xaxis_title='Iteration',
                      yaxis_title='MAE',
                      margin=dict(l=0,r=0,b=0,t=30),
                      )
    return fig

## Train Producer model

In [6]:
# %%time 
# 
# columns_to_drop_producer = ["client_id",
#                    "data_block_id",
#                    # "date",
#                    "date_client",
#                    "date_gas",
#                    "datetime",
#                    "county",
#                    "county_name_forecast",
#                    "latitude_min_forecast",
#                    "latitude_max_forecast",
#                    "longitude_min_forecast",
#                    "longitude_max_forecast",
#                    # "hour",
#                    # "quarter",
#                    "year",
#                    "month"]
# 
# # columns_to_drop = columns_to_drop + ['direct_solar_radiation_last_24_hours',
# # 'cloudcover_low_forecast_last_day',
# # 'winddirection_10m_last_24_hours',
# # 'snowfall_forecast_last_day',
# # '10_metre_u_wind_component',
# # 'product_type',
# # 'sin(day_of_year)',
# # 'rain_last_7_days',
# # 'surface_solar_radiation_downwards_forecast_last_day',
# # 'cloudcover_high_forecast_last_day',
# # 'gas_lowest_price_per_mwh',
# # 'cloudcover_high_last_7_days',
# # 'temperature_last_24_hours',
# # 'cloudcover_low',
# # 'dewpoint_forecast',
# # 'shortwave_radiation',
# # 'cos(day_of_year)',
# # 'dewpoint_forecast_last_day',
# # 'electricity_euros_per_mwh',
# # '10_metre_v_wind_component',
# # 'cos(hour)',
# # 'winddirection_10m',
# # 'longitude_max',
# # 'longitude_min',
# # 'is_holiday',
# # 'shortwave_radiation_last_24_hours',
# # 'cloudcover_low_last_24_hours',
# # 'cloudcover_mid_last_24_hours',
# # 'shortwave_radiation_last_7_days',
# # 'temperature',
# # 'snowfall_last_7_days',
# # 'day',
# # 'sin(hour)',
# # 'temperature_forecast_last_day',
# # 'dewpoint_last_7_days',
# # 'surface_pressure',
# # 'temperature_forecast',
# # 'cloudcover_total_last_24_hours',
# # 'snowfall_last_24_hours',
# # 'rain_last_24_hours',
# # 'diffuse_radiation_last_7_days',
# # 'cloudcover_high',
# # 'winddirection_10m_last_7_days',
# # 'windspeed_10m_last_7_days',
# # 'windspeed_10m',
# # 'direct_solar_radiation',
# # 'temperature_last_7_days',
# # 'diffuse_radiation',
# # 'latitude_min',
# # 'latitude_max',
# # 'dewpoint',
# # 'cloudcover_high_forecast',
# # 'cloudcover_mid_forecast',
# # 'cloudcover_total_forecast',
# # 'cloudcover_high_last_24_hours',
# # 'windspeed_10m_last_24_hours',
# # 'dewpoint_last_24_hours',
# # 'snowfall_forecast',
# # 'gas_mean_price_per_mhw',
# # 'sin(month)',
# # 'rain',
# # 'cos(month)',
# # 'snowfall',
# # 'diffuse_radiation_last_24_hours',
# # 'cloudcover_mid']
# 
# # best model huber = 48.1918, 15 leaves, 10m training
# # best model poisson = 44.699, 15 leaves, 12s training
# producer_model = load_data_and_train_model(model_type="producer", columns_to_drop=columns_to_drop_producer)

In [7]:
# feature_importances_producer = get_feature_importances_and_print_useless_columns(producer_model)
# feature_importances_producer

## Train consumer model

In [8]:
keep_cols = ['10_metre_u_wind_component_forecast_last_day',
             '10_metre_v_wind_component',
             'cloudcover_high_forecast_last_day',
             'cloudcover_high_last_7_days',
             'cloudcover_low_forecast',
             'cloudcover_low_forecast_last_day',
             'cloudcover_total_forecast',
             'cloudcover_total_forecast_last_day',
             'cloudcover_total_last_7_days',
             'cos(day_of_year)',
             'cos(hour)',
             'cos(month)',
             'cos(weekday)',
             'county_name',
             'day',
             "date",
             'dewpoint_forecast_last_day',
             'dewpoint_last_24_hours',
             'diffuse_radiation',
             'diffuse_radiation_last_7_days',
             'direct_solar_radiation_forecast',
             'direct_solar_radiation_forecast_last_day',
             'direct_solar_radiation_last_7_days',
             'eic_count',
             'gas_highest_price_per_mwh',
             'hours_ahead',
             'installed_capacity',
             'is_holiday',
             'longitude_min',
             # 'noise',
             'rain_last_7_days',
             'shortwave_radiation',
             'shortwave_radiation_last_7_days',
             'sin(day_of_year)',
             'sin(weekday)',
             'snowfall_forecast',
             'snowfall_last_24_hours',
             'snowfall_last_7_days',
             'surface_solar_radiation_downwards',
             'surface_solar_radiation_downwards_forecast_last_day',
             "target",
             'target_2_days_ago',
             'target_3_days_ago',
             'target_4_days_ago',
             'target_5_days_ago',
             'target_6_days_ago',
             'target_7_days_ago',
             'temperature',
             'temperature_forecast',
             'temperature_forecast_last_day',
             'temperature_last_24_hours',
             'total_precipitation',
             'total_precipitation_forecast_last_day',
             'winddirection_10m_last_7_days',
             'windspeed_10m_last_7_days'
             ]
columns_to_drop_consumer = [
    "client_id",
    "data_block_id",
    "date",
    "date_client",
    "date_gas",
    "datetime",
    "county",
    "county_name_forecast",
    "latitude_min_forecast",
    "latitude_max_forecast",
    "longitude_min_forecast",
    "longitude_max_forecast",
    "year",
    "month",
    # "noise",
    # "target"
    "installed_capacity",
    "eic_count"
]

def build_dataset(model_type: str, is_business: int):
    dataset = create_dataset(model_type=model_type, columns=[], add_noise_column=True)
    dataset = dataset.query("is_business == @is_business").drop(columns=["is_business"])
    
    train_index, test_index = split_train_test(data=dataset)
    dataset = dataset.drop(columns=columns_to_drop_consumer)
    # dataset = dataset.drop(columns=["date", "installed_capacity"])
    # dataset = dataset[keep_cols]
    x_train, x_test = dataset.loc[train_index], dataset.loc[test_index]
    y_train, y_test = x_train.pop("target"), x_test.pop("target")
    return x_train, x_test, y_train, y_test

In [32]:
def train_model(x_train: pd.DataFrame, y_train: pd.Series, eval_set: tuple[pd.DataFrame, pd.Series]) -> lgb.LGBMModel:
    eval_results = {}
    model = lgb.LGBMRegressor(
        boosting_type="dart",
        num_leaves=20,
        max_depth=-1,
        learning_rate=0.01,
        n_estimators=10_000,
        subsample_for_bin=200_000,
        objective="l2",
        min_split_gain=0.0,
        min_child_weight=0.001,
        min_child_samples=20,
        subsample=0.8,
        subsample_freq=0,
        colsample_bytree=1.,
        reg_alpha=0.0,
        reg_lambda=0.0,
        random_state=SEED,
        n_jobs=-1,
        importance_type="split",
        linear_tree=True,
        verbosity=1,
        device="cpu",
        max_bin=256
    )

    model.fit(
        X=x_train,
        y=y_train,
        eval_set=[(x_train, y_train), eval_set],
        eval_names=["train", "valid"],
        eval_metric="mae",
        callbacks=[lgb.log_evaluation(), lgb.record_evaluation(eval_results), lgb.early_stopping(stopping_rounds=100, first_metric_only=True)],
    )
    return model

In [ ]:
IS_BUSINESS = 1
x_train, x_test, y_train, y_test = build_dataset(model_type="consumer", is_business=IS_BUSINESS)

In [None]:
for model_type in ["consumer", "producer"]:
    mlflow.set_experiment(f"user type: {model_type} is business: True")
    
    mlflow.lightgbm.autolog(
        log_input_examples=False, 
        log_model_signatures=True, 
        log_models=True, 
        log_datasets=False, 
        disable=False, 
        exclusive=False, 
        disable_for_unsupported_versions=False, 
        silent=False, 
        registered_model_name=None, 
        extra_tags=None
    )
    
    with mlflow.start_run(log_system_metrics=True) as run:
        mlflow.log_param("columns", list(x_train.columns))
        model = train_model(x_train=x_train, y_train=y_train, eval_set=(x_test, y_test))
    
        fig = get_learning_curves(model)
        mlflow.log_figure(fig, artifact_file="consumer_model.png")
        
    break
        # feature_importances_consumer = get_feature_importances_and_print_useless_columns(model)
        # feature_importances_consumer.query("importance_perc_cumulative >= 100")

2024/02/07 16:06:58 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.093896 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19671
[LightGBM] [Info] Number of data points in the train set: 365366, number of used features: 92
[LightGBM] [Info] Start training from score 713.439169
[1]	train's l1: 762.165	train's l2: 2.19816e+06	valid's l1: 812.409	valid's l2: 3.12477e+06
[2]	train's l1: 754.888	train's l2: 2.15588e+06	valid's l1: 804.592	valid's l2: 3.06421e+06
[3]	train's l1: 747.684	train's l2: 2.11444e+06	valid's l1: 796.911	valid's l2: 3.00572e+06
[4]	train's l1: 740.558	train's l2: 2.07383e+06	valid's l1: 789.254	valid's l2: 2.94755e+06
[5]	train's l1: 733.502	train's l2: 2.03401e+06	valid's l1: 781.691	valid's l2: 2.89056e+06
[6]	train's l1: 733.573	train's l2: 2.03441e+06	valid's l1: 781.767	valid's l2: 2.89112e+06
[7]	train's l1: 726.59	train's l2: 1.99538e+06	valid's l1: 774.322	valid's l2: 2.83605e+06
[8]	train's

In [11]:
# sorted(feature_importances_consumer.query("importance_perc_cumulative < 100").feature.tolist())

In [12]:
# feature_importances_consumer

## Genetic algorithm

In [13]:
# class GeneticAlgorithm:
#     def __init__(self, parameter_space: dict[str, int | float | bool | str], population_size: int = 10, max_generations: int = 5, mutation_rate: float = 0.1, crossover_rate: float = 0.5, tournament_size: int = 3):
#         self.population_size = population_size
#         self.max_generations = int(max_generations)
#         self.mutation_rate = mutation_rate
#         self.crossover_rate = crossover_rate
#         self.tournament_size = int(tournament_size)
#         self.parameter_space = parameter_space
#         self.best_solution = {}
# 
#     def set_parameter_space(self, parameter_space: dict):
#         self.parameter_space = parameter_space
# 
#     def initialize_population(self, initial_population_sample: dict | None = None) -> list[dict[str, int | float | bool | str]]:
#         population = []
#         for _ in range(self.population_size):
#             chromosome = {param: np.random.choice(values) for param, values in self.parameter_space.items()}
#             population.append(chromosome)
#             
#         if initial_population_sample:    
#             population[np.random.choice(np.arange(self.population_size))] = initial_population_sample
#             
#         return population
# 
#     def tournament_selection(self, population, fitness_scores):
#         selected_parents = []
#         for _ in range(len(population)):
#             tournament_indices = np.random.choice(len(population), self.tournament_size, replace=False)
#             tournament_scores = [fitness_scores[i] for i in tournament_indices]
#             winner_index = tournament_indices[np.argmin(tournament_scores)]
#             selected_parents.append(population[winner_index])
#         return selected_parents
# 
#     def crossover(self, parents):
#         offspring = []
#         for i in range(0, len(parents), 2):
#             parent1, parent2 = parents[i], parents[i + 1]
#             crossover_point = np.random.randint(1, len(parent1))
#             child1 = {**parent1, **{param: parent2[param] for param in parent2.keys()[:crossover_point]}}
#             child2 = {**parent2, **{param: parent1[param] for param in parent1.keys()[:crossover_point]}}
#             offspring.extend([child1, child2])
#         return offspring
# 
# 
#     def mutate(self, chromosome, parameter_space):
#         mutated_chromosome = chromosome.copy()
#         for param in parameter_space.keys():
#             if np.random.rand() < self.mutation_rate:
#                 mutated_chromosome[param] = np.random.choice(parameter_space[param])
#         return mutated_chromosome
#     
#     def evaluate_model(self, params, x_train: pd.DataFrame, y_train: pd.Series, eval_set: tuple[pd.DataFrame, pd.Series]):
#         model = lgb.LGBMRegressor(
#             **params,
#             boosting_type="gbdt",
#             subsample_for_bin=200_000,
#             n_estimators=20_000,
#             importance_type="split",
#             linear_tree=True,
#             verbosity=1,
#             device="cpu",
#             random_state=SEED
#         )
#         model.fit(
#             X=x_train,
#             y=y_train,
#             eval_set=[(x_train, y_train), eval_set],
#             eval_names=["train", "valid"],
#             eval_metric="mae",
#             callbacks=[lgb.early_stopping(stopping_rounds=100, first_metric_only=True)],
#         )
#         return model.best_score_.get("valid", {}).get("l1", np.inf)
#         
#     
#     def fit(self, x_train: pd.DataFrame, y_train: pd.Series, eval_set: tuple[pd.DataFrame, pd.Series], initial_population_sample: dict | None = None):
#         # Genetic algorithm main loop
#         population = self.initialize_population(initial_population_sample)
# 
#         for generation in range(self.max_generations):
#             fitness_scores = [self.evaluate_model(chromosome, x_train, y_train, eval_set) for chromosome in population]
#             print("Generation:", generation, "Best score:", min(fitness_scores))
#         
#             selected_parents = self.tournament_selection(population, fitness_scores)
#             offspring = self.crossover(selected_parents)
#             mutated_offspring = [self.mutate(chromosome, self.parameter_space) for chromosome in offspring]
#         
#             population = mutated_offspring
#         
#         # Select best solution
#         self.best_solution = population[np.argmin(fitness_scores)]
#         
#     def fit_mlflow(self, x_train: pd.DataFrame, y_train: pd.Series, eval_set: tuple[pd.DataFrame, pd.Series], initial_population_sample: dict | None = None, experiment_name: str = None):
#         mlflow.set_experiment(experiment_name=experiment_name)
#         population = self.initialize_population(initial_population_sample)
# 
#         with mlflow.start_run(log_system_metrics=True):
#             for generation in range(self.max_generations):
#                 fitness_scores = []
#                 with mlflow.start_run(run_name=f"generation {generation+1}", nested=True, log_system_metrics=True):
#                     for i, chromosome in enumerate(population):
#                         with mlflow.start_run(run_name=f"population sample {i+1}", nested=True, log_system_metrics=True):
#                             valid_score = self.evaluate_model(chromosome, x_train, y_train, eval_set)
#                             fitness_scores.append(valid_score)
#                 
#                 print("Generation:", generation, "Best score:", min(fitness_scores))
#                 
#                 selected_parents = self.tournament_selection(population, fitness_scores)
#                 offspring = self.crossover(selected_parents)
#                 mutated_offspring = [self.mutate(chromosome, self.parameter_space) for chromosome in offspring]
#                 population = mutated_offspring
# 
#         # Select best solution
#         self.best_solution = population[np.argmin(fitness_scores)]

In [14]:
# parameter_space = {
#     "learning_rate": np.arange(start=0.001, stop=0.1, step=0.0001),
#     "num_leaves": np.arange(start=5, stop=100),
#     "max_depth": np.arange(start=-1, stop=10),    
#     "objective": ["l2", "poisson"],
#     "min_split_gain": np.arange(start=0., stop=1., step=0.001),
#     "min_child_weight": np.arange(start=0.001, stop=10, step=0.01),
#     "min_child_samples": np.arange(start=20, stop=100, step=5),
#     "subsample": np.arange(start=0.2, stop=1.0, step=0.05),
#     "colsample_bytree": np.arange(start=0.2, stop=1.0, step=0.05),
#     "reg_alpha": np.arange(start=0.0, stop=10., step=0.001),
#     "reg_lambda": np.arange(start=0.0, stop=10., step=0.001)
# }
# 
# initial_population_sample = None
# # initial_population_sample = {
# #     
# #     "learning_rate": 0.001,
# #     "num_leaves": 31,
# #     "max_depth": -1,
# #     # "n_estimators": 20_000,
# #     "objective": "poisson",
# #     "min_split_gain": 0., 
# #     "min_child_weight": 0.001, 
# #     "min_child_samples": 20, 
# #     "subsample": 1.0,
# #     "colsample_bytree": 1.0,
# #     "reg_alpha": 0.0, 
# #     "reg_lambda": 0.0,
# # }

In [15]:
# import warnings
# warnings.filterwarnings('ignore')
# 
# mlflow.lightgbm.autolog(
#     log_input_examples=False, 
#     log_model_signatures=True, 
#     log_models=True, 
#     log_datasets=False, 
#     disable=False, 
#     exclusive=False, 
#     disable_for_unsupported_versions=False, 
#     silent=False, 
#     registered_model_name=None, 
#     extra_tags=None
# )
# 
# ga = GeneticAlgorithm(parameter_space=parameter_space, population_size=10, max_generations=5, mutation_rate=0.1, crossover_rate=0.8, tournament_size=3)
# 
# ga.fit_mlflow(x_train, y_train, eval_set=(x_test, y_test), initial_population_sample=initial_population_sample, experiment_name="consumer model not business")