In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.chdir('..')

In [3]:
import optuna
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn import preprocessing, metrics
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import lightgbm as lgb
import random
import pygad

from sales_forecasting.utils import timeseries_split
from sales_forecasting.plot import plot_timeseries, plot_feature_importance
from sales_forecasting.features import col_name

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
random.seed(42)
np.random.seed(42)

In [5]:
df = pd.read_parquet(".data/df_agg_monthly_oversampled.parquet")
train_split, valid_split = timeseries_split(df, max_month=33, col='date_block_num', continuous=False)
train_test_split, test_split = timeseries_split(df, max_month=34, col='date_block_num', continuous=False)
target_col = 'item_cnt_month'
train_target, valid_target = train_split[target_col].clip(0, 20), valid_split[target_col].clip(0, 20)
cols_to_drop = [target_col, 'date_block_num', 'shop_id', 'item_id']
X_train, X_valid = train_split.drop(columns=cols_to_drop), valid_split.drop(columns=cols_to_drop)

In [28]:
params = {
    "float": {
        'learning_rate': (1e-5, 1.0),
        'reg_alpha': (1e-3, 10.0),
        'reg_lambda': (1e-3, 10.0),
        'colsample_bytree': (0.3, 1.0),
        'subsample': (0.4, 1.0),
    },
    "int": {
        'num_leaves': (20, 150),
        'max_depth': (1, 50),
        'min_child_samples': (1, 300), 
        'cat_smooth' : (1, 100), 
    }
}

gene_space = [
    *({"low": pv[0], "high": pv[1]} for pv in params["float"].values()),
    *({"low": pv[0], "high": pv[1], "step": 1} for pv in params["int"].values())
]

In [29]:
def fitness_function(ga_instance, solution, solution_idx):
    params = {
        "float": {
            'learning_rate': (1e-5, 1.0),
            'reg_alpha': (1e-3, 10.0),
            'reg_lambda': (1e-3, 10.0),
            'colsample_bytree': (0.3, 1.0),
            'subsample': (0.4, 1.0),
        },
        "int": {
            'num_leaves': (20, 150),
            'max_depth': (1, 50),
            'min_child_samples': (1, 300), 
            'cat_smooth' : (1, 100), 
        }
    }
    keys = list(params["float"]) + list(params["int"])

    params = {k: v for k, v in zip(keys, solution)} | {'n_estimators': 50, 'random_state': 42, 'verbose': -1}

    model = lgb.LGBMRegressor(**params, boosting_type='gbdt', n_jobs=-1, metric='rmse', objective='regression')
    model.fit(X_train, train_target, eval_set=[(X_valid, valid_target)])

    y_valid_pred = model.predict(X_valid)
    rmse = metrics.root_mean_squared_error(valid_target, y_valid_pred)
    return -rmse

In [30]:
num_generations = 5
sol_per_pop = 10
num_genes = len(params["float"]) + len(params["int"])

ga_instance = pygad.GA(
    num_generations=num_generations,
    num_parents_mating=4,
    fitness_func=fitness_function,
    sol_per_pop=sol_per_pop,
    num_genes=num_genes,
    gene_space=gene_space,
    gene_type=(len(params["float"]) * [float]) + (len(params["int"]) * [int]),
    parent_selection_type="sss",
    keep_parents=2,
    crossover_type="single_point",
    mutation_type="random",
    mutation_percent_genes=20
)
ga_instance.run()

solution, solution_fitness, solution_idx = ga_instance.best_solution()
best_hyperparams = {k: v for k, v in zip(list(params["float"]) + list(params["int"]), solution)}

print("\nBest Hyperparameters Found:")
print(best_hyperparams)
print("Best RMSE (Validation):", -solution_fitness)




Best Hyperparameters Found:
{'learning_rate': np.float64(0.041077106092708114), 'reg_alpha': np.float64(3.988810193546501), 'reg_lambda': np.float64(4.33577385506984), 'colsample_bytree': np.float64(0.8208298500993807), 'subsample': 0.4, 'num_leaves': 20, 'max_depth': np.int64(25), 'min_child_samples': 151, 'cat_smooth': np.int64(60)}
Best RMSE (Validation): 0.38464007359680524
