In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.chdir('..')

In [3]:
import optuna
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn import preprocessing, metrics
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import lightgbm as lgb
import random
import pygad

from sales_forecasting.utils import timeseries_split
from sales_forecasting.plot import plot_timeseries, plot_feature_importance
from sales_forecasting.features import col_name

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
random.seed(42)
np.random.seed(42)

In [5]:
df = pd.read_parquet(".data/df_agg_monthly_oversampled.parquet")
train_split, valid_split = timeseries_split(df, max_month=33, col='date_block_num', continuous=False)
train_test_split, test_split = timeseries_split(df, max_month=34, col='date_block_num', continuous=False)
target_col = 'item_cnt_month'
train_target, valid_target = train_split[target_col].clip(0, 20), valid_split[target_col].clip(0, 20)
cols_to_drop = [target_col, 'date_block_num', 'shop_id', 'item_id']
X_train, X_valid = train_split.drop(columns=cols_to_drop), valid_split.drop(columns=cols_to_drop)

In [6]:
X_train

  has_large_values = (abs_vals > 1e6).any()
  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,city_id,item_category_id,general_item_category_id,date_month,month_sin,month_cos,lagged_1,lagged_2,lagged_3,lagged_4,...,rolling_9,avg_shop_item_item_price_lag_1,avg_shop_item_item_cnt_day_lag_1,avg_item_item_price_lag_1,avg_item_item_cnt_day_lag_1,avg_shop_item_category_item_price_lag_1,avg_shop_item_category_item_cnt_day_lag_1,avg_item_category_item_price_lag_1,avg_item_category_item_cnt_day_lag_1,months_since_last_buy
0,0,40,11,8,0.866211,-0.500000,0,0,0,0,...,0.0,169.0,1.0,4488.0,1.000000,270.50,1.163086,264.00,1.080078,0
1,0,40,11,9,0.707031,-0.707031,0,0,0,0,...,0.0,169.0,1.0,4488.0,1.000000,263.50,1.150391,265.50,1.080078,1
2,0,40,11,10,0.500000,-0.866211,0,0,0,0,...,0.0,169.0,1.0,4488.0,1.000000,263.75,1.170898,263.75,1.087891,2
3,0,40,11,11,0.258789,-0.965820,0,0,0,0,...,0.0,169.0,1.0,4488.0,1.000000,262.25,1.138672,264.00,1.089844,3
4,0,40,11,0,0.000000,1.000000,0,0,0,0,...,0.0,169.0,1.0,4488.0,1.000000,262.50,1.228516,262.00,1.131836,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29460948,30,69,14,4,0.866211,0.500000,0,0,0,0,...,0.0,299.0,1.0,169.0,1.069336,1042.00,1.000000,1093.00,1.047852,14
29460949,30,69,14,5,0.965820,0.258789,0,0,0,0,...,0.0,299.0,1.0,169.0,1.069336,489.00,1.000000,1130.00,1.031250,15
29460950,30,69,14,6,1.000000,0.000000,0,0,0,0,...,0.0,299.0,1.0,169.0,1.069336,1080.00,1.125000,938.50,1.047852,16
29460951,30,69,14,7,0.965820,-0.258789,0,0,0,0,...,0.0,299.0,1.0,169.0,1.069336,553.00,1.151367,738.50,1.071289,17


In [9]:
def fitness_function(ga_instance, solution, solution_idx):
    selected_features = [col for col, include in zip(X_train.columns, solution) if include == 1]
    
    # If no features are selected, return a very low fitness value
    if len(selected_features) == 0:
        return 1e-6
    
    X_train_selected = X_train[selected_features]
    X_valid_selected = X_valid[selected_features]
    
    params = {
        'num_leaves': 28,
        'max_depth': 43,
        'learning_rate': 0.07188714405942678,
        'n_estimators': 50,
        'reg_alpha': 1.862771282197631,
        'reg_lambda': 0.6080693179624701,
        'colsample_bytree': 1.0,
        'subsample': 0.7,
        'min_child_samples': 236,
        'min_data_per_groups': 89,
        'random_state': 42,
        'verbose': -1
    }
    model = lgb.LGBMRegressor(**params, boosting_type='gbdt', n_jobs=-1, metric='rmse', objective='regression')
    model.fit(X_train_selected, train_target, eval_set=[(X_valid_selected, valid_target)])

    y_valid_pred = model.predict(X_valid_selected)
    rmse = metrics.root_mean_squared_error(valid_target, y_valid_pred)

    return 1 / (rmse + 1e-6)

In [10]:
num_generations = 5
sol_per_pop = 10
num_genes = X_train.shape[1]

ga_instance = pygad.GA(
    num_generations=num_generations,
    num_parents_mating=4,
    fitness_func=fitness_function,
    sol_per_pop=sol_per_pop,
    num_genes=num_genes,
    init_range_low=0,
    init_range_high=2,
    gene_type=int,
    parent_selection_type="sss",
    crossover_type="single_point",
    mutation_type="random",
    mutation_percent_genes=20
)
ga_instance.run()

solution, solution_fitness, _ = ga_instance.best_solution()
selected_features = [col for col, include in zip(X_train.columns, solution) if include == 1]

print("Best Solution (Feature Mask):", solution)
print("Selected Features:", selected_features)
print("Fitness Score (Inverse RMSE):", solution_fitness)



Best Solution (Feature Mask): [0 0 0 0 0 1 1 0 1 0 0 1 1 1 0 0 1 0 1 1 0 1 1 0 1 1 1 0 1]
Selected Features: ['month_cos', 'lagged_1', 'lagged_3', 'lagged_6', 'lagged_7', 'lagged_8', 'lagged_11', 'rolling_6', 'rolling_9', 'avg_shop_item_item_cnt_day_lag_1', 'avg_item_item_price_lag_1', 'avg_shop_item_category_item_price_lag_1', 'avg_shop_item_category_item_cnt_day_lag_1', 'avg_item_category_item_price_lag_1', 'months_since_last_buy']
Fitness Score (Inverse RMSE): 2.6667910197500126


In [12]:
params = {
    'num_leaves': 28,
    'max_depth': 43,
    'learning_rate': 0.07188714405942678,
    'n_estimators': 50,
    'reg_alpha': 1.862771282197631,
    'reg_lambda': 0.6080693179624701,
    'colsample_bytree': 1.0,
    'subsample': 0.7,
    'min_child_samples': 236,
    'min_data_per_groups': 89,
    'random_state': 42,
    'verbose': -1
}
model = lgb.LGBMRegressor(**params, boosting_type='gbdt', n_jobs=-1, metric='rmse', objective='regression')
model.fit(X_train[selected_features], train_target, eval_set=[(X_valid[selected_features], valid_target)])

y_valid_pred = model.predict(X_valid[selected_features])
rmse = metrics.root_mean_squared_error(valid_target, y_valid_pred)

print("Final RMSE with Selected Features:", rmse)

Final RMSE with Selected Features: 0.37498151366308446
