In [1]:
import h2o

In [2]:
import pandas as pd
from h2o.frame import H2OFrame

Подготовим данные

In [3]:
# Загрузка датасета
data = pd.read_csv('Walmart.csv')

data['Date'] = pd.to_datetime(data['Date'], dayfirst=1)


# Определяем времена года
def get_season(date):
    month = date.month
    if month in [12, 1, 2]:
        return 'winter'
    elif month in [3, 4, 5]:
        return 'spring'
    elif month in [6, 7, 8]:
        return 'summer'
    elif month in [9, 10, 11]:
        return 'autumn'

# Применяем функцию к колонке с датой
data['season'] = data['Date'].apply(get_season)

# Создаем бинарные колонки для каждого времени года
data = pd.get_dummies(data, columns=['season'], prefix='', prefix_sep='')

data.drop('Date', axis=1)

data = data.drop('Date', axis=1) # Удаляем даты



In [5]:
h2o.init() # Инициализация h2o

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 25.351-b10, mixed mode)
  Starting server from D:\soft\Anaconda\envs\DZ3\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\LP\AppData\Local\Temp\tmpnfgldvqq
  JVM stdout: C:\Users\LP\AppData\Local\Temp\tmpnfgldvqq\h2o_LP_started_from_python.out
  JVM stderr: C:\Users\LP\AppData\Local\Temp\tmpnfgldvqq\h2o_LP_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Europe/Minsk
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.6
H2O_cluster_version_age:,3 months and 2 days
H2O_cluster_name:,H2O_from_python_LP_6gmop7
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.501 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


In [6]:
# Преобразование в H2OFrame
hf = h2o.H2OFrame(data)

# Разделение на обучающую и тестовую выборки (80% / 20%)
train, test = hf.split_frame(ratios=[0.8], seed=42)

# Указываем целевую переменную
target = "Weekly_Sales"
features = [col for col in hf.columns if col != target]

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [33]:
from h2o.estimators import H2OGradientBoostingEstimator

# Создаём модель GBM
gbm = H2OGradientBoostingEstimator(
    ntrees=100,         # Количество деревьев
    max_depth=5,        # Глубина деревьев
    learn_rate=0.1,     # Скорость обучения
    sample_rate=0.8,    # Сэмплирование выборки
    col_sample_rate=0.8 # Сэмплирование признаков
)

# Обучение модели
gbm.train(x=features, y=target, training_frame=train)

# Оценка модели
performance = gbm.model_performance(test_data=test)
print(performance)
r2 = gbm.r2()
print(f'R2-score: {r2}')

gbm Model Build progress: |

██████████████████████████████████████████████████████| (done) 100%
ModelMetricsRegression: gbm
** Reported on test data. **

MSE: 17400852476.775955
RMSE: 131912.29084803263
MAE: 78779.16774965209
RMSLE: 0.11654768753613556
Mean Residual Deviance: 17400852476.775955
R2-score: 0.9561650444449035


In [34]:
# save the model
model_path = h2o.save_model(model=gbm, force=True)

In [36]:
# load the model
saved_model = h2o.load_model(model_path)

In [37]:
saved_model

Unnamed: 0,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
,100.0,100.0,39583.0,5.0,5.0,5.0,18.0,32.0,26.83

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
,2025-02-04 16:07:35,0.009 sec,0.0,565382.7095505,468490.2833379,319657608258.6486816
,2025-02-04 16:07:35,0.028 sec,1.0,532356.5394468,441190.9603023,283403485091.7838135
,2025-02-04 16:07:35,0.032 sec,2.0,505977.3925139,419307.0836770,256013121735.1910095
,2025-02-04 16:07:35,0.035 sec,3.0,472240.3164459,390913.3564550,223010916476.9095764
,2025-02-04 16:07:35,0.038 sec,4.0,444634.0011147,367258.1922545,197699394947.3037109
,2025-02-04 16:07:35,0.041 sec,5.0,424586.4252357,350418.6436178,180273632494.4631042
,2025-02-04 16:07:35,0.045 sec,6.0,402800.5811363,330699.7038497,162248308163.7164917
,2025-02-04 16:07:35,0.049 sec,7.0,381800.7339772,310668.6289540,145771800465.5508118
,2025-02-04 16:07:35,0.052 sec,8.0,364536.0978689,294356.9070808,132886566649.5152893
,2025-02-04 16:07:35,0.056 sec,9.0,346834.2188353,278929.6421987,120293975355.1177979

variable,relative_importance,scaled_importance,percentage
Store,4918055918370816.0,1.0,0.7088157
CPI,1128417026113536.0,0.2294437,0.1626333
Unemployment,650834111102976.0,0.1323356,0.0938016
Temperature,103742552670208.0,0.0210942,0.0149519
Fuel_Price,80122170310656.0,0.0162914,0.0115476
winter,23802845069312.0,0.0048399,0.0034306
autumn,15438037647360.0,0.0031391,0.002225
Holiday_Flag,12612011032576.0,0.0025644,0.0018177
spring,3037430546432.0,0.0006176,0.0004378
summer,2350747484160.0,0.000478,0.0003388


Получены метрики:
MSE: 17400852476.775955
R2-score: 0.9561650444449035