In [None]:
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, root_mean_squared_error
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor

from lightgbm import LGBMRegressor

import warnings
warnings.filterwarnings('ignore')

In [11]:
train = pd.read_csv('./train.csv')
train.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [12]:
test = pd.read_csv('./test.csv')
test.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0
2,3,az,v,as,f,d,a,j,j,0,...,0,0,0,1,0,0,0,0,0,0
3,4,az,l,n,f,d,z,l,n,0,...,0,0,0,1,0,0,0,0,0,0
4,5,w,s,as,c,d,y,i,m,0,...,1,0,0,0,0,0,0,0,0,0


In [13]:
# Удаляем ID
train_id = train['ID']
test_id = test['ID']
train.drop('ID', axis=1, inplace=True)
test.drop('ID', axis=1, inplace=True)

In [14]:
# Целевая переменная
y = train['y']
X = train.drop('y', axis=1)

In [16]:
# Получаем реально существующие категориальные столбцы
existing_categorical_cols = [col for col in [f'X{i}' for i in range(0, 9)] 
                           if col in X.columns]

# Применяем OHE только к существующим столбцам
X = pd.get_dummies(X, columns=existing_categorical_cols, drop_first=True)
test_prepared = pd.get_dummies(test, columns=existing_categorical_cols, drop_first=True)

# Выравниваем столбцы
common_cols = X.columns.intersection(test_prepared.columns)
X = X[common_cols]
test_prepared = test_prepared[common_cols]

In [17]:
X

Unnamed: 0,X10,X11,X12,X13,X14,X15,X16,X17,X18,X19,...,X8_p,X8_q,X8_r,X8_s,X8_t,X8_u,X8_v,X8_w,X8_x,X8_y
0,0,0,0,1,0,0,0,0,1,0,...,False,False,False,False,False,False,False,False,False,False
1,0,0,0,0,0,0,0,0,1,0,...,False,False,False,False,False,False,False,False,False,False
2,0,0,0,0,0,0,0,1,0,0,...,False,False,False,False,False,False,False,False,True,False
3,0,0,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
4,0,0,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,0,0,0,0,1,0,0,0,0,0,...,False,True,False,False,False,False,False,False,False,False
4205,0,0,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
4206,0,0,1,1,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
4207,0,0,0,0,1,0,0,0,0,0,...,False,False,False,False,False,True,False,False,False,False


In [19]:
# Разделим на трейн/валидацию
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## Посмотрим на модели из коробки

LinearRegression

In [23]:
lr = LinearRegression()
lr.fit(X_train, y_train)

# Прогноз и оценка на валидации
y_pred_baseline = lr.predict(X_val)
r2_baseline = r2_score(y_val, y_pred_baseline)
rmse_baseline = root_mean_squared_error(y_val, y_pred_baseline)
print(f"Baseline (Linear Regression) - R2: {r2_baseline:.4f}, RMSE: {rmse_baseline:.4f}")

Baseline (Linear Regression) - R2: 0.5446, RMSE: 8.4191


Случайный лес

In [26]:
rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)  # n_jobs=-1 использует все ядра
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_val)
r2_rf = r2_score(y_val, y_pred_rf)
rmse_rf = root_mean_squared_error(y_val, y_pred_rf)
print(f"Random Forest - R2: {r2_rf:.4f}, RMSE: {rmse_rf:.4f}")

Random Forest - R2: 0.4711, RMSE: 9.0731


Бустинг

In [28]:
lgbm = LGBMRegressor(n_estimators=1000, learning_rate=0.05, random_state=42, n_jobs=-1)
lgbm.fit(X_train, y_train)
y_pred_lgbm = lgbm.predict(X_val)
r2_lgbm = r2_score(y_val, y_pred_lgbm)
rmse_lgbm = root_mean_squared_error(y_val, y_pred_lgbm)
print(f"LightGBM - R2: {r2_lgbm:.4f}, RMSE: {rmse_lgbm:.4f}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.037859 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 746
[LightGBM] [Info] Number of data points in the train set: 3367, number of used features: 373
[LightGBM] [Info] Start training from score 100.685794
LightGBM - R2: 0.4970, RMSE: 8.8480


# AutoML: H20

In [29]:
import h2o
from h2o.automl import H2OAutoML
h2o.init()  # Запускает локальный сервер H2O

# Конвертируем pandas DataFrame в H2O Frame
train_h2o = h2o.H2OFrame(pd.concat([X_train, y_train], axis=1))
test_h2o = h2o.H2OFrame(X_val)  # Для валидации

# Указываем целевую переменную и предикторы
y_col = 'y'
x_cols = train_h2o.columns
x_cols.remove(y_col)

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "21.0.8" 2025-07-15; OpenJDK Runtime Environment (build 21.0.8+9-Ubuntu-0ubuntu124.04.1); OpenJDK 64-Bit Server VM (build 21.0.8+9-Ubuntu-0ubuntu124.04.1, mixed mode, sharing)
  Starting server from /home/nikita/.local/lib/python3.12/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpmqcon4u3
  JVM stdout: /tmp/tmpmqcon4u3/h2o_nikita_started_from_python.out
  JVM stderr: /tmp/tmpmqcon4u3/h2o_nikita_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Asia/Yekaterinburg
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.7
H2O_cluster_version_age:,4 months and 6 days
H2O_cluster_name:,H2O_from_python_nikita_m216w0
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.740 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [30]:
aml = H2OAutoML(max_runtime_secs=300,
                seed=42,
                sort_metric="RMSE")

aml.train(x=x_cols, y=y_col, training_frame=train_h2o)

AutoML progress: |
17:23:02.80: _train param, Dropping bad and constant columns: [X107, X2_j, X297, X330, X233, X288, X93, X289, X235, X268, X347, X11, X290, X293]

███
17:23:12.543: _train param, Dropping bad and constant columns: [X107, X2_j, X297, X330, X233, X288, X93, X289, X235, X268, X347, X11, X290, X293]

█
17:23:17.553: _train param, Dropping bad and constant columns: [X107, X2_j, X297, X330, X233, X288, X93, X289, X235, X268, X347, X11, X290, X293]

█
17:23:23.380: _train param, Dropping unused columns: [X107, X2_j, X297, X330, X233, X288, X93, X289, X235, X268, X347, X11, X290, X293]
17:23:23.712: _train param, Dropping bad and constant columns: [X107, X2_j, X297, X330, X233, X288, X93, X289, X235, X268, X347, X11, X290, X293]

█
17:23:28.950: _train param, Dropping bad and constant columns: [X107, X2_j, X297, X330, X233, X288, X93, X289, X235, X268, X347, X11, X290, X293]

████
17:23:46.44: _train param, Dropping bad and constant columns: [X107, X2_j, X297, X330, X233, X28

key,value
Stacking strategy,cross_validation
Number of base models (used / total),9/48
# GBM base models (used / total),2/21
# XGBoost base models (used / total),5/20
# GLM base models (used / total),0/1
# DeepLearning base models (used / total),2/4
# DRF base models (used / total),0/2
Metalearner algorithm,GLM
Metalearner fold assignment scheme,Random
Metalearner nfolds,5

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
aic,4800.074,258.89215,4856.795,4967.4507,5093.94,4631.558,4450.627
loglikelihood,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mae,5.329521,0.206975,5.4224696,5.326719,5.6260586,5.1384993,5.1338587
mean_residual_deviance,72.27371,20.659159,73.65512,73.91738,104.915825,53.72907,55.151154
mse,72.27371,20.659159,73.65512,73.91738,104.915825,53.72907,55.151154
null_deviance,109142.18,15686.828,117989.125,109789.68,130116.164,93028.07,94787.87
r2,0.5594233,0.0681071,0.5777343,0.534039,0.4538832,0.608837,0.6226231
residual_deviance,48773.688,14350.316,49790.863,51150.824,70923.1,36320.85,35682.797
rmse,8.435802,1.1784204,8.582256,8.597522,10.242843,7.3300114,7.4263825
rmsle,0.0744682,0.0058288,0.0761948,0.0767981,0.0821343,0.0685481,0.0686654


In [31]:
# Лидерборд
lb = aml.leaderboard
print(lb.head())  # Показать топ моделей

# Прогноз лидера на валидации (H2O Frame)
preds_h2o = aml.leader.predict(test_h2o)

# Конвертируем прогнозы обратно в numpy array и оцениваем
y_pred_aml = preds_h2o.as_data_frame().values.flatten()
r2_aml = r2_score(y_val, y_pred_aml)
rmse_aml = root_mean_squared_error(y_val, y_pred_aml)
print(f"H2O AutoML Leader - R2: {r2_aml:.4f}, RMSE: {rmse_aml:.4f}")

model_id                                                    rmse      mse      mae      rmsle    mean_residual_deviance
StackedEnsemble_AllModels_3_AutoML_1_20250802_172301     8.51052  72.429   5.33119  0.0747102                   72.429
StackedEnsemble_BestOfFamily_4_AutoML_1_20250802_172301  8.51947  72.5813  5.34446  0.0747922                   72.5813
XGBoost_grid_1_AutoML_1_20250802_172301_model_4          8.5198   72.587   5.32091  0.0747685                   72.587
GBM_grid_1_AutoML_1_20250802_172301_model_12             8.52041  72.5975  5.36149  0.0748148                   72.5975
StackedEnsemble_BestOfFamily_3_AutoML_1_20250802_172301  8.57344  73.5038  5.38285  0.0753248                   73.5038
StackedEnsemble_AllModels_2_AutoML_1_20250802_172301     8.57563  73.5415  5.38353  0.0753551                   73.5415
GBM_grid_1_AutoML_1_20250802_172301_model_13             8.57735  73.571   5.39054  0.0754492                   73.571
StackedEnsemble_BestOfFamily_2_AutoML_1_202




In [32]:
leader_model = aml.leader  # Получаем лучшую модель

# Основная информация о модели
print(leader_model)  # Название модели и параметры

# Детализированный отчет
print(leader_model.summary())

Model Details
H2OStackedEnsembleEstimator : Stacked Ensemble
Model Key: StackedEnsemble_AllModels_3_AutoML_1_20250802_172301


Model Summary for Stacked Ensemble: 
key                                        value
-----------------------------------------  ----------------
Stacking strategy                          cross_validation
Number of base models (used / total)       9/48
# GBM base models (used / total)           2/21
# XGBoost base models (used / total)       5/20
# GLM base models (used / total)           0/1
# DeepLearning base models (used / total)  2/4
# DRF base models (used / total)           0/2
Metalearner algorithm                      GLM
Metalearner fold assignment scheme         Random
Metalearner nfolds                         5
Metalearner fold_column
Custom metalearner hyperparameters         None

ModelMetricsRegressionGLM: stackedensemble
** Reported on train data. **

MSE: 61.634316608803466
RMSE: 7.850752614164037
MAE: 4.93463825231696
RMSLE: 0.06836027427799

In [33]:
if "StackedEnsemble" in leader_model.model_id:
    print("Состав ансамбля:")
    print(leader_model.base_models)  # Список базовых моделей
    
    # Детализация мета-модели
    metalearner = leader_model.metalearner()
    print("\nМета-модель:")
    print(metalearner.summary())

Состав ансамбля:
['XGBoost_grid_1_AutoML_1_20250802_172301_model_4', 'GBM_grid_1_AutoML_1_20250802_172301_model_12', 'GBM_grid_1_AutoML_1_20250802_172301_model_13', 'GBM_2_AutoML_1_20250802_172301', 'GBM_3_AutoML_1_20250802_172301', 'GBM_grid_1_AutoML_1_20250802_172301_model_2', 'XGBoost_grid_1_AutoML_1_20250802_172301_model_14', 'GBM_grid_1_AutoML_1_20250802_172301_model_10', 'XGBoost_grid_1_AutoML_1_20250802_172301_model_6', 'GBM_4_AutoML_1_20250802_172301', 'GBM_grid_1_AutoML_1_20250802_172301_model_7', 'GBM_grid_1_AutoML_1_20250802_172301_model_9', 'GBM_grid_1_AutoML_1_20250802_172301_model_4', 'GBM_grid_1_AutoML_1_20250802_172301_model_11', 'GBM_1_AutoML_1_20250802_172301', 'GBM_grid_1_AutoML_1_20250802_172301_model_5', 'GLM_1_AutoML_1_20250802_172301', 'GBM_grid_1_AutoML_1_20250802_172301_model_6', 'GBM_grid_1_AutoML_1_20250802_172301_model_8', 'XGBoost_grid_1_AutoML_1_20250802_172301_model_15', 'DeepLearning_1_AutoML_1_20250802_172301', 'GBM_grid_1_AutoML_1_20250802_172301_model

# Вывод  
Видно, что automl бьет лучший результат (регрессию из коробки), но несильно.  
Решение automl лучше по score, но получившаяся модель очень сложная, возможно это сильно большая плата за такой прирост.  
Из плюсов в копилку automl - он еще посмотрел на фичи и выкинул константные самостоятельно.  