In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import optuna
import pickle

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error 
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.neighbors import KNeighborsRegressor

import sys
import os
from tqdm import tqdm

RAND = 10
N_FOLDS = 3

backend_path = os.path.abspath('../backend')
sys.path.append(backend_path)
model_path = os.path.abspath('../models')
sys.path.append(model_path)
from get_metrics import get_metrics_regression
from check_overfitting import check_overfitting

# Import data

In [2]:
# Чтение DataFrame df в файл data/df.csv
df = pd.read_csv('../data/df.csv')

In [3]:
# df = df[-50000:]

In [4]:
df[:3]

Unnamed: 0,milliseconds,place,status,tsunami,significance,data_type,magnitude,country,longitude,latitude,depth,datetime,timezone,magnitude_bins,year,month,day,hour,minute,second
0,1668773163070,"14 km SSE of Eden Roc, Hawaii",automatic,0,58,earthquake,1.94,USA,-155.030334,19.374001,7.1,2022-11-18 12:06:03,+00:00,green,2022,11,18,12,6,3
1,1668773284487,"40 km ESE of Nikolski, Alaska",reviewed,0,62,earthquake,2.0,USA,-168.3108,52.7861,64.9,2022-11-18 12:08:04,+00:00,green,2022,11,18,12,8,4
2,1668773482790,"45 km SW of Howell, Utah",reviewed,0,21,earthquake,1.16,USA,-112.845833,41.512167,5.73,2022-11-18 12:11:22,+00:00,green,2022,11,18,12,11,22


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94354 entries, 0 to 94353
Data columns (total 20 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   milliseconds    94354 non-null  int64  
 1   place           94354 non-null  object 
 2   status          94354 non-null  object 
 3   tsunami         94354 non-null  int64  
 4   significance    94354 non-null  int64  
 5   data_type       94354 non-null  object 
 6   magnitude       94354 non-null  float64
 7   country         94354 non-null  object 
 8   longitude       94354 non-null  float64
 9   latitude        94354 non-null  float64
 10  depth           94354 non-null  float64
 11  datetime        94354 non-null  object 
 12  timezone        94354 non-null  object 
 13  magnitude_bins  94354 non-null  object 
 14  year            94354 non-null  int64  
 15  month           94354 non-null  int64  
 16  day             94354 non-null  int64  
 17  hour            94354 non-null 

In [6]:
# основные описательные статистики для числовых признаков
df.iloc[:, 1:].describe()

Unnamed: 0,tsunami,significance,magnitude,longitude,latitude,depth,year,month,day,hour,minute,second
count,94354.0,94354.0,94354.0,94354.0,94354.0,94354.0,94354.0,94354.0,94354.0,94354.0,94354.0,94354.0
mean,0.000964,67.406363,1.720177,-116.542011,41.560005,26.813648,2022.824915,5.299479,15.482046,11.48601,29.419537,29.46829
std,0.031041,96.398645,1.178204,72.358151,20.35713,56.179701,0.380042,3.429028,8.635858,6.920763,17.35737,17.283395
min,0.0,0.0,0.0,-179.9987,-65.4254,-3.74,2022.0,1.0,1.0,0.0,0.0,0.0
25%,0.0,14.0,0.95,-153.445775,34.018333,3.4,2023.0,3.0,8.0,6.0,14.0,15.0
50%,0.0,30.0,1.4,-122.8525,39.2463,9.2,2023.0,5.0,16.0,11.0,29.0,29.0
75%,0.0,68.0,2.1,-116.717375,58.264,25.6,2023.0,7.0,23.0,17.0,45.0,44.0
max,1.0,2910.0,7.8,179.9994,86.5939,681.238,2023.0,12.0,30.0,23.0,59.0,59.0


In [7]:
# основные описательные статистики для булевых и категориальных признаков
df.describe(include=["object", "bool"])

Unnamed: 0,place,status,data_type,country,datetime,timezone,magnitude_bins
count,94354,94354,94354,94354,94354,94354,94354
unique,34095,3,7,234,93284,1,3
top,"8km NW of The Geysers, CA",reviewed,earthquake,USA,2023-03-02 18:11:07,+00:00,green
freq,973,84325,92210,80057,2,94354,82184


# разделение данных train_test_split

In [8]:
# признаки
X = df[['milliseconds', 'significance', 'country', 'depth', 'year', 'month', 'day', 'hour', 'minute', 'second']]

# целевые переменные
y = df[['magnitude', 'longitude', 'latitude']]

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=RAND)

# LabelEncoder

In [9]:
# создаем LabelEncoder для кодирования категориальных значений
le = LabelEncoder()

# обучаем le на тренировочных и тестовых данных
le.fit(np.concatenate((X_train['country'], X_test['country'])))

# кодирование колонки 'country' для обучения моделей
X_train['country'] = le.transform(X_train['country'])
X_test['country'] = le.transform(X_test['country'])

# StandardScaler

In [10]:
# создаем объект scaler
scaler = StandardScaler()

# масштабируем признаки в X_train
X_train[['milliseconds', 'depth', 'year', 'month', 'day', 'hour', 'minute', 'second']] = scaler.fit_transform(X_train[['milliseconds', 'depth', 'year', 'month', 'day', 'hour', 'minute', 'second']])

# масштабируем признаки в X_test
X_test[['milliseconds', 'depth', 'year', 'month', 'day', 'hour', 'minute', 'second']] = scaler.transform(X_test[['milliseconds', 'depth', 'year', 'month', 'day', 'hour', 'minute', 'second']])

# Baseline 

## RandomForestRegressor

In [11]:
# модель RandomForestRegressor
rfr = RandomForestRegressor(random_state=RAND)
# обучаем модель
rfr.fit(X_train, y_train)
#  предсказания на тестовой выборке
y_pred_rfr = rfr.predict(X_test)

In [12]:
# проверка на переобучение модели RandomForestRegressor
check_overfitting(rfr, X_train, y_train, X_test, y_test, mean_squared_error)

mean_squared_error train: 40.778
mean_squared_error test: 270.277
delta = 84.9 %


In [13]:
# просмотр метрик модели RandomForestRegressor обученной на train
metrics = get_metrics_regression(y_test,
                                 y_pred = y_pred_rfr,
                                 X_test = X_test,
                                 name='RandomForestRegressor_Baseline')
metrics

Unnamed: 0,model,MAE,MSE,RMSE,RMSLE,R2 adjusted
0,RandomForestRegressor_Baseline,6.345105,270.2768,16.440097,,0.888101


## DecisionTreeRegressor

In [14]:
# модель DecisionTreeRegressor
dtr = DecisionTreeRegressor(random_state=RAND)
# обучаем модель
dtr.fit(X_train, y_train)
#  предсказания на тестовой выборке
y_pred_dtr = dtr.predict(X_test)

In [15]:
# проверка на переобучение модели DecisionTreeRegressor
check_overfitting(dtr, X_train, y_train, X_test, y_test, mean_squared_error)

mean_squared_error train: 0.000
mean_squared_error test: 504.655
delta = 100.0 %


In [16]:
# просмотр метрик модели DecisionTreeRegressor обученной на train
metrics = pd.concat([
    metrics,
    get_metrics_regression(y_test,
                           y_pred = y_pred_dtr,
                           X_test = X_test,
                           name='DecisionTreeRegressor_Baseline')])
metrics

Unnamed: 0,model,MAE,MSE,RMSE,RMSLE,R2 adjusted
0,RandomForestRegressor_Baseline,6.345105,270.2768,16.440097,,0.888101
0,DecisionTreeRegressor_Baseline,7.334105,504.655449,22.464538,,0.783413


## GradientBoostingRegressor

In [17]:
# модель GradientBoostingRegressor
gbr = MultiOutputRegressor(GradientBoostingRegressor(random_state=RAND)) 
# обучаем модель
gbr.fit(X_train, y_train)
#  предсказания на тестовой выборке
y_pred_gbr = gbr.predict(X_test)


In [18]:
# проверка на переобучение модели GradientBoostingRegressor
check_overfitting(gbr, X_train, y_train, X_test, y_test, mean_squared_error)

mean_squared_error train: 434.716
mean_squared_error test: 429.898
delta = 1.1 %


In [19]:
# просмотр метрик модели GradientBoostingRegressor обученной на train
metrics = pd.concat([
    metrics,
    get_metrics_regression(y_test,
                           y_pred = y_pred_gbr,
                           X_test = X_test,
                           name='GradientBoostingRegressor_Baseline')])
metrics

Unnamed: 0,model,MAE,MSE,RMSE,RMSLE,R2 adjusted
0,RandomForestRegressor_Baseline,6.345105,270.2768,16.440097,,0.888101
0,DecisionTreeRegressor_Baseline,7.334105,504.655449,22.464538,,0.783413
0,GradientBoostingRegressor_Baseline,9.749377,429.898228,20.733987,,0.827055


## KNeighborsRegressor

In [20]:
# модель KNeighborsRegressor
knn = KNeighborsRegressor()
# обучаем модель
knn.fit(X_train, y_train)
#  предсказания на тестовой выборке
y_pred_knn = knn.predict(X_test)

In [21]:
# проверка на переобучение модели KNeighborsRegressor
check_overfitting(knn, X_train, y_train, X_test, y_test, mean_squared_error)

mean_squared_error train: 360.794
mean_squared_error test: 542.268
delta = 33.5 %


In [22]:
# просмотр метрик модели KNeighborsRegressor обученной на train
metrics = pd.concat([
    metrics,
    get_metrics_regression(y_test,
                           y_pred = y_pred_knn,
                           X_test = X_test,
                           name='KNeighborsRegressor')])
metrics

Unnamed: 0,model,MAE,MSE,RMSE,RMSLE,R2 adjusted
0,RandomForestRegressor_Baseline,6.345105,270.2768,16.440097,,0.888101
0,DecisionTreeRegressor_Baseline,7.334105,504.655449,22.464538,,0.783413
0,GradientBoostingRegressor_Baseline,9.749377,429.898228,20.733987,,0.827055
0,KNeighborsRegressor,9.531598,542.267584,23.28664,,0.797164


# Optuna & KFold подбор гиперпараметров и кросс-валидация

## RandomForestRegressor

In [23]:
# целевая функция для оптимизации
def objective(trial):
    # гиперпараметры для настройки
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 5, 10)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 3)
    min_weight_fraction_leaf = trial.suggest_float('min_weight_fraction_leaf', 0.0, 0.5)
    max_features = trial.suggest_int('max_features', 1, 10)
    bootstrap = trial.suggest_categorical('bootstrap', [True, False])

    # RandomForestRegressor с гиперпараметрами
    rfr = RandomForestRegressor(n_estimators=n_estimators,
                                max_depth=max_depth,
                                min_samples_split=min_samples_split,
                                min_samples_leaf=min_samples_leaf,
                                min_weight_fraction_leaf=min_weight_fraction_leaf,
                                max_features=max_features,
                                bootstrap=bootstrap,
                                random_state=RAND)

    # выполнение кросс-валидации
    scores = cross_val_score(rfr, X_train, y_train, cv=KFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND))
    score = np.mean(scores)

    return -score  # Optuna минимизирует целевую функцию, поэтому мы используем -score

# выполнение настройки гиперпараметров с Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, timeout=3600)

[I 2024-07-14 03:40:21,166] A new study created in memory with name: no-name-1c0d9d6c-23c0-40a1-8653-4c3aa60f0185
[I 2024-07-14 03:44:37,086] Trial 0 finished with value: -0.633840081589376 and parameters: {'n_estimators': 902, 'max_depth': 5, 'min_samples_split': 2, 'min_samples_leaf': 3, 'min_weight_fraction_leaf': 0.025405282755729452, 'max_features': 9, 'bootstrap': True}. Best is trial 0 with value: -0.633840081589376.
[I 2024-07-14 03:45:55,780] Trial 1 finished with value: -0.5393688469107332 and parameters: {'n_estimators': 998, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 3, 'min_weight_fraction_leaf': 0.06689519450498088, 'max_features': 2, 'bootstrap': True}. Best is trial 0 with value: -0.633840081589376.
[I 2024-07-14 03:46:13,705] Trial 2 finished with value: -0.11837439279206184 and parameters: {'n_estimators': 622, 'max_depth': 8, 'min_samples_split': 7, 'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0.46919788975048204, 'max_features': 2, 'bootstrap

In [24]:
# получение лучших гиперпараметров
best_params_rfr = study.best_params

# создание RandomForestRegressor с лучшими гиперпараметрами
rfr_best_params = RandomForestRegressor(**best_params_rfr, random_state=RAND)

# обучение модели
rfr_best_params.fit(X_train, y_train)

# предсказания на тестовой выборке
y_pred_rfr_best_params = rfr_best_params.predict(X_test)

In [25]:
# оценка модели
check_overfitting(rfr_best_params, X_train, y_train, X_test, y_test, mean_squared_error)

# получение метрик
metrics = pd.concat([
    metrics,
    get_metrics_regression(y_test,
                           y_pred=y_pred_rfr_best_params,
                           X_test=X_test,
                           name='RandomForestRegressor_best_params_optuna')])
metrics

mean_squared_error train: 569.505
mean_squared_error test: 569.467
delta = 0.0 %


Unnamed: 0,model,MAE,MSE,RMSE,RMSLE,R2 adjusted
0,RandomForestRegressor_Baseline,6.345105,270.2768,16.440097,,0.888101
0,DecisionTreeRegressor_Baseline,7.334105,504.655449,22.464538,,0.783413
0,GradientBoostingRegressor_Baseline,9.749377,429.898228,20.733987,,0.827055
0,KNeighborsRegressor,9.531598,542.267584,23.28664,,0.797164
0,RandomForestRegressor_best_params_optuna,11.096956,569.46714,23.863511,,0.722739


In [26]:
# сохраняем модель в файл
model_file = os.path.join(model_path, 'rfr_best_params.pkl')
with open(model_file, 'wb') as f:
    pickle.dump(rfr_best_params, f)

## DecisionTreeRegressor

In [27]:
# целевая функция для оптимизации
def objective(trial):
    # гиперпараметры для настройки
    splitter = trial.suggest_categorical('splitter', ['best', 'random'])
    max_depth = trial.suggest_int('max_depth', 5, 10)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 3)
    min_weight_fraction_leaf = trial.suggest_float('min_weight_fraction_leaf', 0.0, 0.5)
    max_features = trial.suggest_int('max_features', 1, 10)
    max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 2, 5)

    # DecisionTreeRegressor с гиперпараметрами
    rfr = DecisionTreeRegressor(splitter=splitter,
                                max_depth=max_depth,
                                min_samples_split=min_samples_split,
                                min_samples_leaf=min_samples_leaf,
                                min_weight_fraction_leaf=min_weight_fraction_leaf,
                                max_features=max_features,
                                max_leaf_nodes=max_leaf_nodes,
                                random_state=RAND)

    # выполнение кросс-валидации
    scores = cross_val_score(rfr, X_train, y_train, cv=KFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND))
    score = np.mean(scores)

    return -score  # Optuna минимизирует целевую функцию, поэтому мы используем -score

# выполнение настройки гиперпараметров с Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, timeout=3600)

[I 2024-07-14 04:42:57,108] A new study created in memory with name: no-name-4e2686f0-003e-417c-93c2-e471e861a7ed
[I 2024-07-14 04:42:57,440] Trial 0 finished with value: -0.39608056423563087 and parameters: {'splitter': 'best', 'max_depth': 10, 'min_samples_split': 7, 'min_samples_leaf': 2, 'min_weight_fraction_leaf': 0.23519959992772949, 'max_features': 9, 'max_leaf_nodes': 5}. Best is trial 0 with value: -0.39608056423563087.
[I 2024-07-14 04:42:57,630] Trial 1 finished with value: -0.00039116178739097133 and parameters: {'splitter': 'random', 'max_depth': 7, 'min_samples_split': 5, 'min_samples_leaf': 3, 'min_weight_fraction_leaf': 0.16416737342291782, 'max_features': 7, 'max_leaf_nodes': 5}. Best is trial 0 with value: -0.39608056423563087.
[I 2024-07-14 04:42:57,731] Trial 2 finished with value: 0.00011229537235276273 and parameters: {'splitter': 'random', 'max_depth': 9, 'min_samples_split': 9, 'min_samples_leaf': 2, 'min_weight_fraction_leaf': 0.4410617418073125, 'max_features'

In [28]:
# получение лучших гиперпараметров
best_params_dtr = study.best_params

# создание RandomForestRegressor с лучшими гиперпараметрами
dtr_best_params = DecisionTreeRegressor(**best_params_dtr, random_state=RAND)

# обучение модели
dtr_best_params.fit(X_train, y_train)

# предсказания на тестовой выборке
y_pred_dtr_best_params = dtr_best_params.predict(X_test)

In [29]:
# оценка модели
check_overfitting(dtr_best_params, X_train, y_train, X_test, y_test, mean_squared_error)

# получение метрик
metrics = pd.concat([
    metrics,
    get_metrics_regression(y_test,
                           y_pred=y_pred_dtr_best_params,
                           X_test=X_test,
                           name='DecisionTreeRegressor_best_params_optuna')])
metrics

mean_squared_error train: 884.761
mean_squared_error test: 871.315
delta = 1.5 %


Unnamed: 0,model,MAE,MSE,RMSE,RMSLE,R2 adjusted
0,RandomForestRegressor_Baseline,6.345105,270.2768,16.440097,,0.888101
0,DecisionTreeRegressor_Baseline,7.334105,504.655449,22.464538,,0.783413
0,GradientBoostingRegressor_Baseline,9.749377,429.898228,20.733987,,0.827055
0,KNeighborsRegressor,9.531598,542.267584,23.28664,,0.797164
0,RandomForestRegressor_best_params_optuna,11.096956,569.46714,23.863511,,0.722739
0,DecisionTreeRegressor_best_params_optuna,14.23555,871.314958,29.518045,,0.631328


In [30]:
# сохраняем модель в файл
model_file = os.path.join(model_path, 'dtr_best_params.pkl')
with open(model_file, 'wb') as f:
    pickle.dump(dtr_best_params, f)

## GradientBoostingRegressor

In [31]:
# целевая функция для оптимизации
def objective(trial):
    # гиперпараметры для настройки
    learning_rate = trial.suggest_float('estimator__learning_rate', 0, 1)
    n_estimators = trial.suggest_int('estimator__n_estimators', 100, 1000)
    subsample = trial.suggest_float('estimator__subsample', 0.1, 1.0)
    min_samples_split = trial.suggest_int('estimator__min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('estimator__min_samples_leaf', 1, 5)
    min_weight_fraction_leaf = trial.suggest_float('estimator__min_weight_fraction_leaf', 0.0, 0.5)
    max_depth = trial.suggest_int('estimator__max_depth', 1, 10)

    # GradientBoostingRegressor с гиперпараметрами
    gbr = MultiOutputRegressor(GradientBoostingRegressor(learning_rate=learning_rate,
                                                         n_estimators=n_estimators,
                                                         subsample=subsample,
                                                         min_samples_split=min_samples_split,
                                                         min_samples_leaf=min_samples_leaf,
                                                         min_weight_fraction_leaf=min_weight_fraction_leaf,
                                                         max_depth=max_depth,
                                                         random_state=RAND))

    # выполнение кросс-валидации
    scores = cross_val_score(gbr, X_train, y_train, cv=KFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND))
    score = np.mean(scores)

    return -score  # Optuna минимизирует целевую функцию, поэтому мы используем -score

# выполнение настройки гиперпараметров с Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, timeout=3600)

[I 2024-07-14 04:43:27,834] A new study created in memory with name: no-name-4baea1d9-32fa-47df-a863-2e8f595c14ef
[I 2024-07-14 04:51:52,605] Trial 0 finished with value: -0.752756569100946 and parameters: {'estimator__learning_rate': 0.7970102708259263, 'estimator__n_estimators': 914, 'estimator__subsample': 0.5447248933044606, 'estimator__min_samples_split': 7, 'estimator__min_samples_leaf': 4, 'estimator__min_weight_fraction_leaf': 0.11595063764499641, 'estimator__max_depth': 9}. Best is trial 0 with value: -0.752756569100946.
[I 2024-07-14 04:54:14,853] Trial 1 finished with value: -0.7541418724775402 and parameters: {'estimator__learning_rate': 0.3531590254042978, 'estimator__n_estimators': 489, 'estimator__subsample': 0.23768330131532742, 'estimator__min_samples_split': 8, 'estimator__min_samples_leaf': 5, 'estimator__min_weight_fraction_leaf': 0.10039473410087296, 'estimator__max_depth': 5}. Best is trial 1 with value: -0.7541418724775402.
[I 2024-07-14 04:56:45,463] Trial 2 fin

In [32]:
# получение лучших гиперпараметров
best_params_gbr = study.best_params

# создание GradientBoostingRegressor с лучшими гиперпараметрами
gbr_best_params = MultiOutputRegressor(GradientBoostingRegressor(**{k.replace('estimator__', ''): v for k, v in best_params_gbr.items()}, random_state=RAND))

# обучение модели
gbr_best_params.fit(X_train, y_train)

# предсказания на тестовой выборке
y_pred_gbr_best_params = gbr_best_params.predict(X_test)

In [33]:
# оценка модели
check_overfitting(gbr_best_params, X_train, y_train, X_test, y_test, mean_squared_error)

# получение метрик
metrics = pd.concat([
    metrics,
    get_metrics_regression(y_test,
                           y_pred=y_pred_gbr_best_params,
                           X_test=X_test,
                           name='GradientBoostingRegressor_best_params_optuna')])
metrics

mean_squared_error train: 345.757
mean_squared_error test: 371.222
delta = 6.9 %


Unnamed: 0,model,MAE,MSE,RMSE,RMSLE,R2 adjusted
0,RandomForestRegressor_Baseline,6.345105,270.2768,16.440097,,0.888101
0,DecisionTreeRegressor_Baseline,7.334105,504.655449,22.464538,,0.783413
0,GradientBoostingRegressor_Baseline,9.749377,429.898228,20.733987,,0.827055
0,KNeighborsRegressor,9.531598,542.267584,23.28664,,0.797164
0,RandomForestRegressor_best_params_optuna,11.096956,569.46714,23.863511,,0.722739
0,DecisionTreeRegressor_best_params_optuna,14.23555,871.314958,29.518045,,0.631328
0,GradientBoostingRegressor_best_params_optuna,8.437457,371.222418,19.267133,,0.859299


In [34]:
# сохраняем модель в файл
model_file = os.path.join(model_path, 'gbr_best_params.pkl')
with open(model_file, 'wb') as f:
    pickle.dump(gbr_best_params, f)

## KNeighborsRegressor

In [35]:
# целевая функция для оптимизации
def objective(trial):
    # гиперпараметры для настройки
    n_neighbors = trial.suggest_int('n_neighbors', 2, 10)
    weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
    algorithm = trial.suggest_categorical('algorithm', ['brute', 'ball_tree', 'kd_tree'])
    leaf_size = trial.suggest_int('leaf_size', 10, 50)
    p = trial.suggest_int('p', 1, 5)
    metric = trial.suggest_categorical('metric', ['euclidean', 'manhattan', 'minkowski'])

    # KNeighborsRegressor с гиперпараметрами
    knn = KNeighborsRegressor(n_neighbors=n_neighbors,
                              weights=weights,
                              algorithm=algorithm,
                              leaf_size=leaf_size,
                              p=p,
                              metric=metric
                              )

    # выполнение кросс-валидации
    scores = cross_val_score(knn, X_train, y_train, cv=KFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND))
    score = np.mean(scores)

    return -score  # Optuna минимизирует целевую функцию, поэтому мы используем -score

# выполнение настройки гиперпараметров с Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, timeout=3600)

[I 2024-07-14 05:44:42,634] A new study created in memory with name: no-name-9c1d9bdb-3fbc-44ef-9a1c-fcdacc426e9f
[I 2024-07-14 05:44:53,273] Trial 0 finished with value: -0.7875853052208955 and parameters: {'n_neighbors': 6, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 13, 'p': 3, 'metric': 'euclidean'}. Best is trial 0 with value: -0.7875853052208955.
[I 2024-07-14 05:44:58,708] Trial 1 finished with value: -0.7507780817268884 and parameters: {'n_neighbors': 2, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 33, 'p': 3, 'metric': 'euclidean'}. Best is trial 0 with value: -0.7875853052208955.
[I 2024-07-14 05:45:22,891] Trial 2 finished with value: -0.7938525465114269 and parameters: {'n_neighbors': 9, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 41, 'p': 4, 'metric': 'manhattan'}. Best is trial 2 with value: -0.7938525465114269.
[I 2024-07-14 05:55:03,621] Trial 3 finished with value: -0.7856771870485185 and parameters: {'n_neighbors': 7, 'weights'

In [36]:
# получение лучших гиперпараметров
best_params_knn = study.best_params

# создание RandomForestRegressor с лучшими гиперпараметрами
knn_best_params = KNeighborsRegressor(**best_params_knn)

# обучение модели
knn_best_params.fit(X_train, y_train)

# предсказания на тестовой выборке
y_pred_knn_best_params = knn_best_params.predict(X_test)

In [37]:
# оценка модели
check_overfitting(knn_best_params, X_train, y_train, X_test, y_test, mean_squared_error)

# получение метрик
metrics = pd.concat([
    metrics,
    get_metrics_regression(y_test,
                           y_pred=y_pred_knn_best_params,
                           X_test=X_test,
                           name='KNeighborsRegressor_best_params_optuna')])
metrics

mean_squared_error train: 0.000
mean_squared_error test: 474.034
delta = 100.0 %


Unnamed: 0,model,MAE,MSE,RMSE,RMSLE,R2 adjusted
0,RandomForestRegressor_Baseline,6.345105,270.2768,16.440097,,0.888101
0,DecisionTreeRegressor_Baseline,7.334105,504.655449,22.464538,,0.783413
0,GradientBoostingRegressor_Baseline,9.749377,429.898228,20.733987,,0.827055
0,KNeighborsRegressor,9.531598,542.267584,23.28664,,0.797164
0,RandomForestRegressor_best_params_optuna,11.096956,569.46714,23.863511,,0.722739
0,DecisionTreeRegressor_best_params_optuna,14.23555,871.314958,29.518045,,0.631328
0,GradientBoostingRegressor_best_params_optuna,8.437457,371.222418,19.267133,,0.859299
0,KNeighborsRegressor_best_params_optuna,9.182462,474.034398,21.772331,,0.819349


In [38]:
# сохраняем модель в файл
model_file = os.path.join(model_path, 'knn_best_params.pkl')
with open(model_file, 'wb') as f:
    pickle.dump(knn_best_params, f)

In [39]:
# сохранение metrics в файл data/metrics.csv
metrics.to_csv('../data/metrics.csv', index=False)

In [40]:
# # Обратное преобразование масштабированных данных в X_train
# X_train[['milliseconds', 'depth', 'year', 'month', 'day', 'hour', 'minute', 'second']] = scaler.inverse_transform(X_train[['milliseconds', 'depth', 'year', 'month', 'day', 'hour', 'minute', 'second']])

# # Обратное преобразование масштабированных данных в X_test
# X_test[['milliseconds', 'depth', 'year', 'month', 'day', 'hour', 'minute', 'second']] = scaler.inverse_transform(X_test[['milliseconds', 'depth', 'year', 'month', 'day', 'hour', 'minute', 'second']])

In [41]:
# декодирование колонки 'country'
#X_train['country'] = le.inverse_transform(X_train['country'])
#X_test['country'] = le.inverse_transform(X_test['country'])