In [42]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import optuna
import pickle

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error 
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.neighbors import KNeighborsRegressor

import sys
import os
from tqdm import tqdm

RAND = 10
N_FOLDS = 3

backend_path = os.path.abspath('../backend')
sys.path.append(backend_path)
model_path = os.path.abspath('../models')
sys.path.append(model_path)
from get_metrics import get_metrics_regression
from check_overfitting import check_overfitting

# Import data

In [43]:
# Чтение DataFrame df в файл data/df.csv
df = pd.read_csv('../data/df.csv')

In [44]:
# df = df[-50000:]

In [45]:
df[:3]

Unnamed: 0,milliseconds,place,status,tsunami,significance,data_type,magnitude,country,longitude,latitude,depth,datetime,timezone,magnitude_bins,year,month,day,hour,minute,second
0,1668773163070,"14 km SSE of Eden Roc, Hawaii",automatic,0,58,earthquake,1.94,USA,-155.030334,19.374001,7.1,2022-11-18 12:06:03,+00:00,green,2022,11,18,12,6,3
1,1668773284487,"40 km ESE of Nikolski, Alaska",reviewed,0,62,earthquake,2.0,USA,-168.3108,52.7861,64.9,2022-11-18 12:08:04,+00:00,green,2022,11,18,12,8,4
2,1668773482790,"45 km SW of Howell, Utah",reviewed,0,21,earthquake,1.16,USA,-112.845833,41.512167,5.73,2022-11-18 12:11:22,+00:00,green,2022,11,18,12,11,22


In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94354 entries, 0 to 94353
Data columns (total 20 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   milliseconds    94354 non-null  int64  
 1   place           94354 non-null  object 
 2   status          94354 non-null  object 
 3   tsunami         94354 non-null  int64  
 4   significance    94354 non-null  int64  
 5   data_type       94354 non-null  object 
 6   magnitude       94354 non-null  float64
 7   country         94354 non-null  object 
 8   longitude       94354 non-null  float64
 9   latitude        94354 non-null  float64
 10  depth           94354 non-null  float64
 11  datetime        94354 non-null  object 
 12  timezone        94354 non-null  object 
 13  magnitude_bins  94354 non-null  object 
 14  year            94354 non-null  int64  
 15  month           94354 non-null  int64  
 16  day             94354 non-null  int64  
 17  hour            94354 non-null 

In [47]:
# основные описательные статистики для числовых признаков
df.iloc[:, 1:].describe()

Unnamed: 0,tsunami,significance,magnitude,longitude,latitude,depth,year,month,day,hour,minute,second
count,94354.0,94354.0,94354.0,94354.0,94354.0,94354.0,94354.0,94354.0,94354.0,94354.0,94354.0,94354.0
mean,0.000964,67.406363,1.720177,-116.542011,41.560005,26.813648,2022.824915,5.299479,15.482046,11.48601,29.419537,29.46829
std,0.031041,96.398645,1.178204,72.358151,20.35713,56.179701,0.380042,3.429028,8.635858,6.920763,17.35737,17.283395
min,0.0,0.0,0.0,-179.9987,-65.4254,-3.74,2022.0,1.0,1.0,0.0,0.0,0.0
25%,0.0,14.0,0.95,-153.445775,34.018333,3.4,2023.0,3.0,8.0,6.0,14.0,15.0
50%,0.0,30.0,1.4,-122.8525,39.2463,9.2,2023.0,5.0,16.0,11.0,29.0,29.0
75%,0.0,68.0,2.1,-116.717375,58.264,25.6,2023.0,7.0,23.0,17.0,45.0,44.0
max,1.0,2910.0,7.8,179.9994,86.5939,681.238,2023.0,12.0,30.0,23.0,59.0,59.0


In [48]:
# основные описательные статистики для булевых и категориальных признаков
df.describe(include=["object", "bool"])

Unnamed: 0,place,status,data_type,country,datetime,timezone,magnitude_bins
count,94354,94354,94354,94354,94354,94354,94354
unique,34095,3,7,234,93284,1,3
top,"8km NW of The Geysers, CA",reviewed,earthquake,USA,2023-03-02 18:11:07,+00:00,green
freq,973,84325,92210,80057,2,94354,82184


# разделение данных train_test_split

In [49]:
# признаки
X = df[['milliseconds', 'significance', 'country', 'depth', 'year', 'month', 'day', 'hour', 'minute', 'second']]

# целевые переменные
y = df[['magnitude', 'longitude', 'latitude']]

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=RAND)

# LabelEncoder

In [50]:
# создаем LabelEncoder для кодирования категориальных значений
le = LabelEncoder()

# обучаем le на тренировочных и тестовых данных
le.fit(np.concatenate((X_train['country'], X_test['country'])))

# кодирование колонки 'country' для обучения моделей
X_train['country'] = le.transform(X_train['country'])
X_test['country'] = le.transform(X_test['country'])

# StandardScaler

In [51]:
# создаем объект scaler
scaler = StandardScaler()

# масштабируем признаки в X_train
X_train[['milliseconds', 'depth', 'year', 'month', 'day', 'hour', 'minute', 'second']] = scaler.fit_transform(X_train[['milliseconds', 'depth', 'year', 'month', 'day', 'hour', 'minute', 'second']])

# масштабируем признаки в X_test
X_test[['milliseconds', 'depth', 'year', 'month', 'day', 'hour', 'minute', 'second']] = scaler.transform(X_test[['milliseconds', 'depth', 'year', 'month', 'day', 'hour', 'minute', 'second']])

# Baseline 

## RandomForestRegressor

In [52]:
# модель RandomForestRegressor
rfr = RandomForestRegressor(random_state=RAND)
# обучаем модель
rfr.fit(X_train, y_train)
#  предсказания на тестовой выборке
y_pred_rfr = rfr.predict(X_test)

In [53]:
# проверка на переобучение модели RandomForestRegressor
check_overfitting(rfr, X_train, y_train, X_test, y_test, mean_squared_error)

mean_squared_error train: 40.778
mean_squared_error test: 270.277
delta = 84.9 %


In [54]:
# просмотр метрик модели RandomForestRegressor обученной на train
metrics = get_metrics_regression(y_test,
                                 y_pred = y_pred_rfr,
                                 X_test = X_test,
                                 name='RandomForestRegressor_Baseline')
metrics

Unnamed: 0,model,MAE,MSE,RMSE,RMSLE,R2 adjusted
0,RandomForestRegressor_Baseline,6.345105,270.2768,16.440097,,0.888101


## DecisionTreeRegressor

In [55]:
# модель DecisionTreeRegressor
dtr = DecisionTreeRegressor(random_state=RAND)
# обучаем модель
dtr.fit(X_train, y_train)
#  предсказания на тестовой выборке
y_pred_dtr = dtr.predict(X_test)

In [56]:
# проверка на переобучение модели DecisionTreeRegressor
check_overfitting(dtr, X_train, y_train, X_test, y_test, mean_squared_error)

mean_squared_error train: 0.000
mean_squared_error test: 504.655
delta = 100.0 %


In [57]:
# просмотр метрик модели DecisionTreeRegressor обученной на train
metrics = pd.concat([
    metrics,
    get_metrics_regression(y_test,
                           y_pred = y_pred_dtr,
                           X_test = X_test,
                           name='DecisionTreeRegressor_Baseline')])
metrics

Unnamed: 0,model,MAE,MSE,RMSE,RMSLE,R2 adjusted
0,RandomForestRegressor_Baseline,6.345105,270.2768,16.440097,,0.888101
0,DecisionTreeRegressor_Baseline,7.334105,504.655449,22.464538,,0.783413


## GradientBoostingRegressor

In [58]:
# модель GradientBoostingRegressor
gbr = MultiOutputRegressor(GradientBoostingRegressor(random_state=RAND)) 
# обучаем модель
gbr.fit(X_train, y_train)
#  предсказания на тестовой выборке
y_pred_gbr = gbr.predict(X_test)


In [59]:
# проверка на переобучение модели GradientBoostingRegressor
check_overfitting(gbr, X_train, y_train, X_test, y_test, mean_squared_error)

mean_squared_error train: 434.716
mean_squared_error test: 429.898
delta = 1.1 %


In [60]:
# просмотр метрик модели GradientBoostingRegressor обученной на train
metrics = pd.concat([
    metrics,
    get_metrics_regression(y_test,
                           y_pred = y_pred_gbr,
                           X_test = X_test,
                           name='GradientBoostingRegressor_Baseline')])
metrics

Unnamed: 0,model,MAE,MSE,RMSE,RMSLE,R2 adjusted
0,RandomForestRegressor_Baseline,6.345105,270.2768,16.440097,,0.888101
0,DecisionTreeRegressor_Baseline,7.334105,504.655449,22.464538,,0.783413
0,GradientBoostingRegressor_Baseline,9.749377,429.898228,20.733987,,0.827055


## KNeighborsRegressor

In [61]:
# модель KNeighborsRegressor
knn = KNeighborsRegressor()
# обучаем модель
knn.fit(X_train, y_train)
#  предсказания на тестовой выборке
y_pred_knn = knn.predict(X_test)

In [62]:
# проверка на переобучение модели KNeighborsRegressor
check_overfitting(knn, X_train, y_train, X_test, y_test, mean_squared_error)

mean_squared_error train: 360.794
mean_squared_error test: 542.268
delta = 33.5 %


In [63]:
# просмотр метрик модели KNeighborsRegressor обученной на train
metrics = pd.concat([
    metrics,
    get_metrics_regression(y_test,
                           y_pred = y_pred_knn,
                           X_test = X_test,
                           name='KNeighborsRegressor')])
metrics

Unnamed: 0,model,MAE,MSE,RMSE,RMSLE,R2 adjusted
0,RandomForestRegressor_Baseline,6.345105,270.2768,16.440097,,0.888101
0,DecisionTreeRegressor_Baseline,7.334105,504.655449,22.464538,,0.783413
0,GradientBoostingRegressor_Baseline,9.749377,429.898228,20.733987,,0.827055
0,KNeighborsRegressor,9.531598,542.267584,23.28664,,0.797164


# Optuna & KFold подбор гиперпараметров и кросс-валидация

## RandomForestRegressor

In [64]:
# целевая функция для оптимизации
def objective(trial):
    # гиперпараметры для настройки
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 5, 10)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 3)
    min_weight_fraction_leaf = trial.suggest_float('min_weight_fraction_leaf', 0.0, 0.5)
    max_features = trial.suggest_int('max_features', 1, 10)
    bootstrap = trial.suggest_categorical('bootstrap', [True, False])

    # RandomForestRegressor с гиперпараметрами
    rfr = RandomForestRegressor(n_estimators=n_estimators,
                                max_depth=max_depth,
                                min_samples_split=min_samples_split,
                                min_samples_leaf=min_samples_leaf,
                                min_weight_fraction_leaf=min_weight_fraction_leaf,
                                max_features=max_features,
                                bootstrap=bootstrap,
                                random_state=RAND)

    # выполнение кросс-валидации
    scores = cross_val_score(rfr, X_train, y_train, cv=KFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND))
    score = np.mean(scores)

    return -score  # Optuna минимизирует целевую функцию, поэтому мы используем -score

# выполнение настройки гиперпараметров с Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, timeout=7200)

[I 2024-07-14 09:10:12,844] A new study created in memory with name: no-name-35672c1f-53ed-4b6f-9102-d4d25077a64e
[I 2024-07-14 09:11:20,926] Trial 0 finished with value: -0.36977154380270955 and parameters: {'n_estimators': 826, 'max_depth': 5, 'min_samples_split': 3, 'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0.2690831025880113, 'max_features': 9, 'bootstrap': True}. Best is trial 0 with value: -0.36977154380270955.
[I 2024-07-14 09:17:18,431] Trial 1 finished with value: -0.6685571314586117 and parameters: {'n_estimators': 766, 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 2, 'min_weight_fraction_leaf': 0.023614757817893295, 'max_features': 10, 'bootstrap': False}. Best is trial 1 with value: -0.6685571314586117.
[I 2024-07-14 09:17:41,171] Trial 2 finished with value: -0.2510848787492726 and parameters: {'n_estimators': 387, 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 3, 'min_weight_fraction_leaf': 0.4428509433376912, 'max_features': 10, 'boo

In [65]:
# получение лучших гиперпараметров
best_params_rfr = study.best_params

# создание RandomForestRegressor с лучшими гиперпараметрами
rfr_best_params = RandomForestRegressor(**best_params_rfr, random_state=RAND)

# обучение модели
rfr_best_params.fit(X_train, y_train)

# предсказания на тестовой выборке
y_pred_rfr_best_params = rfr_best_params.predict(X_test)

In [66]:
# оценка модели
check_overfitting(rfr_best_params, X_train, y_train, X_test, y_test, mean_squared_error)

# получение метрик
metrics = pd.concat([
    metrics,
    get_metrics_regression(y_test,
                           y_pred=y_pred_rfr_best_params,
                           X_test=X_test,
                           name='RandomForestRegressor_best_params_optuna')])
metrics

mean_squared_error train: 385.090
mean_squared_error test: 389.014
delta = 1.0 %


Unnamed: 0,model,MAE,MSE,RMSE,RMSLE,R2 adjusted
0,RandomForestRegressor_Baseline,6.345105,270.2768,16.440097,,0.888101
0,DecisionTreeRegressor_Baseline,7.334105,504.655449,22.464538,,0.783413
0,GradientBoostingRegressor_Baseline,9.749377,429.898228,20.733987,,0.827055
0,KNeighborsRegressor,9.531598,542.267584,23.28664,,0.797164
0,RandomForestRegressor_best_params_optuna,8.747466,389.013688,19.72343,,0.813048


In [67]:
# сохраняем модель в файл
model_file = os.path.join(model_path, 'rfr_best_params.pkl')
with open(model_file, 'wb') as f:
    pickle.dump(rfr_best_params, f)

## DecisionTreeRegressor

In [68]:
# целевая функция для оптимизации
def objective(trial):
    # гиперпараметры для настройки
    splitter = trial.suggest_categorical('splitter', ['best', 'random'])
    max_depth = trial.suggest_int('max_depth', 5, 10)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 3)
    min_weight_fraction_leaf = trial.suggest_float('min_weight_fraction_leaf', 0.0, 0.5)
    max_features = trial.suggest_int('max_features', 1, 10)
    max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 2, 5)

    # DecisionTreeRegressor с гиперпараметрами
    rfr = DecisionTreeRegressor(splitter=splitter,
                                max_depth=max_depth,
                                min_samples_split=min_samples_split,
                                min_samples_leaf=min_samples_leaf,
                                min_weight_fraction_leaf=min_weight_fraction_leaf,
                                max_features=max_features,
                                max_leaf_nodes=max_leaf_nodes,
                                random_state=RAND)

    # выполнение кросс-валидации
    scores = cross_val_score(rfr, X_train, y_train, cv=KFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND))
    score = np.mean(scores)

    return -score  # Optuna минимизирует целевую функцию, поэтому мы используем -score

# выполнение настройки гиперпараметров с Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, timeout=7200)

[I 2024-07-14 11:12:17,276] A new study created in memory with name: no-name-b86f50fd-ce05-4ab7-ac8f-8a85ee8b9221
[I 2024-07-14 11:12:17,542] Trial 0 finished with value: -0.3219452108460781 and parameters: {'splitter': 'best', 'max_depth': 9, 'min_samples_split': 9, 'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0.32996747434895346, 'max_features': 10, 'max_leaf_nodes': 4}. Best is trial 0 with value: -0.3219452108460781.
[I 2024-07-14 11:12:17,680] Trial 1 finished with value: -0.0014706921098902716 and parameters: {'splitter': 'best', 'max_depth': 6, 'min_samples_split': 10, 'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0.31040738069715434, 'max_features': 5, 'max_leaf_nodes': 3}. Best is trial 0 with value: -0.3219452108460781.
[I 2024-07-14 11:12:17,784] Trial 2 finished with value: 4.596756787358627e-05 and parameters: {'splitter': 'random', 'max_depth': 7, 'min_samples_split': 10, 'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0.30029742235951207, 'max_features': 10

In [69]:
# получение лучших гиперпараметров
best_params_dtr = study.best_params

# создание RandomForestRegressor с лучшими гиперпараметрами
dtr_best_params = DecisionTreeRegressor(**best_params_dtr, random_state=RAND)

# обучение модели
dtr_best_params.fit(X_train, y_train)

# предсказания на тестовой выборке
y_pred_dtr_best_params = dtr_best_params.predict(X_test)

In [70]:
# оценка модели
check_overfitting(dtr_best_params, X_train, y_train, X_test, y_test, mean_squared_error)

# получение метрик
metrics = pd.concat([
    metrics,
    get_metrics_regression(y_test,
                           y_pred=y_pred_dtr_best_params,
                           X_test=X_test,
                           name='DecisionTreeRegressor_best_params_optuna')])
metrics

mean_squared_error train: 884.761
mean_squared_error test: 871.315
delta = 1.5 %


Unnamed: 0,model,MAE,MSE,RMSE,RMSLE,R2 adjusted
0,RandomForestRegressor_Baseline,6.345105,270.2768,16.440097,,0.888101
0,DecisionTreeRegressor_Baseline,7.334105,504.655449,22.464538,,0.783413
0,GradientBoostingRegressor_Baseline,9.749377,429.898228,20.733987,,0.827055
0,KNeighborsRegressor,9.531598,542.267584,23.28664,,0.797164
0,RandomForestRegressor_best_params_optuna,8.747466,389.013688,19.72343,,0.813048
0,DecisionTreeRegressor_best_params_optuna,14.23555,871.314958,29.518045,,0.631328


In [71]:
# сохраняем модель в файл
model_file = os.path.join(model_path, 'dtr_best_params.pkl')
with open(model_file, 'wb') as f:
    pickle.dump(dtr_best_params, f)

## GradientBoostingRegressor

In [72]:
# целевая функция для оптимизации
def objective(trial):
    # гиперпараметры для настройки
    learning_rate = trial.suggest_float('estimator__learning_rate', 0, 1)
    n_estimators = trial.suggest_int('estimator__n_estimators', 100, 1000)
    subsample = trial.suggest_float('estimator__subsample', 0.1, 1.0)
    min_samples_split = trial.suggest_int('estimator__min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('estimator__min_samples_leaf', 1, 5)
    min_weight_fraction_leaf = trial.suggest_float('estimator__min_weight_fraction_leaf', 0.0, 0.5)
    max_depth = trial.suggest_int('estimator__max_depth', 1, 10)

    # GradientBoostingRegressor с гиперпараметрами
    gbr = MultiOutputRegressor(GradientBoostingRegressor(learning_rate=learning_rate,
                                                         n_estimators=n_estimators,
                                                         subsample=subsample,
                                                         min_samples_split=min_samples_split,
                                                         min_samples_leaf=min_samples_leaf,
                                                         min_weight_fraction_leaf=min_weight_fraction_leaf,
                                                         max_depth=max_depth,
                                                         random_state=RAND))

    # выполнение кросс-валидации
    scores = cross_val_score(gbr, X_train, y_train, cv=KFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND))
    score = np.mean(scores)

    return -score  # Optuna минимизирует целевую функцию, поэтому мы используем -score

# выполнение настройки гиперпараметров с Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, timeout=7200)

[I 2024-07-14 11:12:40,891] A new study created in memory with name: no-name-9a1b49c9-d0a4-4d89-8eac-dd342619717b
[I 2024-07-14 11:14:40,447] Trial 0 finished with value: -0.24722659602585417 and parameters: {'estimator__learning_rate': 0.9354162881041332, 'estimator__n_estimators': 504, 'estimator__subsample': 0.97951829355189, 'estimator__min_samples_split': 2, 'estimator__min_samples_leaf': 3, 'estimator__min_weight_fraction_leaf': 0.4855684349029239, 'estimator__max_depth': 7}. Best is trial 0 with value: -0.24722659602585417.
[I 2024-07-14 11:19:52,124] Trial 1 finished with value: -0.8693556588320622 and parameters: {'estimator__learning_rate': 0.5588776818275955, 'estimator__n_estimators': 218, 'estimator__subsample': 0.8904292491859971, 'estimator__min_samples_split': 7, 'estimator__min_samples_leaf': 4, 'estimator__min_weight_fraction_leaf': 0.006542360472891695, 'estimator__max_depth': 8}. Best is trial 1 with value: -0.8693556588320622.
[I 2024-07-14 11:20:40,109] Trial 2 fi

In [73]:
# получение лучших гиперпараметров
best_params_gbr = study.best_params

# создание GradientBoostingRegressor с лучшими гиперпараметрами
gbr_best_params = MultiOutputRegressor(GradientBoostingRegressor(**{k.replace('estimator__', ''): v for k, v in best_params_gbr.items()}, random_state=RAND))

# обучение модели
gbr_best_params.fit(X_train, y_train)

# предсказания на тестовой выборке
y_pred_gbr_best_params = gbr_best_params.predict(X_test)

In [74]:
# оценка модели
check_overfitting(gbr_best_params, X_train, y_train, X_test, y_test, mean_squared_error)

# получение метрик
metrics = pd.concat([
    metrics,
    get_metrics_regression(y_test,
                           y_pred=y_pred_gbr_best_params,
                           X_test=X_test,
                           name='GradientBoostingRegressor_best_params_optuna')])
metrics

mean_squared_error train: 255.541
mean_squared_error test: 302.703
delta = 15.6 %


Unnamed: 0,model,MAE,MSE,RMSE,RMSLE,R2 adjusted
0,RandomForestRegressor_Baseline,6.345105,270.2768,16.440097,,0.888101
0,DecisionTreeRegressor_Baseline,7.334105,504.655449,22.464538,,0.783413
0,GradientBoostingRegressor_Baseline,9.749377,429.898228,20.733987,,0.827055
0,KNeighborsRegressor,9.531598,542.267584,23.28664,,0.797164
0,RandomForestRegressor_best_params_optuna,8.747466,389.013688,19.72343,,0.813048
0,DecisionTreeRegressor_best_params_optuna,14.23555,871.314958,29.518045,,0.631328
0,GradientBoostingRegressor_best_params_optuna,7.221774,302.703266,17.39837,,0.882389


In [75]:
# сохраняем модель в файл
model_file = os.path.join(model_path, 'gbr_best_params.pkl')
with open(model_file, 'wb') as f:
    pickle.dump(gbr_best_params, f)

## KNeighborsRegressor

In [76]:
# целевая функция для оптимизации
def objective(trial):
    # гиперпараметры для настройки
    n_neighbors = trial.suggest_int('n_neighbors', 2, 10)
    weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
    algorithm = trial.suggest_categorical('algorithm', ['brute', 'ball_tree', 'kd_tree'])
    leaf_size = trial.suggest_int('leaf_size', 10, 50)
    p = trial.suggest_int('p', 1, 5)
    metric = trial.suggest_categorical('metric', ['euclidean', 'manhattan', 'minkowski'])

    # KNeighborsRegressor с гиперпараметрами
    knn = KNeighborsRegressor(n_neighbors=n_neighbors,
                              weights=weights,
                              algorithm=algorithm,
                              leaf_size=leaf_size,
                              p=p,
                              metric=metric
                              )

    # выполнение кросс-валидации
    scores = cross_val_score(knn, X_train, y_train, cv=KFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND))
    score = np.mean(scores)

    return -score  # Optuna минимизирует целевую функцию, поэтому мы используем -score

# выполнение настройки гиперпараметров с Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, timeout=7200)

[I 2024-07-14 13:16:21,638] A new study created in memory with name: no-name-6144ca43-a7b6-4ee9-93dc-c93a94a446e6
[I 2024-07-14 13:16:31,859] Trial 0 finished with value: -0.7956958176238405 and parameters: {'n_neighbors': 7, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 32, 'p': 3, 'metric': 'euclidean'}. Best is trial 0 with value: -0.7956958176238405.
[I 2024-07-14 13:16:38,712] Trial 1 finished with value: -0.7507780817264469 and parameters: {'n_neighbors': 2, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 26, 'p': 3, 'metric': 'euclidean'}. Best is trial 0 with value: -0.7956958176238405.
[I 2024-07-14 13:16:43,479] Trial 2 finished with value: -0.7875853052208955 and parameters: {'n_neighbors': 6, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 37, 'p': 2, 'metric': 'minkowski'}. Best is trial 0 with value: -0.7956958176238405.
[I 2024-07-14 13:16:50,035] Trial 3 finished with value: -0.7487095714143487 and parameters: {'n_neighbors': 2, 'wei

In [77]:
# получение лучших гиперпараметров
best_params_knn = study.best_params

# создание RandomForestRegressor с лучшими гиперпараметрами
knn_best_params = KNeighborsRegressor(**best_params_knn)

# обучение модели
knn_best_params.fit(X_train, y_train)

# предсказания на тестовой выборке
y_pred_knn_best_params = knn_best_params.predict(X_test)

In [78]:
# оценка модели
check_overfitting(knn_best_params, X_train, y_train, X_test, y_test, mean_squared_error)

# получение метрик
metrics = pd.concat([
    metrics,
    get_metrics_regression(y_test,
                           y_pred=y_pred_knn_best_params,
                           X_test=X_test,
                           name='KNeighborsRegressor_best_params_optuna')])
metrics

mean_squared_error train: 0.000
mean_squared_error test: 474.034
delta = 100.0 %


Unnamed: 0,model,MAE,MSE,RMSE,RMSLE,R2 adjusted
0,RandomForestRegressor_Baseline,6.345105,270.2768,16.440097,,0.888101
0,DecisionTreeRegressor_Baseline,7.334105,504.655449,22.464538,,0.783413
0,GradientBoostingRegressor_Baseline,9.749377,429.898228,20.733987,,0.827055
0,KNeighborsRegressor,9.531598,542.267584,23.28664,,0.797164
0,RandomForestRegressor_best_params_optuna,8.747466,389.013688,19.72343,,0.813048
0,DecisionTreeRegressor_best_params_optuna,14.23555,871.314958,29.518045,,0.631328
0,GradientBoostingRegressor_best_params_optuna,7.221774,302.703266,17.39837,,0.882389
0,KNeighborsRegressor_best_params_optuna,9.182462,474.034398,21.772331,,0.819349


In [79]:
# сохраняем модель в файл
model_file = os.path.join(model_path, 'knn_best_params.pkl')
with open(model_file, 'wb') as f:
    pickle.dump(knn_best_params, f)

In [80]:
# сохранение metrics в файл data/metrics.csv
metrics.to_csv('../data/metrics.csv', index=False)

In [81]:
# # Обратное преобразование масштабированных данных в X_train
# X_train[['milliseconds', 'depth', 'year', 'month', 'day', 'hour', 'minute', 'second']] = scaler.inverse_transform(X_train[['milliseconds', 'depth', 'year', 'month', 'day', 'hour', 'minute', 'second']])

# # Обратное преобразование масштабированных данных в X_test
# X_test[['milliseconds', 'depth', 'year', 'month', 'day', 'hour', 'minute', 'second']] = scaler.inverse_transform(X_test[['milliseconds', 'depth', 'year', 'month', 'day', 'hour', 'minute', 'second']])

In [82]:
# декодирование колонки 'country'
#X_train['country'] = le.inverse_transform(X_train['country'])
#X_test['country'] = le.inverse_transform(X_test['country'])