In [37]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import math
import optuna

from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error 
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.neighbors import KNeighborsRegressor

import sys
import os
from tqdm import tqdm

RAND = 10
N_FOLDS = 3
N_ITER = 300

backend_path = os.path.abspath('../backend')
sys.path.append(backend_path)
from get_metrics import get_metrics_regression
from check_overfitting import check_overfitting

# Import data

In [38]:
# Чтение DataFrame df в файл data/df.csv
df = pd.read_csv('../data/df.csv')

In [39]:
#df = df[-10000:]

In [40]:
df[:3]

Unnamed: 0,milliseconds,place,status,tsunami,significance,data_type,magnitude,country,longitude,latitude,depth,datetime,timezone,magnitude_bins,year,month,day,hour,minute,second
0,1668773163070,"14 km SSE of Eden Roc, Hawaii",automatic,0,58,earthquake,1.94,USA,-155.030334,19.374001,7.1,2022-11-18 12:06:03,+00:00,green,2022,11,18,12,6,3
1,1668773284487,"40 km ESE of Nikolski, Alaska",reviewed,0,62,earthquake,2.0,USA,-168.3108,52.7861,64.9,2022-11-18 12:08:04,+00:00,green,2022,11,18,12,8,4
2,1668773482790,"45 km SW of Howell, Utah",reviewed,0,21,earthquake,1.16,USA,-112.845833,41.512167,5.73,2022-11-18 12:11:22,+00:00,green,2022,11,18,12,11,22


In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94354 entries, 0 to 94353
Data columns (total 20 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   milliseconds    94354 non-null  int64  
 1   place           94354 non-null  object 
 2   status          94354 non-null  object 
 3   tsunami         94354 non-null  int64  
 4   significance    94354 non-null  int64  
 5   data_type       94354 non-null  object 
 6   magnitude       94354 non-null  float64
 7   country         94354 non-null  object 
 8   longitude       94354 non-null  float64
 9   latitude        94354 non-null  float64
 10  depth           94354 non-null  float64
 11  datetime        94354 non-null  object 
 12  timezone        94354 non-null  object 
 13  magnitude_bins  94354 non-null  object 
 14  year            94354 non-null  int64  
 15  month           94354 non-null  int64  
 16  day             94354 non-null  int64  
 17  hour            94354 non-null 

In [42]:
# основные описательные статистики для числовых признаков
df.iloc[:, 1:].describe()

Unnamed: 0,tsunami,significance,magnitude,longitude,latitude,depth,year,month,day,hour,minute,second
count,94354.0,94354.0,94354.0,94354.0,94354.0,94354.0,94354.0,94354.0,94354.0,94354.0,94354.0,94354.0
mean,0.000964,67.406363,1.720177,-116.542011,41.560005,26.813648,2022.824915,5.299479,15.482046,11.48601,29.419537,29.46829
std,0.031041,96.398645,1.178204,72.358151,20.35713,56.179701,0.380042,3.429028,8.635858,6.920763,17.35737,17.283395
min,0.0,0.0,0.0,-179.9987,-65.4254,-3.74,2022.0,1.0,1.0,0.0,0.0,0.0
25%,0.0,14.0,0.95,-153.445775,34.018333,3.4,2023.0,3.0,8.0,6.0,14.0,15.0
50%,0.0,30.0,1.4,-122.8525,39.2463,9.2,2023.0,5.0,16.0,11.0,29.0,29.0
75%,0.0,68.0,2.1,-116.717375,58.264,25.6,2023.0,7.0,23.0,17.0,45.0,44.0
max,1.0,2910.0,7.8,179.9994,86.5939,681.238,2023.0,12.0,30.0,23.0,59.0,59.0


In [43]:
# основные описательные статистики для булевых и категориальных признаков
df.describe(include=["object", "bool"])

Unnamed: 0,place,status,data_type,country,datetime,timezone,magnitude_bins
count,94354,94354,94354,94354,94354,94354,94354
unique,34095,3,7,234,93284,1,3
top,"8km NW of The Geysers, CA",reviewed,earthquake,USA,2023-03-02 18:11:07,+00:00,green
freq,973,84325,92210,80057,2,94354,82184


# разделение данных train_test_split

In [44]:
# признаки
X = df[['milliseconds', 'significance', 'country', 'depth', 'year', 'month', 'day', 'hour', 'minute', 'second']]

# целевые переменные
y = df[['magnitude', 'longitude', 'latitude']]

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=RAND)

# LabelEncoder

In [45]:
# создаем LabelEncoder для кодирования категориальных значений
le = LabelEncoder()

# обучаем le на тренировочных и тестовых данных
le.fit(np.concatenate((X_train['country'], X_test['country'])))

# кодирование колонки 'country' для обучения моделей
X_train['country'] = le.transform(X_train['country'])
X_test['country'] = le.transform(X_test['country'])

# StandardScaler

In [46]:
# создаем объект scaler
scaler = StandardScaler()

# масштабируем признаки в X_train
X_train[['milliseconds', 'depth', 'year', 'month', 'day', 'hour', 'minute', 'second']] = scaler.fit_transform(X_train[['milliseconds', 'depth', 'year', 'month', 'day', 'hour', 'minute', 'second']])

# масштабируем признаки в X_test
X_test[['milliseconds', 'depth', 'year', 'month', 'day', 'hour', 'minute', 'second']] = scaler.transform(X_test[['milliseconds', 'depth', 'year', 'month', 'day', 'hour', 'minute', 'second']])

# Baseline 

## RandomForestRegressor

In [47]:
# модель RandomForestRegressor
rfr = RandomForestRegressor(random_state=RAND)
# обучаем модель
rfr.fit(X_train, y_train)
# предсказания на test 
y_pred_rfr = rfr.predict(X_test)

In [None]:
# проверка на переобучение модели RandomForestRegressor
check_overfitting(rfr, X_train, y_train, X_test, y_test, mean_squared_error)

In [None]:
# просмотр метрик модели RandomForestRegressor обученной на train
metrics = get_metrics_regression(y_test,
                                 y_pred = y_pred_rfr,
                                 X_test = X_test,
                                 name='RandomForestRegressor_Baseline')
metrics

## DecisionTreeRegressor

In [None]:
# модель DecisionTreeRegressor
dtr = DecisionTreeRegressor(random_state=RAND)
# обучаем модель
dtr.fit(X_train, y_train)
# предсказания на test 
y_pred_dtr = dtr.predict(X_test)

In [None]:
# проверка на переобучение модели DecisionTreeRegressor
check_overfitting(dtr, X_train, y_train, X_test, y_test, mean_squared_error)

In [None]:
# просмотр метрик модели DecisionTreeRegressor обученной на train
metrics = pd.concat([
    metrics,
    get_metrics_regression(y_test,
                           y_pred = y_pred_dtr,
                           X_test = X_test,
                           name='DecisionTreeRegressor_Baseline')])
metrics

## GradientBoostingRegressor

In [None]:
# модель GradientBoostingRegressor
gbr = MultiOutputRegressor(GradientBoostingRegressor(random_state=RAND)) 
# обучаем модель
gbr.fit(X_train, y_train)
# предсказания на test 
y_pred_gbr = gbr.predict(X_test)


In [None]:
# проверка на переобучение модели GradientBoostingRegressor
check_overfitting(gbr, X_train, y_train, X_test, y_test, mean_squared_error)

In [None]:
# просмотр метрик модели GradientBoostingRegressor обученной на train
metrics = pd.concat([
    metrics,
    get_metrics_regression(y_test,
                           y_pred = y_pred_gbr,
                           X_test = X_test,
                           name='GradientBoostingRegressor_Baseline')])
metrics

## KNeighborsRegressor

In [None]:
# модель KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=5)
# обучаем модель
knn.fit(X_train, y_train)
# предсказания на test 
y_pred_knn = knn.predict(X_test)

In [None]:
# проверка на переобучение модели KNeighborsRegressor
check_overfitting(knn, X_train, y_train, X_test, y_test, mean_squared_error)

In [None]:
# просмотр метрик модели KNeighborsRegressor обученной на train
metrics = pd.concat([
    metrics,
    get_metrics_regression(y_test,
                           y_pred = y_pred_knn,
                           X_test = X_test,
                           name='KNeighborsRegressor')])
metrics

# Optuna & KFold подбор гиперпараметров и кросс-валидация

## RandomForestRegressor

In [None]:
# # сетка параметров
# param_grid = {
#     'n_estimators': [100, 500, 1000],
#     'max_depth': [None, 5, 10],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 3],
#     'min_weight_fraction_leaf': [0.0, 0.1, 0.5],
#     'max_features': [1 , 5 , 10],
#     'bootstrap': [True, False],
# }

# # расчет количества моделей
# num_models = math.prod(len(values) for values in param_grid.values())

# print(f"Количество моделей для обучения: {num_models}")

In [None]:
# # модель RandomForestRegressor
# rfr = RandomForestRegressor(random_state=RAND)

# # Создание KFold
# kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND)

# # модель GridSearchCV
# gsCV = GridSearchCV(rfr, param_grid, cv=kf, error_score='raise')

# # производим обучение по сетке
# gsCV.fit(X_train, y_train)

In [None]:
# # просмотр параметров
# best_params_rfr = gsCV.best_params_
# print(best_params_rfr)
# print(gsCV.best_score_)

In [None]:
# # модель RandomForestRegressor с лучшими параметрами
# rfr_best_params = RandomForestRegressor(**best_params_rfr, random_state=RAND)
# # обучаем модель
# rfr_best_params.fit(X_train, y_train)
# # предсказания на test 
# y_pred_rfr_best_params = rfr_best_params.predict(X_test)

In [None]:
# # проверка на переобучение модели RandomForestRegressor с лучшими параметрами
# check_overfitting(rfr_best_params, X_train, y_train, X_test, y_test, mean_squared_error)

In [None]:
# # просмотр метрик модели RandomForestRegressor обученной на train с лучшими параметрами
# metrics = pd.concat([
#     metrics,
#     get_metrics_regression(y_test,
#                            y_pred = y_pred_rfr_best_params,
#                            X_test = X_test,
#                            name='RandomForestRegressor_best_params')])
# metrics

In [None]:
# целевая функция для оптимизации
def objective(trial):
    # гиперпараметры для настройки
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 5, 10)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 3)
    min_weight_fraction_leaf = trial.suggest_uniform('min_weight_fraction_leaf', 0.0, 0.5)
    max_features = trial.suggest_int('max_features', 1, 10)
    bootstrap = trial.suggest_categorical('bootstrap', [True, False])

    # RandomForestRegressor с гиперпараметрами
    rfr = RandomForestRegressor(n_estimators=n_estimators,
                                max_depth=max_depth,
                                min_samples_split=min_samples_split,
                                min_samples_leaf=min_samples_leaf,
                                min_weight_fraction_leaf=min_weight_fraction_leaf,
                                max_features=max_features,
                                bootstrap=bootstrap,
                                random_state=RAND)

    # выполнение кросс-валидации
    scores = cross_val_score(rfr, X_train, y_train, cv=KFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND))
    score = np.mean(scores)

    return -score  # Optuna минимизирует целевую функцию, поэтому мы используем -score

# выполнение настройки гиперпараметров с Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, timeout=2400)

In [None]:
# получение лучших гиперпараметров
best_params_rfr = study.best_params

# создание RandomForestRegressor с лучшими гиперпараметрами
rfr_best_params = RandomForestRegressor(**best_params_rfr, random_state=RAND)

# обучение модели
rfr_best_params.fit(X_train, y_train)

# предсказания на тестовой выборке
y_pred_rfr_best_params = rfr_best_params.predict(X_test)

In [None]:
# оценка модели
check_overfitting(rfr_best_params, X_train, y_train, X_test, y_test, mean_squared_error)

# получение метрик
metrics = pd.concat([
    metrics,
    get_metrics_regression(y_test,
                           y_pred=y_pred_rfr_best_params,
                           X_test=X_test,
                           name='RandomForestRegressor_best_params_optuna')])
metrics

## DecisionTreeRegressor

In [None]:
# целевая функция для оптимизации
def objective(trial):
    # гиперпараметры для настройки
    splitter = trial.suggest_categorical('splitter', ['best', 'random'])
    max_depth = trial.suggest_int('max_depth', 5, 10)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 3)
    min_weight_fraction_leaf = trial.suggest_uniform('min_weight_fraction_leaf', 0.0, 0.5)
    max_features = trial.suggest_int('max_features', 1, 10)
    max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 2, 5)

    # DecisionTreeRegressor с гиперпараметрами
    rfr = DecisionTreeRegressor(splitter=splitter,
                                max_depth=max_depth,
                                min_samples_split=min_samples_split,
                                min_samples_leaf=min_samples_leaf,
                                min_weight_fraction_leaf=min_weight_fraction_leaf,
                                max_features=max_features,
                                max_leaf_nodes=max_leaf_nodes,
                                random_state=RAND)

    # выполнение кросс-валидации
    scores = cross_val_score(rfr, X_train, y_train, cv=KFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND))
    score = np.mean(scores)

    return -score  # Optuna минимизирует целевую функцию, поэтому мы используем -score

# выполнение настройки гиперпараметров с Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, timeout=2400)

In [None]:
# получение лучших гиперпараметров
best_params_dtr = study.best_params

# создание RandomForestRegressor с лучшими гиперпараметрами
dtr_best_params = DecisionTreeRegressor(**best_params_dtr, random_state=RAND)

# обучение модели
dtr_best_params.fit(X_train, y_train)

# предсказания на тестовой выборке
y_pred_dtr_best_params = dtr_best_params.predict(X_test)

In [None]:
# оценка модели
check_overfitting(dtr_best_params, X_train, y_train, X_test, y_test, mean_squared_error)

# получение метрик
metrics = pd.concat([
    metrics,
    get_metrics_regression(y_test,
                           y_pred=y_pred_dtr_best_params,
                           X_test=X_test,
                           name='DecisionTreeRegressor_best_params_optuna')])
metrics

In [None]:
# # сетка параметров
# param_grid = {
#     'splitter': ['best', 'random'],
#     'max_depth': [None, 5, 10],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 3],
#     'min_weight_fraction_leaf': [0.0, 0.1, 0.5],
#     'max_features': [1, 5, 10],
#     'max_leaf_nodes':[None, 2, 5]
# }

# # расчет количества моделей
# num_models = math.prod(len(values) for values in param_grid.values())

# print(f"Количество моделей для обучения: {num_models}")

In [None]:
# # модель DecisionTreeRegressor
# dtr = DecisionTreeRegressor(random_state=RAND)

# # Создание KFold
# kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND)

# # модель GridSearchCV
# gsCV = GridSearchCV(dtr, param_grid, cv=kf, error_score='raise')

# # производим обучение по сетке
# gsCV.fit(X_train, y_train)

In [None]:
# # просмотр параметров
# best_params_dtr = gsCV.best_params_
# print(best_params_dtr)
# print(gsCV.best_score_)

In [None]:
# # модель DecisionTreeRegressor лучшими параметрами
# dtr_best_params = DecisionTreeRegressor(**best_params_dtr, random_state=RAND)
# # обучаем модель
# dtr_best_params.fit(X_train, y_train)
# # предсказания на test 
# y_pred_dtr_best_params = dtr_best_params.predict(X_test)

In [None]:
# # проверка на переобучение модели DecisionTreeRegressor с лучшими параметрами
# check_overfitting(dtr_best_params, X_train, y_train, X_test, y_test, mean_squared_error)

In [None]:
# # просмотр метрик модели DecisionTreeRegressor обученной на train с лучшими параметрами
# metrics = pd.concat([
#     metrics,
#     get_metrics_regression(y_test,
#                            y_pred = y_pred_dtr_best_params,
#                            X_test = X_test,
#                            name='DecisionTreeRegressor_best_params')])
# metrics

## GradientBoostingRegressor

In [None]:
# целевая функция для оптимизации
def objective(trial):
    # гиперпараметры для настройки
    learning_rate = trial.suggest_categorical('estimator__learning_rate', [0, 0.1, 0.5, 1])
    n_estimators = trial.suggest_categorical('estimator__n_estimators', [100, 500, 1000])
    subsample = trial.suggest_categorical('estimator__subsample', [0.1, 0.5, 1.0])
    min_samples_split = trial.suggest_categorical('estimator__min_samples_split', [2, 5, 10])
    min_samples_leaf = trial.suggest_categorical('estimator__min_samples_leaf', [1, 2, 3])
    min_weight_fraction_leaf = trial.suggest_categorical('estimator__min_weight_fraction_leaf', [0.0, 0.1, 0.5])
    max_depth = trial.suggest_categorical('estimator__max_depth', [None, 5, 10])

    # GradientBoostingRegressor с гиперпараметрами
    gbr = MultiOutputRegressor(GradientBoostingRegressor(learning_rate=learning_rate,
                                                         n_estimators=n_estimators,
                                                         subsample=subsample,
                                                         min_samples_split=min_samples_split,
                                                         min_samples_leaf=min_samples_leaf,
                                                         min_weight_fraction_leaf=min_weight_fraction_leaf,
                                                         max_depth=max_depth,
                                                         random_state=RAND))

    # выполнение кросс-валидации
    scores = cross_val_score(gbr, X_train, y_train, cv=KFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND))
    score = np.mean(scores)

    return -score  # Optuna минимизирует целевую функцию, поэтому мы используем -score

# выполнение настройки гиперпараметров с Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=5, timeout=2400)

In [None]:
# получение лучших гиперпараметров
best_params_gbr = study.best_params

# создание GradientBoostingRegressor с лучшими гиперпараметрами
gbr_best_params = MultiOutputRegressor(GradientBoostingRegressor(**{k.replace('estimator__', ''): v for k, v in best_params_gbr.items()}, random_state=RAND))

# обучение модели
gbr_best_params.fit(X_train, y_train)

# предсказания на тестовой выборке
y_pred_gbr_best_params = gbr_best_params.predict(X_test)

In [None]:
# оценка модели
check_overfitting(gbr_best_params, X_train, y_train, X_test, y_test, mean_squared_error)

# получение метрик
metrics = pd.concat([
    metrics,
    get_metrics_regression(y_test,
                           y_pred=y_pred_gbr_best_params,
                           X_test=X_test,
                           name='GradientBoostingRegressor_best_params_optuna')])
metrics

In [None]:
# # сетка параметров
# param_grid = {
#     'estimator__learning_rate': [0,1, 0,5, 1],
#     'estimator__n_estimators': [100, 500, 1000],
#     'estimator__subsample': [0.1, 0.5, 1.0],
#     'estimator__min_samples_split': [2, 5, 10],
#     'estimator__min_samples_leaf': [1, 2, 3],
#     'estimator__min_weight_fraction_leaf': [0.0, 0.1, 0.5],
#     'estimator__max_depth': [None, 5, 10],
# }

# # расчет количества моделей
# num_models = math.prod(len(values) for values in param_grid.values())

# print(f"Количество моделей для обучения: {num_models}")

In [None]:
# # модель GradientBoostingRegressor
# gbr = MultiOutputRegressor(GradientBoostingRegressor(random_state=RAND))

# # Создание KFold
# kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND)

# # модель GridSearchCV
# gsCV = GridSearchCV(gbr, param_grid, cv=kf, error_score='raise')

# # производим обучение по сетке
# gsCV.fit(X_train, y_train)

In [None]:
# # просмотр параметров
# best_params_gbr = gsCV.best_params_
# print(best_params_gbr)
# print(gsCV.best_score_)

In [None]:
# # модель GradientBoostingRegressor лучшими параметрами
# gbr_best_params = MultiOutputRegressor(GradientBoostingRegressor(**{k.replace('estimator__', ''): v for k, v in best_params_gbr.items()}, random_state=RAND))
# # обучаем модель
# gbr_best_params.fit(X_train, y_train)
# # предсказания на test 
# y_pred_gbr_best_params = gbr_best_params.predict(X_test)

In [None]:
# # проверка на переобучение модели GradientBoostingRegressor с лучшими параметрами
# check_overfitting(gbr_best_params, X_train, y_train, X_test, y_test, mean_squared_error)

In [None]:
# # просмотр метрик модели GradientBoostingRegressor обученной на train с лучшими параметрами
# metrics = pd.concat([
#     metrics,
#     get_metrics_regression(y_test,
#                            y_pred = y_pred_gbr_best_params,
#                            X_test = X_test,
#                            name='GradientBoostingRegressor_best_params')])
# metrics

## KNeighborsRegressor

In [None]:
# целевая функция для оптимизации
def objective(trial):
    # гиперпараметры для настройки
    n_neighbors = trial.suggest_int('n_neighbors', 2, 10)
    weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
    algorithm = trial.suggest_categorical('algorithm', ['brute', 'ball_tree', 'kd_tree'])
    leaf_size = trial.suggest_int('leaf_size', 10, 50)
    p = trial.suggest_int('p', 1, 5)
    metric = trial.suggest_categorical('metric', ['euclidean', 'manhattan', 'minkowski'])

    # KNeighborsRegressor с гиперпараметрами
    knn = KNeighborsRegressor(n_neighbors=n_neighbors,
                              weights=weights,
                              algorithm=algorithm,
                              leaf_size=leaf_size,
                              p=p,
                              metric=metric)

    # выполнение кросс-валидации
    scores = cross_val_score(knn, X_train, y_train, cv=KFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND))
    score = np.mean(scores)

    return -score  # Optuna минимизирует целевую функцию, поэтому мы используем -score

# выполнение настройки гиперпараметров с Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, timeout=2400)

In [None]:
# получение лучших гиперпараметров
best_params_knn = study.best_params

# создание RandomForestRegressor с лучшими гиперпараметрами
knn_best_params = KNeighborsRegressor(**best_params_knn)

# обучение модели
knn_best_params.fit(X_train, y_train)

# предсказания на тестовой выборке
y_pred_knn_best_params = knn_best_params.predict(X_test)

In [None]:
# оценка модели
check_overfitting(knn_best_params, X_train, y_train, X_test, y_test, mean_squared_error)

# получение метрик
metrics = pd.concat([
    metrics,
    get_metrics_regression(y_test,
                           y_pred=y_pred_knn_best_params,
                           X_test=X_test,
                           name='KNeighborsRegressor_best_params_optuna')])
metrics

In [None]:
# # сетка параметров
# param_grid = {
#     'n_neighbors': [2, 5, 10],
#     'weights': ['uniform', 'distance'],
#     'algorithm': ['brute', 'ball_tree', 'kd_tree'],
#     'leaf_size': [10, 30, 50],
#     'p': [1, 2, 5],
#     'metric': ['cosine', 'euclidean', 'manhattan', 'minkowski'],
# }

# # расчет количества моделей
# num_models = math.prod(len(values) for values in param_grid.values())

# print(f"Количество моделей для обучения: {num_models}")

In [None]:
# # модель KNeighborsRegressor
# knn = KNeighborsRegressor()

# # Создание KFold
# kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND)

# # модель GridSearchCV
# gsCV = GridSearchCV(knn, param_grid, cv=kf, error_score='raise')

# # производим обучение по сетке
# gsCV.fit(X_train, y_train)

In [None]:
# # просмотр параметров
# best_params_knn = gsCV.best_params_
# print(best_params_knn)
# print(gsCV.best_score_)

In [None]:
# # модель KNeighborsRegressor лучшими параметрами
# knn_best_params = KNeighborsRegressor(**best_params_knn)
# # обучаем модель
# knn_best_params.fit(X_train, y_train)
# # предсказания на test 
# y_pred_knn_best_params = knn_best_params.predict(X_test)

In [None]:
# # проверка на переобучение модели KNeighborsRegressor с лучшими параметрами
# check_overfitting(knn_best_params, X_train, y_train, X_test, y_test, mean_squared_error)

In [None]:
# # просмотр метрик модели KNeighborsRegressor обученной на train с лучшими параметрами
# metrics = pd.concat([
#     metrics,
#     get_metrics_regression(y_test,
#                            y_pred = y_pred_knn_best_params,
#                            X_test = X_test,
#                            name='KNeighborsRegressor_best_params')])
# metrics

In [None]:
# # Обратное преобразование масштабированных данных в X_train
# X_train[['milliseconds', 'depth', 'year', 'month', 'day', 'hour', 'minute', 'second']] = scaler.inverse_transform(X_train[['milliseconds', 'depth', 'year', 'month', 'day', 'hour', 'minute', 'second']])

# # Обратное преобразование масштабированных данных в X_test
# X_test[['milliseconds', 'depth', 'year', 'month', 'day', 'hour', 'minute', 'second']] = scaler.inverse_transform(X_test[['milliseconds', 'depth', 'year', 'month', 'day', 'hour', 'minute', 'second']])

In [None]:
# декодирование колонки 'country'
#X_train['country'] = le.inverse_transform(X_train['country'])
#X_test['country'] = le.inverse_transform(X_test['country'])