In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import folium

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR 
from sklearn.preprocessing import LabelEncoder

import sys
import os

random_state = 10

backend_path = os.path.abspath('../backend')
sys.path.append(backend_path)
from get_metrics import get_metrics_regression, get_metrics_classification
from check_overfitting import check_overfitting

In [None]:
# Чтение DataFrame df в файл data/df.csv
df = pd.read_csv('../data/df.csv')

In [None]:
df.info()

In [None]:
# создаем LabelEncoder для кодирования категориальных значений
le = LabelEncoder()
# кодирование колонки 'country'
df['country'] = le.fit_transform(df['country'])

In [None]:
# признаки
X = df[['milliseconds', 'significance', 'country', 'depth', 'year', 'month', 'day', 'hour', 'minute', 'second']]

# целевые переменные
y = df[['magnitude', 'longitude', 'latitude']]

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=random_state)


In [None]:
# модель RandomForestRegressor
rfr = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=random_state)
# обучаем модель
rfr.fit(X_train, y_train)
# предсказания на test 
y_pred_rfr = rfr.predict(X_test)

In [None]:
# проверка на переобучение модели RandomForestRegressor
check_overfitting(rfr, X_train, y_train, X_test, y_test, mean_squared_error)

In [None]:
# просмотр метрик модели RandomForestRegressor обученной на train
metrics = get_metrics_regression(y_test,
                                 y_pred = rfr.predict(X_test),
                                 X_test = X_test,
                                 name='RandomForestRegressor_Baseline')
metrics

In [None]:
# модель DecisionTreeRegressor
dtr = DecisionTreeRegressor(max_depth=5, random_state=random_state)
# обучаем модель
dtr.fit(X_train, y_train)
# предсказания на test 
y_pred_dtr = dtr.predict(X_test)

In [None]:
# проверка на переобучение модели DecisionTreeRegressor
check_overfitting(dtr, X_train, y_train, X_test, y_test, mean_squared_error)

In [None]:
# просмотр метрик модели DecisionTreeRegressor обученной на train
metrics = pd.concat([
    metrics,
    get_metrics_regression(y_test,
                           y_pred = dtr.predict(X_test),
                           X_test = X_test,
                           name='DecisionTreeRegressor_Baseline')])
metrics

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor

# модель GradientBoostingRegressor
gbr = MultiOutputRegressor(GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=random_state))
# обучаем модель
gbr.fit(X_train, y_train)
# предсказания на test 
y_pred_gbr = gbr.predict(X_test)


In [None]:
# модель SVR
svr = SVR(kernel='rbf', C=1, epsilon=0.1)
# обучаем модель
svr.fit(X_train, y_train)
# предсказания на test 
y_pred_svr = svr.predict(X_test)

In [None]:
# декодирование колонки 'country'
df['country'] = le.inverse_transform(df['country'])
df[:5]