In [119]:
import pandas as pd
import numpy as np
import math

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

from catboost import CatBoostRegressor, Pool, metrics, cv

import matplotlib.pyplot as plt
import seaborn as sns
import math
%matplotlib inline

In [120]:
sns.set(rc={'figure.figsize': (12, 9)})

In [121]:
data = pd.read_excel('Статистика_для_графиков_3_д_ТЭЦ2_УРУТ_130.xlsx', sheet_name='Лист1')

In [122]:
data = data.drop(columns=["Дата"]).iloc[:, :-3]

## **Проверим на пустые значения**

In [123]:
data.isna().sum()

Выработка электроэнергии ТА гр.130                        0
Отпуск тепла из ТО ТА гр.130                              0
Удельный расход условного топлива на отпуск э/э гр.130    0
dtype: int64

In [124]:
data.dtypes

Выработка электроэнергии ТА гр.130                        float64
Отпуск тепла из ТО ТА гр.130                              float64
Удельный расход условного топлива на отпуск э/э гр.130    float64
dtype: object

## **Делим на выборки**

In [125]:
X = data.drop(columns=['Удельный расход условного топлива на отпуск э/э гр.130'])
Y = data['Удельный расход условного топлива на отпуск э/э гр.130']

In [126]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15, random_state=50)

## **Подберём модель по которой можно будет предсказывать**

**Разберём модели CatBoostRegressor и LinearRegression**

### **LinearRegression**

In [127]:
lr = LinearRegression()

lr.fit(X_train, Y_train)

prediction_lr = lr.predict(X_test)

In [128]:
print(f'mean_squared_error: {math.sqrt(mean_squared_error(Y_test, prediction_lr))}')
print(f'mean_absolute_error: {mean_absolute_error(Y_test, prediction_lr)}')
print(f'median_absolute_error: {median_absolute_error(Y_test, prediction_lr)}')
print(f'r2_score: {r2_score(Y_test, prediction_lr)}')

mean_squared_error: 29.88321972358573
mean_absolute_error: 22.01786423363523
median_absolute_error: 18.722099663604354
r2_score: 0.7407035000237302


In [129]:
LR_pereb = LinearRegression()

params = {
    'fit_intercept': [True, False],
    'copy_X': [True, False],
    'n_jobs': [-1],
    'positive': [True, False]
}

# создадим объект GridSearchCV
search_LR = GridSearchCV(LR_pereb, params, n_jobs=-1, cv=5, refit=True, scoring='neg_mean_absolute_error')

# запустим поиск
search_LR.fit(X_train, Y_train)

# выведем наилучшие параметры
print(search_LR.best_params_)

# применяем параметры
search_LR = search_LR.best_estimator_

prediction_search_LR = search_LR.predict(X_test)

{'copy_X': True, 'fit_intercept': True, 'n_jobs': -1, 'positive': False}


In [130]:
print(f'mean_squared_error: {math.sqrt(mean_squared_error(Y_test, prediction_search_LR))}')
print(f'mean_absolute_error: {mean_absolute_error(Y_test, prediction_search_LR)}')
print(f'median_absolute_error: {median_absolute_error(Y_test, prediction_search_LR)}')
print(f'r2_score: {r2_score(Y_test, prediction_search_LR)}')

mean_squared_error: 29.88321972358573
mean_absolute_error: 22.01786423363523
median_absolute_error: 18.722099663604354
r2_score: 0.7407035000237302


### **CatBoostRegressor**

In [131]:
cbr = CatBoostRegressor()

cbr.fit(X_train, Y_train)

prediction_cbr = cbr.predict(X_test)

Learning rate set to 0.040449
0:	learn: 52.3631128	total: 3.65ms	remaining: 3.65s
1:	learn: 50.8122411	total: 6.41ms	remaining: 3.2s
2:	learn: 49.3171517	total: 9.04ms	remaining: 3s
3:	learn: 47.8398073	total: 11.4ms	remaining: 2.83s
4:	learn: 46.4193129	total: 13.9ms	remaining: 2.77s
5:	learn: 45.1172948	total: 16.5ms	remaining: 2.74s
6:	learn: 43.8351557	total: 19ms	remaining: 2.69s
7:	learn: 42.6134060	total: 20.8ms	remaining: 2.58s
8:	learn: 41.4611769	total: 22.4ms	remaining: 2.47s
9:	learn: 40.3606576	total: 24ms	remaining: 2.38s
10:	learn: 39.3297544	total: 25.9ms	remaining: 2.33s
11:	learn: 38.3507890	total: 27.5ms	remaining: 2.26s
12:	learn: 37.3703947	total: 29.6ms	remaining: 2.24s
13:	learn: 36.4702369	total: 30.2ms	remaining: 2.13s
14:	learn: 35.5792770	total: 33.1ms	remaining: 2.17s
15:	learn: 34.7559275	total: 35.3ms	remaining: 2.17s
16:	learn: 33.9676006	total: 36.9ms	remaining: 2.14s
17:	learn: 33.2093168	total: 38.4ms	remaining: 2.1s
18:	learn: 32.4735457	total: 39.7ms

In [132]:
print(f'mean_squared_error: {math.sqrt(mean_squared_error(Y_test, prediction_cbr))}')
print(f'mean_absolute_error: {mean_absolute_error(Y_test, prediction_cbr)}')
print(f'median_absolute_error: {median_absolute_error(Y_test, prediction_cbr)}')
print(f'r2_score: {r2_score(Y_test, prediction_cbr)}')

mean_squared_error: 19.578091947160598
mean_absolute_error: 12.892028216328992
median_absolute_error: 8.496663393355817
r2_score: 0.888703218352646


In [133]:
search_CBR = CatBoostRegressor(iterations=4001,
                                    rsm=1,
                                    early_stopping_rounds=100,
                                    grow_policy='Depthwise',
                                    depth=7,
                                    loss_function='RMSE',
                                    eval_metric='RMSE',
                                    l2_leaf_reg=10,
                                    learning_rate=0.01,
                                    verbose=100,
                                    task_type='GPU'
                                    )

search_CBR.fit(X_train, Y_train)

prediction_search_CBR = search_CBR.predict(X_test)

0:	learn: 53.7549887	total: 16.6ms	remaining: 1m 6s
100:	learn: 35.9549060	total: 754ms	remaining: 29.1s
200:	learn: 26.7761814	total: 1.65s	remaining: 31.3s
300:	learn: 21.4290450	total: 2.55s	remaining: 31.4s
400:	learn: 18.1545646	total: 3.31s	remaining: 29.7s
500:	learn: 16.0791001	total: 4.13s	remaining: 28.9s
600:	learn: 14.6439583	total: 4.93s	remaining: 27.9s
700:	learn: 13.5896675	total: 5.71s	remaining: 26.9s
800:	learn: 12.7786834	total: 6.67s	remaining: 26.6s
900:	learn: 12.1436802	total: 7.48s	remaining: 25.7s
1000:	learn: 11.6666288	total: 8.3s	remaining: 24.9s
1100:	learn: 11.2867710	total: 9.08s	remaining: 23.9s
1200:	learn: 10.9906158	total: 9.86s	remaining: 23s
1300:	learn: 10.7570203	total: 10.7s	remaining: 22.1s
1400:	learn: 10.5614248	total: 11.4s	remaining: 21.2s
1500:	learn: 10.3913761	total: 12.2s	remaining: 20.3s
1600:	learn: 10.2389933	total: 13s	remaining: 19.5s
1700:	learn: 10.0961345	total: 13.8s	remaining: 18.7s
1800:	learn: 9.9679808	total: 14.6s	remainin

In [134]:
print(f'mean_squared_error: {math.sqrt(mean_squared_error(Y_test, prediction_search_CBR))}')
print(f'mean_absolute_error: {mean_absolute_error(Y_test, prediction_search_CBR)}')
print(f'median_absolute_error: {median_absolute_error(Y_test, prediction_search_CBR)}')
print(f'r2_score: {r2_score(Y_test, prediction_search_CBR)}')

mean_squared_error: 21.93323723145162
mean_absolute_error: 13.334850548549412
median_absolute_error: 7.036916078608726
r2_score: 0.860315778888983


## **Сделаем таблицу ля сравнения результатов**

In [135]:
comparison_table = pd.DataFrame(
    {'models': ['lr', 'search_LR', 'cbr', 'search_CBR'],
     
    'mean_squared_error': [math.sqrt(mean_squared_error(Y_test, prediction_lr)),
                            math.sqrt(mean_squared_error(Y_test, prediction_search_LR)),
                            math.sqrt(mean_squared_error(Y_test, prediction_cbr)),
                            math.sqrt(mean_squared_error(Y_test, prediction_search_CBR))],

    'mean_absolute_error':[mean_absolute_error(Y_test, prediction_lr),
                           mean_absolute_error(Y_test, prediction_search_LR),
                           mean_absolute_error(Y_test, prediction_cbr),
                           mean_absolute_error(Y_test, prediction_search_CBR)],\

    'median_absolute_error': [median_absolute_error(Y_test, prediction_lr),
                              median_absolute_error(Y_test, prediction_search_LR),
                              median_absolute_error(Y_test, prediction_cbr),
                              median_absolute_error(Y_test, prediction_search_CBR)],
                              
    'r2_score': [r2_score(Y_test, prediction_lr), 
                r2_score(Y_test, prediction_search_LR),
                r2_score(Y_test, prediction_cbr),
                r2_score(Y_test, prediction_search_CBR)]
    })

In [136]:
comparison_table

Unnamed: 0,models,mean_squared_error,mean_absolute_error,median_absolute_error,r2_score
0,lr,29.88322,22.017864,18.7221,0.740704
1,search_LR,29.88322,22.017864,18.7221,0.740704
2,cbr,19.578092,12.892028,8.496663,0.888703
3,search_CBR,21.933237,13.334851,7.036916,0.860316


## **Из всего этого хоршо подходит CatBoostRegressor из коробки(cbr)**

### **Обучим модель на всех данных**

In [137]:
cbr.fit(X, Y)

Learning rate set to 0.041504
0:	learn: 53.0297492	total: 2.06ms	remaining: 2.05s
1:	learn: 51.4023598	total: 4.47ms	remaining: 2.23s
2:	learn: 49.8360823	total: 6.56ms	remaining: 2.18s
3:	learn: 48.2961140	total: 8.84ms	remaining: 2.2s
4:	learn: 46.8227465	total: 11ms	remaining: 2.18s
5:	learn: 45.4812126	total: 12.4ms	remaining: 2.06s
6:	learn: 44.1459598	total: 14.1ms	remaining: 1.99s
7:	learn: 42.8902195	total: 16.2ms	remaining: 2.01s
8:	learn: 41.6762099	total: 18ms	remaining: 1.98s
9:	learn: 40.5111491	total: 19.9ms	remaining: 1.97s
10:	learn: 39.4499349	total: 21.9ms	remaining: 1.97s
11:	learn: 38.4107025	total: 24.1ms	remaining: 1.98s
12:	learn: 37.4597035	total: 25.5ms	remaining: 1.94s
13:	learn: 36.5548368	total: 26.1ms	remaining: 1.84s
14:	learn: 35.6171190	total: 27.9ms	remaining: 1.83s
15:	learn: 34.7775885	total: 30ms	remaining: 1.85s
16:	learn: 33.9423239	total: 32ms	remaining: 1.85s
17:	learn: 33.1795272	total: 33.6ms	remaining: 1.83s
18:	learn: 32.4347081	total: 36.3ms

<catboost.core.CatBoostRegressor at 0x1a71dffba60>