In [1]:
import pandas as pd
import numpy as np
import math

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

from catboost import CatBoostRegressor, Pool, metrics, cv

import matplotlib.pyplot as plt
import seaborn as sns
import math
%matplotlib inline

In [2]:
sns.set(rc={'figure.figsize': (12, 9)})

In [3]:
data = pd.read_excel('Статистика_для_графиков_3_д_ТЭЦ2_УРУТ_130.xlsx', sheet_name='Лист1')

In [4]:
data = data.drop(columns=["Дата"]).iloc[:, :-3]

## **Проверим на пустые значения**

In [5]:
data.isna().sum()

Выработка электроэнергии ТА гр.130                        0
Отпуск тепла из ТО ТА гр.130                              0
Удельный расход условного топлива на отпуск э/э гр.130    0
dtype: int64

In [6]:
data.dtypes

Выработка электроэнергии ТА гр.130                        float64
Отпуск тепла из ТО ТА гр.130                              float64
Удельный расход условного топлива на отпуск э/э гр.130    float64
dtype: object

## **Делим на выборки**

In [7]:
X = data.drop(columns=['Удельный расход условного топлива на отпуск э/э гр.130'])
Y = data['Удельный расход условного топлива на отпуск э/э гр.130']

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15, random_state=50)

## **Подберём модель по которой можно будет предсказывать**

**Разберём модели CatBoostRegressor и LinearRegression**

### **LinearRegression**

In [9]:
lr = LinearRegression()

lr.fit(X_train, Y_train)

prediction_lr = lr.predict(X_test)

In [10]:
print(f'mean_squared_error: {math.sqrt(mean_squared_error(Y_test, prediction_lr))}')
print(f'mean_absolute_error: {mean_absolute_error(Y_test, prediction_lr)}')
print(f'median_absolute_error: {median_absolute_error(Y_test, prediction_lr)}')
print(f'r2_score: {r2_score(Y_test, prediction_lr)}')

mean_squared_error: 29.88321972358573
mean_absolute_error: 22.01786423363523
median_absolute_error: 18.722099663604354
r2_score: 0.7407035000237302


In [11]:
LR_pereb = LinearRegression()

params = {
    'fit_intercept': [True, False],
    'copy_X': [True, False],
    'n_jobs': [-1],
    'positive': [True, False]
}

# создадим объект GridSearchCV
search_LR = GridSearchCV(LR_pereb, params, n_jobs=-1, cv=5, refit=True, scoring='neg_mean_absolute_error')

# запустим поиск
search_LR.fit(X_train, Y_train)

# выведем наилучшие параметры
print(search_LR.best_params_)

# применяем параметры
search_LR = search_LR.best_estimator_

prediction_search_LR = search_LR.predict(X_test)

{'copy_X': True, 'fit_intercept': True, 'n_jobs': -1, 'positive': False}


In [12]:
print(f'mean_squared_error: {math.sqrt(mean_squared_error(Y_test, prediction_search_LR))}')
print(f'mean_absolute_error: {mean_absolute_error(Y_test, prediction_search_LR)}')
print(f'median_absolute_error: {median_absolute_error(Y_test, prediction_search_LR)}')
print(f'r2_score: {r2_score(Y_test, prediction_search_LR)}')

mean_squared_error: 29.88321972358573
mean_absolute_error: 22.01786423363523
median_absolute_error: 18.722099663604354
r2_score: 0.7407035000237302


### **CatBoostRegressor**

In [13]:
cbr = CatBoostRegressor()

cbr.fit(X_train, Y_train, verbose=500)

prediction_cbr = cbr.predict(X_test)

Learning rate set to 0.040449
0:	learn: 52.3631128	total: 124ms	remaining: 2m 3s
500:	learn: 12.7735390	total: 1.24s	remaining: 1.24s
999:	learn: 10.3427633	total: 2.3s	remaining: 0us


In [14]:
print(f'mean_squared_error: {math.sqrt(mean_squared_error(Y_test, prediction_cbr))}')
print(f'mean_absolute_error: {mean_absolute_error(Y_test, prediction_cbr)}')
print(f'median_absolute_error: {median_absolute_error(Y_test, prediction_cbr)}')
print(f'r2_score: {r2_score(Y_test, prediction_cbr)}')

mean_squared_error: 19.578091947160598
mean_absolute_error: 12.892028216328992
median_absolute_error: 8.496663393355817
r2_score: 0.888703218352646


In [15]:
search_CBR = CatBoostRegressor(iterations=4001,
                                    rsm=1,
                                    early_stopping_rounds=100,
                                    grow_policy='Depthwise',
                                    depth=7,
                                    loss_function='RMSE',
                                    eval_metric='RMSE',
                                    l2_leaf_reg=10,
                                    learning_rate=0.01,
                                    verbose=1000,
                                    task_type='GPU'
                                    )

search_CBR.fit(X_train, Y_train)

prediction_search_CBR = search_CBR.predict(X_test)

0:	learn: 53.7549887	total: 10.5ms	remaining: 42.2s
1000:	learn: 11.6666288	total: 7.32s	remaining: 21.9s
2000:	learn: 9.7225391	total: 15.7s	remaining: 15.7s
3000:	learn: 8.7631641	total: 25.8s	remaining: 8.61s
4000:	learn: 7.9460022	total: 36.6s	remaining: 0us


In [16]:
print(f'mean_squared_error: {math.sqrt(mean_squared_error(Y_test, prediction_search_CBR))}')
print(f'mean_absolute_error: {mean_absolute_error(Y_test, prediction_search_CBR)}')
print(f'median_absolute_error: {median_absolute_error(Y_test, prediction_search_CBR)}')
print(f'r2_score: {r2_score(Y_test, prediction_search_CBR)}')

mean_squared_error: 21.933437407298484
mean_absolute_error: 13.334985030633046
median_absolute_error: 7.036916078608726
r2_score: 0.8603132291937825


## **Сделаем таблицу ля сравнения результатов**

In [17]:
comparison_table = pd.DataFrame(
    {'models': ['lr', 'search_LR', 'cbr', 'search_CBR'],
     
    'mean_squared_error': [math.sqrt(mean_squared_error(Y_test, prediction_lr)),
                            math.sqrt(mean_squared_error(Y_test, prediction_search_LR)),
                            math.sqrt(mean_squared_error(Y_test, prediction_cbr)),
                            math.sqrt(mean_squared_error(Y_test, prediction_search_CBR))],

    'mean_absolute_error':[mean_absolute_error(Y_test, prediction_lr),
                           mean_absolute_error(Y_test, prediction_search_LR),
                           mean_absolute_error(Y_test, prediction_cbr),
                           mean_absolute_error(Y_test, prediction_search_CBR)],\

    'median_absolute_error': [median_absolute_error(Y_test, prediction_lr),
                              median_absolute_error(Y_test, prediction_search_LR),
                              median_absolute_error(Y_test, prediction_cbr),
                              median_absolute_error(Y_test, prediction_search_CBR)],
                              
    'r2_score': [r2_score(Y_test, prediction_lr), 
                r2_score(Y_test, prediction_search_LR),
                r2_score(Y_test, prediction_cbr),
                r2_score(Y_test, prediction_search_CBR)]
    })

In [18]:
comparison_table

Unnamed: 0,models,mean_squared_error,mean_absolute_error,median_absolute_error,r2_score
0,lr,29.88322,22.017864,18.7221,0.740704
1,search_LR,29.88322,22.017864,18.7221,0.740704
2,cbr,19.578092,12.892028,8.496663,0.888703
3,search_CBR,21.933437,13.334985,7.036916,0.860313


## **Из всего этого хоршо подходит CatBoostRegressor из коробки(cbr)**

### **Обучим модель на всех данных**

In [20]:
cbr.fit(X, Y, verbose=500)

prediction_cbr = cbr.predict(X)

Learning rate set to 0.041504
0:	learn: 53.0297492	total: 1.81ms	remaining: 1.81s
500:	learn: 12.9915347	total: 1.01s	remaining: 1.01s
999:	learn: 10.6172882	total: 2.33s	remaining: 0us


In [22]:
print(f'mean_squared_error: {math.sqrt(mean_squared_error(Y, prediction_cbr))}')
print(f'mean_absolute_error: {mean_absolute_error(Y, prediction_cbr)}')
print(f'median_absolute_error: {median_absolute_error(Y, prediction_cbr)}')
print(f'r2_score: {r2_score(Y, prediction_cbr)}')

mean_squared_error: 10.617287934592882
mean_absolute_error: 8.031329901045604
median_absolute_error: 6.3158566975016015
r2_score: 0.9623670510823832


In [23]:
search_CBR.fit(X, Y)

p = search_CBR.predict(X)

print(f'mean_squared_error: {math.sqrt(mean_squared_error(Y, p))}')
print(f'mean_absolute_error: {mean_absolute_error(Y, p)}')
print(f'median_absolute_error: {median_absolute_error(Y, p)}')
print(f'r2_score: {r2_score(Y, p)}')

0:	learn: 54.4605593	total: 11.5ms	remaining: 46.1s
1000:	learn: 11.9433931	total: 8.9s	remaining: 26.7s
2000:	learn: 10.0631596	total: 17.4s	remaining: 17.4s
3000:	learn: 9.2106337	total: 25.5s	remaining: 8.51s
4000:	learn: 8.4801483	total: 33.8s	remaining: 0us
mean_squared_error: 8.480151354375472
mean_absolute_error: 5.957715642034898
median_absolute_error: 4.366988567292054
r2_score: 0.9759924264746485
