In [52]:
import pandas as pd
import numpy as np
import math

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

from catboost import CatBoostRegressor, Pool, metrics, cv

import matplotlib.pyplot as plt
import seaborn as sns
import math
%matplotlib inline

In [53]:
sns.set(rc={'figure.figsize': (12, 9)})

In [54]:
data = pd.read_excel('Статистика_для_графиков_3_д_ТЭЦ2_УРУТ_130.xlsx', sheet_name='Лист1')

In [55]:
data = data.drop(columns=["Дата"]).iloc[:, :-3]

## **Проверим на пустые значения**

In [56]:
data.isna().sum()

Выработка электроэнергии ТА гр.130                        0
Отпуск тепла из ТО ТА гр.130                              0
Удельный расход условного топлива на отпуск э/э гр.130    0
dtype: int64

In [57]:
data.dtypes

Выработка электроэнергии ТА гр.130                        float64
Отпуск тепла из ТО ТА гр.130                              float64
Удельный расход условного топлива на отпуск э/э гр.130    float64
dtype: object

## **Делим на выборки**

In [58]:
X = data.drop(columns=['Удельный расход условного топлива на отпуск э/э гр.130'])
Y = data['Удельный расход условного топлива на отпуск э/э гр.130']

In [81]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15, random_state=42)

## **Подберём модель по которой можно будет предсказывать**

**Разберём модели CatBoostRegressor и LinearRegression**

### **LinearRegression**

In [86]:
lr = LinearRegression()

lr.fit(X_train, Y_train)

prediction_lr = lr.predict(X_test)

In [87]:
print(f'mean_squared_error: {math.sqrt(mean_squared_error(Y_test, prediction_lr))}')
print(f'mean_absolute_error: {mean_absolute_error(Y_test, prediction_lr)}')
print(f'median_absolute_error: {median_absolute_error(Y_test, prediction_lr)}')
print(f'r2_score: {r2_score(Y_test, prediction_lr)}')

mean_squared_error: 30.334219784204844
mean_absolute_error: 24.082724049740875
median_absolute_error: 20.374034095981244
r2_score: 0.7161998602530684


In [88]:
LR_pereb = LinearRegression()

params = {
    'fit_intercept': [True, False],
    'copy_X': [True, False],
    'n_jobs': [-1],
    'positive': [True, False]
}

# создадим объект GridSearchCV
search_LR = GridSearchCV(LR_pereb, params, n_jobs=-1, cv=5, refit=True, scoring='neg_mean_absolute_error')

# запустим поиск
search_LR.fit(X_train, Y_train)

# выведем наилучшие параметры
print(search_LR.best_params_)

# применяем параметры
search_LR = search_LR.best_estimator_

prediction_search_LR = search_LR.predict(X_test)

{'copy_X': True, 'fit_intercept': True, 'n_jobs': -1, 'positive': False}


In [89]:
print(f'mean_squared_error: {math.sqrt(mean_squared_error(Y_test, prediction_search_LR))}')
print(f'mean_absolute_error: {mean_absolute_error(Y_test, prediction_search_LR)}')
print(f'median_absolute_error: {median_absolute_error(Y_test, prediction_search_LR)}')
print(f'r2_score: {r2_score(Y_test, prediction_search_LR)}')

mean_squared_error: 30.334219784204844
mean_absolute_error: 24.082724049740875
median_absolute_error: 20.374034095981244
r2_score: 0.7161998602530684


### **RandomForestRegressor**

In [90]:
rfr = RandomForestRegressor()

rfr.fit(X_train, Y_train)

prediction_rfr = rfr.predict(X_test)

In [91]:
print(f'mean_squared_error: {math.sqrt(mean_squared_error(Y_test, prediction_rfr))}')
print(f'mean_absolute_error: {mean_absolute_error(Y_test, prediction_rfr)}')
print(f'median_absolute_error: {median_absolute_error(Y_test, prediction_rfr)}')
print(f'r2_score: {r2_score(Y_test, prediction_rfr)}')

mean_squared_error: 16.936414198137136
mean_absolute_error: 11.634547504085354
median_absolute_error: 7.540839864999725
r2_score: 0.9115312523818145


In [92]:
rfr_pereb = RandomForestRegressor()

params = {
    'n_estimators': [100, 200, 300],
    'criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
    'max_depth': [3, 4, 6, 10],
    'min_samples_split': [2, 4, 5, 6],
    'min_samples_leaf': [1, 2, 4, 6]
}

# создадим объект GridSearchCV
search_rfr = GridSearchCV(rfr_pereb, params, n_jobs=-1, cv=5, refit=True, scoring='neg_mean_absolute_error')

# запустим поиск
search_rfr.fit(X_train, Y_train)

# выведем наилучшие параметры
print(search_rfr.best_params_)

# применяем параметры
search_rfr = search_rfr.best_estimator_

prediction_search_rfr = search_rfr.predict(X_test)

{'criterion': 'friedman_mse', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}


In [93]:
print(f'mean_squared_error: {math.sqrt(mean_squared_error(Y_test, prediction_search_rfr))}')
print(f'mean_absolute_error: {mean_absolute_error(Y_test, prediction_search_rfr)}')
print(f'median_absolute_error: {median_absolute_error(Y_test, prediction_search_rfr)}')
print(f'r2_score: {r2_score(Y_test, prediction_search_rfr)}')

mean_squared_error: 17.011550707234832
mean_absolute_error: 11.691540507374791
median_absolute_error: 7.9696331560739395
r2_score: 0.9107445477541116


### **CatBoostRegressor**

In [94]:
cbr = CatBoostRegressor()

cbr.fit(X_train, Y_train, verbose=500)

prediction_cbr = cbr.predict(X_test)

Learning rate set to 0.040449
0:	learn: 52.6780644	total: 1.22ms	remaining: 1.21s
500:	learn: 12.7150954	total: 473ms	remaining: 471ms
999:	learn: 10.3321747	total: 946ms	remaining: 0us


In [95]:
print(f'mean_squared_error: {math.sqrt(mean_squared_error(Y_test, prediction_cbr))}')
print(f'mean_absolute_error: {mean_absolute_error(Y_test, prediction_cbr)}')
print(f'median_absolute_error: {median_absolute_error(Y_test, prediction_cbr)}')
print(f'r2_score: {r2_score(Y_test, prediction_cbr)}')

mean_squared_error: 17.064845516334813
mean_absolute_error: 11.83644302898369
median_absolute_error: 8.636002482898704
r2_score: 0.9101844220316506


In [96]:
cbr_litlleBit_params = CatBoostRegressor(iterations=5001,
                                         depth=7,
                                         rsm=1,
                                         grow_policy='Depthwise',
                                         verbose=1000,
                                         task_type='GPU'
                                        )

cbr_litlleBit_params.fit(X_train, Y_train)

prediction_cbr_litlleBit_params = cbr_litlleBit_params.predict(X_test)

Learning rate set to 0.015297
0:	learn: 53.7887838	total: 7.43ms	remaining: 37.1s
1000:	learn: 8.4166479	total: 3.92s	remaining: 15.7s
2000:	learn: 6.2242053	total: 8.26s	remaining: 12.4s
3000:	learn: 5.2737594	total: 12.8s	remaining: 8.55s
4000:	learn: 4.8257370	total: 17.5s	remaining: 4.37s
5000:	learn: 4.5870797	total: 22.2s	remaining: 0us


In [97]:
print(f'mean_squared_error: {math.sqrt(mean_squared_error(Y_test, prediction_cbr_litlleBit_params))}')
print(f'mean_absolute_error: {mean_absolute_error(Y_test, prediction_cbr_litlleBit_params)}')
print(f'median_absolute_error: {median_absolute_error(Y_test, prediction_cbr_litlleBit_params)}')
print(f'r2_score: {r2_score(Y_test, prediction_cbr_litlleBit_params)}')

mean_squared_error: 19.06027160403372
mean_absolute_error: 12.68418627652424
median_absolute_error: 8.587262758994399
r2_score: 0.8879517400236807


In [98]:
search_CBR = CatBoostRegressor(iterations=5001,
                                    rsm=1,
                                    early_stopping_rounds=100,
                                    grow_policy='Depthwise',
                                    depth=7,
                                    loss_function='RMSE',
                                    eval_metric='RMSE',
                                    l2_leaf_reg=10,
                                    learning_rate=0.01,
                                    verbose=1000,
                                    task_type='GPU'
                                    )

search_CBR.fit(X_train, Y_train)

prediction_search_CBR = search_CBR.predict(X_test)

0:	learn: 54.0892741	total: 7.45ms	remaining: 37.3s
1000:	learn: 12.1033936	total: 4.06s	remaining: 16.2s
2000:	learn: 9.7791996	total: 8.16s	remaining: 12.2s
3000:	learn: 8.7232786	total: 12.6s	remaining: 8.43s
4000:	learn: 7.8588536	total: 17.4s	remaining: 4.34s
5000:	learn: 7.1429142	total: 22.2s	remaining: 0us


In [99]:
print(f'mean_squared_error: {math.sqrt(mean_squared_error(Y_test, prediction_search_CBR))}')
print(f'mean_absolute_error: {mean_absolute_error(Y_test, prediction_search_CBR)}')
print(f'median_absolute_error: {median_absolute_error(Y_test, prediction_search_CBR)}')
print(f'r2_score: {r2_score(Y_test, prediction_search_CBR)}')

mean_squared_error: 17.79852001665036
mean_absolute_error: 11.731893549442056
median_absolute_error: 7.527233677762638
r2_score: 0.9022954633558516


## **Сделаем таблицу для сравнения результатов**

In [100]:
comparison_table = pd.DataFrame(
    {'models': ['lr', 'search_LR', 'rfr', 'search_rfr', 'cbr', 'cbr_litlleBit_params', 'search_CBR'],
     
    'mean_squared_error': [math.sqrt(mean_squared_error(Y_test, prediction_lr)),
                            math.sqrt(mean_squared_error(Y_test, prediction_search_LR)),
                            math.sqrt(mean_squared_error(Y_test, prediction_rfr)),
                            math.sqrt(mean_squared_error(Y_test, prediction_search_rfr)),
                            math.sqrt(mean_squared_error(Y_test, prediction_cbr)),
                            math.sqrt(mean_squared_error(Y_test, prediction_cbr_litlleBit_params)),
                            math.sqrt(mean_squared_error(Y_test, prediction_search_CBR))],

    'mean_absolute_error':[mean_absolute_error(Y_test, prediction_lr),
                           mean_absolute_error(Y_test, prediction_search_LR),
                           mean_absolute_error(Y_test, prediction_rfr),
                           mean_absolute_error(Y_test, prediction_search_rfr),
                           mean_absolute_error(Y_test, prediction_cbr),
                           mean_absolute_error(Y_test, prediction_cbr_litlleBit_params),
                           mean_absolute_error(Y_test, prediction_search_CBR)],\

    'median_absolute_error': [median_absolute_error(Y_test, prediction_lr),
                              median_absolute_error(Y_test, prediction_search_LR),
                              median_absolute_error(Y_test, prediction_rfr),
                              median_absolute_error(Y_test, prediction_search_rfr),
                              median_absolute_error(Y_test, prediction_cbr),
                              median_absolute_error(Y_test, prediction_cbr_litlleBit_params),
                              median_absolute_error(Y_test, prediction_search_CBR)],
                              
    'r2_score': [r2_score(Y_test, prediction_lr), 
                r2_score(Y_test, prediction_search_LR),
                r2_score(Y_test, prediction_rfr),
                r2_score(Y_test, prediction_search_rfr),
                r2_score(Y_test, prediction_cbr),
                r2_score(Y_test, prediction_cbr_litlleBit_params),
                r2_score(Y_test, prediction_search_CBR)]
    })

In [101]:
comparison_table

Unnamed: 0,models,mean_squared_error,mean_absolute_error,median_absolute_error,r2_score
0,lr,30.33422,24.082724,20.374034,0.7162
1,search_LR,30.33422,24.082724,20.374034,0.7162
2,rfr,16.936414,11.634548,7.54084,0.911531
3,search_rfr,17.011551,11.691541,7.969633,0.910745
4,cbr,17.064846,11.836443,8.636002,0.910184
5,cbr_litlleBit_params,19.060272,12.684186,8.587263,0.887952
6,search_CBR,17.79852,11.731894,7.527234,0.902295


## **Из всего этого хоршо подходит CatBoostRegressor из коробки(cbr), но CatBoostRegressor с подкручеными параметрами я бы доверял больше поэтому оставим его тоже**

### **Обучим модели на всех данных**

In [22]:
cbr.fit(X, Y, verbose=500)

prediction_cbr = cbr.predict(X)

Learning rate set to 0.041504
0:	learn: 53.0297492	total: 3.65ms	remaining: 3.65s
500:	learn: 12.9915347	total: 1.94s	remaining: 1.93s
999:	learn: 10.6172882	total: 3.62s	remaining: 0us


In [23]:
print(f'mean_squared_error: {math.sqrt(mean_squared_error(Y, prediction_cbr))}')
print(f'mean_absolute_error: {mean_absolute_error(Y, prediction_cbr)}')
print(f'median_absolute_error: {median_absolute_error(Y, prediction_cbr)}')
print(f'r2_score: {r2_score(Y, prediction_cbr)}')

mean_squared_error: 10.617287934592882
mean_absolute_error: 8.031329901045604
median_absolute_error: 6.3158566975016015
r2_score: 0.9623670510823832


In [24]:
search_CBR.fit(X, Y)

p = search_CBR.predict(X)

0:	learn: 54.4605593	total: 24.3ms	remaining: 2m 1s
1000:	learn: 11.9433931	total: 13.5s	remaining: 53.9s
2000:	learn: 10.0631596	total: 26s	remaining: 39s
3000:	learn: 9.2106337	total: 38.8s	remaining: 25.8s
4000:	learn: 8.4801483	total: 53s	remaining: 13.2s
5000:	learn: 7.8934117	total: 1m 4s	remaining: 0us


In [25]:
print(f'mean_squared_error: {math.sqrt(mean_squared_error(Y, p))}')
print(f'mean_absolute_error: {mean_absolute_error(Y, p)}')
print(f'median_absolute_error: {median_absolute_error(Y, p)}')
print(f'r2_score: {r2_score(Y, p)}')

mean_squared_error: 7.8934187861997955
mean_absolute_error: 5.420095459179627
median_absolute_error: 3.9967100214070683
r2_score: 0.9791996161152253


In [26]:
cbr_litlleBit_params.fit(X, Y)

Learning rate set to 0.015627
0:	learn: 54.1521133	total: 13.6ms	remaining: 1m 8s
500:	learn: 10.4931945	total: 4.18s	remaining: 37.6s
1000:	learn: 8.9197147	total: 8.96s	remaining: 35.8s
1500:	learn: 7.8137219	total: 13.3s	remaining: 31.1s
2000:	learn: 7.1169246	total: 17.4s	remaining: 26.1s
2500:	learn: 6.6773309	total: 21.5s	remaining: 21.5s
3000:	learn: 6.3843942	total: 25.7s	remaining: 17.1s
3500:	learn: 6.1775073	total: 29.9s	remaining: 12.8s
4000:	learn: 6.0349473	total: 34.1s	remaining: 8.52s
4500:	learn: 5.9285093	total: 38.7s	remaining: 4.29s
5000:	learn: 5.8436646	total: 43.2s	remaining: 0us
0.9885998141934244


In [50]:
print(f'mean_squared_error: {math.sqrt(mean_squared_error(Y, p))}')
print(f'mean_absolute_error: {mean_absolute_error(Y, p)}')
print(f'median_absolute_error: {median_absolute_error(Y, p)}')
print(f'r2_score: {r2_score(Y, p)}')

mean_squared_error: 7.8934187861997955
mean_absolute_error: 5.420095459179627
median_absolute_error: 3.9967100214070683
r2_score: 0.9791996161152253


# **Сделаем финальное испытание для моделей**

## **Разделим датасет на фичи дат: год, месяц и день**