In [80]:
import pandas as pd
import glob
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_pacf, plot_acf
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import pacf

In [81]:
from pmdarima.arima import auto_arima

In [82]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error

In [83]:
data = pd.read_csv('consumo_mensal_energia.csv', sep=",", encoding='UTF-8', low_memory=False)
data.shape

(379870, 6)

In [84]:
data.head()

Unnamed: 0,id,referencia,cliente_id,tipo_cliente,consumo_kwh,Estado_Sigla
0,1264499,2021-08-01,21824,PJ,390.0,MG
1,1253013,2021-09-01,21824,PJ,339.0,MG
2,1089580,2021-10-01,21824,PJ,513.0,MG
3,1043501,2021-11-01,21824,PJ,500.0,MG
4,964456,2021-12-01,21824,PJ,521.0,MG


In [85]:
data['referencia'] = pd.to_datetime(data['referencia'].str.strip(), format='%Y-%m-%d')

In [86]:
data.dtypes

id                       int64
referencia      datetime64[ns]
cliente_id               int64
tipo_cliente            object
consumo_kwh            float64
Estado_Sigla            object
dtype: object

In [87]:
data.head()

Unnamed: 0,id,referencia,cliente_id,tipo_cliente,consumo_kwh,Estado_Sigla
0,1264499,2021-08-01,21824,PJ,390.0,MG
1,1253013,2021-09-01,21824,PJ,339.0,MG
2,1089580,2021-10-01,21824,PJ,513.0,MG
3,1043501,2021-11-01,21824,PJ,500.0,MG
4,964456,2021-12-01,21824,PJ,521.0,MG


In [88]:
data.columns

Index(['id', 'referencia', 'cliente_id', 'tipo_cliente', 'consumo_kwh',
       'Estado_Sigla'],
      dtype='object')

In [89]:
data = data[['referencia', 'cliente_id', 'tipo_cliente', 'consumo_kwh']]
data = data[data['consumo_kwh'].notna()]
data.head()

Unnamed: 0,referencia,cliente_id,tipo_cliente,consumo_kwh
0,2021-08-01,21824,PJ,390.0
1,2021-09-01,21824,PJ,339.0
2,2021-10-01,21824,PJ,513.0
3,2021-11-01,21824,PJ,500.0
4,2021-12-01,21824,PJ,521.0


In [90]:
a = data.groupby(['cliente_id']).agg(N = ('referencia', 'count')).reset_index()
a.head()

Unnamed: 0,cliente_id,N
0,10,22
1,46,22
2,62,8
3,180,22
4,186,22


In [91]:
a = a.sample(frac = 0.01)
a = a[(a['N'] >= 6)]
a = a.reset_index()
a = a.drop('index', axis=1)

In [92]:
a.head()

Unnamed: 0,cliente_id,N
0,73650,20
1,79651,18
2,82084,19
3,43945,22
4,110999,15


In [93]:
a['N'].describe()

count    205.000000
mean      18.000000
std        4.030521
min        7.000000
25%       15.000000
50%       19.000000
75%       22.000000
max       25.000000
Name: N, dtype: float64

In [94]:
a.shape

(205, 2)

In [95]:
data = data.merge(a, on=['cliente_id'], how='left')
data = data[data['N'].notna()]
data.shape

(3690, 5)

In [96]:
data.head()

Unnamed: 0,referencia,cliente_id,tipo_cliente,consumo_kwh,N
3411,2022-02-01,35118,PJ,211.0,16.0
3412,2022-03-01,35118,PJ,233.0,16.0
3413,2022-04-01,35118,PJ,211.0,16.0
3414,2022-05-01,35118,PJ,211.0,16.0
3415,2022-06-01,35118,PJ,229.0,16.0


In [97]:
data.tail()

Unnamed: 0,referencia,cliente_id,tipo_cliente,consumo_kwh,N
379217,2023-01-01,18556,PJ,856.0,22.0
379218,2023-02-01,18556,PJ,917.0,22.0
379219,2023-03-01,18556,PJ,895.0,22.0
379220,2023-04-01,18556,PJ,876.0,22.0
379221,2023-05-01,18556,PJ,974.0,22.0


In [98]:
w = len(a)
w

205

In [None]:
for j in range(0, w):
    data24 = data[(data['cliente_id'] == a.loc[j, 'cliente_id'])]
    uc_cliente = data24['cliente_id'].unique()
    data24 = data24[['referencia', 'consumo_kwh']]
    data24 = data24.set_index(['referencia'])

    TEST_SIZE = 1
    train, test = data24.iloc[:-TEST_SIZE], data24.iloc[-TEST_SIZE:]
    x_train, x_test = np.array(range(train.shape[0])), np.array(range(train.shape[0], data24.shape[0]))
    train.shape, x_train.shape, test.shape, x_test.shape

    data24 = data24.reset_index()
    data24['Tipo'] = 'Real'
    data24['Var_pct'] = 0.0
    data24['MAPE'] = 0.0
    data24['RMSE'] = 0.0
    data24['MAE'] = 0.0
    data24['uc'] = uc_cliente[0]

    model = auto_arima(train, start_p=1, start_q=1,
                        test='adf',
                        max_p=5, max_q=5,
                        m=1,             
                        d=1,          
                        seasonal=False,   
                        start_P=0, 
                        D=None, 
                        trace=True,
                        error_action='ignore',  
                        suppress_warnings=True, 
                        stepwise=True)

    prediction = model.predict(n_periods=TEST_SIZE)

    b = prediction.reset_index()

    b['Tipo'] = 'Previsto'
    b['Var_pct'] = ((b[0] - data24.loc[(data24.shape[0]-1), 'consumo_kwh']) / data24.loc[(data24.shape[0]-1), 'consumo_kwh']) * 100
    b['MAPE'] = mean_absolute_percentage_error(test.consumo_kwh, prediction)
    b['RMSE'] = mean_squared_error(test.consumo_kwh, prediction, squared=False)
    b['MAE'] = mean_absolute_error(test.consumo_kwh, prediction)
    b['uc'] = uc_cliente[0]
    b = b.rename(columns={'index': 'referencia', 0: 'consumo_kwh'})

    data24 = pd.concat([data24, b])

    if j == 0:
       df_A = data24
    else:
       df_A = pd.concat([df_A, data24])



In [104]:
df_A.shape

(3895, 8)

In [105]:
df_A.head()

Unnamed: 0,referencia,consumo_kwh,Tipo,Var_pct,MAPE,RMSE,MAE,uc
0,2021-10-01 00:00:00,206.0,Real,0.0,0.0,0.0,0.0,73650
1,2021-11-01 00:00:00,208.0,Real,0.0,0.0,0.0,0.0,73650
2,2021-12-01 00:00:00,206.0,Real,0.0,0.0,0.0,0.0,73650
3,2022-01-01 00:00:00,243.0,Real,0.0,0.0,0.0,0.0,73650
4,2022-02-01 00:00:00,207.0,Real,0.0,0.0,0.0,0.0,73650


In [106]:
df_A.tail()

Unnamed: 0,referencia,consumo_kwh,Tipo,Var_pct,MAPE,RMSE,MAE,uc
14,2023-02-01 00:00:00,164.0,Real,0.0,0.0,0.0,0.0,95890
15,2023-03-01 00:00:00,226.0,Real,0.0,0.0,0.0,0.0,95890
16,2023-04-01 00:00:00,158.0,Real,0.0,0.0,0.0,0.0,95890
17,2023-05-01 00:00:00,15.0,Real,0.0,0.0,0.0,0.0,95890
0,2023-05-01 00:00:00,158.0,Previsto,953.333333,9.533333,143.0,143.0,95890


In [107]:
Previsto = df_A[(df_A['Tipo'] == 'Previsto')]
Previsto.shape

(205, 8)

In [None]:
Previsto['Real'] = Previsto['consumo_kwh'] - (Previsto['consumo_kwh'] * Previsto['MAPE'])
Previsto['Real'] = round(Previsto['Real'], 0)

In [109]:
Previsto.tail()

Unnamed: 0,referencia,consumo_kwh,Tipo,Var_pct,MAPE,RMSE,MAE,uc,Real
0,2023-05-01 00:00:00,209.682405,Previsto,16.490225,0.164902,29.682405,29.682405,62230,175.0
0,2023-05-01 00:00:00,208.318111,Previsto,61.486908,0.614869,79.318111,79.318111,158221,80.0
0,2023-05-01 00:00:00,222.0,Previsto,6.220096,0.062201,13.0,13.0,95219,208.0
0,2023-05-01 00:00:00,370.898051,Previsto,7.819201,0.078192,26.898051,26.898051,76059,342.0
0,2023-05-01 00:00:00,158.0,Previsto,953.333333,9.533333,143.0,143.0,95890,-1348.0


In [111]:
Previsto.iloc[:, 0:9].isnull().sum()

referencia     0
consumo_kwh    0
Tipo           0
Var_pct        1
MAPE           0
RMSE           0
MAE            0
uc             0
Real           0
dtype: int64

In [112]:
#filtered_df = df[df['var2'].isnull()]
Previsto[Previsto['Var_pct'].isnull()]

Unnamed: 0,referencia,consumo_kwh,Tipo,Var_pct,MAPE,RMSE,MAE,uc,Real
0,2023-05-01 00:00:00,0.0,Previsto,,0.0,0.0,0.0,4560,0.0


In [113]:
Previsto[Previsto.Real == Previsto.Real.max()]

Unnamed: 0,referencia,consumo_kwh,Tipo,Var_pct,MAPE,RMSE,MAE,uc,Real
0,2023-05-01 00:00:00,9800.0,Previsto,3.813559,0.038136,360.0,360.0,6808,9426.0


In [119]:
df_A.head()

Unnamed: 0,referencia,consumo_kwh,Tipo,Var_pct,MAPE,RMSE,MAE,uc
0,2021-10-01 00:00:00,206.0,Real,0.0,0.0,0.0,0.0,73650
1,2021-11-01 00:00:00,208.0,Real,0.0,0.0,0.0,0.0,73650
2,2021-12-01 00:00:00,206.0,Real,0.0,0.0,0.0,0.0,73650
3,2022-01-01 00:00:00,243.0,Real,0.0,0.0,0.0,0.0,73650
4,2022-02-01 00:00:00,207.0,Real,0.0,0.0,0.0,0.0,73650


In [120]:
df_A[(df_A['uc'] == 73650)]

Unnamed: 0,referencia,consumo_kwh,Tipo,Var_pct,MAPE,RMSE,MAE,uc
0,2021-10-01 00:00:00,206.0,Real,0.0,0.0,0.0,0.0,73650
1,2021-11-01 00:00:00,208.0,Real,0.0,0.0,0.0,0.0,73650
2,2021-12-01 00:00:00,206.0,Real,0.0,0.0,0.0,0.0,73650
3,2022-01-01 00:00:00,243.0,Real,0.0,0.0,0.0,0.0,73650
4,2022-02-01 00:00:00,207.0,Real,0.0,0.0,0.0,0.0,73650
5,2022-03-01 00:00:00,221.0,Real,0.0,0.0,0.0,0.0,73650
6,2022-04-01 00:00:00,211.0,Real,0.0,0.0,0.0,0.0,73650
7,2022-05-01 00:00:00,219.0,Real,0.0,0.0,0.0,0.0,73650
8,2022-06-01 00:00:00,223.0,Real,0.0,0.0,0.0,0.0,73650
9,2022-07-01 00:00:00,257.0,Real,0.0,0.0,0.0,0.0,73650


In [121]:
Previsto[Previsto.Real == Previsto.Real.min()]

Unnamed: 0,referencia,consumo_kwh,Tipo,Var_pct,MAPE,RMSE,MAE,uc,Real
0,2023-04-01 00:00:00,665.0,Previsto,4056.25,40.5625,649.0,649.0,97430,-26309.0


In [122]:
df_A[(df_A['uc'] == 73650)]

Unnamed: 0,referencia,consumo_kwh,Tipo,Var_pct,MAPE,RMSE,MAE,uc
0,2021-10-01 00:00:00,206.0,Real,0.0,0.0,0.0,0.0,73650
1,2021-11-01 00:00:00,208.0,Real,0.0,0.0,0.0,0.0,73650
2,2021-12-01 00:00:00,206.0,Real,0.0,0.0,0.0,0.0,73650
3,2022-01-01 00:00:00,243.0,Real,0.0,0.0,0.0,0.0,73650
4,2022-02-01 00:00:00,207.0,Real,0.0,0.0,0.0,0.0,73650
5,2022-03-01 00:00:00,221.0,Real,0.0,0.0,0.0,0.0,73650
6,2022-04-01 00:00:00,211.0,Real,0.0,0.0,0.0,0.0,73650
7,2022-05-01 00:00:00,219.0,Real,0.0,0.0,0.0,0.0,73650
8,2022-06-01 00:00:00,223.0,Real,0.0,0.0,0.0,0.0,73650
9,2022-07-01 00:00:00,257.0,Real,0.0,0.0,0.0,0.0,73650


In [123]:
Previsto[(Previsto['uc'] == 73650)]

Unnamed: 0,referencia,consumo_kwh,Tipo,Var_pct,MAPE,RMSE,MAE,uc,Real
0,2023-05-01 00:00:00,230.0,Previsto,1.769912,0.017699,4.0,4.0,73650,226.0


In [125]:
Previsto2 = Previsto[Previsto['Var_pct'].notnull()]

In [126]:
Previsto2 = Previsto2.replace([np.inf, -np.inf], np.nan)

In [127]:
Previsto2.describe()

Unnamed: 0,consumo_kwh,Var_pct,MAPE,RMSE,MAE,uc,Real
count,204.0,203.0,204.0,204.0,204.0,204.0,204.0
mean,713.90507,31.187508,0.757234,145.282631,145.282631,76387.421569,440.686275
std,1523.595593,294.508788,5.322518,459.061463,459.061463,49383.117486,2267.553209
min,0.0,-100.0,0.0,0.0,0.0,2570.0,-26309.0
25%,140.160822,-7.136454,0.052468,10.715761,10.715761,37562.75,116.5
50%,212.575747,3.298887,0.121181,26.949026,26.949026,69399.5,182.5
75%,384.25,15.420796,0.22228,68.131582,68.131582,105303.25,316.5
max,9800.0,4056.25,64.0,4085.0,4085.0,234264.0,9426.0


In [128]:
Previsto2.head(10)

Unnamed: 0,referencia,consumo_kwh,Tipo,Var_pct,MAPE,RMSE,MAE,uc,Real
0,2023-05-01 00:00:00,230.0,Previsto,1.769912,0.017699,4.0,4.0,73650,226.0
0,2023-05-01 00:00:00,206.045088,Previsto,7.31515,0.073151,14.045088,14.045088,79651,191.0
0,2023-05-01 00:00:00,223.485229,Previsto,17.623805,0.176238,33.485229,33.485229,82084,184.0
0,2023-05-01 00:00:00,169.0,Previsto,11.184211,0.111842,17.0,17.0,43945,150.0
0,2023-05-01 00:00:00,272.0,Previsto,-4.225352,0.042254,12.0,12.0,110999,261.0
0,2023-05-01 00:00:00,76.0,Previsto,4.109589,0.041096,3.0,3.0,121514,73.0
0,2023-05-01 00:00:00,104.0,Previsto,-17.460317,0.174603,22.0,22.0,85230,86.0
0,2023-05-01 00:00:00,415.329623,Previsto,16.33883,0.163388,58.329623,58.329623,46476,347.0
0,2023-05-01 00:00:00,142.0,Previsto,-2.739726,0.027397,4.0,4.0,84388,138.0
0,2023-05-01 00:00:00,170.124174,Previsto,9.053958,0.09054,14.124174,14.124174,74845,155.0


In [129]:
df_A[(df_A['uc'] == 73650)]

Unnamed: 0,referencia,consumo_kwh,Tipo,Var_pct,MAPE,RMSE,MAE,uc
0,2021-10-01 00:00:00,206.0,Real,0.0,0.0,0.0,0.0,73650
1,2021-11-01 00:00:00,208.0,Real,0.0,0.0,0.0,0.0,73650
2,2021-12-01 00:00:00,206.0,Real,0.0,0.0,0.0,0.0,73650
3,2022-01-01 00:00:00,243.0,Real,0.0,0.0,0.0,0.0,73650
4,2022-02-01 00:00:00,207.0,Real,0.0,0.0,0.0,0.0,73650
5,2022-03-01 00:00:00,221.0,Real,0.0,0.0,0.0,0.0,73650
6,2022-04-01 00:00:00,211.0,Real,0.0,0.0,0.0,0.0,73650
7,2022-05-01 00:00:00,219.0,Real,0.0,0.0,0.0,0.0,73650
8,2022-06-01 00:00:00,223.0,Real,0.0,0.0,0.0,0.0,73650
9,2022-07-01 00:00:00,257.0,Real,0.0,0.0,0.0,0.0,73650


In [130]:
Previsto3 = Previsto2[(Previsto2['Var_pct'] > 22.565129)]
Previsto3.shape

(32, 9)

In [131]:
Previsto3.head(10)

Unnamed: 0,referencia,consumo_kwh,Tipo,Var_pct,MAPE,RMSE,MAE,uc,Real
0,2023-05-01 00:00:00,925.0,Previsto,71.933086,0.719331,387.0,387.0,11096,260.0
0,2023-05-01 00:00:00,616.0,Previsto,34.204793,0.342048,157.0,157.0,51229,405.0
0,2023-05-01 00:00:00,532.875,Previsto,74.142157,0.741422,226.875,226.875,206865,138.0
0,2023-05-01 00:00:00,95.0,Previsto,37.681159,0.376812,26.0,26.0,103051,59.0
0,2023-05-01 00:00:00,383.0,Previsto,22.75641,0.227564,71.0,71.0,49771,296.0
0,2023-05-01 00:00:00,7680.0,Previsto,50.0,0.5,2560.0,2560.0,8691,3840.0
0,2023-05-01 00:00:00,106.0,Previsto,51.428571,0.514286,36.0,36.0,130279,51.0
0,2023-05-01 00:00:00,32.0,Previsto,23.076923,0.230769,6.0,6.0,54974,25.0
0,2023-05-01 00:00:00,991.0,Previsto,24.030038,0.2403,192.0,192.0,89369,753.0
0,2023-05-01 00:00:00,7.0,Previsto,133.333333,1.333333,4.0,4.0,32596,-2.0


In [132]:
Previsto4 = Previsto2[(Previsto2['Var_pct'] < -22.565129)]
Previsto4.shape

(17, 9)

In [133]:
Previsto4.head(10)

Unnamed: 0,referencia,consumo_kwh,Tipo,Var_pct,MAPE,RMSE,MAE,uc,Real
0,2023-05-01 00:00:00,77.6436,Previsto,-37.384193,0.373842,46.3564,46.3564,83545,49.0
0,2023-04-01 00:00:00,206.0,Previsto,-46.354167,0.463542,178.0,178.0,234264,111.0
0,2023-05-01 00:00:00,115.176384,Previsto,-22.700413,0.227004,33.823616,33.823616,106327,89.0
0,2023-05-01 00:00:00,975.0,Previsto,-66.075157,0.660752,1899.0,1899.0,201348,331.0
0,2023-05-01 00:00:00,4680.0,Previsto,-39.378238,0.393782,3040.0,3040.0,46565,2837.0
0,2023-03-01 00:00:00,229.0,Previsto,-32.844575,0.328446,112.0,112.0,39527,154.0
0,2023-05-01 00:00:00,5382.0,Previsto,-24.685139,0.246851,1764.0,1764.0,14636,4053.0
0,2023-05-01 00:00:00,213.151493,Previsto,-25.731187,0.257312,73.848507,73.848507,40345,158.0
0,2023-05-01 00:00:00,35.048214,Previsto,-32.599589,0.325996,16.951786,16.951786,86465,24.0
0,2023-05-01 00:00:00,8.0,Previsto,-95.180723,0.951807,158.0,158.0,78597,0.0


In [134]:
df_A[(df_A['uc'] == 73650)]

Unnamed: 0,referencia,consumo_kwh,Tipo,Var_pct,MAPE,RMSE,MAE,uc
0,2021-10-01 00:00:00,206.0,Real,0.0,0.0,0.0,0.0,73650
1,2021-11-01 00:00:00,208.0,Real,0.0,0.0,0.0,0.0,73650
2,2021-12-01 00:00:00,206.0,Real,0.0,0.0,0.0,0.0,73650
3,2022-01-01 00:00:00,243.0,Real,0.0,0.0,0.0,0.0,73650
4,2022-02-01 00:00:00,207.0,Real,0.0,0.0,0.0,0.0,73650
5,2022-03-01 00:00:00,221.0,Real,0.0,0.0,0.0,0.0,73650
6,2022-04-01 00:00:00,211.0,Real,0.0,0.0,0.0,0.0,73650
7,2022-05-01 00:00:00,219.0,Real,0.0,0.0,0.0,0.0,73650
8,2022-06-01 00:00:00,223.0,Real,0.0,0.0,0.0,0.0,73650
9,2022-07-01 00:00:00,257.0,Real,0.0,0.0,0.0,0.0,73650
