In [None]:
import pandas as pd
import requests
import datetime as dt
from typing import List

In [None]:
class ons_data:
    def __init__(self, freq: str, ano_inicio: int, ano_fim: int, idreg: str=None):
        self.freq = freq
        self.ano_inicio = ano_inicio
        self.ano_fim = ano_fim
        self.idreg = idreg
        self.data = pd.DataFrame()
        self.missing_dates = []
        self.data_dt_inserted = pd.DataFrame()
        self.data_dir = "../../../data/"

    def read(self) -> pd.DataFrame:
        """Função para ler arquivos "csv" já presentes no diretório de dados.

        Args:
            idreg (str): sub-região. ['N', 'NE', 'S', 'SE']

        Returns:
            pd.DataFrame: série de carga elétrica no período entre ano_inicio e ano_fim.
        """
        if self.freq == "hourly":
            path = "".join([self.data_dir,"hourly_load.csv"])
        elif self.freq == "daily":
            path = "".join([self.data_dir,"daily_load.csv"])
        df = pd.read_csv(path, sep=";", decimal=",")
        if not self.idreg:
            idreg = df["id_reg"].unique()
        else:
            idreg = [self.idreg]
        df = df[df["id_reg"].isin(idreg)]
        df.set_index("date", inplace=True)
        self.data = df
        return df

    def update(self, printer=False, write=False):

        if self.freq == "hourly":
            url = "https://ons-dl-prod-opendata.s3.amazonaws.com/dataset/curva-carga-ho/CURVA_CARGA_{}.csv"
            date_format = "%Y-%m-%d %H:%M:%S"
        elif self.freq == "daily":
            url = "https://ons-dl-prod-opendata.s3.amazonaws.com/dataset/carga_energia_di/CARGA_ENERGIA_{}.csv"
            date_format = "%Y-%m-%d"
        else:
            raise Exception("Frequência não reconhecida. Utilize 'hourly' ou 'daily'.")
        get0 = requests.get(url.format(self.ano_inicio)).status_code # verify = False (autenticação)
        getn = requests.get(url.format(self.ano_fim)).status_code 
        if (get0 == 200) and (getn == 200): # 200: página (ano) disponível
            # concatenar arquivos de cada ano em um único dataframe
            df = pd.DataFrame()
            for ano in range(self.ano_inicio, self.ano_fim + 1):
                if printer:
                    print(f"Lendo ano {ano}...")
                df2 = pd.read_csv(url.format(ano), sep = ";")
                df = pd.concat([df, df2])
            df.columns = ["id_reg", "desc_reg", "date", "load_mwmed"]
            df.loc[:, "date"] = pd.to_datetime(df.loc[:, "date"], format = date_format)
            df.sort_values(by = "date", inplace = True)
            df.set_index("date", inplace=True)
            if write:
                full_path = "".join([self.data_dir,f"{self.freq}_load.csv"])
                df.to_csv(full_path, sep=";", decimal=",")
            self.data = df
            return df
        else:
            print("Ano não disponível.")
    
    def check_date_column(self, _freq: str, printer=False) -> List[dt.datetime]:
        """Verifica datas faltantes no intervalo

        Args:
            _freq (str): frequência da série
            printer (bool, optional): informa as datas faltantes em tela. Defaults to False.

        Returns:
            List[dt.datetime]: lista de datas faltantes
        """
        date_col = self.data.reset_index()["date"]
        missing_dates = pd.date_range(date_col.min(), date_col.max(), freq=_freq).difference(date_col)
        missing_list = missing_dates.to_list()
        if printer:
            print("Datas faltantes:\n", missing_list)
        self.missing_dates = missing_list
        return missing_list
    
    def insert_missing_dates(self, printer=False):
        y = self.data.reset_index()
        missing = pd.DataFrame(self.missing_dates, columns=["date"])
        y = pd.concat([y, missing], ignore_index=True)
        y.loc[:,"date"] = pd.to_datetime(y.loc[:,"date"])
        y.set_index("date", inplace=True)
        y.sort_index(inplace=True)
        faltantes = check_date_column(y.index, _freq='h')
        if printer:
            print("Datas faltantes após transformação:", faltantes)
        self.data_dt_inserted = y
        return y

In [None]:
data = ons_data('daily', 2000, 2023, idreg="S")

In [8]:
df = data.update(write=True, printer=True)

Lendo ano 2000...
Lendo ano 2001...
Lendo ano 2002...
Lendo ano 2003...
Lendo ano 2004...
Lendo ano 2005...
Lendo ano 2006...
Lendo ano 2007...
Lendo ano 2008...
Lendo ano 2009...
Lendo ano 2010...


: 

In [61]:
df2 = data.check_date_column(_freq='d', printer=True)

Datas faltantes:
 [Timestamp('2000-10-08 00:00:00'), Timestamp('2001-10-14 00:00:00'), Timestamp('2002-11-03 00:00:00'), Timestamp('2003-10-19 00:00:00'), Timestamp('2004-11-02 00:00:00'), Timestamp('2005-10-16 00:00:00'), Timestamp('2006-11-05 00:00:00'), Timestamp('2007-10-14 00:00:00'), Timestamp('2008-10-19 00:00:00'), Timestamp('2009-10-18 00:00:00'), Timestamp('2010-10-17 00:00:00'), Timestamp('2011-10-16 00:00:00'), Timestamp('2012-10-21 00:00:00'), Timestamp('2013-10-20 00:00:00')]


In [62]:
df3 = data.insert_missing_dates()

  y.loc[:,"date"] = pd.to_datetime(y.loc[:,"date"])


In [63]:
data.missing_dates

[Timestamp('2000-10-08 00:00:00'),
 Timestamp('2001-10-14 00:00:00'),
 Timestamp('2002-11-03 00:00:00'),
 Timestamp('2003-10-19 00:00:00'),
 Timestamp('2004-11-02 00:00:00'),
 Timestamp('2005-10-16 00:00:00'),
 Timestamp('2006-11-05 00:00:00'),
 Timestamp('2007-10-14 00:00:00'),
 Timestamp('2008-10-19 00:00:00'),
 Timestamp('2009-10-18 00:00:00'),
 Timestamp('2010-10-17 00:00:00'),
 Timestamp('2011-10-16 00:00:00'),
 Timestamp('2012-10-21 00:00:00'),
 Timestamp('2013-10-20 00:00:00')]

In [7]:
def check_date_column(date_col: List[dt.datetime], _freq: str, printer=False) -> print:

    missing_dates = pd.date_range(date_col.min(), date_col.max(), freq=_freq).difference(date_col)
    missing_list = missing_dates.to_list()
    if printer:
        print("Datas faltantes:\n", missing_list)
    return missing_list

In [12]:
missing_dates = check_date_column(df.index, _freq="h", printer=True)

Datas faltantes:
 [Timestamp('2000-10-08 00:00:00'), Timestamp('2001-10-14 00:00:00'), Timestamp('2002-11-03 00:00:00'), Timestamp('2003-10-19 00:00:00'), Timestamp('2004-11-02 00:00:00'), Timestamp('2005-10-16 00:00:00'), Timestamp('2006-11-05 00:00:00'), Timestamp('2007-10-14 00:00:00'), Timestamp('2008-10-19 00:00:00'), Timestamp('2009-10-18 00:00:00'), Timestamp('2010-10-17 00:00:00'), Timestamp('2011-10-16 00:00:00'), Timestamp('2012-10-21 00:00:00'), Timestamp('2013-10-20 00:00:00')]


In [13]:
def insert_missing_dates(x: pd.DataFrame, missing_dates: list, date_column_name: str):
    y = x.reset_index()
    missing = pd.DataFrame(missing_dates, columns=[date_column_name])
    y = pd.concat([y, missing], ignore_index=True)
    y.loc[:,"date"] = pd.to_datetime(y.loc[:,"date"])
    y.set_index("date", inplace=True)
    y.sort_index(inplace=True)
    faltantes = check_date_column(y.index, _freq='h')
    print("Datas faltantes após transformação:", faltantes)
    return y

In [14]:
df2 = insert_missing_dates(df, missing_dates=missing_dates, date_column_name="date")

Datas faltantes após transformação: []


  y.loc[:,"date"] = pd.to_datetime(y.loc[:,"date"])
