# Bibliotecas

In [13]:
import holidays
import pandas as pd
import plotly.express as px
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

# 1. Leitura do dataframe original

In [2]:
# Lê o arquivo excel e pula a primeira linha
df = pd.read_excel('../TCC/datasets/demanda_original.xlsx', skiprows=1)
# Apresentar na tela o dataframe
display(df)

Unnamed: 0,Data,Dia,Postos horários,kVA fornecido,kW fornecido,kvar indutivo,kvar capacitivo,kW recebido,kvar indutivo recebido,kvar capacitivo recebido
0,2022-01-01 00:15:00,Sábado,Fora Ponta,2400.614321,2211.84,933.12,0.0,0.0,0,0.0
1,2022-01-01 00:30:00,Sábado,Fora Ponta,2384.702608,2194.56,933.12,0.0,0.0,0,0.0
2,2022-01-01 00:45:00,Sábado,Fora Ponta,2371.392769,2194.56,898.56,0.0,0.0,0,0.0
3,2022-01-01 01:00:00,Sábado,Fora Ponta,2439.174680,2246.40,950.40,0.0,0.0,0,0.0
4,2022-01-01 01:15:00,Sábado,Fora Ponta,2448.460757,2263.68,933.12,0.0,0.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...
40699,2023-02-28 23:00:00,Terça,Fora Ponta,2618.418962,2505.60,760.32,0.0,0.0,0,0.0
40700,2023-02-28 23:15:00,Terça,Fora Ponta,2580.338568,2471.04,743.04,0.0,0.0,0,0.0
40701,2023-02-28 23:30:00,Terça,Fora Ponta,2558.840698,2453.76,725.76,0.0,0.0,0,0.0
40702,2023-02-28 23:45:00,Terça,Fora Ponta,2509.172625,2401.92,725.76,0.0,0.0,0,0.0


# 2. Exploração e Limpeza de Dados

## 2.1. Transformar dados no formato 'object' em numéricos

In [3]:
# Verificar os tipos de dados do dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40704 entries, 0 to 40703
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Data                      40704 non-null  datetime64[ns]
 1   Dia                       40704 non-null  object        
 2   Postos horários           40704 non-null  object        
 3   kVA fornecido             40704 non-null  float64       
 4   kW fornecido              40704 non-null  float64       
 5   kvar indutivo             40704 non-null  float64       
 6   kvar capacitivo           40704 non-null  float64       
 7   kW recebido               40704 non-null  float64       
 8   kvar indutivo recebido    40704 non-null  int64         
 9   kvar capacitivo recebido  40704 non-null  float64       
dtypes: datetime64[ns](1), float64(6), int64(1), object(2)
memory usage: 3.1+ MB


In [4]:
# Descobrir os tipos de classificações de dia
df['Dia'].unique()

# Renomear dias da semana na coluna 'Dia'
df['Dia'] = df['Dia'].replace(['Segunda', 'Terça', 'Quarta', 'Quinta', 'Sexta', 'Sábado', 'Domingo'], [0, 1, 2, 3, 4, 5, 6])

# Apresentar na tela o dataframe
display(df)

Unnamed: 0,Data,Dia,Postos horários,kVA fornecido,kW fornecido,kvar indutivo,kvar capacitivo,kW recebido,kvar indutivo recebido,kvar capacitivo recebido
0,2022-01-01 00:15:00,5,Fora Ponta,2400.614321,2211.84,933.12,0.0,0.0,0,0.0
1,2022-01-01 00:30:00,5,Fora Ponta,2384.702608,2194.56,933.12,0.0,0.0,0,0.0
2,2022-01-01 00:45:00,5,Fora Ponta,2371.392769,2194.56,898.56,0.0,0.0,0,0.0
3,2022-01-01 01:00:00,5,Fora Ponta,2439.174680,2246.40,950.40,0.0,0.0,0,0.0
4,2022-01-01 01:15:00,5,Fora Ponta,2448.460757,2263.68,933.12,0.0,0.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...
40699,2023-02-28 23:00:00,1,Fora Ponta,2618.418962,2505.60,760.32,0.0,0.0,0,0.0
40700,2023-02-28 23:15:00,1,Fora Ponta,2580.338568,2471.04,743.04,0.0,0.0,0,0.0
40701,2023-02-28 23:30:00,1,Fora Ponta,2558.840698,2453.76,725.76,0.0,0.0,0,0.0
40702,2023-02-28 23:45:00,1,Fora Ponta,2509.172625,2401.92,725.76,0.0,0.0,0,0.0


In [5]:
# Descobrir os tipos de classificações de postos horários
df['Postos horários'].unique()

# Renomear postos horários na coluna 'Posto horários'
df['Postos horários'] = df['Postos horários'].replace(['Fora Ponta', 'Ponta'], [0, 1])

# Apresentar na tela o dataframe
display(df)

Unnamed: 0,Data,Dia,Postos horários,kVA fornecido,kW fornecido,kvar indutivo,kvar capacitivo,kW recebido,kvar indutivo recebido,kvar capacitivo recebido
0,2022-01-01 00:15:00,5,0,2400.614321,2211.84,933.12,0.0,0.0,0,0.0
1,2022-01-01 00:30:00,5,0,2384.702608,2194.56,933.12,0.0,0.0,0,0.0
2,2022-01-01 00:45:00,5,0,2371.392769,2194.56,898.56,0.0,0.0,0,0.0
3,2022-01-01 01:00:00,5,0,2439.174680,2246.40,950.40,0.0,0.0,0,0.0
4,2022-01-01 01:15:00,5,0,2448.460757,2263.68,933.12,0.0,0.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...
40699,2023-02-28 23:00:00,1,0,2618.418962,2505.60,760.32,0.0,0.0,0,0.0
40700,2023-02-28 23:15:00,1,0,2580.338568,2471.04,743.04,0.0,0.0,0,0.0
40701,2023-02-28 23:30:00,1,0,2558.840698,2453.76,725.76,0.0,0.0,0,0.0
40702,2023-02-28 23:45:00,1,0,2509.172625,2401.92,725.76,0.0,0.0,0,0.0


## 2.2. Buscar dados inválidos como NaNs e colunas constituídas de zeros

In [6]:
df.isna().sum()

Data                        0
Dia                         0
Postos horários             0
kVA fornecido               0
kW fornecido                0
kvar indutivo               0
kvar capacitivo             0
kW recebido                 0
kvar indutivo recebido      0
kvar capacitivo recebido    0
dtype: int64

In [7]:
df.describe().round(2).transpose()

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
Data,40704.0,2022-08-01 00:07:30,2022-01-01 00:15:00,2022-04-17 00:11:15,2022-08-01 00:07:30,2022-11-15 00:03:45,2023-03-01 00:00:00,
Dia,40704.0,3.0,0.0,1.0,3.0,5.0,6.0,2.01
Postos horários,40704.0,0.09,0.0,0.0,0.0,0.0,1.0,0.28
kVA fornecido,40704.0,3314.73,0.0,2530.5,3322.44,4066.72,13138.4,1252.73
kW fornecido,40704.0,3064.68,0.0,2263.68,3110.4,3853.44,12666.24,1282.27
kvar indutivo,40704.0,1185.97,0.0,1019.52,1192.32,1382.4,3697.92,337.32
kvar capacitivo,40704.0,0.01,0.0,0.0,0.0,0.0,120.96,0.9
kW recebido,40704.0,2.21,0.0,0.0,0.0,0.0,1416.96,34.89
kvar indutivo recebido,40704.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
kvar capacitivo recebido,40704.0,5.43,0.0,0.0,0.0,0.0,1399.68,66.62


In [8]:
# A coluna 'kvar indutivo recebido' é preenchida por zeros, então será excluída
df = df.drop(columns=['kvar indutivo recebido'])

## 2.3. Trabalhando o index

In [9]:
# Renomear a coluna 'Data' como 'Timestamp'
df = df.rename(columns={'Data': 'Timestamp'})

# Converta a coluna 'Timestamp' para o formato datetime
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

# Colocando a coluna 'Timestamp' como índice
df.set_index('Timestamp', inplace=True)

# Apresentar na tela o dataframe
display(df)

Unnamed: 0_level_0,Dia,Postos horários,kVA fornecido,kW fornecido,kvar indutivo,kvar capacitivo,kW recebido,kvar capacitivo recebido
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022-01-01 00:15:00,5,0,2400.614321,2211.84,933.12,0.0,0.0,0.0
2022-01-01 00:30:00,5,0,2384.702608,2194.56,933.12,0.0,0.0,0.0
2022-01-01 00:45:00,5,0,2371.392769,2194.56,898.56,0.0,0.0,0.0
2022-01-01 01:00:00,5,0,2439.174680,2246.40,950.40,0.0,0.0,0.0
2022-01-01 01:15:00,5,0,2448.460757,2263.68,933.12,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
2023-02-28 23:00:00,1,0,2618.418962,2505.60,760.32,0.0,0.0,0.0
2023-02-28 23:15:00,1,0,2580.338568,2471.04,743.04,0.0,0.0,0.0
2023-02-28 23:30:00,1,0,2558.840698,2453.76,725.76,0.0,0.0,0.0
2023-02-28 23:45:00,1,0,2509.172625,2401.92,725.76,0.0,0.0,0.0


In [11]:
# Agrupe os dados por hora e calcule a mediana para cada grupo
df = df.resample('H').median()

# Exiba o DataFrame resultante
display(df)

Unnamed: 0_level_0,Dia,Postos horários,kVA fornecido,kW fornecido,kvar indutivo,kvar capacitivo,kW recebido,kvar capacitivo recebido
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022-01-01 00:00:00,5.0,0.0,2384.702608,2194.56,933.12,0.0,0.0,0.0
2022-01-01 01:00:00,5.0,0.0,2448.460757,2263.68,941.76,0.0,0.0,0.0
2022-01-01 02:00:00,5.0,0.0,2650.551481,2453.76,1002.24,0.0,0.0,0.0
2022-01-01 03:00:00,5.0,0.0,2607.329616,2410.56,993.60,0.0,0.0,0.0
2022-01-01 04:00:00,5.0,0.0,2560.792438,2367.36,984.96,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
2023-02-28 20:00:00,1.0,1.0,2514.990271,2410.56,725.76,0.0,0.0,0.0
2023-02-28 21:00:00,1.0,0.0,2600.296644,2496.96,725.76,0.0,0.0,0.0
2023-02-28 22:00:00,1.0,0.0,2638.315348,2531.52,743.04,0.0,0.0,0.0
2023-02-28 23:00:00,1.0,0.0,2569.589633,2462.40,734.40,0.0,0.0,0.0


In [12]:
# Crie um gráfico de linha usando a coluna 'dados' do DataFrame
fig = px.line(df, x=df.index, y=['kVA fornecido', 'kW fornecido', 'kvar indutivo'], title='Demanda solicitada (kVA)')

# Mostre o gráfico
fig.show()

# 3. Engenharia de Características (Feature Engineering)

## 3.1. Criar colunas de feriados

In [15]:
# Crie um objeto de feriados para o período do dataframe
feriados = holidays.BR(years=range(df.index.year.min(), df.index.year.max()))
# Use uma função lambda para verificar se uma data é feriado
df['Holiday'] = df.index.map(lambda x: 1 if x.date() in feriados else 0)
# Exiba o DataFrame resultante
display(df)

Unnamed: 0_level_0,Dia,Postos horários,kVA fornecido,kW fornecido,kvar indutivo,kvar capacitivo,kW recebido,kvar capacitivo recebido,Holiday
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-01-01 00:00:00,5.0,0.0,2384.702608,2194.56,933.12,0.0,0.0,0.0,1
2022-01-01 01:00:00,5.0,0.0,2448.460757,2263.68,941.76,0.0,0.0,0.0,1
2022-01-01 02:00:00,5.0,0.0,2650.551481,2453.76,1002.24,0.0,0.0,0.0,1
2022-01-01 03:00:00,5.0,0.0,2607.329616,2410.56,993.60,0.0,0.0,0.0,1
2022-01-01 04:00:00,5.0,0.0,2560.792438,2367.36,984.96,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...
2023-02-28 20:00:00,1.0,1.0,2514.990271,2410.56,725.76,0.0,0.0,0.0,0
2023-02-28 21:00:00,1.0,0.0,2600.296644,2496.96,725.76,0.0,0.0,0.0,0
2023-02-28 22:00:00,1.0,0.0,2638.315348,2531.52,743.04,0.0,0.0,0.0,0
2023-02-28 23:00:00,1.0,0.0,2569.589633,2462.40,734.40,0.0,0.0,0.0,0


# 4. Salvar o dataframe em um arquivo .csv

In [17]:
# Ordenar as colunas do dataframe
df = df[['kVA fornecido', 'kW fornecido', 'kvar indutivo', 'Dia', 'Postos horários', 'Holiday']]

In [18]:
# Salvar o dataframe em um arquivo csv
df.to_csv('../TCC/datasets/demanda.csv')