In [434]:
import pandas as pd
import numpy as np
from datetime import datetime
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import plotly.subplots as sp

In [435]:
DATAPATH2023 = 'data/INMET_S_RS_B807_PORTO ALEGRE- BELEM NOVO_01-01-2023_A_31-12-2023.CSV'
DATAPATH2024 = 'data/INMET_S_RS_B807_PORTO ALEGRE- BELEM NOVO_01-01-2024_A_29-02-2024.CSV'
DATACLEANED =  'data/Data_cleaned.csv'

In [436]:
COLUMN_MAPPING = {
    'PRECIPITAÇÃO TOTAL, HORÁRIO (mm)': 'total_precip_mm',
    'PRESSAO ATMOSFERICA AO NIVEL DA ESTACAO, HORARIA (mB)': 'station_pressure_mb',
    'PRESSÃO ATMOSFERICA MAX.NA HORA ANT. (AUT) (mB)': 'max_pressure_last_hour_mb',
    'PRESSÃO ATMOSFERICA MIN. NA HORA ANT. (AUT) (mB)': 'min_pressure_last_hour_mb',
    'RADIACAO GLOBAL (Kj/m²)': 'global_radiation_kjm2',
    'TEMPERATURA DO AR - BULBO SECO, HORARIA (°C)': 'air_temp_c',
    'TEMPERATURA DO PONTO DE ORVALHO (°C)': 'dew_point_temp_c',
    'TEMPERATURA MÁXIMA NA HORA ANT. (AUT) (°C)': 'max_temp_last_hour_c',
    'TEMPERATURA MÍNIMA NA HORA ANT. (AUT) (°C)': 'min_temp_last_hour_c',
    'TEMPERATURA ORVALHO MAX. NA HORA ANT. (AUT) (°C)': 'max_dew_point_last_hour_c',
    'TEMPERATURA ORVALHO MIN. NA HORA ANT. (AUT) (°C)': 'min_dew_point_last_hour_c',
    'UMIDADE REL. MAX. NA HORA ANT. (AUT) (%)': 'max_humidity_last_hour_pct',
    'UMIDADE REL. MIN. NA HORA ANT. (AUT) (%)': 'min_humidity_last_hour_pct',
    'UMIDADE RELATIVA DO AR, HORARIA (%)': 'relative_humidity_pct',
    'VENTO, DIREÇÃO HORARIA (gr) (° (gr))': 'wind_direction_deg',
    'VENTO, RAJADA MAXIMA (m/s)': 'max_wind_gust_ms',
    'VENTO, VELOCIDADE HORARIA (m/s)': 'wind_speed_ms'
}

In [437]:
def preProcess(path):
    df = pd.read_csv(path, delimiter=';', skiprows=8, encoding='latin1', decimal=',')
    df.drop(df.columns[-1], axis=1, inplace=True) # Last column is empty due to ";" at the end of each line
    df['Hora UTC'] = df['Hora UTC'].apply(lambda x: datetime.strptime(x, '%H%M %Z')) 
    df['Data'] = pd.to_datetime(df['Data'], format='%Y/%m/%d')
    df['hour'] = df['Hora UTC'].dt.hour
    df['day'] = df['Data'].dt.day
    df['month'] = df['Data'].dt.month
    df['year'] = df['Data'].dt.year
    df.rename(columns=COLUMN_MAPPING, inplace=True)
    df.drop(columns=['Data', 'Hora UTC'], axis=1, inplace=True)

    return df

In [438]:
df23, df24 = preProcess(DATAPATH2023), preProcess(DATAPATH2024)

In [439]:
df23.head()

Unnamed: 0,total_precip_mm,station_pressure_mb,max_pressure_last_hour_mb,min_pressure_last_hour_mb,global_radiation_kjm2,air_temp_c,dew_point_temp_c,max_temp_last_hour_c,min_temp_last_hour_c,max_dew_point_last_hour_c,...,max_humidity_last_hour_pct,min_humidity_last_hour_pct,relative_humidity_pct,wind_direction_deg,max_wind_gust_ms,wind_speed_ms,hour,day,month,year
0,0.0,1011.5,1011.5,1011.2,0.0,26.5,16.3,27.4,26.2,17.8,...,57,54.0,54,65.0,6.2,2.6,0,1,1,2023
1,0.0,1011.8,1011.8,1011.5,0.0,26.6,15.0,26.7,26.2,16.3,...,54,49.0,49,50.0,7.4,2.3,1,1,1,2023
2,0.0,1011.7,1011.8,1011.7,0.0,26.2,14.6,27.0,26.2,15.0,...,49,45.0,49,32.0,6.1,2.4,2,1,1,2023
3,0.0,1011.4,1011.7,1011.4,0.0,24.4,16.6,26.2,24.3,16.6,...,62,49.0,62,5.0,5.8,1.6,3,1,1,2023
4,0.0,1011.2,1011.4,1011.1,0.0,23.1,17.1,24.4,22.6,17.3,...,71,62.0,69,54.0,3.8,1.7,4,1,1,2023


In [440]:
df24.head()

Unnamed: 0,total_precip_mm,station_pressure_mb,max_pressure_last_hour_mb,min_pressure_last_hour_mb,global_radiation_kjm2,air_temp_c,dew_point_temp_c,max_temp_last_hour_c,min_temp_last_hour_c,max_dew_point_last_hour_c,...,max_humidity_last_hour_pct,min_humidity_last_hour_pct,relative_humidity_pct,wind_direction_deg,max_wind_gust_ms,wind_speed_ms,hour,day,month,year
0,0.0,1015.9,1015.9,1015.5,0.0,18.8,14.8,20.0,18.8,14.8,...,78,71,78,101,5.3,2.7,0,1,1,2024
1,0.0,1016.5,1016.5,1015.9,0.0,18.2,14.7,19.0,18.1,14.8,...,80,77,80,105,4.1,2.1,1,1,1,2024
2,0.0,1016.4,1016.6,1016.4,0.0,17.8,14.7,18.4,17.8,14.8,...,82,79,82,109,3.2,2.1,2,1,1,2024
3,0.0,1015.8,1016.4,1015.7,0.0,18.5,15.1,18.5,17.7,15.1,...,83,81,81,114,4.0,2.1,3,1,1,2024
4,0.0,1015.4,1015.8,1015.4,0.0,18.6,15.5,18.8,18.2,15.6,...,83,80,82,120,4.3,2.0,4,1,1,2024


In [441]:
df_23_24_concat = pd.concat([df23, df24], axis=0)

In [442]:
df_23_24_concat['year'].value_counts()

2023    8760
2024    1440
Name: year, dtype: int64

In [443]:
cleaned_df = pd.read_csv(DATACLEANED) 

In [444]:
cleaned_df.head()

Unnamed: 0,timestamp,flow_in,reservatory_percentage,pressure,gmb_1,gmb_2,volume,volume_diff,time_diff,flow_out
0,2023-03-17 11:27:06,68.59,29.86,38.2,0,1,298600.0,,,
1,2023-03-17 12:28:56,66.05,35.86,38.2,0,1,358600.0,60000.0,3710.0,49.877493
2,2023-03-17 12:31:26,65.64,36.16,38.06,0,1,361600.0,3000.0,150.0,45.64
3,2023-03-17 12:33:56,65.64,36.5,38.03,0,1,365000.0,3400.0,150.0,42.973333
4,2023-03-17 12:36:26,65.64,36.8,38.17,0,1,368000.0,3000.0,150.0,45.64


In [445]:
cleaned_df['timestamp'] = pd.to_datetime(cleaned_df['timestamp'], format='%Y-%m-%d %H:%M:%S')
cleaned_df['hour'] = cleaned_df['timestamp'].dt.hour
cleaned_df['day'] = cleaned_df['timestamp'].dt.day
cleaned_df['month'] = cleaned_df['timestamp'].dt.month
cleaned_df['year'] = cleaned_df['timestamp'].dt.year

In [446]:
cleaned_df.head()

Unnamed: 0,timestamp,flow_in,reservatory_percentage,pressure,gmb_1,gmb_2,volume,volume_diff,time_diff,flow_out,hour,day,month,year
0,2023-03-17 11:27:06,68.59,29.86,38.2,0,1,298600.0,,,,11,17,3,2023
1,2023-03-17 12:28:56,66.05,35.86,38.2,0,1,358600.0,60000.0,3710.0,49.877493,12,17,3,2023
2,2023-03-17 12:31:26,65.64,36.16,38.06,0,1,361600.0,3000.0,150.0,45.64,12,17,3,2023
3,2023-03-17 12:33:56,65.64,36.5,38.03,0,1,365000.0,3400.0,150.0,42.973333,12,17,3,2023
4,2023-03-17 12:36:26,65.64,36.8,38.17,0,1,368000.0,3000.0,150.0,45.64,12,17,3,2023


In [447]:
merged_df = cleaned_df.merge(df_23_24_concat, on=['hour', 'day', 'year', 'month'], how='left') 
# There is no temperature data available in some dates for the og data.

In [448]:
merged_df.head()

Unnamed: 0,timestamp,flow_in,reservatory_percentage,pressure,gmb_1,gmb_2,volume,volume_diff,time_diff,flow_out,...,max_temp_last_hour_c,min_temp_last_hour_c,max_dew_point_last_hour_c,min_dew_point_last_hour_c,max_humidity_last_hour_pct,min_humidity_last_hour_pct,relative_humidity_pct,wind_direction_deg,max_wind_gust_ms,wind_speed_ms
0,2023-03-17 11:27:06,68.59,29.86,38.2,0,1,298600.0,,,,...,26.0,22.4,22.6,21.1,92.0,81.0,82.0,93.0,1.8,0.4
1,2023-03-17 12:28:56,66.05,35.86,38.2,0,1,358600.0,60000.0,3710.0,49.877493,...,27.9,25.9,22.9,20.8,82.0,66.0,70.0,337.0,2.2,0.8
2,2023-03-17 12:31:26,65.64,36.16,38.06,0,1,361600.0,3000.0,150.0,45.64,...,27.9,25.9,22.9,20.8,82.0,66.0,70.0,337.0,2.2,0.8
3,2023-03-17 12:33:56,65.64,36.5,38.03,0,1,365000.0,3400.0,150.0,42.973333,...,27.9,25.9,22.9,20.8,82.0,66.0,70.0,337.0,2.2,0.8
4,2023-03-17 12:36:26,65.64,36.8,38.17,0,1,368000.0,3000.0,150.0,45.64,...,27.9,25.9,22.9,20.8,82.0,66.0,70.0,337.0,2.2,0.8


In [449]:
merged_df.isnull().sum()    

timestamp                        0
flow_in                          0
reservatory_percentage           0
pressure                         0
gmb_1                            0
gmb_2                            0
volume                           0
volume_diff                      1
time_diff                        1
flow_out                         2
hour                             0
day                              0
month                            0
year                             0
total_precip_mm               3209
station_pressure_mb           3209
max_pressure_last_hour_mb     3209
min_pressure_last_hour_mb     3209
global_radiation_kjm2         3209
air_temp_c                    3209
dew_point_temp_c              3209
max_temp_last_hour_c          3209
min_temp_last_hour_c          3209
max_dew_point_last_hour_c     3209
min_dew_point_last_hour_c     3209
max_humidity_last_hour_pct    3209
min_humidity_last_hour_pct    3209
relative_humidity_pct         3209
wind_direction_deg  

In [450]:
merged_df[['timestamp', 'hour', 'day', 'year', 'month']].sample(5)

Unnamed: 0,timestamp,hour,day,year,month
17553,2023-05-06 17:21:00,17,6,2023,5
20643,2023-05-12 15:13:15,15,12,2023,5
19984,2023-05-11 10:55:43,10,11,2023,5
39027,2023-06-18 20:06:59,20,18,2023,6
73395,2023-10-11 05:42:35,5,11,2023,10


In [451]:
print(f'merged_df shape: {merged_df.shape}\ncleaned_df shape: {cleaned_df.shape}\ndf_23_24_concat shape: {df_23_24_concat.shape}')

merged_df shape: (125073, 31)
cleaned_df shape: (125073, 14)
df_23_24_concat shape: (10200, 21)


In [452]:
nulls_df = merged_df[merged_df.isnull().any(axis=1)]
nulls_df[['timestamp', 'hour', 'day', 'year', 'month']]

Unnamed: 0,timestamp,hour,day,year,month
0,2023-03-17 11:27:06,11,17,2023,3
1514,2023-03-20 17:06:06,17,20,2023,3
1515,2023-03-20 17:08:36,17,20,2023,3
1516,2023-03-20 17:11:06,17,20,2023,3
1517,2023-03-20 17:13:36,17,20,2023,3
...,...,...,...,...,...
125068,2024-03-11 07:56:32,7,11,2024,3
125069,2024-03-11 08:06:32,8,11,2024,3
125070,2024-03-11 08:14:02,8,11,2024,3
125071,2024-03-11 08:16:32,8,11,2024,3


In [453]:
nulls_df.year.value_counts()

2024    3209
2023     168
Name: year, dtype: int64

In [454]:
nulls2023 = nulls_df[nulls_df['year'] == 2023]
nulls2023.shape

(168, 31)

In [455]:
nulls2023.day.value_counts()

14    58
6     45
13    23
20    20
12    20
17     1
29     1
Name: day, dtype: int64

In [456]:
nulls2024 = nulls_df[nulls_df['year'] == 2024]
nulls2024.shape

(3209, 31)

In [457]:
nulls2024.day.value_counts()

9     500
1     495
8     484
7     470
6     455
10    439
11    156
5     145
2      63
3       1
4       1
Name: day, dtype: int64

In [458]:
fig = sp.make_subplots(rows=2, cols=1)

trace1 = go.Scatter(x=nulls2023['day'], y=nulls2023['month'], mode='markers', name='nulls 2023')
trace2 = go.Scatter(x=nulls2024['day'], y=nulls2024['month'], mode='markers', name='nulls 2024')

fig.add_trace(trace1, row=1, col=1)
fig.add_trace(trace2, row=2, col=1)

fig.update_layout(height=600, width=600, title_text="Day and Month from nulls 2023 and nulls 2024")
fig.update_xaxes(title_text="Day", row=1, col=1)
fig.update_yaxes(title_text="Month", row=1, col=1)
fig.update_xaxes(title_text="Day", row=2, col=1)
fig.update_yaxes(title_text="Month", row=2, col=1)

fig.show()

### We do not have temperature data from 24/03/01 to 24/03/11 from the provided datasets

Found the complementary data but it is measured by "PORTO ALEGRE - JARDIM BOTANICO" station instead of "PORTO ALEGRE- BELEM NOVO" from the provided dataset

https://portal.inmet.gov.br/dadoshistoricos

In [459]:
DATAPATH2024_COMP = 'data/weather_2024_complementary.CSV'
comp_data = preProcess(DATAPATH2024_COMP)

In [460]:
filtered_comp_data = comp_data.query('1 <= day <= 11 and month == 3')


In [461]:

merged_df_indexed = merged_df.set_index(['day', 'month', 'hour', 'year'])
filtered_comp_data_indexed = filtered_comp_data.set_index(['day', 'month', 'hour', 'year'])

merged_df_indexed.update(filtered_comp_data_indexed)

full_df = merged_df_indexed.reset_index()

In [462]:
full_df.isnull().sum()

day                              0
month                            0
hour                             0
year                             0
timestamp                        0
flow_in                          0
reservatory_percentage           0
pressure                         0
gmb_1                            0
gmb_2                            0
volume                           0
volume_diff                      1
time_diff                        1
flow_out                         2
total_precip_mm                 57
station_pressure_mb             57
max_pressure_last_hour_mb       57
min_pressure_last_hour_mb       57
global_radiation_kjm2         1637
air_temp_c                      57
dew_point_temp_c                57
max_temp_last_hour_c            57
min_temp_last_hour_c            57
max_dew_point_last_hour_c       57
min_dew_point_last_hour_c       57
max_humidity_last_hour_pct      57
min_humidity_last_hour_pct      57
relative_humidity_pct           57
wind_direction_deg  

In [463]:
full_df['global_radiation_kjm2'] = full_df['global_radiation_kjm2'].fillna(0)
full_df = full_df.ffill()

In [465]:
full_df.isnull().sum()

day                           0
month                         0
hour                          0
year                          0
timestamp                     0
flow_in                       0
reservatory_percentage        0
pressure                      0
gmb_1                         0
gmb_2                         0
volume                        0
volume_diff                   1
time_diff                     1
flow_out                      1
total_precip_mm               0
station_pressure_mb           0
max_pressure_last_hour_mb     0
min_pressure_last_hour_mb     0
global_radiation_kjm2         0
air_temp_c                    0
dew_point_temp_c              0
max_temp_last_hour_c          0
min_temp_last_hour_c          0
max_dew_point_last_hour_c     0
min_dew_point_last_hour_c     0
max_humidity_last_hour_pct    0
min_humidity_last_hour_pct    0
relative_humidity_pct         0
wind_direction_deg            0
max_wind_gust_ms              0
wind_speed_ms                 0
dtype: i

In [470]:
cleaned_df['flow_out']

0               NaN
1         49.877493
2         45.640000
3         42.973333
4         45.640000
            ...    
125068    22.460000
125069    35.460000
125070    41.924444
125071    69.480000
125072    24.460000
Name: flow_out, Length: 125073, dtype: float64