In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from geopy.distance import distance

import os

In [6]:
os.listdir("./../../data/")

['.DS_Store', 'raw_data', 'automatic_stations_catalog.xlsx']

# Data Catalog Ingestion and Preprocessing

In [34]:
df_automatic_stations = pd.read_excel('./../../data/automatic_stations_catalog.xlsx')[[
                                                        'DC_NOME','SG_ESTADO','CD_SITUACAO',
                                                        'VL_LATITUDE','VL_LONGITUDE','VL_ALTITUDE',
                                                        'DT_INICIO_OPERACAO','CD_ESTACAO']]

df_automatic_stations = df_automatic_stations.query("SG_ESTADO in ('BA','AL','PE','SP','PI') and CD_SITUACAO != 'Pane'")
df_automatic_stations

Unnamed: 0,DC_NOME,SG_ESTADO,CD_SITUACAO,VL_LATITUDE,VL_LONGITUDE,VL_ALTITUDE,DT_INICIO_OPERACAO,CD_ESTACAO
19,SALVADOR,BA,Operante,-13.005515,-38.505760,47.56,2000-05-12,A401
21,BARREIRAS,BA,Operante,-12.124722,-45.026944,474.17,2001-12-19,A402
28,LUIZ EDUARDO MAGALHAES,BA,Operante,-12.152500,-45.829722,760.68,2002-04-17,A404
37,CRUZ DAS ALMAS,BA,Operante,-12.675556,-39.089444,219.76,2003-01-18,A406
40,ARIRANHA,SP,Operante,-21.133056,-48.840556,525.44,2007-11-12,A736
...,...,...,...,...,...,...,...,...
541,TUPA,SP,Operante,-21.927251,-50.490251,498.00,2017-05-11,A768
543,CORRENTE,PI,Operante,-10.429167,-45.173056,452.00,2018-03-27,A374
555,RIBEIRA DO AMPARO,BA,Operante,-11.058611,-38.444167,182.00,2018-09-19,A458
557,VALPARAISO,SP,Operante,-21.319167,-50.930278,381.90,2007-08-29,A734


### Jeremoabo is our region of interest

In [35]:
df_automatic_stations.query("DC_NOME == 'JEREMOABO'")

Unnamed: 0,DC_NOME,SG_ESTADO,CD_SITUACAO,VL_LATITUDE,VL_LONGITUDE,VL_ALTITUDE,DT_INICIO_OPERACAO,CD_ESTACAO
458,JEREMOABO,BA,Operante,-10.080833,-38.345833,261.0,2015-08-12,A450


### Calculating closest stations to Jeremoabo

In [36]:
jeremoabo_lat = -10.080833
jeremoabo_long = -38.345833

distances_km = [distance((jeremoabo_lat,jeremoabo_long),(station[0],station[1])).km for station in 
                        zip(df_automatic_stations['VL_LATITUDE'].values,
                            df_automatic_stations['VL_LONGITUDE'].values)]
df_automatic_stations['DIST_JEREMOABE_KM'] = distances_km

In [37]:
df_automatic_stations.sort_values('DIST_JEREMOABE_KM',ascending=True).head(5)

Unnamed: 0,DC_NOME,SG_ESTADO,CD_SITUACAO,VL_LATITUDE,VL_LONGITUDE,VL_ALTITUDE,DT_INICIO_OPERACAO,CD_ESTACAO,DIST_JEREMOABE_KM
458,JEREMOABO,BA,Operante,-10.080833,-38.345833,261.0,2015-08-12,A450,5.1e-05
506,PIRANHAS,AL,Operante,-9.622222,-37.767222,187.0,2017-09-19,A371,81.247291
248,EUCLIDES DA CUNHA,BA,Operante,-10.537233,-38.996605,431.96,2008-03-30,A442,87.347165
555,RIBEIRA DO AMPARO,BA,Operante,-11.058611,-38.444167,182.0,2018-09-19,A458,108.687776
361,QUEIMADAS,BA,Operante,-10.984722,-39.616944,310.11,2008-05-22,A436,171.327221


Let us pick data from Jeremoabo (main target) Piranhas, Euclides da Cunha and Ribeira do Amparo.

## Data preprocessing of Jeremoabo

In [None]:
northeast_stations = os.listdir('./../../data/raw_data')
northeast_stations = [x.split('.')[0].replace('dados_','') for x in northeast_stations]

In [4]:
df_jeremoabo = pd.read_csv( './../../data/raw_data/dados_A450_H_2015-08-12_2022-04-10.csv',header=9,sep=';')
df_jeremoabo.head()


Unnamed: 0,Data Medicao,Hora Medicao,"PRECIPITACAO TOTAL, HORARIO(mm)","PRESSAO ATMOSFERICA AO NIVEL DA ESTACAO, HORARIA(mB)","PRESSAO ATMOSFERICA REDUZIDA NIVEL DO MAR, AUT(mB)",PRESSAO ATMOSFERICA MAX.NA HORA ANT. (AUT)(mB),PRESSAO ATMOSFERICA MIN. NA HORA ANT. (AUT)(mB),RADIACAO GLOBAL(Kj/m²),TEMPERATURA DA CPU DA ESTACAO(°C),"TEMPERATURA DO AR - BULBO SECO, HORARIA(°C)",...,TEMPERATURA ORVALHO MAX. NA HORA ANT. (AUT)(°C),TEMPERATURA ORVALHO MIN. NA HORA ANT. (AUT)(°C),TENSAO DA BATERIA DA ESTACAO(V),UMIDADE REL. MAX. NA HORA ANT. (AUT)(%),UMIDADE REL. MIN. NA HORA ANT. (AUT)(%),"UMIDADE RELATIVA DO AR, HORARIA(%)","VENTO, DIRECAO HORARIA (gr)(° (gr))","VENTO, RAJADA MAXIMA(m/s)","VENTO, VELOCIDADE HORARIA(m/s)",Unnamed: 22
0,2015-08-12,0,,,,,,,,,...,,,,,,,,,,
1,2015-08-12,100,,,,,,,,,...,,,,,,,,,,
2,2015-08-12,200,,,,,,,,,...,,,,,,,,,,
3,2015-08-12,300,,,,,,,,,...,,,,,,,,,,
4,2015-08-12,400,,,,,,,,,...,,,,,,,,,,


In [37]:
df_jeremoabo.shape

(58416, 23)

In [38]:
df_jeremoabo.isna().sum()

Data Medicao                                                0
Hora Medicao                                                0
PRECIPITACAO TOTAL, HORARIO(mm)                          3710
PRESSAO ATMOSFERICA AO NIVEL DA ESTACAO, HORARIA(mB)     2712
PRESSAO ATMOSFERICA REDUZIDA NIVEL DO MAR, AUT(mB)       4186
PRESSAO ATMOSFERICA MAX.NA HORA ANT. (AUT)(mB)           2714
PRESSAO ATMOSFERICA MIN. NA HORA ANT. (AUT)(mB)          2714
RADIACAO GLOBAL(Kj/m²)                                   2712
TEMPERATURA DA CPU DA ESTACAO(°C)                        2712
TEMPERATURA DO AR - BULBO SECO, HORARIA(°C)              2712
TEMPERATURA DO PONTO DE ORVALHO(°C)                      4162
TEMPERATURA MAXIMA NA HORA ANT. (AUT)(°C)                2714
TEMPERATURA MINIMA NA HORA ANT. (AUT)(°C)                2714
TEMPERATURA ORVALHO MAX. NA HORA ANT. (AUT)(°C)          4265
TEMPERATURA ORVALHO MIN. NA HORA ANT. (AUT)(°C)          4265
TENSAO DA BATERIA DA ESTACAO(V)                          2712
UMIDADE 