In [1]:
import pandas as pd
import os

In [26]:
basedir = '../data/raw/inmet_dados_hist'
os.makedirs(basedir)

In [2]:
import requests
import zipfile

In [28]:
def download(url, basedir):
    file_name = url.split('/')[-1:][0]
    file_path = f'{basedir}/{file_name}'
    r = requests.get(url)
    with open(file_path, 'wb') as file:
        file.write(r.content)

In [29]:
baseurl = 'https://portal.inmet.gov.br/uploads/dadoshistoricos/'
for year in range(2007,2023):
    url = f'{baseurl}/{year}.zip'
    download(url=url, basedir=basedir)
    

In [30]:
def unzip(file):
    file_name = os.path.abspath(file) 
    zip_ref = zipfile.ZipFile(file_name) 
    zip_ref.extractall(basedir) 
    zip_ref.close() 
    os.remove(file_name) 

In [31]:
for file in os.listdir(basedir):
    if file.endswith('.zip'):
        print(file)
        unzip(f'{basedir}/{file}')

2007.zip
2008.zip
2009.zip
2010.zip
2011.zip
2012.zip
2013.zip
2014.zip
2015.zip
2016.zip
2017.zip
2018.zip
2019.zip
2020.zip
2021.zip
2022.zip


In [37]:
for item in os.listdir(basedir):
    if os.path.isdir(f'{basedir}/{item}'):
        for file in os.listdir(f'{basedir}/{item}'):
            os.rename(f'{basedir}/{item}/{file}', f'{basedir}/{file}')
        os.removedirs(f'{basedir}/{item}')

In [96]:
columns = {
    'DATA (YYYY-MM-DD)': 'data', 
    'HORA (UTC)': 'hora', 
    'PRECIPITAÇÃO TOTAL, HORÁRIO (mm)': 'prec',
    'TEMPERATURA MÁXIMA NA HORA ANT. (AUT) (°C)': 'temp_max',
    'TEMPERATURA MÍNIMA NA HORA ANT. (AUT) (°C)': 'temp_min',
}
columns2 = {
    'Data': 'data', 
    'Hora UTC': 'hora', 
    'PRECIPITAÇÃO TOTAL, HORÁRIO (mm)': 'prec',
    'TEMPERATURA MÁXIMA NA HORA ANT. (AUT) (°C)': 'temp_max',
    'TEMPERATURA MÍNIMA NA HORA ANT. (AUT) (°C)': 'temp_min',
}

In [110]:
df_final = None
for file in os.listdir(basedir):
    try:
        _, regiao, uf, _, municipio, _, _, final = file.split('_')

        if uf != 'RS':
            continue
        ano = int(final.split('.')[0][-4:])
        df = pd.read_csv(f'{basedir}/{file}',
                        sep=';',
                        encoding='ISO-8859-1',
                        skiprows=8)
        if ano < 2019: # a partir de 2019 mudou o padrão do cabeçalho
            df = df[columns.keys()]
            df.columns = columns.values()
        else:
            df = df[columns2.keys()]
            df.columns = columns2.values()
        df['regiao'] = regiao
        df['uf'] = uf
        df['municipio'] = municipio

        df_final = pd.concat([df_final, df], ignore_index=True)
    except:
        print(f'Erro ao processar arquivo {file}')
        
    

Erro ao processar arquivo INMET_SE_MG_S122_EB_PEF_BONFIM_01-01-2017_A_31-12-2017.CSV
Erro ao processar arquivo INMET_SE_MG_S122_EB_PEF_BONFIM_01-01-2018_A_31-12-2018.CSV
Erro ao processar arquivo INMET_SE_MG_S122_EB_PEF_BONFIM_01-12-2016_A_31-12-2016.CSV


In [116]:
os.makedirs('../data/processed/inmet')

In [117]:
df_final.to_csv('../data/processed/inmet/rs.csv', sep=';', index=False)

In [112]:
df_final.head()

Unnamed: 0,data,hora,prec,temp_max,temp_min,regiao,uf,municipio
0,2022/01/01,0000 UTC,0,264,25,S,RS,PORTO ALEGRE - JARDIM BOTANICO
1,2022/01/01,0100 UTC,0,25,241,S,RS,PORTO ALEGRE - JARDIM BOTANICO
2,2022/01/01,0200 UTC,0,241,236,S,RS,PORTO ALEGRE - JARDIM BOTANICO
3,2022/01/01,0300 UTC,0,238,233,S,RS,PORTO ALEGRE - JARDIM BOTANICO
4,2022/01/01,0400 UTC,0,236,225,S,RS,PORTO ALEGRE - JARDIM BOTANICO


In [48]:
df = pd.read_csv('../data/processed/inmet/rs.csv', sep=';', decimal=',')

In [74]:
df.head()

Unnamed: 0,data,hora,prec,temp_max,temp_min,regiao,uf,municipio,ano
0,2022-01-01,0000 UTC,0.0,26.4,25.0,S,RS,PORTO ALEGRE - JARDIM BOTANICO,2022
1,2022-01-01,0100 UTC,0.0,25.0,24.1,S,RS,PORTO ALEGRE - JARDIM BOTANICO,2022
2,2022-01-01,0200 UTC,0.0,24.1,23.6,S,RS,PORTO ALEGRE - JARDIM BOTANICO,2022
3,2022-01-01,0300 UTC,0.0,23.8,23.3,S,RS,PORTO ALEGRE - JARDIM BOTANICO,2022
4,2022-01-01,0400 UTC,0.0,23.6,22.5,S,RS,PORTO ALEGRE - JARDIM BOTANICO,2022


In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5307000 entries, 0 to 5306999
Data columns (total 8 columns):
 #   Column     Dtype  
---  ------     -----  
 0   data       object 
 1   hora       object 
 2   prec       float64
 3   temp_max   float64
 4   temp_min   float64
 5   regiao     object 
 6   uf         object 
 7   municipio  object 
dtypes: float64(3), object(5)
memory usage: 323.9+ MB


In [51]:
df['data'] = df['data'].str.replace('/', '-')
df['data'] = pd.to_datetime(df['data'], format='%Y-%m-%d')
df['prec'] = df['prec'].astype('float')
df['temp_max'] = df['temp_max'].astype('float')
df['temp_min'] = df['temp_min'].astype('float')

In [52]:
df['ano'] = df['data'].dt.year

In [97]:
df = df.drop(df.query('prec==-9999 or temp_max==-9999 or temp_min==-9999').index)


In [136]:
df_consolidado = pd.DataFrame()
df_consolidado['ano'] = df['ano'].sort_values().unique()
df_consolidado.index = df_consolidado['ano']
df_consolidado['temp_max_MAX'] = list(df.groupby('ano')['temp_max'].max())
df_consolidado['temp_max_MIN'] = list(df.groupby('ano')['temp_max'].min())
df_consolidado['temp_min_MAX'] = list(df.groupby('ano')['temp_min'].max())
df_consolidado['temp_min_MIN'] = list(df.groupby('ano')['temp_min'].min())
df_consolidado['prec_MEAN'] = list(df.groupby('ano')['prec'].mean())
df_consolidado

Unnamed: 0_level_0,ano,temp_max_MAX,temp_max_MIN,temp_min_MAX,temp_min_MIN,prec_MEAN
ano,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2007,2007,38.2,-4.1,36.6,-4.6,0.193293
2008,2008,39.7,-2.2,38.2,-2.8,0.163809
2009,2009,39.3,-4.8,38.2,-6.3,0.207996
2010,2010,39.7,-2.7,37.8,-3.5,0.178997
2011,2011,40.9,-4.0,38.7,-5.0,0.172825
2012,2012,41.2,-5.5,38.7,-6.3,0.153323
2013,2013,40.9,-3.6,39.1,-3.9,0.18401
2014,2014,41.0,-3.5,39.9,-4.0,0.229152
2015,2015,37.7,-1.8,36.5,-2.9,0.240876
2016,2016,39.5,-4.1,38.2,-5.1,0.185942


In [None]:
df_consolidado.to_csv()