In [101]:
import pandas as pd
import os

In [26]:
basedir = '../data/raw/inmet_dados_hist'
os.makedirs(basedir)

In [27]:
import requests
import zipfile

In [28]:
def download(url, basedir):
    file_name = url.split('/')[-1:][0]
    file_path = f'{basedir}/{file_name}'
    r = requests.get(url)
    with open(file_path, 'wb') as file:
        file.write(r.content)

In [29]:
baseurl = 'https://portal.inmet.gov.br/uploads/dadoshistoricos/'
for year in range(2007,2023):
    url = f'{baseurl}/{year}.zip'
    download(url=url, basedir=basedir)
    

In [30]:
def unzip(file):
    file_name = os.path.abspath(file) 
    zip_ref = zipfile.ZipFile(file_name) 
    zip_ref.extractall(basedir) 
    zip_ref.close() 
    os.remove(file_name) 

In [31]:
for file in os.listdir(basedir):
    if file.endswith('.zip'):
        print(file)
        unzip(f'{basedir}/{file}')

2007.zip
2008.zip
2009.zip
2010.zip
2011.zip
2012.zip
2013.zip
2014.zip
2015.zip
2016.zip
2017.zip
2018.zip
2019.zip
2020.zip
2021.zip
2022.zip


In [37]:
for item in os.listdir(basedir):
    if os.path.isdir(f'{basedir}/{item}'):
        for file in os.listdir(f'{basedir}/{item}'):
            os.rename(f'{basedir}/{item}/{file}', f'{basedir}/{file}')
        os.removedirs(f'{basedir}/{item}')

In [96]:
columns = {
    'DATA (YYYY-MM-DD)': 'data', 
    'HORA (UTC)': 'hora', 
    'PRECIPITAÇÃO TOTAL, HORÁRIO (mm)': 'prec',
    'TEMPERATURA MÁXIMA NA HORA ANT. (AUT) (°C)': 'temp_max',
    'TEMPERATURA MÍNIMA NA HORA ANT. (AUT) (°C)': 'temp_min',
}
columns2 = {
    'Data': 'data', 
    'Hora UTC': 'hora', 
    'PRECIPITAÇÃO TOTAL, HORÁRIO (mm)': 'prec',
    'TEMPERATURA MÁXIMA NA HORA ANT. (AUT) (°C)': 'temp_max',
    'TEMPERATURA MÍNIMA NA HORA ANT. (AUT) (°C)': 'temp_min',
}

In [110]:
df_final = None
for file in os.listdir(basedir):
    try:
        _, regiao, uf, _, municipio, _, _, final = file.split('_')

        if uf != 'RS':
            continue
        ano = int(final.split('.')[0][-4:])
        df = pd.read_csv(f'{basedir}/{file}',
                        sep=';',
                        encoding='ISO-8859-1',
                        skiprows=8)
        if ano < 2019: # a partir de 2019 mudou o padrão do cabeçalho
            df = df[columns.keys()]
            df.columns = columns.values()
        else:
            df = df[columns2.keys()]
            df.columns = columns2.values()
        df['regiao'] = regiao
        df['uf'] = uf
        df['municipio'] = municipio

        df_final = pd.concat([df_final, df], ignore_index=True)
    except:
        print(f'Erro ao processar arquivo {file}')
        
    

Erro ao processar arquivo INMET_SE_MG_S122_EB_PEF_BONFIM_01-01-2017_A_31-12-2017.CSV
Erro ao processar arquivo INMET_SE_MG_S122_EB_PEF_BONFIM_01-01-2018_A_31-12-2018.CSV
Erro ao processar arquivo INMET_SE_MG_S122_EB_PEF_BONFIM_01-12-2016_A_31-12-2016.CSV


In [113]:
os.makedirs('../data/processed/inmet')

In [114]:
df_final.to_csv('../data/processed/inmet/rs.csv', sep=';')

In [112]:
df_final.head()

Unnamed: 0,data,hora,prec,temp_max,temp_min,regiao,uf,municipio
0,2022/01/01,0000 UTC,0,264,25,S,RS,PORTO ALEGRE - JARDIM BOTANICO
1,2022/01/01,0100 UTC,0,25,241,S,RS,PORTO ALEGRE - JARDIM BOTANICO
2,2022/01/01,0200 UTC,0,241,236,S,RS,PORTO ALEGRE - JARDIM BOTANICO
3,2022/01/01,0300 UTC,0,238,233,S,RS,PORTO ALEGRE - JARDIM BOTANICO
4,2022/01/01,0400 UTC,0,236,225,S,RS,PORTO ALEGRE - JARDIM BOTANICO
