In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt

In [20]:
pd.set_option('display.max_rows', None)

In [None]:
basedir = '../data/raw/inmet_dados_hist'
os.makedirs(basedir)

In [2]:
import requests
import zipfile

In [None]:
def download(url, basedir):
    file_name = url.split('/')[-1:][0]
    file_path = f'{basedir}/{file_name}'
    r = requests.get(url)
    with open(file_path, 'wb') as file:
        file.write(r.content)

In [None]:
baseurl = 'https://portal.inmet.gov.br/uploads/dadoshistoricos/'
for year in range(2005,2023):
    url = f'{baseurl}/{year}.zip'
    download(url=url, basedir=basedir)
    

In [None]:
def unzip(file):
    file_name = os.path.abspath(file) 
    zip_ref = zipfile.ZipFile(file_name) 
    zip_ref.extractall(basedir) 
    zip_ref.close() 
    os.remove(file_name) 

In [None]:
for file in os.listdir(basedir):
    if file.endswith('.zip'):
        print(file)
        unzip(f'{basedir}/{file}')

In [None]:
for item in os.listdir(basedir):
    try:
        if os.path.isdir(f'{basedir}/{item}'):
            for file in os.listdir(f'{basedir}/{item}'):
                os.rename(f'{basedir}/{item}/{file}', f'{basedir}/{file}')
            os.removedirs(f'{basedir}/{item}')
    except:
        continue

In [None]:
columns = {
    'DATA (YYYY-MM-DD)': 'data', 
    'HORA (UTC)': 'hora', 
    'PRECIPITAÇÃO TOTAL, HORÁRIO (mm)': 'prec',
    'TEMPERATURA MÁXIMA NA HORA ANT. (AUT) (°C)': 'temp_max',
    'TEMPERATURA MÍNIMA NA HORA ANT. (AUT) (°C)': 'temp_min',
}
columns2 = {
    'Data': 'data', 
    'Hora UTC': 'hora', 
    'PRECIPITAÇÃO TOTAL, HORÁRIO (mm)': 'prec',
    'TEMPERATURA MÁXIMA NA HORA ANT. (AUT) (°C)': 'temp_max',
    'TEMPERATURA MÍNIMA NA HORA ANT. (AUT) (°C)': 'temp_min',
}

In [None]:
df_final = None
for file in os.listdir(basedir):
    try:
        _, regiao, uf, _, municipio, _, _, final = file.split('_')

        if uf != 'RS':
            continue
        ano = int(final.split('.')[0][-4:])
        df = pd.read_csv(f'{basedir}/{file}',
                        sep=';',
                        encoding='ISO-8859-1',
                        skiprows=8)
        if ano < 2019: # a partir de 2019 mudou o padrão do cabeçalho
            df = df[columns.keys()]
            df.columns = columns.values()
        else:
            df = df[columns2.keys()]
            df.columns = columns2.values()
        df['regiao'] = regiao
        df['uf'] = uf
        df['municipio'] = municipio

        df_final = pd.concat([df_final, df], ignore_index=True)
    except:
        print(f'Erro ao processar arquivo {file}')
        
    

In [None]:
os.makedirs('../data/processed/inmet')

In [None]:
df_final.to_csv('../data/processed/inmet/rs.csv', sep=';', decimal=',', index=False)

In [None]:
df_final.head()

In [43]:
df = pd.read_csv('../data/processed/inmet/rs.csv', sep=';', decimal=',')

In [44]:
df.head()

Unnamed: 0,data,hora,prec,temp_max,temp_min,regiao,uf,municipio
0,2022/01/01,0000 UTC,0.0,26.4,25.0,S,RS,PORTO ALEGRE - JARDIM BOTANICO
1,2022/01/01,0100 UTC,0.0,25.0,24.1,S,RS,PORTO ALEGRE - JARDIM BOTANICO
2,2022/01/01,0200 UTC,0.0,24.1,23.6,S,RS,PORTO ALEGRE - JARDIM BOTANICO
3,2022/01/01,0300 UTC,0.0,23.8,23.3,S,RS,PORTO ALEGRE - JARDIM BOTANICO
4,2022/01/01,0400 UTC,0.0,23.6,22.5,S,RS,PORTO ALEGRE - JARDIM BOTANICO


In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5416848 entries, 0 to 5416847
Data columns (total 8 columns):
 #   Column     Dtype  
---  ------     -----  
 0   data       object 
 1   hora       object 
 2   prec       float64
 3   temp_max   float64
 4   temp_min   float64
 5   regiao     object 
 6   uf         object 
 7   municipio  object 
dtypes: float64(3), object(5)
memory usage: 330.6+ MB


In [46]:
df['data'] = df['data'].str.replace('/', '-')
df['data'] = pd.to_datetime(df['data'], format='%Y-%m-%d')
df['prec'] = df['prec'].astype('float')
df['temp_max'] = df['temp_max'].astype('float')
df['temp_min'] = df['temp_min'].astype('float')

In [47]:
df['ano'] = df['data'].dt.year

In [48]:
df = df.drop(df.query('prec==-9999 or temp_max==-9999 or temp_min==-9999').index)


In [49]:
df_consolidado = pd.DataFrame()
df_consolidado['ano'] = df['ano'].sort_values().unique()
df_consolidado.index = df_consolidado['ano']
df_consolidado['temp_max_MAX'] = list(df.groupby('ano')['temp_max'].max())
df_consolidado['temp_max_MIN'] = list(df.groupby('ano')['temp_max'].min())
df_consolidado['temp_max_MEAN'] = list(df.groupby('ano')['temp_max'].mean())
df_consolidado['temp_max_MEDIAN'] = list(df.groupby('ano')['temp_max'].median())
df_consolidado['temp_min_MAX'] = list(df.groupby('ano')['temp_min'].max())
df_consolidado['temp_min_MIN'] = list(df.groupby('ano')['temp_min'].min())
df_consolidado['temp_min_MEAN'] = list(df.groupby('ano')['temp_min'].mean())
df_consolidado['temp_min_MEDIAN'] = list(df.groupby('ano')['temp_min'].median())
df_consolidado['prec_MAX'] = list(df.groupby('ano')['prec'].sum())
df_consolidado['prec_MIN'] = 0
df_consolidado['prec_MEAN'] = list(df.groupby('ano')['prec'].mean() * 30)
df_consolidado['prec_MEDIAN'] = list(df.groupby('ano')['prec'].median())
df_consolidado

Unnamed: 0_level_0,ano,temp_max_MAX,temp_max_MIN,temp_max_MEAN,temp_max_MEDIAN,temp_min_MAX,temp_min_MIN,temp_min_MEAN,temp_min_MEDIAN,prec_MAX,prec_MIN,prec_MEAN,prec_MEDIAN
ano,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2005,2005,39.4,0.6,19.213431,19.1,38.3,-0.2,18.141995,18.2,5529.6,0,4.911853,0.0
2006,2006,38.7,-0.3,19.879059,19.9,37.4,-0.9,18.772309,18.9,9796.2,0,4.723488,0.0
2007,2007,38.2,-4.1,18.728664,19.1,36.6,-4.6,17.59849,18.2,35475.8,0,5.798784,0.0
2008,2008,39.7,-2.2,18.557284,18.4,38.2,-2.8,17.380722,17.4,43218.6,0,4.914257,0.0
2009,2009,39.3,-4.8,18.737948,19.0,38.2,-6.3,17.570511,18.0,60612.0,0,6.239868,0.0
2010,2010,39.7,-2.7,18.64386,18.6,37.8,-3.5,17.504551,17.6,53581.8,0,5.369904,0.0
2011,2011,40.9,-4.0,18.543017,18.6,38.7,-5.0,17.391125,17.6,51225.0,0,5.184752,0.0
2012,2012,41.2,-5.5,19.416325,19.5,38.7,-6.3,18.203728,18.4,44675.6,0,4.599678,0.0
2013,2013,40.9,-3.6,18.506437,18.6,39.1,-3.9,17.338627,17.5,58007.8,0,5.520294,0.0
2014,2014,41.0,-3.5,19.683009,19.6,39.9,-4.0,18.554779,18.6,68118.4,0,6.874559,0.0


In [32]:
columns = ['year', 'value_max', 'value_min', 'value_mean', 'value_median', 'stat']

df_temp_max = df_consolidado[['ano', 'temp_max_MAX', 'temp_max_MIN', 'temp_max_MEAN', 'temp_max_MEDIAN']]
df_temp_max.loc[df_temp_max.index, 'stat'] = 'TMAX'
df_temp_max = df_temp_max.reset_index(drop=True)
df_temp_max.columns = columns
df_temp_min = df_consolidado[['ano', 'temp_min_MAX', 'temp_min_MIN', 'temp_min_MEAN', 'temp_min_MEDIAN']]
df_temp_min.loc[df_temp_min.index, 'stat'] = 'TMIN'
df_temp_min = df_temp_min.reset_index(drop=True)
df_temp_min.columns = columns
df_prec = df_consolidado[['ano', 'prec_MAX', 'prec_MIN', 'prec_MEAN', 'prec_MEDIAN']]
df_prec.loc[df_prec.index, 'stat'] = 'PRCP'
df_prec = df_prec.reset_index(drop=True)
df_prec.columns = columns

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp_max.loc[df_temp_max.index, 'stat'] = 'TMAX'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp_min.loc[df_temp_min.index, 'stat'] = 'TMIN'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_prec.loc[df_prec.index, 'stat'] = 'PRCP'


In [33]:
df_final = pd.concat([df_temp_max, df_temp_min, df_prec])

In [34]:
df_final = df_final.sort_values(by=['year'])
df_final.head()    

Unnamed: 0,year,value_max,value_min,value_mean,value_median,stat
0,2005,39.4,0.6,19.213431,19.1,TMAX
0,2005,38.3,-0.2,18.141995,18.2,TMIN
0,2005,5529.6,0.0,4.911853,0.0,PRCP
1,2006,38.7,-0.3,19.879059,19.9,TMAX
1,2006,37.4,-0.9,18.772309,18.9,TMIN


In [35]:
df_final.loc[df_final.index, 'country_code'] = 'BR-RS'

In [36]:
df_final.head()

Unnamed: 0,year,value_max,value_min,value_mean,value_median,stat,country_code
0,2005,39.4,0.6,19.213431,19.1,TMAX,BR-RS
0,2005,38.3,-0.2,18.141995,18.2,TMIN,BR-RS
0,2005,5529.6,0.0,4.911853,0.0,PRCP,BR-RS
1,2006,38.7,-0.3,19.879059,19.9,TMAX,BR-RS
1,2006,37.4,-0.9,18.772309,18.9,TMIN,BR-RS


In [37]:
df_noaa_global = pd.read_csv('../data/processed/noaa_global/noaa_global_final.csv', sep=';', decimal=',')

In [38]:
df_noaa_global.loc[df_noaa_global['country_code'].isna(), 'country_code'] = 'NA'

In [39]:
df = pd.concat([df_final, df_noaa_global])

In [40]:
# df = df.drop(df.query('year < 2004 or year > 2019').index)
df = df.drop(df.query('stat=="TAVG"').index)

In [41]:
df.query('stat=="PRCP"').sort_values(by=['year', 'value_mean'], ascending=False)

Unnamed: 0,year,value_max,value_min,value_mean,value_median,stat,country_code
17,2022,55392.2,0.0,4.924583,0.0,PRCP,BR-RS
16,2021,48339.2,0.0,4.575222,0.0,PRCP,BR-RS
15,2020,50494.4,0.0,4.344738,0.0,PRCP,BR-RS
17828,2019,1938.0,0.0,155.761364,36.25,PRCP,TV
17790,2019,2080.0,0.0,138.066923,73.0,PRCP,ID
17762,2019,470.0,0.0,127.541176,14.0,PRCP,CD
17777,2019,879.0,0.0,120.892173,3.0,PRCP,EE
17815,2019,1930.0,0.0,118.330807,28.75,PRCP,PY
17833,2019,3899.0,0.0,116.982687,20.0,PRCP,VU
17829,2019,1671.0,0.0,116.432499,20.0,PRCP,TZ


In [None]:
df_tmax = df.query('stat=="TMAX"')
perc75 = df_tmax['value_median'].quantile(0.75)
perc25 = df_tmax['value_median'].quantile(0.25)

In [None]:
x = df_tmax[(df_tmax['value_median'] < perc75) & (df_tmax['value_median'] > perc25)]

In [None]:
x['value_median'].plot(kind='box')

In [None]:
def plota_grafico_stat(df, stat='PRCP', low=0.25, high=0.75):
    df_stat = df.query(f'stat=="{stat}"')
    perc_low = df_stat['value_median'].quantile(low)
    perc_high = df_stat['value_median'].quantile(high)
    df_stat = df_stat[(df_stat['value_median'] > perc_low) & (df_stat['value_median'] < perc_high)]
    df_stat['value_median'].plot(kind='box', title=stat)
    

In [None]:
for stat in df.stat.unique():
    plota_grafico_stat(df, stat=stat)
    plt.show()

In [None]:
x.query('country_code=="BR-US"')

In [None]:
df.to_csv('../data/processed/inmet/consolidado.csv', sep=';', index=False)