# 1. Setup

## 1.1. Bibliotecas

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import os
import pandas as pd
import numpy as np
import itertools
import gc
import polars as pl

## 1.2. Funções

In [2]:
def build_coord(lat_min,lat_max,lon_min,lon_max,step,precision):
    coords = list(itertools.product(
        [format(x,f'.{precision}f') for x in np.arange(lat_min,lat_max+step,step)],
        [format(x,f'.{precision}f') for x in np.arange(lon_min,lon_max+step,step)]))    

    string_coords = [f'({item[0]} {item[1]})' for item in coords]
    return string_coords

def concat_tables_satelites(source,data,verbose=False):
    final_df = None
    for i_coord,coordinate in enumerate(coordinates[source]):
        df = spark.read.csv(f'{path_dict[source][data]}{string_coordinates[source][i_coord]}.csv',header=True,inferSchema=True)
        if verbose:
            print(f'{coordinate} ({i_coord+1}/{len(coordinates[source])})',end='\r')
        df = df \
            .withColumn('lat',F.lit(coordinate['lat'])) \
            .withColumn('lon',F.lit(coordinate['lon']))
        if final_df is None:
            final_df = df
        elif not final_df is None:
            final_df = final_df.union(df)

    return final_df

def polars_concat_tables_satelites(source, data, verbose=False):
    final_df = None
    for i_coord, coordinate in enumerate(coordinates[source]):
        # Construir o caminho para o arquivo CSV
        file_path = f'{path_dict[source][data]}{string_coordinates[source][i_coord]}.csv'
        
        # Ler o arquivo CSV usando Polars
        df = pl.read_csv(file_path)
        
        # Adicionar colunas de latitude e longitude
        df = df.with_columns([
            pl.lit(coordinate['lat']).alias('lat'),
            pl.lit(coordinate['lon']).alias('lon')
        ])
        
        if verbose:
            print(f'{coordinate} ({i_coord+1}/{len(coordinates[source])})', end='\r')
        
        # Concatenar DataFrames
        if final_df is None:
            final_df = df
        else:
            final_df = final_df.vstack(df)

    return final_df

def satelites_join_medidas(df_dict):
    joined_df = None
    for df in df_dict.values():
        if joined_df is None:
            joined_df = df
        else:
            joined_df = joined_df.join(df,['data','lat','lon'],'outer')
    return joined_df

In [3]:
spark = SparkSession.builder.appName("Mateus") \
    .config("spark.sql.shuffle.partitions", "100") \
    .config("spark.sql.files.maxPartitionBytes", "128MB") \
    .config("spark.sql.files.openCostInBytes", "4MB")  \
    .config("spark.driver.maxResultSize", "4g") \
    .config("spark.driver.memory", "16g")  \
    .config("spark.executor.cores", "4") \
    .config("spark.executor.memory", "8g") \
    .config("spark.sql.execution.arrow.enabled", "true") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.dynamicAllocation.minExecutors", "1") \
    .config("spark.dynamicAllocation.maxExecutors", "4")  \
    .getOrCreate()


# 2. Satélites

## 2.1. Obtendo metadados

In [4]:
initial_path = 'C:/Users/Mateus Santos Rochas/Desktop/Estudos/07. Doutorado - Matemática aplicada/Pesquisa/Dados R2'
bronze_path = os.path.join(initial_path,'1. bronze')

path_dict = {
    'AgCFSR':{
        'prate':os.path.join(initial_path,r'0. bruto\AgCFSR_prate\AgCFSR\agcfsr_prate_')
        ,'rhstmax':os.path.join(initial_path,r'0. bruto\AgCFSR_rhstmax\AgCFSR\agcfsr_rhstmax_')
        ,'srad':os.path.join(initial_path,r'0. bruto\AgCFSR_srad\AgCFSR\agcfsr_srad_')
        ,'tavg':os.path.join(initial_path,r'0. bruto\AgCFSR_tavg\AgCFSR\agcfsr_tavg_')
        ,'tmax':os.path.join(initial_path,r'0. bruto\AgCFSR_tmax\AgCFSR\agcfsr_tmax_')
        ,'tmin':os.path.join(initial_path,r'0. bruto\AgCFSR_tmin\AgCFSR\agcfsr_tmin_')
        ,'wndspd':os.path.join(initial_path,r'0. bruto\AgCFSR_wndspd\AgCFSR\agcfsr_wndspd_')
    }
    ,'AgMERRA':{
        'prate':os.path.join(initial_path,r'0. bruto\AgMERRA_prate\AgMERRA\agmerra_prate_')
        ,'rhstmax':os.path.join(initial_path,r'0. bruto\AgMERRA_rhstmax\AgMERRA\agmerra_rhstmax_')
        ,'srad':os.path.join(initial_path,r'0. bruto\AgMERRA_srad\AgMERRA\agmerra_srad_')
        ,'tavg':os.path.join(initial_path,r'0. bruto\AgMERRA_tavg\AgMERRA\agmerra_tavg_')
        ,'tmax':os.path.join(initial_path,r'0. bruto\AgMERRA_tmax\AgMERRA\agmerra_tmax_')
        ,'tmin':os.path.join(initial_path,r'0. bruto\AgMERRA_tmin\AgMERRA\agmerra_tmin_')
        ,'wndspd':os.path.join(initial_path,r'0. bruto\AgMERRA_wndspd\AgMERRA\agmerra_wndspd_')
    }
    ,'CHIRPS':{
        'precip':os.path.join(initial_path,r'0. bruto\CHIRPS_precipitation\CHIRPS\chirps_precipitation_')
    }
    ,'CPC':{
        'precip':os.path.join(initial_path,r'0. bruto\CPC_precip\CPC\cpc_precip_')
        ,'tmax':os.path.join(initial_path,r'0. bruto\CPC_tmax\CPC\cpc_tmax_')
        ,'tmin':os.path.join(initial_path,r'0. bruto\CPC_tmin\CPC\cpc_tmin_')
    }
    ,'GL':{
        'irradiancia':os.path.join(initial_path,r'0. bruto\GL_irradiancia\GL\gl_irradiancia_')
    }
    ,'GPM Final Run':{
        'precipitation':os.path.join(initial_path,r'0. bruto\GPM Final Run_precipitation\GPM Final Run\gpm-final-run_precipitation_')
    }
    ,'GPM Late Run':{
        'precipitation':os.path.join(initial_path,r'0. bruto\GPM Late Run_precipitation\GPM Late Run\gpm-late-run_precipitation_')
    }
    ,'PERSIANN-CDR':{
        'precipitation':os.path.join(initial_path,r'0. bruto\PERSIANN-CDR_precipitation\PERSIANN-CDR\persiann_precipitation_')
    }
    ,'POWER':{
        'allsky_sfc_sw_dwn':os.path.join(initial_path,r'0. bruto\POWER_allsky_sfc_sw_dwn\POWER\power_allsky_sfc_sw_dwn_')
        ,'prectotcorr':os.path.join(initial_path,r'0. bruto\POWER_prectotcorr\POWER\power_prectotcorr_')
        ,'ps':os.path.join(initial_path,r'0. bruto\POWER_ps\POWER\power_ps_')
        ,'rh2m':os.path.join(initial_path,r'0. bruto\POWER_rh2m\POWER\power_rh2m_')
        ,'t2m':os.path.join(initial_path,r'0. bruto\POWER_t2m\POWER\power_t2m_')
        ,'t2m_max':os.path.join(initial_path,r'0. bruto\POWER_t2m_max\POWER\power_t2m_max_')
        ,'t2m_min':os.path.join(initial_path,r'0. bruto\POWER_t2m_min\POWER\power_t2m_min_')
        ,'t2mdew':os.path.join(initial_path,r'0. bruto\POWER_t2mdew\POWER\power_t2mdew_')
        ,'wd2m':os.path.join(initial_path,r'0. bruto\POWER_wd2m\POWER\power_wd2m_')
        ,'wd10m':os.path.join(initial_path,r'0. bruto\POWER_wd10m\POWER\power_wd10m_')
        ,'ws2m':os.path.join(initial_path,r'0. bruto\POWER_ws2m\POWER\power_ws2m_')
        ,'ws2m_max':os.path.join(initial_path,r'0. bruto\POWER_ws2m_max\POWER\power_ws2m_max_')
        ,'ws10m':os.path.join(initial_path,r'0. bruto\POWER_ws10m\POWER\power_ws10m_')
        ,'ws10m_max':os.path.join(initial_path,r'0. bruto\POWER_ws10m_max\POWER\power_ws10m_max_')
    }
    ,'TRMM':{
        'precipitation':os.path.join(initial_path,r'0. bruto\TRMM_precipitation\TRMM\trmm_precipitation_')
    }
}

string_coordinates = {
    'AgCFSR':build_coord(-24,-17,-53,-38,0.25,2)
    ,'AgMERRA':build_coord(-24,-17,-53,-38,0.25,2)
    ,'CHIRPS':build_coord(-24,-17,-53,-38,0.05,2)
    ,'CPC':build_coord(-24,-17,-53,-38,0.05,2)
    ,'GL':build_coord(-24,-17.1,-53,-38,0.1,1)
    ,'GPM Final Run':build_coord(-24,-17.1,-53,-38,0.1,1)
    ,'GPM Late Run':build_coord(-24,-17.1,-53,-38,0.1,1)
    ,'PERSIANN-CDR':build_coord(-24,-17,-53,-38,0.25,2)
    ,'POWER':build_coord(-24,-17,-53,-38,0.5,1)
    ,'TRMM':build_coord(-24,-17,-53,-38,0.25,2)
}

sources = list(path_dict.keys())
datas = {source:list(path_dict[source].keys()) for source in sources}

coordinates = {}
for source in sources:
    coordinates[source] = [
        {
        'lat':float(string_coordinate.split(' ')[0][1:]),
        'lon':float(string_coordinate.split(' ')[1][:-1])
        } for string_coordinate in string_coordinates[source]]


## 2.2. Obtendo as tabelas e concatenando

In [4]:
sources_batch_1 = ['AgCFSR','AgMERRA']
for i_s,source in enumerate(sources_batch_1):
    print(f'Source: {source} ({i_s+1}/{len(sources_batch_1)})')
    for i_d,data in enumerate(datas[source]):
        print(f'    Data: {data} ({i_d+1}/{len(datas[source])})')
        df = polars_concat_tables_satelites(source,data,verbose=True)
        df.write_csv(os.path.join(initial_path, '1. bronze', f'{source}_{data}.csv'))
        del df
        gc.collect()

        ### Tentar separar em blocos de 500/1000 para juntar

Source: AgCFSR (1/10)
    Data: prate (1/7)
    Data: rhstmax (2/7)38.0} (1769/1769)))
    Data: srad (3/7): -38.0} (1769/1769)))
    Data: tavg (4/7): -38.0} (1769/1769)))
    Data: tmax (5/7): -38.0} (1769/1769)))
    Data: tmin (6/7): -38.0} (1769/1769)))
    Data: wndspd (7/7)-38.0} (1769/1769)))
Source: AgMERRA (2/10)-38.0} (1769/1769)))
    Data: prate (1/7)
    Data: rhstmax (2/7)38.0} (1769/1769)))
    Data: srad (3/7): -38.0} (1769/1769)))
    Data: tavg (4/7): -38.0} (1769/1769)))
    Data: tmax (5/7): -38.0} (1769/1769)))
    Data: tmin (6/7): -38.0} (1769/1769)))
    Data: wndspd (7/7)-38.0} (1769/1769)))
{'lat': -17.0, 'lon': -38.0} (1769/1769)))

In [5]:
sources_batch_2 = ['CHIRPS']
for i_s,source in enumerate(sources_batch_2):
    print(f'Source: {source} ({i_s+1}/{len(sources_batch_2)})')
    for i_d,data in enumerate(datas[source]):
        print(f'    Data: {data} ({i_d+1}/{len(datas[source])})')
        df = polars_concat_tables_satelites(source,data,verbose=True)
        df.write_csv(os.path.join(initial_path, '1. bronze', f'{source}_{data}.csv'))
        del df
        gc.collect()

Source: CHIRPS (1/10)
    Data: precip (1/1)
{'lat': -17.0, 'lon': -38.0} (42441/42441)))

In [13]:
# sources_batch_3 = ['CPC','GL','GPM Final Run','GPM Late Run']
sources_batch_3 = ['GPM Final Run','GPM Late Run']
for i_s,source in enumerate(sources_batch_3):
    print(f'Source: {source} ({i_s+1}/{len(sources_batch_3)})')
    for i_d,data in enumerate(datas[source]):
        print(f'    Data: {data} ({i_d+1}/{len(datas[source])})')
        df = polars_concat_tables_satelites(source,data,verbose=True)
        df.write_csv(os.path.join(initial_path, '1. bronze', f'{source}_{data}.csv'))
        del df
        gc.collect()

Source: GPM Final Run (1/10)
    Data: precipitation (1/1)
Source: GPM Late Run (2/10)} (10570/10570)
    Data: precipitation (1/1)
{'lat': -17.1, 'lon': -38.0} (10570/10570)

In [16]:
sources_batch_4 = ['PERSIANN-CDR','POWER','TRMM']
for i_s,source in enumerate(sources_batch_4):
    print(f'Source: {source} ({i_s+1}/{len(sources_batch_4)})')
    for i_d,data in enumerate(datas[source]):
        print(f'    Data: {data} ({i_d+1}/{len(datas[source])})')
        df = polars_concat_tables_satelites(source,data,verbose=True)
        df.write_csv(os.path.join(initial_path, '1. bronze', f'{source}_{data}.csv'))
        del df
        gc.collect()

Source: PERSIANN-CDR (1/10)
    Data: precipitation (1/1)
Source: POWER (2/10): -38.0} (1769/1769)))
    Data: allsky_sfc_sw_dwn (1/14)
    Data: prectotcorr (2/14) (465/465)
    Data: ps (3/14)': -38.0} (465/465)
    Data: rh2m (4/14) -38.0} (465/465)
    Data: t2m (5/14): -38.0} (465/465)
    Data: t2m_max (6/14)8.0} (465/465)
    Data: t2m_min (7/14)8.0} (465/465)
    Data: t2mdew (8/14)38.0} (465/465)
    Data: wd2m (9/14) -38.0} (465/465)
    Data: wd10m (10/14)38.0} (465/465)
    Data: ws2m (11/14)-38.0} (465/465)
    Data: ws2m_max (12/14)0} (465/465)
    Data: ws10m (13/14)38.0} (465/465)
    Data: ws10m_max (14/14)} (465/465)
Source: TRMM (3/10)': -38.0} (465/465)
    Data: precipitation (1/1)
{'lat': -17.0, 'lon': -38.0} (1769/1769)))

## 2.3. Unindo as tabelas de cada fonte

In [5]:
medidas = {
    'AgCFSR':['prate','rhstmax','srad','tavg','tmax','tmin','wndspd'],
    'AgMERRA':['prate','rhstmax','srad','tavg','tmax','tmin','wndspd'],
    'CHIRPS':['precipitation'],
    'CPC':['precipitation','tmax','tmin'],
    'GL':['irradiancia'],
    'GPM Final Run':['precipitation'],
    'GPM Late Run':['precipitation'],
    'PERSIANN-CDR':['precipitation'],
    'POWER':['allsky_sfc_sw_dwn','prectotcorr','ps','rh2m','t2m_max','t2m_min','t2m','t2mdew','wd2m','wd10m','ws2m_max','ws2m','ws10m_max','ws10m'],
    'TRMM':['precipitation']
}

satelites_dict = {satelite:{medida:spark.read.csv(os.path.join(bronze_path,f'{satelite}_{medida}.csv'),
                              header=True,inferSchema=True) \
                                .withColumnRenamed('valor',f'vl_{medida}') for medida in medidas[satelite]} for satelite in medidas.keys()}

In [6]:
# Salvar o CSV resultado dos joins
satelites = {satelite:satelites_join_medidas(satelite_dict) for satelite,satelite_dict in satelites_dict.items()}

In [7]:
for satelite,df in satelites.items():
    df.write.option("header",True) \
    .option("delimiter",",") \
    .mode("overwrite") \
    .csv(os.path.join(initial_path, '1. bronze', f'full_{satelite}.csv'))

# 3. Estações

In [6]:
estacoes_path = os.path.join(initial_path,'0. bruto','estacoes')
estacoes_bronze_path = os.path.join(initial_path,'1. bronze')

## 3.1. cemaden

In [7]:
# Importando a tabela
cemaden = pd.read_csv(os.path.join(estacoes_path,'cemaden.csv'),sep='|').iloc[1:-1]

# Renomeando as colunas
cemaden.columns = ['id_estacao','lat','lon','data','tmin','tmed','tmax','prec','urmin','urmed','urmax']

# Substituir valores que contenham qualquer número de espaços por np.nan em todas as colunas
cemaden = cemaden.replace(r'^\s+$', np.nan, regex=True)

# Escrevendo na bronze
cemaden.to_csv(os.path.join(estacoes_bronze_path,'cemaden.csv'),index=False)

  cemaden = cemaden.replace(r'^\s+$', np.nan, regex=True)


## 3.2. cemig

In [8]:
# Importando a tabela
cemig = pd.read_csv(os.path.join(estacoes_path,'CEMIG.csv'),sep=';').iloc[:-1]

# Renomeando as colunas
cemig.columns = ['id_estacao','lat','lon','data','tmin','tmed','tmax','prec','urmin','urmed','urmax']

# Escrevendo na bronze
cemig.to_csv(os.path.join(estacoes_bronze_path,'cemig.csv'),index=False)

  cemig = pd.read_csv(os.path.join(estacoes_path,'CEMIG.csv'),sep=';').iloc[:-1]


## 3.3. iac

In [9]:
# Importando a tabela
iac = pd.read_csv(os.path.join(estacoes_path,'IAC.csv'),sep='|').iloc[1:-1]

# Renomeando as colunas
iac.columns = ['id_estacao','lat','lon','data','tmin','tmed','tmax','prec','urmin','urmed','urmax']

# Substituir valores que contenham qualquer número de espaços por np.nan em todas as colunas
iac = iac.replace(r'^\s+$', np.nan, regex=True)

# Escrevendo na bronze
iac.to_csv(os.path.join(estacoes_bronze_path,'iac.csv'),index=False)

  iac = pd.read_csv(os.path.join(estacoes_path,'IAC.csv'),sep='|').iloc[1:-1]
  iac = iac.replace(r'^\s+$', np.nan, regex=True)


## 3.4. inmet_agri

In [10]:
# Importando a tabela
inmet_agri = pd.read_csv(os.path.join(estacoes_path,'INMET_AGRI.csv'),sep='|').iloc[1:-1]

# Renomeando as colunas
inmet_agri.columns = ['id_estacao','lat','lon','data','tmin','tmed','tmax','prec','urmin','urmed','urmax']

# Substituir valores que contenham qualquer número de espaços por np.nan em todas as colunas
inmet_agri = inmet_agri.replace(r'^\s+$', np.nan, regex=True)

# Escrevendo na bronze
inmet_agri.to_csv(os.path.join(estacoes_bronze_path,'inmet_agri.csv'),index=False)


  inmet_agri = pd.read_csv(os.path.join(estacoes_path,'INMET_AGRI.csv'),sep='|').iloc[1:-1]
  inmet_agri = inmet_agri.replace(r'^\s+$', np.nan, regex=True)


## 3.5. inmet

In [11]:
# Importando a tabela
inmet = pd.read_csv(os.path.join(estacoes_path,'INMET.csv'),sep=';').iloc[:-1]

# Renomeando as colunas
inmet.columns = ['id_estacao','lat','lon','data','tmin','tmed','tmax','prec','urmin','urmed','urmax']

# Escrevendo na bronze
inmet.to_csv(os.path.join(estacoes_bronze_path,'inmet.csv'),index=False)

  inmet = pd.read_csv(os.path.join(estacoes_path,'INMET.csv'),sep=';').iloc[:-1]


## 3.6. unesp

In [12]:
# Importando a tabela
unesp = pd.read_csv(os.path.join(estacoes_path,'unesp.csv'),sep='|').iloc[1:-1]

# Renomeando as colunas
unesp.columns = ['id_estacao','lat','lon','data','tmin','tmed','tmax','prec','urmin','urmed','urmax']

# Substituir valores que contenham qualquer número de espaços por np.nan em todas as colunas
unesp = unesp.replace(r'^\s+$', np.nan, regex=True)

# Escrevendo na bronze
unesp.to_csv(os.path.join(estacoes_bronze_path,'unesp.csv'),index=False)

  unesp = pd.read_csv(os.path.join(estacoes_path,'unesp.csv'),sep='|').iloc[1:-1]
  unesp = unesp.replace(r'^\s+$', np.nan, regex=True)


## 3.7. z_ana_hidro_old

In [13]:
# Importando a tabela
ana = pd.read_csv(os.path.join(estacoes_path,'z_ana_hidro_old.csv'),sep=';').iloc[0:-1]

# Renomeando as colunas
ana.columns = ['id_estacao','lat','lon','data','tmin','tmed','tmax','prec','urmin','urmed','urmax']

# Escrevendo na bronze
ana.to_csv(os.path.join(estacoes_bronze_path,'ana.csv'),index=False)