In [1]:
import pandas as pd
from pyspark.sql import SparkSession,Window
import pyspark.sql.functions as F
import seaborn as sns
import os
import duckdb
from pathlib import Path

In [2]:
spark = SparkSession.builder.appName("Mateus") \
    .config("spark.sql.shuffle.partitions", "100") \
    .config("spark.sql.files.maxPartitionBytes", "128MB") \
    .config("spark.sql.files.openCostInBytes", "4MB")  \
    .config("spark.driver.maxResultSize", "4g") \
    .config("spark.driver.memory", "16g")  \
    .config("spark.executor.cores", "4") \
    .config("spark.executor.memory", "8g") \
    .config("spark.sql.execution.arrow.enabled", "true") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.dynamicAllocation.minExecutors", "1") \
    .config("spark.dynamicAllocation.maxExecutors", "4")  \
    .getOrCreate()

In [None]:
with open('initial_path.txt','r') as f:
    initial_path = f.read()
    
bronze_path = os.path.join(initial_path,'1. bronze')
prata_path = os.path.join(initial_path,'2. prata')
conn = duckdb.connect('r2_prata.duckdb')

In [4]:
def compute_lat_delta(df):
    lat_max = df.select(F.max('lat')).collect()[0][0]
    lat_min = df.select(F.min('lat')).collect()[0][0]
    lat_range = lat_max-lat_min
    lat_n = df.select('lat').distinct().count()
    lat_delta = lat_range/(lat_n-1)
    return lat_delta

def compute_lon_delta(df):
    lon_max = df.select(F.max('lon')).collect()[0][0]
    lon_min = df.select(F.min('lon')).collect()[0][0]
    lon_range = lon_max-lon_min
    lon_n = df.select('lon').distinct().count()
    lon_delta = lon_range/(lon_n-1)
    return lon_delta

# 1. Separando tabelas dim e tabelas fato

## dim_estacoes

In [4]:
grupo_estacao = ['ana','cemaden','cemig','iac','inmet_agri','inmet','unesp']

estacao = {
    nome.upper():
    pd.read_csv(os.path.join(bronze_path,f'{nome}.csv')) 
    for nome in grupo_estacao
}

id_estacao = {
    nome.upper():
    list(df['id_estacao'].unique())
    for nome,df in estacao.items()
}

dim_estacoes_dict = {}
for nome,df in estacao.items():
    dim_df = df \
        .groupby(['id_estacao','lat','lon'],as_index=False).agg(
            dt_min = ('data','min'),
            dt_max = ('data','max'),
            fl_tmin = ('tmin','max'),
            fl_tmed = ('tmed','max'),
            fl_tmax = ('tmax','max'),
            fl_prec = ('prec','max'),
            fl_urmin = ('urmin','max'),
            fl_urmed = ('urmed','max'),
            fl_urmax = ('urmax','max')
            )
    for col in [c for c in dim_df.columns if c.startswith('fl_')]:
        dim_df[col] = dim_df[col].notnull().astype(int)
    dim_df['nm_grupo_estacao'] = nome.upper()
    dim_estacoes_dict[nome] = dim_df

dim_estacoes  = pd.concat(dim_estacoes_dict.values()).reset_index(drop=True)
conn.execute("CREATE TABLE IF NOT EXISTS dim_estacoes AS SELECT * FROM dim_estacoes")

<duckdb.duckdb.DuckDBPyConnection at 0x1895f003870>

In [5]:
conn.execute("SHOW TABLES").fetchall()

[('dim_estacoes',)]

## fato_satelites

In [5]:
# Renomeando as colunas
satelites_nomes = ['AgCFSR','AgMERRA','CHIRPS','CPC','GL','GPM Final Run','GPM Late Run','PERSIANN-CDR','POWER','TRMM']
satelite_dict = {nome:spark.read.csv(os.path.join(bronze_path,f'full_{nome}.csv'),header=True,inferSchema=True) for nome in satelites_nomes}

satelite_dict['AgCFSR'] = satelite_dict['AgCFSR'] \
    .withColumnRenamed('data','dt_medicao') \
    .withColumnRenamed('vl_prate','vl_precipitacao') \
    .withColumnRenamed('vl_rhstmax','vl_umidade_relativa_maxima') \
    .withColumnRenamed('vl_srad','vl_radiacao_solar') \
    .withColumnRenamed('vl_tavg','vl_temperatura_media') \
    .withColumnRenamed('vl_tmax','vl_temperatura_maxima') \
    .withColumnRenamed('vl_tmin','vl_temperatura_minima') \
    .withColumnRenamed('vl_wndspd','vl_vento_velocidade')

satelite_dict['AgMERRA'] = satelite_dict['AgMERRA'] \
    .withColumnRenamed('data','dt_medicao') \
    .withColumnRenamed('vl_prate','vl_precipitacao') \
    .withColumnRenamed('vl_rhstmax','vl_umidade_relativa_maxima') \
    .withColumnRenamed('vl_srad','vl_radiacao_solar') \
    .withColumnRenamed('vl_tavg','vl_temperatura_media') \
    .withColumnRenamed('vl_tmax','vl_temperatura_maxima') \
    .withColumnRenamed('vl_tmin','vl_temperatura_minima') \
    .withColumnRenamed('vl_wndspd','vl_vento_velocidade')

satelite_dict['CHIRPS'] = satelite_dict['CHIRPS'] \
    .withColumnRenamed('data','dt_medicao') \
    .withColumnRenamed('vl_precipitation','vl_precipitacao')

satelite_dict['CPC'] = satelite_dict['CPC'] \
    .withColumnRenamed('data','dt_medicao') \
    .withColumnRenamed('vl_precipitation','vl_precipitacao') \
    .withColumnRenamed('vl_tmax','vl_temperatura_maxima') \
    .withColumnRenamed('vl_tmin','vl_temperatura_minima')

satelite_dict['GL'] = satelite_dict['GL'] \
    .withColumnRenamed('data','dt_medicao')

satelite_dict['GPM Final Run'] = satelite_dict['GPM Final Run'] \
    .withColumnRenamed('data','dt_medicao') \
    .withColumnRenamed('vl_precipitation','vl_precipitacao')

satelite_dict['GPM Late Run'] = satelite_dict['GPM Late Run'] \
    .withColumnRenamed('data','dt_medicao') \
    .withColumnRenamed('vl_precipitation','vl_precipitacao')

satelite_dict['PERSIANN-CDR'] = satelite_dict['PERSIANN-CDR'] \
    .withColumnRenamed('data','dt_medicao') \
    .withColumnRenamed('vl_precipitation','vl_precipitacao')

satelite_dict['POWER'] = satelite_dict['POWER'] \
    .withColumnRenamed('data','dt_medicao') \
    .withColumnRenamed('vl_allsky_sfc_sw_dwn','vl_radiacao_solar_ceu_total') \
    .withColumnRenamed('vl_prectotcorr','vl_precipitacao_total_corrigido') \
    .withColumnRenamed('vl_ps','vl_pressao_superficie') \
    .withColumnRenamed('vl_rh2m','vl_umidade_relativa_2m') \
    .withColumnRenamed('vl_t2m_max','vl_temperatura_maxima_2m') \
    .withColumnRenamed('vl_t2m_min','vl_temperatura_minima_2m') \
    .withColumnRenamed('vl_t2m','vl_temperatura_media_2m') \
    .withColumnRenamed('vl_t2mdew','vl_temperatura_orvalho_2m') \
    .withColumnRenamed('vl_wd2m','vl_direcao_vento_2m') \
    .withColumnRenamed('vl_wd10m','vl_direcao_vento_10m') \
    .withColumnRenamed('vl_ws2m_max','vl_maxima_vento_2m') \
    .withColumnRenamed('vl_ws10m_max','vl_maxima_vento_10m') \
    .withColumnRenamed('vl_ws10m','vl_vento_10m')

satelite_dict['TRMM'] = satelite_dict['TRMM'] \
    .withColumnRenamed('data','dt_medicao') \
    .withColumnRenamed('vl_precipitation','vl_precipitacao')

In [6]:
print('AgCFSR')
satelite_dict['AgCFSR'] \
        .coalesce(1) \
        .write.option("header",True) \
        .option("delimiter",",") \
        .mode("overwrite") \
        .csv(os.path.join(prata_path, f'prata_fato_AgCFSR.csv'))

print('AgMERRA')
satelite_dict['AgMERRA'] \
        .coalesce(1) \
        .write.option("header",True) \
        .option("delimiter",",") \
        .mode("overwrite") \
        .csv(os.path.join(prata_path, f'prata_fato_AgMERRA.csv'))

print('CHIRPS')
satelite_dict['CHIRPS'] \
        .coalesce(1) \
        .write.option("header",True) \
        .option("delimiter",",") \
        .mode("overwrite") \
        .csv(os.path.join(prata_path, f'prata_fato_CHIRPS.csv'))

print('CPC')
satelite_dict['CPC'] \
        .coalesce(1) \
        .write.option("header",True) \
        .option("delimiter",",") \
        .mode("overwrite") \
        .csv(os.path.join(prata_path, f'prata_fato_CPC.csv'))

print('GL')
satelite_dict['GL'] \
        .coalesce(1) \
        .write.option("header",True) \
        .option("delimiter",",") \
        .mode("overwrite") \
        .csv(os.path.join(prata_path, f'prata_fato_GL.csv'))

print('GPM Final Run')
satelite_dict['GPM Final Run'] \
        .coalesce(1) \
        .write.option("header",True) \
        .option("delimiter",",") \
        .mode("overwrite") \
        .csv(os.path.join(prata_path, f'prata_fato_GPM_Final_Run.csv'))

print('GPM Late Run')
satelite_dict['GPM Late Run'] \
        .coalesce(1) \
        .write.option("header",True) \
        .option("delimiter",",") \
        .mode("overwrite") \
        .csv(os.path.join(prata_path, f'prata_fato_GPM_Late_Run.csv'))

print('PERSIANN-CDR')
satelite_dict['PERSIANN-CDR'] \
        .coalesce(1) \
        .write.option("header",True) \
        .option("delimiter",",") \
        .mode("overwrite") \
        .csv(os.path.join(prata_path, f'prata_fato_PERSIANN_CDR.csv'))

print('TRMM')
satelite_dict['TRMM'] \
        .coalesce(1) \
        .write.option("header",True) \
        .option("delimiter",",") \
        .mode("overwrite") \
        .csv(os.path.join(prata_path, f'prata_fato_TRMM.csv'))

AgCFSR
AgMERRA
CHIRPS
CPC
GL
GPM Final Run
GPM Late Run
PERSIANN-CDR
TRMM


In [None]:
# Salvando no db
def get_single_csv(satelite):
    file_path = os.path.join('2. prata',f'prata_fato_{satelite}.csv',[f for f in os.listdir(os.path.join('2. prata',f'prata_fato_{satelite}.csv')) if f.endswith('.csv')][0])
    return str(Path(file_path).resolve())

satelites = ['AgCFSR','AgMERRA','CHIRPS','CPC','GL','GPM_Final_Run','GPM_Late_Run','PERSIANN_CDR','TRMM']
for satelite_name in satelites:
    conn.execute(f"""
        CREATE OR REPLACE TABLE
                    fato_satelite_{satelite_name} AS 
        SELECT * FROM read_csv_auto(
        '{get_single_csv(satelite_name)}'
        )""")

In [8]:
conn.close()

## fato_estacoes

In [6]:
# Importando os dataframes
estacoes_path = {
    'ANA':r'1. bronze\ana.csv',
    'CEMADEN':r'1. bronze\cemaden.csv',
    'CEMIG':r'1. bronze\cemig.csv',
    'IAC':r'1. bronze\iac.csv',
    'INMET_AGRI':r'1. bronze\inmet_agri.csv',
    'INMET':r'1. bronze\inmet.csv',
    'UNESP':r'1. bronze\unesp.csv'}

estacoes_dict = {
    estacao:pd.read_csv(path) for estacao,path in estacoes_path.items()
}

In [7]:
# Renomeando as colunas
estacoes_dict['ANA'] = estacoes_dict['ANA'] \
    [['id_estacao','data','prec']] \
    .rename(columns={
        'data':'dt_medicao',
        'prec':'vl_precipitacao'
    })

estacoes_dict['CEMADEN'] = estacoes_dict['CEMADEN'] \
    [['id_estacao','data','prec']] \
    .rename(columns={
        'data':'dt_medicao',
        'prec':'vl_precipitacao'
    })

estacoes_dict['CEMIG'] = estacoes_dict['CEMIG'] \
    [['id_estacao','data','tmin','tmed','tmax','prec','urmin','urmax']] \
    .rename(columns={
        'data':'dt_medicao',
        'tmin':'vl_temperatura_minima',
        'tmed':'vl_temperatura_media',
        'tmax':'vl_temperatura_maxima',
        'urmin':'vl_umidade_relativa_minima',
        'urmax':'vl_umidade_relativa_maxima'
    })

estacoes_dict['IAC'] = estacoes_dict['IAC'] \
    [['id_estacao','data','tmin','tmed','tmax','prec']] \
    .rename(columns={
        'data':'dt_medicao',
        'tmin':'vl_temperatura_minima',
        'tmed':'vl_temperatura_media',
        'tmax':'vl_temperatura_maxima'
    })

estacoes_dict['INMET_AGRI'] = estacoes_dict['INMET_AGRI'] \
    [['id_estacao','data','tmin','tmed','tmax','prec','urmin','urmax']] \
    .rename(columns={
        'data':'dt_medicao',
        'tmin':'vl_temperatura_minima',
        'tmed':'vl_temperatura_media',
        'tmax':'vl_temperatura_maxima',
        'urmin':'vl_umidade_relativa_minima',
        'urmax':'vl_umidade_relativa_maxima'
    })

estacoes_dict['INMET'] = estacoes_dict['INMET'] \
    [['id_estacao','data','tmin','tmed','tmax','prec','urmin','urmed','urmax']] \
    .rename(columns={
        'data':'dt_medicao',
        'tmin':'vl_temperatura_minima',
        'tmed':'vl_temperatura_media',
        'tmax':'vl_temperatura_maxima',
        'urmin':'vl_umidade_relativa_minima',
        'urmed':'vl_umidade_relativa_media',
        'urmax':'vl_umidade_relativa_maxima'
    })

estacoes_dict['UNESP'] = estacoes_dict['UNESP'] \
    [['id_estacao','data','tmin','tmed','tmax','prec','urmin','urmax']] \
    .rename(columns={
        'data':'dt_medicao',
        'tmin':'vl_temperatura_minima',
        'tmed':'vl_temperatura_media',
        'tmax':'vl_temperatura_maxima',
        'urmin':'vl_umidade_relativa_minima',
        'urmax':'vl_umidade_relativa_maxima'
    })

In [124]:
for estacao,df, in estacoes_dict.items():
    conn.execute(f"CREATE TABLE IF NOT EXISTS fato_estacao_{estacao} AS SELECT * FROM df")

In [126]:
for estacao,df, in estacoes_dict.items():
    conn.execute(f"DROP TABLE IF EXISTS {estacao}")

In [4]:
conn.execute('SHOW tables').fetch_df()

Unnamed: 0,name
0,dim_estacoes
1,fato_estacao_ANA
2,fato_estacao_CEMADEN
3,fato_estacao_CEMIG
4,fato_estacao_IAC
5,fato_estacao_INMET
6,fato_estacao_INMET_AGRI
7,fato_estacao_UNESP
8,fato_satelite_AgCFSR
9,fato_satelite_AgMERRA


In [10]:
conn.execute('SELECT * FROM dim_estacoes').fetch_df()

Unnamed: 0,id_estacao,lat,lon,dt_min,dt_max,fl_tmin,fl_tmed,fl_tmax,fl_prec,fl_urmin,fl_urmed,fl_urmax,nm_grupo_estacao
0,574937,-22.5283,-43.2029,2021-11-19,2023-11-01,0,0,0,1,0,0,0,ANA
1,575953,-22.4280,-43.0580,2013-10-05,2023-03-02,0,0,0,1,0,0,0,ANA
2,577468,-22.9281,-44.3972,2019-02-22,2024-02-20,0,0,0,1,0,0,0,ANA
3,574937,-22.5283,-43.2029,2021-11-19,2023-11-01,0,0,0,1,0,0,0,CEMADEN
4,575953,-22.4280,-43.0580,2013-10-05,2023-03-02,0,0,0,1,0,0,0,CEMADEN
...,...,...,...,...,...,...,...,...,...,...,...,...,...
850,1364,-22.3000,-49.0500,1994-11-01,2018-03-21,1,1,1,1,1,0,0,UNESP
851,1365,-22.8500,-48.4500,1961-01-01,2017-02-18,1,1,1,1,0,0,0,UNESP
852,1366,-20.4000,-51.3200,1991-08-20,2017-02-18,1,1,1,1,1,0,1,UNESP
853,1367,-22.1000,-51.3700,1961-01-01,2017-02-18,1,1,1,1,1,0,1,UNESP


## dim_satelites

In [None]:
table_data = {
    satelite:{
        'dt_min':satelite_dict[satelite].select(F.min('dt_medicao')).collect()[0][0].strftime('%Y-%m-%d'),
        'dt_max':satelite_dict[satelite].select(F.max('dt_medicao')).collect()[0][0].strftime('%Y-%m-%d'),
        'min_lat':satelite_dict[satelite].select(F.min('lat')).collect()[0][0],
        'max_lat':satelite_dict[satelite].select(F.max('lat')).collect()[0][0],
        'delta_lat':compute_lat_delta(satelite_dict[satelite]),
        'min_lon':-satelite_dict[satelite].select(F.min('lon')).collect()[0][0],
        'max_lon':satelite_dict[satelite].select(F.max('lon')).collect()[0][0],
        'delta_lon':compute_lon_delta(satelite_dict[satelite])
    } for satelite in satelite_dict}

In [12]:
dim_satelites = pd.DataFrame(table_data).T.reset_index(names=['nm_satelite'])

In [14]:
conn.execute("CREATE TABLE IF NOT EXISTS dim_satelites AS SELECT * FROM dim_satelites")

<duckdb.duckdb.DuckDBPyConnection at 0x1816652b170>

In [17]:
conn.close()