In [None]:
# default_exp backfill

# Pipeline for backfilling / batching

Dagster can only run one pipeline per module, and `05_pipeline.ipynb` ie `mario.py` already has one pipeline defined for continuous linear retrieval where the steps take place one after the other.



In [None]:
#exports
import pandas as pd
import xarray as xr
import os
import glob
import dotenv
import warnings
from dagster import execute_pipeline, pipeline, solid, Field, OutputDefinition, DagsterType, Output

from IPython.display import JSON

from satip import eumetsat, reproj, io, gcp_helpers
from satip.mario import (df_metadata_to_dt_to_fp_map, 
                         reproject_datasets, 
                         save_metadata, 
                         compress_and_save_datasets, 
                         compress_export_then_delete_raw)

In [None]:
# Filter some warnings
#exports
warnings.filterwarnings('ignore', message='divide by zero encountered in true_divide')
warnings.filterwarnings('ignore', message='invalid value encountered in sin')
warnings.filterwarnings('ignore', message='invalid value encountered in cos')
warnings.filterwarnings('ignore', message='invalid value encountered in subtract')
warnings.filterwarnings('ignore', message='You will likely lose important projection information when converting to a PROJ string from another format. See: https://proj.org/faq.html#what-is-the-best-format-for-describing-coordinate-reference-systems')

In [None]:
missing_datasets = io.identifying_missing_datasets("2020-01-01 00:00", "2020-01-01 01:00")
JSON(missing_datasets)

Earliest 2020-01-01 00:00, latest 2020-01-01 01:00


Found 12 results


<IPython.core.display.JSON object>

In [None]:
#exports

# create pandas DataFrame type definition for Dagster
DataFrame = DagsterType(
    name="DataFrame",
    type_check_fn=lambda _, x: isinstance(x, pd.DataFrame),
)

@solid(output_defs=[OutputDefinition(name='df_new_metadata', dagster_type=DataFrame, is_required=False)])
def download_missing_eumetsat_files(context, env_vars_fp: str, data_dir: str, metadata_db_fp: str, debug_fp: str, table_id: str, project_id: str, start_date: str='', end_date: str=''):
    _ = dotenv.load_dotenv(env_vars_fp)
    dm = eumetsat.DownloadManager(os.environ.get('USER_KEY'), os.environ.get('USER_SECRET'), data_dir, metadata_db_fp, debug_fp, slack_webhook_url=os.environ.get('SLACK_WEBHOOK_URL'), slack_id=os.environ.get('SLACK_ID'))
    
    missing_datasets = io.identifying_missing_datasets(start_date, end_date)
    context.log.info(f"Missing data: {len(missing_datasets)}")
    
    df_new_metadata = dm.download_datasets(missing_datasets)

    # if df_new_metadata is None, pipeline will skip subsequent solids
    if df_new_metadata is None:
        context.log.info("*******************")
        context.log.info("Files already in zarr. Exiting.")
        context.log.info("*******************")
        return

    yield Output(df_new_metadata, 'df_new_metadata')

In [None]:
#exports
@pipeline
def download_missing_data_pipeline():  
    # Retrieving data, reprojecting, compressing, and saving to GCP
    df_new_metadata = download_missing_eumetsat_files()
    datetime_to_filepath = df_metadata_to_dt_to_fp_map(df_new_metadata)
    ds_combined_reproj = reproject_datasets(datetime_to_filepath)
    ds_combined_compressed = compress_and_save_datasets(ds_combined_reproj)
    
    ready_to_delete = save_metadata(ds_combined_compressed, df_new_metadata)
    compress_export_then_delete_raw(ready_to_delete)

Test the configuration and execute the pipeline:

In [None]:
#hide
run_config = {
    'solids': {
        'download_missing_eumetsat_files': {
            'inputs': {
                'env_vars_fp': "../.env",
                'data_dir': "../data/raw",
                'metadata_db_fp': "../data/EUMETSAT_metadata.db",
                'debug_fp': "../logs/EUMETSAT_download.txt",
                'table_id': "eumetsat.metadata",
                'project_id': "solar-pv-nowcasting",
                'start_date': "2019-01-01T00:00:00",
                'end_date': "2019-01-01T01:00:00"
            },
        },
        'df_metadata_to_dt_to_fp_map': {
            'inputs': {
                'data_dir': "../data/raw"
            }
        },
        'reproject_datasets': {
            'inputs': {
                'new_coords_fp': "../data/intermediate/reproj_coords_TM_4km.csv",
                'new_grid_fp': "../data/intermediate/new_grid_4km_TM.json"
            }
        },
        'compress_and_save_datasets': {
            'inputs': {
                'zarr_bucket': "solar-pv-nowcasting-data/satellite/EUMETSAT/SEVIRI_RSS/full_extent_TM_int16",
                'var_name': "stacked_eumetsat_data"
            }
        },
        'save_metadata': {
            'inputs': {
                'table_id': "eumetsat.metadata",
                'project_id': "solar-pv-nowcasting"
            },
        },
        'compress_export_then_delete_raw': {
            'inputs': {
                'data_dir': "../data/raw",
                'compressed_dir': "../data/compressed",
                'BUCKET_NAME': "solar-pv-nowcasting-data",
                'PREFIX': "satellite/EUMETSAT/SEVIRI_RSS/native/",
                'ready_to_delete': True
            },
        }
    }
}

In [None]:
#hide
# if 'download_eumetsat_files' in run_config['solids'].keys():
#     run_config['solids']['download_missing_eumetsat_files'] = run_config['solids']['download_eumetsat_files']
#     run_config['solids'].pop('download_eumetsat_files')

execute_pipeline(download_missing_data_pipeline, run_config=run_config)

2021-03-06 11:01:11 - dagster - DEBUG - download_missing_data_pipeline - 4ebd32c0-8f9e-4827-a673-29bee44ede05 - 54936 - ENGINE_EVENT - Starting initialization of resources [asset_store].
2021-03-06 11:01:11 - dagster - DEBUG - download_missing_data_pipeline - 4ebd32c0-8f9e-4827-a673-29bee44ede05 - 54936 - ENGINE_EVENT - Finished initialization of resources [asset_store].
2021-03-06 11:01:11 - dagster - DEBUG - download_missing_data_pipeline - 4ebd32c0-8f9e-4827-a673-29bee44ede05 - 54936 - PIPELINE_START - Started execution of pipeline "download_missing_data_pipeline".
2021-03-06 11:01:11 - dagster - DEBUG - download_missing_data_pipeline - 4ebd32c0-8f9e-4827-a673-29bee44ede05 - 54936 - ENGINE_EVENT - Executing steps in process (pid: 54936)
2021-03-06 11:01:11 - dagster - DEBUG - download_missing_data_pipeline - 4ebd32c0-8f9e-4827-a673-29bee44ede05 - 54936 - download_missing_eumetsat_files.compute - STEP_START - Started execution of step "download_missing_eumetsat_files.compute".
2021-0

Earliest 2019-01-01T00:00:00, latest 2019-01-01T01:00:00


Found 12 results


2021-03-06 11:01:15 - dagster - INFO - system - 4ebd32c0-8f9e-4827-a673-29bee44ede05 - download_missing_eumetsat_files.compute - Missing data: 0
2021-03-06 11:01:15,518 - INFO - 0 files queried, 0 found in ../data/raw, 0 to download.
2021-03-06 11:01:15,518 - INFO - 0 files queried, 0 found in ../data/raw, 0 to download.
2021-03-06 11:01:15,518 - INFO - 0 files queried, 0 found in ../data/raw, 0 to download.
2021-03-06 11:01:15,518 - INFO - 0 files queried, 0 found in ../data/raw, 0 to download.
2021-03-06 11:01:15,518 - INFO - 0 files queried, 0 found in ../data/raw, 0 to download.
2021-03-06 11:01:15,518 - INFO - 0 files queried, 0 found in ../data/raw, 0 to download.
2021-03-06 11:01:15,518 - INFO - 0 files queried, 0 found in ../data/raw, 0 to download.
2021-03-06 11:01:15,518 - INFO - 0 files queried, 0 found in ../data/raw, 0 to download.
2021-03-06 11:01:15,518 - INFO - 0 files queried, 0 found in ../data/raw, 0 to download.
2021-03-06 11:01:15,518 - INFO - 0 files queried, 0 fo

<dagster.core.execution.results.PipelineExecutionResult at 0x7fa0424948e0>

In [None]:
#hide
from nbdev.export import *
notebook2script('05a_pipeline_batch.ipynb')