# Example workflow
Example "recipe" for downloading data from various sources and executing a range of ML methods with it. 

```yaml
time:
  default: 2000, 2001
  ancient: 1980, 1990
  modern: 2016, 2021

geometry: 
  sites:
    site_a: 40, -10
    site_b: 30, 0
    site_c: 20, 10
    site_d: 10, 20

  areas:
    europe: -10, 40, 40, 70
    usa: ...
    name_x: ...

datasets:
  ppo:
    engine: springtime  # or rppo, ...
    period: default  # or e.g. [ancient, modern]
    area: Europe  # or e.g. "from_sites, radius=10, agg=mean"
    genus: Syringa
    source: PEP725
    termID: "obo:PPO_0002313"
    filter:
      minimum_availability: 80  # percent?
  daymet:
    engine: daymetr  # or REST api, ... (see https://daymet.ornl.gov/getdata)
    variables: [tmin, tmax, sunshine_duration]
  modis:
    sites: all  # or [site_a, site_b], ... OR: from_dataset: ppo (will extract unique locations from result of ppo and download those)
    variables: NDVI and EVI  # vegetation indices
    engine: modistools
    ...
  pyphenology:
    period: ...  # if not given, use first option from periods section above
    species: vaccinium  # or aspen
    phenophase: budburst  # or flowers

cross_validation:
  train_test_strategy: ShuffleSplit
  metric_name: RMSE

models:
  target: day_of_first_bloom
  sklearn: 
    model: sklearn.linear_model
    options: ...
  pyphenology:
    model: pyPhenology.primary_model
    options: ...
  merf: ...  # https://github.com/manifoldai/merf
  interpretml: ...  # https://github.com/interpretml/interpret/
  statsmodels: ... # https://www.statsmodels.org/stable/mixed_linear.html 
  

```

# Dataset interface

In [None]:
from shapely import Point, Polygon
from typing import Union, List, Tuple, Literal, Protocol
from pathlib import Path
from datetime import Date

class Dataset(Protocol):
    """Interface for working with phenology datasets."""

    name: Literal["ppo", "daymet", "modis", "pyphenology"]
    period: Tuple[Date, Date]
    geometry: Union[Point, Polygon]  # might need to convert bbox to polygon in parsing
    

    def get_files(self) -> List[Path]:
        """Show filename(s) that this dataset would have on disk.
        
        Should use a generic data reference sytax combined with a local
        filesystem configuration. 
        """

    def exists_locally(self) -> bool:
        """Tell if the data is already present on disk."""

    def download(self):
        """Download the data."""

    def load(self):
        """Load the dataset from disk into memory.
        
        This may include pre-processing operations as specified by the context, e.g. 
        filter certain variables, remove data points with too many NaNs, reshape data.
        """   

In [None]:
class Model(Protocol):
    """Interface for working with various ML models."""

    def fit(self, data):
        """Fit model to data."""

    def predict(self, new_x):
        """Make a prediction for new data."""
        

class CV(Protocol):
    """Interface for cross-validation strategy."""
    
    def split(self, data):
        """Split the data into train/test sets."""

    def search(self):
        """Do grid search or something like that..."""

# Workflow execution

In [None]:
import pandas as pd
import yaml
DEFAULT_CONFIG = "~/.config/sprintime.yaml"

class Config:
    """Settings for springtime."""


class Session():
    """Session for executing a workflow."""
    output_dir: Path


class Workflow:

    config: Config
    recipe: Path
    datasets: List[Dataset]
    cross_validation: CV
    models: List[Model]

    @classmethod
    def from_recipe(cls, recipe: Path):
        with open(recipe, 'r') as raw_recipe:
            options = yaml.load(raw_recipe)
        
        return cls(**options)

    def execute(self):
        """(Down)load data, pre-process, run models, evaluate."""
        self.create_session()
        self.autocomplete()
        self.download_data()
        self.load_data()
        self.run_experiments()

    def create_session(self):
        """Create a context for executing the experiment."""
        self.session = Session(...)
        self.recipe.copy(self.session.output_dir / "data.csv")
        
    def autocomplete(self):
        """Substitute time and area in datasets and model function mappings."""

    def download_data(self):
        """Download the data."""
        for dataset in self.datasets:
            if not dataset.exists_locally() or self.config.force_override:
                dataset.download()

    def load_data(self):
        """Load and merge input datasets."""
        data = []
        for dataset in self.datasets:
            
            if not dataset.exists_locally():
                dataset.download()
            
            data.append(dataset.load())

        df = pd.concat(data, axis=1)        
        df.to_csv(self.session.output_dir / "data.csv")
        self.df = df

    def run_experiments(self):
        """Train and evaluate ML models."""
        scores = {}
        for model in self.models:
            with self.cross_validation as cv:
                model.fit(self.df)
                score = model.score()
                scores[model] = score

        scores.df.to_csv(self.session.output_dir / "data.csv")


if __name__ == "__main__":

    import sys
    
    recipe_file = sys.argv[0]
    Workflow.from_recipe(recipe_file).execute()