# Homework Assignment 2

In [68]:
from datetime import date, datetime, timedelta
from pathlib import Path
import random
import os
import cdsapi

## Task 1

### Date Logic

In [72]:
# Generator function to find and store all dates inside of the given range, inclusive the start and end date 
def daterange(start, end):
    
    current = start
    while current <= end:
        yield current
        current += timedelta(days=1)

### Status Check

In [100]:
# If necessary, a new directory will be created
BASE_DIR = Path("processed_data")
BASE_DIR.mkdir(exist_ok=True)

# Checks whether a date has already been processed
def is_day_processed(day: date) -> bool:
    # A date is considered processed if a folder exists for it
    return (BASE_DIR / day.isoformat()).exists()

### Mock-Processing

In [40]:
# Mock-processing that simulates download, processing and archiving of daily files
def mock_process_day(day, fail_probability=0.2):
    print(f"→ Processing {day}")

    # Simulating a random error during download
    if random.random() < fail_probability:
        raise RuntimeError("Mock download failed")

    # Save the daily mock file
    day_dir = BASE_DIR / day.isoformat()
    day_dir.mkdir(parents=True, exist_ok=True)

    with open(day_dir / "done.txt", "w") as f:
        f.write(f"Processed on {datetime.now()}\n")

    print(f"✓ {day} successfully processed")

### Find oldest missing day

In [17]:
# Find the oldest missing day of a given date range 
def find_oldest_missing_day(start, end) -> date | None:
    for d in daterange(start, end):
        # Use the established logic
        if not is_day_processed(d):
            return d
    return None

### Find oldest processed day

In [20]:
# Find the oldest fully processed day in the directory
def find_oldest_processed_day(processed_dir, date_format):
    
    oldest_date = None
    
    for entry in os.listdir(processed_dir):
        folder_path = os.path.join(processed_dir, entry)
        if not os.path.isdir(folder_path):
            continue  # Skip files, just looking for folders
        
        try:
            folder_date = datetime.strptime(entry, date_format).date() # Extract date from folder name
        except ValueError:
            # Ignore files that do not match the given date format
            continue

        # Check whether “Done.txt” exists in the folder
        done_file = os.path.join(folder_path, "Done.txt")
        if not os.path.isfile(done_file):
            continue  # Date has not been fully processed
            
        if oldest_date is None or folder_date < oldest_date:
            oldest_date = folder_date

    return oldest_date

### Central Workflow

In [23]:
# Central control flow that handles the daily data
def run_daily_workflow(
    start_date = None,
    end_date = None,
    target_date = None, 
    # ensure proper functionaility when script is executed w/o arguments
):

    # If the function is called without start_date, find a suitable start date
    if start_date is None:
        start_date = find_oldest_processed_day(BASE_DIR, "%Y-%m-%d")
    
    # If the function is called without end_date, find a suitable end date
    if end_date is None:
        end_date = date.today()
    
    # Check whether a target date or all days within the range should be processed
    if target_date is not None:
        print(f"Start workflow for individual target date: {target_date}")
        try:
            mock_process_day(target_date)
        except Exception as e:
            print(f"✗ Error on {target_date}: {e}")
        return

    print("Start workflow for all missing days")

    while True:
        missing_day = find_oldest_missing_day(start_date, end_date)
        if missing_day is None:
            print("✓ All days are processed")
            break

        try:
            mock_process_day(missing_day)
        except Exception as e:
            print(f"✗ Error on {missing_day}: {e}")
            print("→ Error logged, next day will be processed")

## Task 2

### ERA5 configuration dictionary

In [85]:
# This dictionary is used to easily adapt the donwload routine to other parameter values as stated in the assignment.
# No hardcoded settings within the download routine of the core workflow  

ERA5_CONFIG = {
    "dataset": "reanalysis-era5-pressure-levels",
    "format": "netcdf",
    "variable": ["specific_humidity"],
    "pressure_levels": [975, 900, 800, 500, 300],
    "times": ["00:00", "06:00", "12:00", "18:00"],
    "filename": "era5_humidity.nc",
}

### Download ERA5 data

In [93]:
# Downloads ERA5 humidity data for a single given day
def download_era5_humidity(day):

    # Save the daily data
    day_dir = BASE_DIR / day.isoformat()
    day_dir.mkdir(exist_ok=True)

    target_file = day_dir / "era5_humidity.nc"

    # CDS API request
    c = cdsapi.Client()
    request = {
        "product_type": "reanalysis",
        "variable": ERA5_CONFIG["variable"],
        "pressure_level": [str(p) for p in ERA5_CONFIG["pressure_levels"]],
        "year": day.strftime("%Y"),
        "month": day.strftime("%m"),
        "day": day.strftime("%d"),
        "time": ERA5_CONFIG["times"],
        "format": ERA5_CONFIG["format"],
    }

    print(f"↓ Download ERA5 humidity data for {day}")
    c.retrieve(
        "reanalysis-era5-pressure-levels",
        request,
        str(target_file),
    )

    # Flag as processed
    (day_dir / "Done.txt").write_text("OK\n")

### Central workflow with ERA5 download

In [96]:
# Central control flow that handles the daily ERA5 data
def run_daily_era5_workflow(
    start_date = None,
    end_date = None,
    target_date = None, 
    # ensure proper functionaility when script is executed w/o arguments
):

    # If the function is called without start_date, find a suitable start date
    if start_date is None:
        start_date = find_oldest_processed_day(BASE_DIR, "%Y-%m-%d")
        
    # If the function is called without end_date, find a suitable end date    
    if end_date is None:
        end_date = date.today()
        
    # Check whether a target date or all days within the range should be processed       
    if target_date is not None:
        print(f"Start workflow for individual day: {target_date}")
        try:
            download_era5_humidity(target_date)
        except Exception as e:
            print(f"✗ Error on {target_date}: {e}")
        return

    print("Start workflow for all missing days")

    while True:
        missing_day = find_oldest_missing_day(start_date, end_date)
        if missing_day is None:
            print("✓ All days are processed")
            break

        try:
            download_era5_humidity(missing_day)
        except Exception as e:
            print(f"✗ Error on {missing_day}: {e}")
            print("→ Error logged, next day will be processed")

## Testing

In [102]:
# Test ERA5 download routine with arguments

run_daily_era5_workflow(
    start_date=date(2024, 12, 1),
    end_date=date(2024, 12, 5),
    target_date=date(2024, 12, 2),
)

Start workflow for individual day: 2024-12-02


2026-01-18 14:51:44,801 INFO Request ID is 08fd94f5-be4a-4f0c-845c-0ed7967d0e16


↓ Download ERA5 humidity data for 2024-12-02


2026-01-18 14:51:44,857 INFO status has been updated to accepted
2026-01-18 14:52:05,969 INFO status has been updated to running
2026-01-18 14:52:17,400 INFO status has been updated to successful


791846caa92b6f74f6d77e87ce88151f.nc:   0%|          | 0.00/33.3M [00:00<?, ?B/s]