# Build Climatology Tutorial

As a baseline method, we build a climatology for FMC based on the historical average at a particular hour of the day. This method is inspired by Schreck et all 2023. This notebook utilizes retrieval and filtering of 10-h dead FMC data from a RAWS stash maintained by Angel Farguell, demonstrated in notebook `ingest_fm10_stash_tutorial`.

Main processes:
- `build_climatology`: this function retrieves RAWS data from a stash given input time period and spatial domain. The data is saved to a local directory for potential reuse, such as repeated applications of calculating forecast error for cross validation. Parameters for this process are stored in `etc/params_models.yaml`
- `get_climatology_forecasts`: this function returns FMC forecasts

## Setup

In [None]:
import os.path as osp
from dateutil.relativedelta import relativedelta
import json
import sys
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
sys.path.append('../src')
from utils import Dict, read_yml, read_pkl, str2time, print_dict_summary, time_range
# import ingest.retrieve_raws_api as rr
# import ingest.retrieve_raws_stash as rrs
import models.moisture_models as mm

In [None]:
with open("../etc/training_data_config.json", "r") as json_file:
    config = json.load(json_file)   
    config = Dict(config)

In [None]:
print_dict_summary(config)

## Climatology

Method description...

6 years


In [None]:
start = str2time(config.start_time)
# end = start + relativedelta(hours=48)
end = start + relativedelta(hours = 12)

In [None]:
import importlib
import models.moisture_models
importlib.reload(models.moisture_models)
import models.moisture_models as mm

In [None]:
clim_dict = mm.build_climatology(
    start,
    end,
    config.bbox
)

In [None]:
clim_df = mm.get_climatology_forecasts(clim_dict)

In [None]:
clim_df

In [None]:
# Plot a few timeseries

df_subset = clim_df.head(5).T  
plt.figure(figsize=(10, 6))
for idx in df_subset.columns:
    plt.plot(df_subset.index, df_subset[idx], label=str(idx))

plt.xlabel('Time')
plt.ylabel('Forecasted FMC (%)')
plt.title('Forecasted FMC over Time')
plt.legend(title='STID')
plt.grid(True)
plt.show()

In [None]:
import os
import pickle
import pandas as pd
from multiprocessing import Pool

from ingest.RAWS import get_file_paths, get_stations
from utils import Dict, read_pkl

In [None]:
clim_params = mm.clim_params
clim_params

In [None]:
def build_climatology(start, end, bbox, nyears=clim_params.nyears, ndays=clim_params.ndays, min_years = clim_params.min_years):
    
    def count_years(values_df, times_df):
        """
        Based on years in times_df, count number of non-nan values per year in values_df. 
        Result should be a count of the number of years of data with non-nan
        """
        counts = {
            col: times_df[values_df[col].notna()][col].nunique()
            for col in values_df.columns
        }
        counts = pd.Series(counts)
        return counts        

    # Retrieve data
    ## Note, many station IDs will be empty, the list of stids was for the entire bbox region in history
    print(f"Retrieving climatology data from {start} to {end}")
    print("Params for Climatology:")
    print(f"    Number of years to look back: {nyears}")
    print(f"    Number of days to bracked target hour: {ndays}")
    print(f"    Required number of years of data: {min_years}")

In [None]:
build_climatology(start, end, config.bbox)

In [None]:
sts_df = get_stations(config.bbox)
sts = list(sts["stid"])

In [None]:
# Forecast Times
ftimes = time_range(start, end)

In [None]:
# Earliest time associated with forecast time,.
# In this project, 1 year previous to earliest forecast time
# and 15 days before that based on 30 day window
t0 = ftimes.min() - relativedelta(years=clim_params.nyears) - relativedelta(days = clim_params.ndays)
t1 = ftimes.max()

all_times = time_range(t0, t1)
print(f"Total hours to retrieve for climatology: {len(all_times)}")

In [None]:
raws_files = get_file_paths(all_times)
raws_files = [f for f in raws_files if os.path.exists(f)]
print(f"Existing RAWS Files: {len(raws_files)}")

In [None]:
di = pd.read_pickle(raws_files[10000])

In [None]:
def load_and_filter_pickle(file_path, sts):
    """Load a pickle file using pd.read_pickle and filter by 'stid' column."""
    try:
        df = pd.read_pickle(file_path)  # Use pd.read_pickle instead of pickle.load
        df.columns = df.columns.str.lower()
        if isinstance(df, pd.DataFrame) and "stid" in df.columns:
            return df[df["stid"].isin(sts)]  # Filter rows
    except Exception as e:
        print(f"Error reading {file_path}: {e}")  # Handle errors gracefully
    return None

In [None]:
load_and_filter_pickle(raws_files[80000], sts)

In [None]:
# def parallel_load_pickles(file_list, sts, num_workers=8):
#     """Parallel loading and filtering of pickle files using pd.read_pickle."""
#     with Pool(num_workers) as pool:
#         results = pool.starmap(load_and_filter_pickle, [(f, sts) for f in file_list])
    
#     # Concatenate all filtered DataFrames, ignoring None values
#     return pd.concat([df for df in results if df is not None], ignore_index=True)

from joblib import Parallel, delayed
def parallel_load_pickles(file_list, sts, num_workers=8):
    """Parallel loading using joblib instead of multiprocessing."""
    results = Parallel(n_jobs=num_workers)(delayed(load_and_filter_pickle)(f, sts) for f in file_list)
    return pd.concat([df for df in results if df is not None], ignore_index=True)


In [None]:
clim_data = parallel_load_pickles(raws_files, sts)

In [None]:
clim_data

In [None]:
clim_data.shape

In [None]:
clim_data_dir = "../data/climatology"
filename = f"test_climatology_data"

In [None]:
os.makedirs(clim_data_dir, exist_ok=True)

In [None]:
with open(osp.join(clim_data_dir, f"{filename}.pkl"), 'wb') as handle:
    pickle.dump(clim_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
clim_metadata = {
    'start': start.strftime("%Y-%m-%d %H:%M:%S"),
    'end': end.strftime("%Y-%m-%d %H:%M:%S"),
    'bbox': config.bbox,
    'nyears': clim_params.nyears,
    'min_years': clim_params.min_years,
    'ndays (+/-)': clim_params.ndays
}
clim_metadata

In [None]:
with open(osp.join(clim_data_dir, f"{filename}_metadata.txt"), "w") as f:
    for key, value in clim_metadata.items():
        f.write(f"{key}: {value}\n")

In [None]:
clim_data.shape

In [None]:
ftimes

In [None]:
clim_data

In [None]:
ti = mm.time_to_climtimes(ftimes[0], nyears = clim_params.nyears, ndays=clim_params.ndays)

In [None]:
type(ti)

In [None]:
ti.shape

In [None]:
def filter_clim_data(clim_data, clim_times):
    """
    Filters clim_data to include only rows where the 'datetime' column matches 
    any datetime in clim_times based on year, month, day, and hour.
    
    Parameters:
    - clim_data (pd.DataFrame): DataFrame containing 'datetime' column (numpy datetime64).
    - clim_times (np.ndarray): Array of datetime objects to match.

    Returns:
    - pd.DataFrame: Filtered DataFrame.
    """
    # Convert clim_times to a DataFrame for efficient merging
    clim_times_df = pd.DataFrame({
        "year": [t.year for t in clim_times],
        "month": [t.month for t in clim_times],
        "day": [t.day for t in clim_times],
        "hour": [t.hour for t in clim_times]
    }).drop_duplicates()  # Remove duplicates to speed up filtering

    # Extract the relevant time components from clim_data
    clim_data_filtered = clim_data.assign(
        year=clim_data["datetime"].dt.year,
        month=clim_data["datetime"].dt.month,
        day=clim_data["datetime"].dt.day,
        hour=clim_data["datetime"].dt.hour
    ).merge(clim_times_df, on=["year", "month", "day", "hour"], how="inner")

    return clim_data_filtered.drop(columns=["year", "month", "day", "hour"])

In [None]:
dfi = filter_clim_data(clim_data, ti)
dfi

In [None]:
import pandas as pd
import numpy as np

def average_fm10_by_stid(filtered_df, min_years):
    """
    Computes the average fm10 grouped by 'stid', but returns NaN if the number 
    of unique years in 'datetime' is less than nyears.

    Parameters:
    - filtered_df (pd.DataFrame): DataFrame containing 'stid', 'datetime', and 'fm10'.
    - nyears (int): Minimum number of unique years required per 'stid'.

    Returns:
    - pd.Series: Averaged fm10 per 'stid' (NaN if unique years < nyears).
    """
    # Extract unique years for each STID
    year_counts = filtered_df.groupby("stid")["datetime"].apply(lambda x: x.dt.year.nunique())

    # Compute fm10 average per STID
    fm10_avg = filtered_df.groupby("stid")["fm10"].mean()

    # Set to NaN where unique years < nyears
    fm10_avg[year_counts < min_years] = np.nan

    return fm10_avg


In [None]:
y = average_fm10_by_stid(dfi, min_years = clim_params.min_years)

In [None]:
def process_ftimes(ftimes, clim_data, clim_params):
    """
    Runs `time_to_climtimes` on each time in `ftimes`, filters `clim_data`,
    computes the average `fm10` per `stid`, and combines results.

    Parameters:
    - ftimes (np.ndarray): Array of datetime objects to process.
    - clim_data (pd.DataFrame): DataFrame containing 'stid', 'datetime', and 'fm10'.
    - clim_params: Object containing `nyears` and `ndays` parameters.

    Returns:
    - pd.DataFrame: Combined results with average fm10 per stid for each ftime.
    """
    results = []

    for ftime in ftimes:
        # Generate climtimes for the given ftime
        clim_times = mm.time_to_climtimes(ftime, nyears=clim_params.nyears, ndays=clim_params.ndays)
        
        # Filter clim_data based on clim_times
        filtered_data = filter_clim_data(clim_data, clim_times)

        # Compute the average fm10 per stid
        fm10_avg = average_fm10_by_stid(filtered_data, min_years=clim_params.min_years)

        # Store results with corresponding ftime
        df_result = fm10_avg.reset_index()
        df_result["forecast_time"] = ftime  # Add ftime column
        results.append(df_result)

    # Combine all results into a single DataFrame
    combined_df = pd.concat(results, ignore_index=True)
    pivot_df = combined_df.pivot(index="stid", columns="forecast_time", values="fm10")    

    return pivot_df



In [None]:
x = process_ftimes(ftimes, clim_data, clim_params)

In [None]:
dropped_stids = x.index[x.isna().all(axis=1)].tolist()
x = x.dropna(how="all")
print(dropped_stids)

In [None]:
x