# Analyze Project Results


This notebook is indended to analyze and visualize the the accuracy of the FMC models.

## Setup

In [None]:
import os
import os.path as osp
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from scipy import stats
import itertools
sys.path.append('../src')
from utils import Dict, read_yml, read_pkl, print_dict_summary
from data_funcs import flag_lag_stretches, get_sts_and_times

In [None]:
clim_path = "../data/rocky_2024_climatology_forecasts.pkl" # climatology forecast outputs
ml_forecast_dir = "../outputs/forecast_analysis_test"

### Read Results


In [None]:
# Extract forecast periods from files
forecast_files = os.listdir(ml_forecast_dir)
forecast_files = [f for f in forecast_files if (f.endswith('.pkl') and not f.startswith('ml_'))]# Remove other files
forecast_starts = np.array([datetime.strptime(f.split(".")[0], "%Y%m%d_%H") for f in forecast_files])
forecast_files = np.array(forecast_files)[forecast_starts.argsort()]
forecast_starts.sort()

ml_data = read_pkl(osp.join(ml_forecast_dir, "ml_data.pkl"))
clim = read_pkl(clim_path)
ml_results = [read_pkl(osp.join(ml_forecast_dir, f)) for f in forecast_files]

## Calculate Accuracy for Climatology

NOTE: As of Feb 25, test forecast analysis ran in 2023, and climatology only exists for 2024. Can't combine for now

The climatology method used in this project produces forcasts for all stations. Note that climatology forecasts are generated using observed data from RAWS. The ML models generate forecasts by using no observed data from the test RAWS stations. So the climatology method has an advantage relative to the ML models.

In [None]:
# # Extract stations used in each forecast period
# # Additionally, perform some checks to make sure data looks right
# sts = []
# for i, fperiod in enumerate(ml_results):
#     stids = fperiod['stids']
#     times = fperiod['times']
#     times.sort()
#     # Check times match, num stations matches
#     assert pd.Timestamp(forecast_starts[i], tz="UTC") == times[0], "Time array from ML output dict doesn't match target file time"
#     for mod in ['RNN']:
#         assert len(fperiod[mod]['loc_rmse']) == len(stids), "Mismatch between number of stations and number of RMSE per station"

#     sts.append(stids)
#     print('~'*75)
#     print(f"Analyzing Forecast Period {i}")
#     print(f"Forecast Start Time: {times.min()}")
#     print(f"Forecast End Time: {times.max()}")    
#     print(f"Test Stations: {stids}")

#     # Extract test station observed FMC data
#     obs = get_sts_and_times(ml_data, stids, times)
#     assert [*obs.keys()] == stids, f"Retrieved observed data from ml_data doesn't match test stids: {[*obs.keys()]}, {stids=}"
#     obs_fm = np.stack([v["data"]["fm"].values[:, np.newaxis] for v in obs.values()]) # Get 3d array, (n_loc, 48, 1)
#     assert obs_fm.shape == (len(stids), 48, 1), f"Observed FMC data unexpected shape. Expected {(len(stids), 48, 1)}, received {obs_fm.shape}"

#     # Extract climatology forecasts for given times and stids
    

## Compare Models

In [None]:
# Run some checks on time and location, combine results into df
ode_errs = []
xgb_errs = []
rnn_errs = []
for i, fperiod in enumerate(ml_results):
    stids = fperiod['stids']
    times = fperiod['times']
    times.sort()
    # Check times match, num stations matches
    assert pd.Timestamp(forecast_starts[i], tz="UTC") == times[0], "Time array from ML output dict doesn't match target file time"
    for mod in ['RNN']:
        assert len(fperiod[mod]['loc_rmse']) == len(stids), "Mismatch between number of stations and number of RMSE per station"

    ode_errs.append(fperiod['ODE']['rmse'])
    xgb_errs.append(fperiod['XGB']['rmse'])
    rnn_errs.append(fperiod['RNN']['rmse'])

In [None]:
fperiod['ODE']

In [None]:
df = pd.DataFrame({
    'ODE': ode_errs,
    'XGB': xgb_errs,
    'RNN': rnn_errs,
})
df.index = forecast_starts

df

In [None]:
# Mean Error for Model
means = df.mean(axis=0)
stds = df.std(axis=0)

overall_errs_df = pd.DataFrame({"Mean RMSE": means, "(Std)": stds})
overall_errs_df

## T Tests

In [None]:
# All Pairwise t-tests
col_pairs = list(itertools.combinations(df.columns, 2))

# Apply t-test to each pair
ttests = {
    (col1, col2): stats.ttest_rel(df[col1], df[col2])
    for col1, col2 in col_pairs
}

In [None]:
ttests

In [None]:
print(f"Number of ttests run: {len(col_pairs)}")
print(f"Bonferroni Corrected Thresholds:")
print(f"    Threshold 0.05 :  Corrected {0.05/len(col_pairs)}")
print(f"    Threshold 0.01 :  Corrected {0.01/len(col_pairs)}")

## Skill Scores

In [None]:
rmse_model = overall_errs_df.loc['RNN'].iloc[0]
rmse_baseline1 = overall_errs_df.loc['ODE'].iloc[0]

print(f"RMSE Skill Score (ODE Baseline): ")
print(f"    {1-rmse_model/rmse_baseline1}")
print()
print(f"MSE Skill Score (ODE Baseline): ")
print(f"    {1-rmse_model**2/rmse_baseline1**2}")