# Analyze Project Results


This notebook is indended to analyze and visualize the the accuracy of the FMC models.

## Setup

In [None]:
import os.path as osp
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
sys.path.append('../src')
from utils import Dict, read_yml, read_pkl, print_dict_summary
from data_funcs import flag_lag_stretches

## Read Results


In [None]:
raws24 = read_pkl("../data/raws_rocky_2024.pkl")
clim = read_pkl("../data/rocky_2024_climatology_forecasts.pkl")

## Clean RAWS

Filter long lag stretches.

In [None]:
hours = 72 # number of hours to group by for filter
max_linear_time = 10

for st in raws24:
    print("~"*50)
    print(f"Processing station {st}")
    df = raws24[st]["RAWS"]
    df['st_period'] = np.arange(len(df)) // hours
    flagged = df.groupby('st_period')['fm'].apply(
    lambda period: flag_lag_stretches(
        period, max_linear_time, lag=2)).pipe(lambda flags: flags[flags].index)    
    if flagged.size > 0:
        print(f"Setting period to NA: {flagged} due to linear period of data longer than {max_linear_time}")    
    df.loc[df.st_period.isin(flagged), "fm"] = np.nan

## Compare

### RMSE by Location

Calculate RMSE per location, average after

In [None]:
rmses = []

for st in clim.index:
    print("~"*50)
    print(f"RAWS: {st}")
    if st in raws24.keys():
        # double check the dates match
        assert np.mean(raws24[st]["RAWS"].date_time == clim.columns) == 1, "Dates don't match"
        
        obs = raws24[st]["RAWS"].fm.reset_index(drop=True)
        pred = clim[clim.index == st].transpose().squeeze().reset_index(drop=True)
    
        # Ignore NA
        na_inds = obs.isna() | pred.isna()
        obs = obs[~na_inds]
        pred = pred[~na_inds]
        if obs.shape[0] > 0:
            print(f"Comparing {obs.shape[0]} forecast hours")
            rmse = np.sqrt(mean_squared_error(obs, pred))
            rmses.append(rmse)
            print(f"Climatology RMSE: {rmse}")
        else:
            print(f"No observed data for forecasted hours for {st}, skipping")
    else:
        print(f"No 2024 data for RAWS {st}, skipping")

In [None]:
np.mean(rmses)

### Overall RMSE

Calculate MSE per location, RMSE after everything

In [None]:
all_obs = []
all_pred = []

for st in clim.index:
    print("~"*50)
    print(f"RAWS: {st}")
    if st in raws24.keys():
        # double check the dates match
        assert np.mean(raws24[st]["RAWS"].date_time == clim.columns) == 1, "Dates don't match"
        
        obs = raws24[st]["RAWS"].fm.reset_index(drop=True)
        pred = clim[clim.index == st].transpose().squeeze().reset_index(drop=True)
    
        # Ignore NA
        na_inds = obs.isna() | pred.isna()
        obs = obs[~na_inds]
        pred = pred[~na_inds]

        all_obs.extend(obs)
        all_pred.extend(pred)
        
    else:
        print(f"No 2024 data for RAWS {st}, skipping")

In [None]:
np.sqrt(mean_squared_error(np.array(all_pred), np.array(all_obs)))