In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

from matplotlib import pyplot as plt
from tabulate import tabulate

import os, shutil, time, pdb, random
import scipy.stats as stats 
import scipy

from math import pi
from datetime import datetime
from collections import OrderedDict
import pickle

import torch
from torch.utils.data import TensorDataset, DataLoader

from importlib import reload
from models import *
from utils import *
from runmanager import *
from experiment import *
from plot_utils import *
from preprocessing_utils import *
from analysis_seasonal import * 

from sklearn.metrics import mean_squared_error as mse

import matplotlib
matplotlib.rc_file_defaults()
%matplotlib inline

import CONFIG

pd.options.display.max_columns = None

np.random.seed(4)

%load_ext autoreload
%autoreload 2

device = CONFIG.device

### Import data

In [5]:
# st = create_station_dataframe(TRAIN_PATH, start, end, add_yesterday=True, basin_filter = None, filter_incomplete_years = True)

data = DataPreprocessing(train_path=CONFIG.TRAIN_PATH, start=CONFIG.start, end=CONFIG.end, 
                add_yesterday = False, 
                basin_filter = None, 
                split_bias_corrected_only = False, 
                filter_incomplete_years = False, 
                include_non_bc_stations = True, 
                split_by = 'station')

In [6]:
data.split_stations()

### Split data into held out sets for K-fold cross validation

### Prepare data

In [8]:
# CONFIG.predictors.append('obs_yesterday')

data.input_data(CONFIG.predictors, CONFIG.predictand, sort_by_quantile=False)

## Multi-Run: Train model with different hyperparameters

### Model run

In [9]:
st_test, predictions = multirun(data, CONFIG.predictors, CONFIG.params, CONFIG.epochs, split_by='station',
                                sequential_samples=False)

Unnamed: 0,run,epoch,loss,valid_loss,test_loss,decision_loss,epoch duration,run duration,lr,batch_size,likelihood_fn,hidden_channels,dropout_rate,linear_model,k
0,1,1,1.440937,1.627235,1.615651,1.627235,1.839906,1.846211,0.005,128,bgmm,[50],0,False,0
1,1,2,1.309601,1.610923,1.626454,1.610923,1.580163,3.491868,0.005,128,bgmm,[50],0,False,0
2,1,3,1.286462,1.590729,1.617588,1.590729,1.725834,5.265925,0.005,128,bgmm,[50],0,False,0
3,1,4,1.282247,1.583095,1.643042,1.583095,1.891429,7.203906,0.005,128,bgmm,[50],0,False,0
4,1,5,1.273739,1.614470,1.602651,1.614470,1.768460,9.016692,0.005,128,bgmm,[50],0,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,40,1,1.161262,1.180428,1.173741,1.180428,1.781906,1.786032,0.005,128,bernoulli_loggaussian,[50],0,True,9
196,40,2,1.082393,1.164141,1.159340,1.164141,1.751311,3.573295,0.005,128,bernoulli_loggaussian,[50],0,True,9
197,40,3,1.065277,1.159186,1.136308,1.159186,1.738073,5.345760,0.005,128,bernoulli_loggaussian,[50],0,True,9
198,40,4,1.057828,1.165236,1.133787,1.165236,1.733022,7.114684,0.005,128,bernoulli_loggaussian,[50],0,True,9


In [21]:
sample_cols = [f'sample_{i}' for i in range(CONFIG.n_samples)]
add_cols = []

columns = ['Prec','wrf_prcp','wrf_bc_prcp','precip_norris']

In [22]:
for p in predictions.keys(): 
    for k,v in predictions[p].items():
        v['wrf_prcp'] = v['precip_norris'] 
        v['wrf_bc_prcp'] = v['precip_norris'] 

In [23]:
table_of_predictions_ks_test(predictions, CONFIG.seasons, columns, sample_cols, add_cols)

bgmm_[50]_NL_B=128_D=0
bgmm_[50]_L_B=128_D=0
bernoulli_loggaussian_[50]_NL_B=128_D=0
bernoulli_loggaussian_[50]_L_B=128_D=0
Model                                    JFM mean    JFM median    AM mean    AM median    JJAS mean    JJAS median    OND mean    OND median
---------------------------------------  ----------  ------------  ---------  -----------  -----------  -------------  ----------  ------------
Bann                                     0.1649      0.1291        0.2319     0.1673       0.3091       0.1297         0.1811      0.1259
BannCorr                                 0.1649      0.1291        0.2319     0.1673       0.3091       0.1297         0.1811      0.1259
Norr                                     0.1649      0.1291        0.2319     0.1673       0.3091       0.1297         0.1811      0.1259
bgmm_[50]_NL_B=128_D=0                   0.2895      0.1194        0.2822     0.1559       0.1443       0.1113         0.4073      0.1141
bgmm_[50]_L_B=128_D=0                 

In [24]:
table_of_predictions_for_metric(predictions, CONFIG.seasons, columns, CONFIG.n_samples, sample_cols, add_cols, metric = 'smape', prefix='smape')

Model                                    JFM mean    JFM median    AM mean    AM median    JJAS mean    JJAS median    OND mean    OND median
---------------------------------------  ----------  ------------  ---------  -----------  -----------  -------------  ----------  ------------
Prec                                     0.00        0.00          0.00       0.00         0.00         0.00           0.00        0.00
wrf_prcp                                 0.69        0.72          0.63       0.62         0.62         0.64           0.67        0.78
wrf_bc_prcp                              0.69        0.72          0.63       0.62         0.62         0.64           0.67        0.78
precip_norris                            0.69        0.72          0.63       0.62         0.62         0.64           0.67        0.78
bgmm_[50]_NL_B=128_D=0                   0.50        0.41          0.34       0.30         0.22         0.12           0.53        0.52
bgmm_[50]_L_B=128_D=0             

In [None]:
table_of_predictions_for_metric(predictions, CONFIG.seasons, columns, CONFIG.n_samples, sample_cols, add_cols, metric = 'edd', prefix='edd')

In [None]:
table_of_predictions_for_metric(predictions, CONFIG.seasons, columns, CONFIG.n_samples, sample_cols, add_cols, metric = 'ae', prefix='ae')

In [None]:
table_of_predictions_for_metric(predictions, CONFIG.seasons, columns, CONFIG.n_samples, sample_cols, add_cols, metric = 'se', prefix='se')

In [None]:
a = pd.read_csv('results.csv')

b = a.groupby(['k','run']).agg({'valid_loss': 'min', 
                                 'hidden_channels': 'first', 
                                 'likelihood_fn': 'first',
                                 'lr':'first',
                                 'batch_size':'first',
                                 'dropout_rate':'first',
                                 'linear_model':'first'})

c = b.groupby(['run']).agg({'valid_loss': 'mean', 
                        'hidden_channels': 'first', 
                        'likelihood_fn': 'first',
                        'lr':'first',
                        'batch_size':'first',
                        'dropout_rate':'first',
                         'linear_model':'first'}
                   ).sort_values('valid_loss').reset_index()

c.groupby(['hidden_channels',
           'likelihood_fn',
           'lr',
           'batch_size',
           'dropout_rate',
           'linear_model']).agg({'valid_loss': 'mean'}
                   ).sort_values('valid_loss').reset_index()

In [None]:
a