In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

from matplotlib import pyplot as plt
from tabulate import tabulate

import os, shutil, time, pdb, random
import scipy.stats as stats 
import scipy

from math import pi
from datetime import datetime
from collections import OrderedDict
import pickle

import torch
from torch.utils.data import TensorDataset, DataLoader

from importlib import reload
from models import *
from utils import *
from runmanager import *
from experiment import *
from plot_utils import *
from preprocessing_utils import *
from analysis_seasonal import * 

from sklearn.metrics import mean_squared_error as mse

import matplotlib
matplotlib.rc_file_defaults()
%matplotlib inline

import CONFIG

pd.options.display.max_columns = None

np.random.seed(4)

%load_ext autoreload
%autoreload 2

device = CONFIG.device



### Import data

In [2]:
data = DataPreprocessing(train_path=CONFIG.TRAIN_PATH, start=CONFIG.start, end=CONFIG.end, 
                add_yesterday = False, 
                basin_filter = None, 
                split_bias_corrected_only = False, 
                filter_incomplete_years = False, 
                include_non_bc_stations = True, 
                split_by = 'station')

### Split data into held out sets for K-fold cross validation

In [3]:
data.split_stations()

### Prepare data

In [4]:
# CONFIG.predictors.append('obs_yesterday')
data.input_data(CONFIG.predictors, CONFIG.predictand, sort_by_quantile=False)

## Multi-Run: Train model with different hyperparameters

### Model run

In [None]:
st_test, predictions = multirun(data, CONFIG.predictors, CONFIG.params, CONFIG.epochs, 
                                split_by = 'station', sequential_samples = False, load_run = None)

Unnamed: 0,run,epoch,loss,valid_loss,test_loss,decision_loss,epoch duration,run duration,lr,batch_size,likelihood_fn,dropout_rate,k,model_arch
0,1,1,1.265911,1.172906,1.242172,1.172906,5.860004,5.867386,0.005,128,bgmm,0,0,"(VGLM, [])"
1,1,2,1.206676,1.173375,1.242178,1.173375,5.791552,11.717118,0.005,128,bgmm,0,0,"(VGLM, [])"
2,1,3,1.206488,1.173694,1.240549,1.173694,5.769708,17.528613,0.005,128,bgmm,0,0,"(VGLM, [])"
3,1,4,1.206122,1.171363,1.246699,1.171363,6.008654,23.582618,0.005,128,bgmm,0,0,"(VGLM, [])"
4,1,5,1.205902,1.169312,1.238578,1.169312,5.705728,29.326298,0.005,128,bgmm,0,0,"(VGLM, [])"
5,1,6,1.205763,1.16897,1.240158,1.16897,6.367269,35.738897,0.005,128,bgmm,0,0,"(VGLM, [])"
6,1,7,1.205577,1.172987,1.240681,1.172987,5.758368,41.546102,0.005,128,bgmm,0,0,"(VGLM, [])"
7,1,8,1.205447,1.172107,1.243065,1.172107,6.412961,48.029615,0.005,128,bgmm,0,0,"(VGLM, [])"
8,1,9,1.205291,1.175139,1.239771,1.175139,5.891509,53.968575,0.005,128,bgmm,0,0,"(VGLM, [])"
9,1,10,1.205385,1.175429,1.240272,1.175429,5.724869,59.760617,0.005,128,bgmm,0,0,"(VGLM, [])"


In [7]:
# # Create predictions for k_all
# for run in predictions.keys():
#     for i in range(len(CONFIG.params['k'])):
#         predictions[run][f'k{i}']['k_fold'] = i
#         if i == 0:
#             predictions[run]['k_all'] = predictions[run][f'k{i}']
#         else:
#             predictions[run]['k_all'] = predictions[run]['k_all'].append(predictions[run][f'k{i}'])

In [8]:
sample_cols = [f'sample_{i}' for i in range(CONFIG.n_samples)]
add_cols = []
columns = ['Prec','wrf_prcp','wrf_bc_prcp','precip_norris']

In [9]:
# with open('_experiments/None/predictions.pkl', 'rb') as handle:
#     b = pickle.load(handle)

# with open('_experiments/magali/predictions.pkl', 'rb') as handle:
#     a = pickle.load(handle)
    
# a.update(b)
# a.update(predictions)

# predictions = a.copy()

In [10]:
for p in predictions.keys(): 
    for k,v in predictions[p].items():
        v['wrf_prcp'] = v['precip_norris'] 
        v['wrf_bc_prcp'] = v['precip_norris'] 

In [11]:
for k,v in predictions.items():
    print(f"{k : <40} {v['k_all'].BS.mean():.3f}")

VGLM_[]_bgmm_B=128_D=0                   0.574
MLP_[10]_bgmm_B=128_D=0                  0.589
SimpleRNN_[10]_bgmm_B=128_D=0            0.427


In [12]:
from utils import QS

In [13]:
def compute_QS(predictions, quantile = 0.5): 
    for k,v in predictions.items(): 
        p = v['k_all']
        p[f'QS_quantile_{quantile}'] = quantile
        likelihood = k.split('_')[2]
        p[f'QS_sample_{quantile}'] = p.apply(sample_apply, axis=1, args=(likelihood, 10000, f'QS_quantile_{quantile}'))
        p[f'QS_{quantile}'] = p.apply(QS, axis=1, args=('QS_sample', 'Prec', quantile))
        print(f'Completed {k} {quantile}')

In [14]:
compute_QS(predictions, quantile = 0.10)
compute_QS(predictions, quantile = 0.25)
compute_QS(predictions, quantile = 0.50)
compute_QS(predictions, quantile = 0.75)
compute_QS(predictions, quantile = 0.90)
compute_QS(predictions, quantile = 0.95)

Completed VGLM_[]_bgmm_B=128_D=0 0.1
Completed MLP_[10]_bgmm_B=128_D=0 0.1
Completed SimpleRNN_[10]_bgmm_B=128_D=0 0.1
Completed VGLM_[]_bgmm_B=128_D=0 0.25
Completed MLP_[10]_bgmm_B=128_D=0 0.25
Completed SimpleRNN_[10]_bgmm_B=128_D=0 0.25
Completed VGLM_[]_bgmm_B=128_D=0 0.5
Completed MLP_[10]_bgmm_B=128_D=0 0.5
Completed SimpleRNN_[10]_bgmm_B=128_D=0 0.5
Completed VGLM_[]_bgmm_B=128_D=0 0.75
Completed MLP_[10]_bgmm_B=128_D=0 0.75
Completed SimpleRNN_[10]_bgmm_B=128_D=0 0.75
Completed VGLM_[]_bgmm_B=128_D=0 0.9
Completed MLP_[10]_bgmm_B=128_D=0 0.9
Completed SimpleRNN_[10]_bgmm_B=128_D=0 0.9
Completed VGLM_[]_bgmm_B=128_D=0 0.95
Completed MLP_[10]_bgmm_B=128_D=0 0.95
Completed SimpleRNN_[10]_bgmm_B=128_D=0 0.95


In [15]:
for k,v in predictions.items():
    print(f"{k : <40} {v['k_all']['QS_0.1'].mean():.3f}")

VGLM_[]_bgmm_B=128_D=0                   5.644
MLP_[10]_bgmm_B=128_D=0                  5.758
SimpleRNN_[10]_bgmm_B=128_D=0            4.390


In [16]:
for k,v in predictions.items():
    print(f"{k : <40} {v['k_all']['QS_0.25'].mean():.3f}")

VGLM_[]_bgmm_B=128_D=0                   4.880
MLP_[10]_bgmm_B=128_D=0                  4.961
SimpleRNN_[10]_bgmm_B=128_D=0            3.940


In [17]:
for k,v in predictions.items():
    print(f"{k : <40} {v['k_all']['QS_0.5'].mean():.3f}")

VGLM_[]_bgmm_B=128_D=0                   3.607
MLP_[10]_bgmm_B=128_D=0                  3.633
SimpleRNN_[10]_bgmm_B=128_D=0            3.189


In [18]:
for k,v in predictions.items():
    print(f"{k : <40} {v['k_all']['QS_0.75'].mean():.3f}")

VGLM_[]_bgmm_B=128_D=0                   2.334
MLP_[10]_bgmm_B=128_D=0                  2.305
SimpleRNN_[10]_bgmm_B=128_D=0            2.438


In [19]:
for k,v in predictions.items():
    print(f"{k : <40} {v['k_all']['QS_0.9'].mean():.3f}")

VGLM_[]_bgmm_B=128_D=0                   1.570
MLP_[10]_bgmm_B=128_D=0                  1.508
SimpleRNN_[10]_bgmm_B=128_D=0            1.988


In [20]:
for k,v in predictions.items():
    print(f"{k : <40} {v['k_all']['QS_0.95'].mean():.3f}")

VGLM_[]_bgmm_B=128_D=0                   1.315
MLP_[10]_bgmm_B=128_D=0                  1.242
SimpleRNN_[10]_bgmm_B=128_D=0            1.838


In [21]:
table_of_predictions_ks_test(predictions, CONFIG.seasons, columns, sample_cols, add_cols)

VGLM_[]_bgmm_B=128_D=0
MLP_[10]_bgmm_B=128_D=0
SimpleRNN_[10]_bgmm_B=128_D=0
Model                          JFM mean    JFM median    AM mean    AM median    JJAS mean    JJAS median    OND mean    OND median
-----------------------------  ----------  ------------  ---------  -----------  -----------  -------------  ----------  ------------
Bann                           0.0757      0.0573        0.1029     0.0567       0.1522       0.0601         0.1348      0.0584
BannCorr                       0.0757      0.0573        0.1029     0.0567       0.1522       0.0601         0.1348      0.0584
Norr                           0.0757      0.0573        0.1029     0.0567       0.1522       0.0601         0.1348      0.0584
VGLM_[]_bgmm_B=128_D=0         0.2198      0.0585        0.1215     0.0920       0.0566       0.0509         0.1680      0.0583
MLP_[10]_bgmm_B=128_D=0        0.2107      0.0607        0.2306     0.0692       0.0548       0.0246         0.1978      0.0492
SimpleRNN_[10]_bg

In [22]:
table_of_predictions_for_metric(predictions, CONFIG.seasons, columns, CONFIG.n_samples, sample_cols, add_cols, metric = 'smape', prefix='smape')

Model                          JFM mean    JFM median    AM mean    AM median    JJAS mean    JJAS median    OND mean    OND median
-----------------------------  ----------  ------------  ---------  -----------  -----------  -------------  ----------  ------------
Prec                           0.00        0.00          0.00       0.00         0.00         0.00           0.00        0.00
wrf_prcp                       0.23        0.18          0.38       0.35         0.21         0.18           0.39        0.34
wrf_bc_prcp                    0.23        0.18          0.38       0.35         0.21         0.18           0.39        0.34
precip_norris                  0.23        0.18          0.38       0.35         0.21         0.18           0.39        0.34
VGLM_[]_bgmm_B=128_D=0         0.25        0.21          0.36       0.32         0.20         0.15           0.40        0.30
MLP_[10]_bgmm_B=128_D=0        0.24        0.18          0.29       0.25         0.20         0.16      

In [23]:
table_of_predictions_for_metric(predictions, CONFIG.seasons, columns, CONFIG.n_samples, sample_cols, add_cols, metric = 'edd', prefix='edd')

Model                          JFM mean    JFM median    AM mean    AM median    JJAS mean    JJAS median    OND mean    OND median
-----------------------------  ----------  ------------  ---------  -----------  -----------  -------------  ----------  ------------
Prec                           0.00        0.00          0.00       0.00         0.00         0.00           0.00        0.00
wrf_prcp                       14.21       10.00         12.79      12.00        32.95        34.00          7.52        5.50
wrf_bc_prcp                    14.21       10.00         12.79      12.00        32.95        34.00          7.52        5.50
precip_norris                  14.21       10.00         12.79      12.00        32.95        34.00          7.52        5.50
VGLM_[]_bgmm_B=128_D=0         5.74        4.60          5.85       4.30         10.32        7.70           5.16        4.30
MLP_[10]_bgmm_B=128_D=0        5.86        4.80          4.77       3.70         10.84        8.80      

In [24]:
table_of_predictions_for_metric(predictions, CONFIG.seasons, columns, CONFIG.n_samples, sample_cols, add_cols, metric = 'ae', prefix='ae')

Model                          JFM mean    JFM median    AM mean    AM median    JJAS mean    JJAS median    OND mean    OND median
-----------------------------  ----------  ------------  ---------  -----------  -----------  -------------  ----------  ------------
Prec                           0.00        0.00          0.00       0.00         0.00         0.00           0.00        0.00
wrf_prcp                       74.51       55.94         53.95      44.58        293.28       191.28         33.85       19.84
wrf_bc_prcp                    74.51       55.94         53.95      44.58        293.28       191.28         33.85       19.84
precip_norris                  74.51       55.94         53.95      44.58        293.28       191.28         33.85       19.84
VGLM_[]_bgmm_B=128_D=0         84.24       63.04         51.28      37.37        273.26       203.65         44.41       37.07
MLP_[10]_bgmm_B=128_D=0        86.69       62.62         45.33      37.92        276.16       187.20

In [49]:
# table_of_predictions_for_metric(predictions, CONFIG.seasons, columns, CONFIG.n_samples, sample_cols, add_cols, metric = 'se', prefix='se')

In [None]:
a = pd.read_csv('results.csv')

b = a.groupby(['k','run']).agg({'valid_loss': 'min', 
                                 'model_arch': 'first', 
                                 'likelihood_fn': 'first',
                                 'lr':'first',
                                 'batch_size':'first',
                                 'dropout_rate':'first',
                               })

c = b.groupby(['run']).agg({'valid_loss': 'mean', 
                        'model_arch': 'first', 
                        'likelihood_fn': 'first',
                        'lr':'first',
                        'batch_size':'first',
                        'dropout_rate':'first',
                         }
                   ).sort_values('valid_loss').reset_index()

c.groupby(['model_arch',
           'likelihood_fn',
           'lr',
           'batch_size',
           'dropout_rate',
           ]).agg({'valid_loss': 'mean'}
                   ).sort_values('valid_loss').reset_index()