# NNPP
This Notebook reimplements the Method of the Paper from RL 18 to compare results to the pytoch geometric model.
This should archieve a CRPS score around 0.78

In [2]:
import os
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline
plt.style.use('default')

In [3]:
def load_data(indexed: bool = True) -> pd.DataFrame:
    """
    Load the data from the specified file and preprocess it.

    :param indexed: Whether to add a DateTimeIndex to the DataFrame. Defaults to True.
    :type indexed: bool, optional
    :return: The preprocessed DataFrame.
    :rtype: pd.DataFrame
    """
    df = pd.read_feather("/Users/moritzfeik/Developer/BA/data_RL18.feather")
    # convert station to integer and subtract 1 to make it 0-based
    df.station = pd.to_numeric(df.station, downcast='integer') - 1
    df = df.sort_values(by=['date', 'station'])  # sort by date and station
    df["doy"] = df["date"].apply(lambda x: math.sin(((x.day_of_year-105)/366)*2*math.pi))  # Sin transformed day of year
    if indexed:
        df.index = df.date  # add DatetimeIndex
        df.index = df.index.tz_convert(None)  # remove timezone
    return df

def clean_data(df: pd.DataFrame, max_missing: int = 121, max_alt: float = 1000.0) -> pd.DataFrame:
    """
    Cleans the DataFrame by removing outliers and stations with a high number of missing values.

    :param df: The DataFrame to be cleaned.
    :type df: pd.DataFrame
    :param max_missing: The maximum number of rows with missing values allowed for each station. Defaults to 121.
    :type max_missing: int, optional
    :param max_alt: The maximum altitude of stations to keep. Stations with altitudes above this value will be dropped. Defaults to 1000.0.
    :type max_alt: float, optional
    :return: The cleaned DataFrame.
    :rtype: pd.DataFrame
    """
    
    # drop stations with altitude > max_alt
    df = df[df['alt'] < max_alt]
    # drop stations with more than max_missing missing values completely
    stations_missing_data = df.station[df.sm_mean.isna()].to_numpy()
    stations_missing_data, counts = np.unique(stations_missing_data, return_counts=True)
    stations_to_drop = stations_missing_data[counts > max_missing]
    df = df[~df['station'].isin(stations_to_drop)]
    # drop all rows with missing values
    df = df.dropna()
    return df

In [4]:
# read data (can be downloaded from https://doi.org/10.6084/m9.figshare.13516301.v1)
print("Loading Data ...")
data = load_data(indexed=False)

print("Cleaning Data ...")
data = clean_data(data, max_missing=121, max_alt=1000.0)

Loading Data ...
Cleaning Data ...


In [5]:
data

Unnamed: 0,date,station,obs,t2m_mean,t2m_var,cape_mean,cape_var,sp_mean,sp_var,tcc_mean,...,str_var,d2m_mean,d2m_var,sm_mean,sm_var,lat,lon,alt,orog,doy
0,2007-01-03 00:00:00+00:00,0,5.5,3.616448,0.079733,11.480126,164.398999,101263.773906,17346.641356,46.793524,...,2.810124e+11,275.956692,0.151394,318.990796,6.543392,50.782700,6.0941,202.0,107.439461,-0.983798
1,2007-01-03 00:00:00+00:00,1,2.9,4.601281,0.107129,22.207007,207.708022,101463.529063,18411.594667,48.161629,...,2.771202e+11,275.529611,0.152089,325.635452,8.776818,52.485298,7.9126,65.0,47.632523,-0.983798
2,2007-01-03 00:00:00+00:00,2,3.3,2.873910,0.078148,44.308516,1438.915507,97942.360781,20106.971594,63.223506,...,4.909704e+11,275.008204,0.075718,336.861672,5.635509,50.744598,9.3450,300.0,348.869904,-0.983798
4,2007-01-03 00:00:00+00:00,4,3.4,2.718213,0.198263,96.170580,2550.754359,98045.333437,19122.337645,73.738330,...,7.963386e+11,274.732042,0.186014,324.187435,13.339768,51.088100,12.9326,296.0,296.839203,-0.983798
5,2007-01-03 00:00:00+00:00,5,1.8,1.375332,0.149906,123.618650,1533.808489,96637.034219,16800.454370,91.842461,...,1.033478e+12,274.021844,0.207122,315.014949,20.406193,48.405998,11.3117,510.0,461.575287,-0.983798
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1808680,2016-12-31 00:00:00+00:00,530,-0.6,-0.155651,0.978417,0.201357,0.141017,102826.477656,9997.686018,49.609571,...,1.132820e+12,271.254705,1.372146,245.919016,17.199589,52.715599,7.3176,19.0,36.652340,-0.973264
1808940,2016-12-31 00:00:00+00:00,531,-5.3,-3.497557,0.172615,0.003596,0.000317,96513.425000,4798.146479,0.186227,...,1.159146e+11,266.253950,0.827402,395.544220,22.223388,48.441799,9.9216,593.0,532.696167,-0.973264
1808533,2016-12-31 00:00:00+00:00,532,-2.7,-1.661223,0.163165,0.021574,0.006121,102359.503125,8667.898487,7.431432,...,1.425973e+11,269.773757,0.461191,253.349395,13.011937,51.841801,8.0607,104.0,95.691666,-0.973264
1808941,2016-12-31 00:00:00+00:00,533,-1.5,-5.979924,0.349577,0.000000,0.000000,95704.712813,4314.376743,1.796408,...,6.356090e+10,263.174088,0.838257,353.039061,22.348963,47.876099,10.5848,816.0,671.567078,-0.973264


In [6]:
# split into train and test data
eval_start = 1626724
train_end = 1626723 # 2016-12-01

train_features_raw = data.iloc[:train_end,3:].to_numpy()
train_targets = data.iloc[:train_end,2].to_numpy()
train_IDs = data.iloc[:train_end,1].to_numpy()

test_features_raw = data.iloc[eval_start:,3:].to_numpy()
test_targets = data.iloc[eval_start:,2].to_numpy()
test_IDs = data.iloc[eval_start:,1].to_numpy()

In [7]:
# normalize data
def normalize(data, method=None, shift=None, scale=None):
    result = np.zeros(data.shape)
    if method == "MAX":
        scale = np.max(data, axis=0)
        shift = np.zeros(scale.shape)
    for index in range(len(data[0])):
        result[:,index] = (data[:,index] - shift[index]) / scale[index]
    return result, shift, scale

train_features, train_shift, train_scale = normalize(train_features_raw[:,:], method="MAX")

test_features = normalize(test_features_raw[:,:], shift=train_shift, scale=train_scale)[0]

## Tensorflow

In [18]:
import tensorflow as tf

from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, Concatenate
from tensorflow.keras.models import Model
#from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.legacy import Adam  # better for M1/M2 Mac
from tensorflow.keras.backend import clear_session
from tqdm import tqdm

In [9]:
def crps_cost_function(y_true, y_pred, theano=False):
    """Compute the CRPS cost function for a normal distribution defined by
    the mean and standard deviation.

    Code inspired by Kai Polsterer (HITS).

    Args:
        y_true: True values
        y_pred: Tensor containing predictions: [mean, std]
        theano: Set to true if using this with pure theano.

    Returns:
        mean_crps: Scalar with mean CRPS over batch
    """

    # Split input
    mu = y_pred[:, 0]
    sigma = y_pred[:, 1]
    # Ugly workaround for different tensor allocation in keras and theano
    if not theano:
        y_true = y_true[:, 0]   # Need to also get rid of axis 1 to match!

    # To stop sigma from becoming negative we first have to
    # convert it the the variance and then take the square
    # root again.
    var = sigma ** 2
    # The following three variables are just for convenience
    loc = (y_true - mu) / tf.sqrt(var)
    phi = 1.0 / tf.sqrt(2.0 * np.pi) * tf.exp(-loc ** 2 / 2.0)
    Phi = 0.5 * (1.0 + tf.math.erf(loc / tf.sqrt(2.0)))
    # First we will compute the crps for each input/target pair
    crps =  tf.sqrt(var) * (loc * (2. * Phi - 1.) + 2 * phi - 1. / tf.sqrt(np.pi))
    # Then we take the mean. The cost is now a scalar
    return tf.math.reduce_mean(crps)

In [19]:
# training multiple models in a loop

emb_size = 2
max_id = int(tf.math.reduce_max([train_IDs.max(), test_IDs.max()]))
n_features = train_features.shape[1]
n_outputs = 2

nreps = 10
trn_scores = []
test_scores = []
preds = []

for i in tqdm(range(nreps)):
    clear_session()

    features_in = Input(shape=(n_features,))
    id_in = Input(shape=(1,))
    emb = Embedding(max_id + 1, emb_size)(id_in)
    emb = Flatten()(emb)
    x = Concatenate()([features_in, emb])
    x = Dense(512, activation='relu')(x)
    x = Dense(n_outputs, activation='linear')(x)
    nn_aux_emb = Model(inputs=[features_in, id_in], outputs=x)

    opt = Adam(learning_rate=0.002)
    nn_aux_emb.compile(optimizer=opt, loss=crps_cost_function)

    nn_aux_emb.fit([train_features, train_IDs], train_targets, epochs=15, batch_size=4096, verbose=0)

    trn_scores.append(nn_aux_emb.evaluate([train_features, train_IDs], train_targets, 4096, verbose=0))
    test_scores.append(nn_aux_emb.evaluate([test_features, test_IDs], test_targets, 4096, verbose=0))
    preds.append(nn_aux_emb.predict([test_features, test_IDs], 4096, verbose=0))

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [10:42<00:00, 64.23s/it]


In [20]:
test_scores

[3.139328718185425,
 1.8778923749923706,
 1.2039612531661987,
 1.5441023111343384,
 1.929037094116211,
 2.4476447105407715,
 1.4293427467346191,
 1.0186914205551147,
 1.2177531719207764,
 1.1318036317825317]

In [21]:
# evaluate ensemble of models

from scipy.stats import norm

def crps_normal(mu, sigma, y):
    """
    Compute CRPS for a Gaussian distribution.
    """
    # Make sure sigma is positive
    sigma = np.abs(sigma)
    loc = (y - mu) / sigma
    crps = sigma * (loc * (2 * norm.cdf(loc) - 1) +
                    2 * norm.pdf(loc) - 1. / np.sqrt(np.pi))
    return crps

preds = np.array(preds)
preds[:, :, 1] = np.abs(preds[:, :, 1]) # Make sure std is positive
mean_preds = np.mean(preds, 0)
ens_score = crps_normal(mean_preds[:, 0], mean_preds[:, 1], test_targets).mean()
print(f'Ensemble test score = {ens_score}')

Ensemble test score = 1.339218302750267
