# NNPP
This Notebook reimplements the Method of the Paper from RL 18 to compare results to the pytoch geometric model.
This should archieve a CRPS score around 0.78

In [2]:
import os
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline
plt.style.use('default')

In [3]:
def load_data(indexed: bool = True) -> pd.DataFrame:
    """
    Load the data from the specified file and preprocess it.

    :param indexed: Whether to add a DateTimeIndex to the DataFrame. Defaults to True.
    :type indexed: bool, optional
    :return: The preprocessed DataFrame.
    :rtype: pd.DataFrame
    """
    df = pd.read_feather("/Users/moritzfeik/Developer/BA/data_RL18.feather")
    # convert station to integer and subtract 1 to make it 0-based
    df.station = pd.to_numeric(df.station, downcast='integer') - 1
    df = df.sort_values(by=['date', 'station'])  # sort by date and station
    df["doy"] = df["date"].apply(lambda x: math.sin(((x.day_of_year-105)/366)*2*math.pi))  # Sin transformed day of year
    if indexed:
        df.index = df.date  # add DatetimeIndex
        df.index = df.index.tz_convert(None)  # remove timezone
    return df

def clean_data(df: pd.DataFrame, max_missing: int = 121, max_alt: float = 1000.0) -> pd.DataFrame:
    """
    Cleans the DataFrame by removing outliers and stations with a high number of missing values.

    :param df: The DataFrame to be cleaned.
    :type df: pd.DataFrame
    :param max_missing: The maximum number of rows with missing values allowed for each station. Defaults to 121.
    :type max_missing: int, optional
    :param max_alt: The maximum altitude of stations to keep. Stations with altitudes above this value will be dropped. Defaults to 1000.0.
    :type max_alt: float, optional
    :return: The cleaned DataFrame.
    :rtype: pd.DataFrame
    """
    
    # drop stations with altitude > max_alt
    df = df[df['alt'] < max_alt]
    # drop stations with more than max_missing missing values completely
    stations_missing_data = df.station[df.sm_mean.isna()].to_numpy()
    stations_missing_data, counts = np.unique(stations_missing_data, return_counts=True)
    stations_to_drop = stations_missing_data[counts > max_missing]
    df = df[~df['station'].isin(stations_to_drop)]
    # drop all rows with missing values
    df = df.dropna()
    return df

In [4]:
# read data (can be downloaded from https://doi.org/10.6084/m9.figshare.13516301.v1)
print("Loading Data ...")
data = load_data(indexed=False)

print("Cleaning Data ...")
data = clean_data(data, max_missing=121, max_alt=1000.0)

Loading Data ...
Cleaning Data ...


In [12]:
# Get index of last day in 2015
data.reset_index(drop=True, inplace=True)
idx = max(data[data.date.dt.year == 2015].index)
idx

1469770

In [13]:
# split into train and test data
eval_start = idx+1
train_end = idx # 2016-12-01

train_features_raw = data.iloc[:train_end,3:].to_numpy()
train_targets = data.iloc[:train_end,2].to_numpy()
train_IDs = data.iloc[:train_end,1].to_numpy()

test_features_raw = data.iloc[eval_start:,3:].to_numpy()
test_targets = data.iloc[eval_start:,2].to_numpy()
test_IDs = data.iloc[eval_start:,1].to_numpy()

In [14]:
# normalize data
def normalize(data, method=None, shift=None, scale=None):
    result = np.zeros(data.shape)
    if method == "MAX":
        scale = np.max(data, axis=0)
        shift = np.zeros(scale.shape)
    for index in range(len(data[0])):
        result[:,index] = (data[:,index] - shift[index]) / scale[index]
    return result, shift, scale

train_features, train_shift, train_scale = normalize(train_features_raw[:,:], method="MAX")

test_features = normalize(test_features_raw[:,:], shift=train_shift, scale=train_scale)[0]

## Tensorflow

In [15]:
import tensorflow as tf

from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, Concatenate
from tensorflow.keras.models import Model
#from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.legacy import Adam  # better for M1/M2 Mac
from tensorflow.keras.backend import clear_session
from tqdm import tqdm

In [16]:
def crps_cost_function(y_true, y_pred, theano=False):
    """Compute the CRPS cost function for a normal distribution defined by
    the mean and standard deviation.

    Code inspired by Kai Polsterer (HITS).

    Args:
        y_true: True values
        y_pred: Tensor containing predictions: [mean, std]
        theano: Set to true if using this with pure theano.

    Returns:
        mean_crps: Scalar with mean CRPS over batch
    """

    # Split input
    mu = y_pred[:, 0]
    sigma = y_pred[:, 1]
    # Ugly workaround for different tensor allocation in keras and theano
    if not theano:
        y_true = y_true[:, 0]   # Need to also get rid of axis 1 to match!

    # To stop sigma from becoming negative we first have to
    # convert it the the variance and then take the square
    # root again.
    var = sigma ** 2
    # The following three variables are just for convenience
    loc = (y_true - mu) / tf.sqrt(var)
    phi = 1.0 / tf.sqrt(2.0 * np.pi) * tf.exp(-loc ** 2 / 2.0)
    Phi = 0.5 * (1.0 + tf.math.erf(loc / tf.sqrt(2.0)))
    # First we will compute the crps for each input/target pair
    crps =  tf.sqrt(var) * (loc * (2. * Phi - 1.) + 2 * phi - 1. / tf.sqrt(np.pi))
    # Then we take the mean. The cost is now a scalar
    return tf.math.reduce_mean(crps)

In [17]:
# training multiple models in a loop

emb_size = 2
max_id = int(tf.math.reduce_max([train_IDs.max(), test_IDs.max()]))
n_features = train_features.shape[1]
n_outputs = 2

nreps = 10
trn_scores = []
test_scores = []
preds = []

for i in tqdm(range(nreps)):
    clear_session()

    features_in = Input(shape=(n_features,))
    id_in = Input(shape=(1,))
    emb = Embedding(max_id + 1, emb_size)(id_in)
    emb = Flatten()(emb)
    x = Concatenate()([features_in, emb])
    x = Dense(512, activation='relu')(x)
    x = Dense(n_outputs, activation='linear')(x)
    nn_aux_emb = Model(inputs=[features_in, id_in], outputs=x)

    opt = Adam(learning_rate=0.002)
    nn_aux_emb.compile(optimizer=opt, loss=crps_cost_function)

    nn_aux_emb.fit([train_features, train_IDs], train_targets, epochs=15, batch_size=4096, verbose=0)

    trn_scores.append(nn_aux_emb.evaluate([train_features, train_IDs], train_targets, 4096, verbose=0))
    test_scores.append(nn_aux_emb.evaluate([test_features, test_IDs], test_targets, 4096, verbose=0))
    preds.append(nn_aux_emb.predict([test_features, test_IDs], 4096, verbose=0))

100%|██████████| 10/10 [04:30<00:00, 27.03s/it]


In [18]:
test_scores

[0.7906690835952759,
 0.794194757938385,
 0.7989064455032349,
 0.7906081676483154,
 0.7908093929290771,
 0.7993164658546448,
 0.8161780834197998,
 0.7906683087348938,
 0.7885107398033142,
 0.7953349947929382]

In [19]:
# evaluate ensemble of models

from scipy.stats import norm

def crps_normal(mu, sigma, y):
    """
    Compute CRPS for a Gaussian distribution.
    """
    # Make sure sigma is positive
    sigma = np.abs(sigma)
    loc = (y - mu) / sigma
    crps = sigma * (loc * (2 * norm.cdf(loc) - 1) +
                    2 * norm.pdf(loc) - 1. / np.sqrt(np.pi))
    return crps

preds = np.array(preds)
preds[:, :, 1] = np.abs(preds[:, :, 1]) # Make sure std is positive
mean_preds = np.mean(preds, 0)
ens_score = crps_normal(mean_preds[:, 0], mean_preds[:, 1], test_targets).mean()
print(f'Ensemble test score = {ens_score}')

Ensemble test score = 0.7865278714841577
