In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import time
from collections import OrderedDict
from attrdict import AttrDict
import numpy as np
import pandas as pd

import torch
from torch.utils.data import DataLoader
from torch import optim
import logging
from tqdm import tqdm



In [3]:
from pytorch_forecasting.models.nn.rnn import LSTM
from pytorch_lightning.callbacks import EarlyStopping
import torch

from pytorch_forecasting import Baseline, NBeats, TimeSeriesDataSet
from pytorch_forecasting.data import NaNLabelEncoder
from pytorch_forecasting.data.examples import generate_ar_data
from pytorch_forecasting.metrics import SMAPE

In [2]:
from neuralprophet.forecaster_additional_models import LSTM

In [3]:
import pandas as pd

In [4]:
from neuralprophet import time_net
import pandas as pd
import numpy as np


In [5]:
# from neuralprophet import LSTM


In [6]:
data_location = "../"
df = pd.read_csv(data_location + "example_data/yosemite_temps.csv")
df.head(3)

Unnamed: 0,ds,y
0,2017-05-01 00:00:00,27.8
1,2017-05-01 00:05:00,27.0
2,2017-05-01 00:10:00,26.8


In [9]:
df.shape

(18721, 2)

In [7]:
import pytorch_lightning as pl

In [8]:
from pytorch_lightning import Trainer



In [9]:
from neuralprophet import configure
from neuralprophet import time_net
from neuralprophet import time_dataset
from neuralprophet import df_utils
from neuralprophet import utils
from neuralprophet import utils_torch
from neuralprophet.plot_forecast import plot, plot_components
from neuralprophet.plot_model_parameters import plot_parameters
from neuralprophet import metrics
from neuralprophet.utils import set_logger_level

In [10]:
import torch.nn as nn


In [11]:
import logging

log = logging.getLogger("NP.LSTM")

In [220]:
class LightLSTM(pl.LightningModule):

    def __init__(self,
                 input_size,
                 hidden_size,
                 num_layers, 
                 bias, 
                 bidirectional,
                 n_forecasts):
        super().__init__()
        self.lstm = nn.LSTM(input_size=input_size, 
                            hidden_size=hidden_size,
                            num_layers=num_layers, 
                            bias=bias, 
                            bidirectional=bidirectional,
                            batch_first = False)
        self.linear = nn.Linear(hidden_size, n_forecasts)
        # Metrics live
        self.metrics_live = {}
        
    def set_optimizer(self, optimizer):
        self.optimizer = optimizer  ##### todo add this to init

    def set_scheduler(self, scheduler):
        self.scheduler = scheduler  ##### todo add this to init

    def set_loss_func(self, loss_func):
        self.loss_func = loss_func  ##### todo add this to init
        
    def set_forecaster(self, self_forecaster):
        self.forecaster = self_forecaster

    def forward(self, x):
        x = x["lags"]
        x.resize_((x.size()[0], 1, x.size()[1]))

        lstm_out, _ = self.lstm(x)
        y_pred = self.linear(lstm_out[:,-1])
        return y_pred
    

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.loss_func(y_hat, y)

        self.forecaster.metrics.update(predicted = y_hat.detach(),
                                       target = y.detach(),
                                       values = {"Loss": loss})
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.loss_func(y_hat, y)
                
        self.forecaster.val_metrics.update(predicted=y_hat.detach(), target=y.detach())
        
        return loss
    
    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.loss_func(y_hat, y)
        
        self.forecaster.test_metrics.update(predicted=y_hat.detach(), target=y.detach())

        return loss
    
    def training_epoch_end(self, outputs):
        epoch_metrics = self.forecaster.metrics.compute(save=True)
        self.metrics_live["{}".format(list(epoch_metrics)[0])] = epoch_metrics[list(epoch_metrics)[0]]

        self.forecaster.metrics.reset()
        
    def validation_epoch_end(self, validation_step_outputs):
        val_epoch_metrics = self.forecaster.val_metrics.compute(save=True)
        self.metrics_live["val_{}".format(list(val_epoch_metrics)[0])] = val_epoch_metrics[
            list(val_epoch_metrics)[0]]
        self.forecaster.val_metrics.reset()
        

    def configure_optimizers(self):
        return [self.optimizer], [self.scheduler]

In [221]:
trainer = Trainer()

GPU available: False, used: False
TPU available: False, using: 0 TPU cores


In [222]:
trainer.test()

AttributeError: 'NoneType' object has no attribute 'running_stage'

In [267]:
class LSTM_NP:
    """LSTM forecaster.
    """

    def __init__(
        self,
        n_lags = 10,
        n_forecasts=1,
        num_hidden_layers=1,
        d_hidden=10,
        learning_rate=None,
        epochs=None,
        batch_size=None,
        loss_func="Huber",
        optimizer="AdamW",
        train_speed=None,
        normalize="auto",
        impute_missing=True,
        lstm_bias = True,
        lstm_biderectional = False,
    ):
        """
        Args:

            ## Model Config
            n_forecasts (int): Number of steps ahead of prediction time step to forecast.
            num_hidden_layers (int): number of hidden layer to include in AR-Net. defaults to 0.
            d_hidden (int): dimension of hidden layers of the AR-Net. Ignored if num_hidden_layers == 0.

            ## Train Config
            learning_rate (float): Maximum learning rate setting for 1cycle policy scheduler.
                default: None: Automatically sets the learning_rate based on a learning rate range test.
                For manual values, try values ~0.001-10.
            epochs (int): Number of epochs (complete iterations over dataset) to train model.
                default: None: Automatically sets the number of epochs based on dataset size.
                    For best results also leave batch_size to None.
                For manual values, try ~5-500.
            batch_size (int): Number of samples per mini-batch.
                default: None: Automatically sets the batch_size based on dataset size.
                    For best results also leave epochs to None.
                For manual values, try ~1-512.
            loss_func (str, torch.nn.modules.loss._Loss, 'typing.Callable'):
                Type of loss to use: str ['Huber', 'MSE'],
                or torch loss or callable for custom loss, eg. asymmetric Huber loss

            ## Data config
            normalize (str): Type of normalization to apply to the time series.
                options: ['auto', 'soft', 'off', 'minmax, 'standardize']
                default: 'auto' uses 'minmax' if variable is binary, else 'soft'
                'soft' scales minimum to 0.1 and the 90th quantile to 0.9
            impute_missing (bool): whether to automatically impute missing dates/values
                imputation follows a linear method up to 10 missing values, more are filled with trend.
                
            ## LSTM specific
            bias (bool): If False, then the layer does not use bias weights b_ih and b_hh. Default: True
            bidirectional (bool): If True, becomes a bidirectional LSTM. Default: False
            
        """
        kwargs = locals()

        # General
        self.name = "LSTM"
        self.n_forecasts = n_forecasts
        self.n_lags = n_lags

        # Data Preprocessing
        self.normalize = normalize
        self.impute_missing = impute_missing
        self.impute_limit_linear = 5
        self.impute_rolling = 20

        # Training
        self.config_train = configure.from_kwargs(configure.Train, kwargs)



        self.metrics = metrics.MetricsCollection(
            metrics=[
                metrics.LossMetric(self.config_train.loss_func),
                metrics.MAE(),
                metrics.MSE(),
            ],
            value_metrics=[
                # metrics.ValueMetric("Loss"),
            ],
        )

        
        # Model
        self.config_model = configure.from_kwargs(configure.Model, kwargs)
        
        # LSTM specific
        self.lstm_bias = lstm_bias
        self.lstm_biderectional = lstm_biderectional

        # set during fit()
        self.data_freq = None

        # Set during _train()
        self.fitted = False
        self.data_params = None
        self.optimizer = None
        self.scheduler = None
        self.model = None

        # set during prediction
        self.future_periods = None
        # later set by user (optional)
        self.highlight_forecast_step_n = None
        self.true_ar_weights = None
        
        
    def _init_model(self):
        """Build Pytorch model with configured hyperparamters.

        Returns:
        """
        self.model = LightLSTM(
            input_size = self.n_lags,
            hidden_size = self.config_model.d_hidden,
            num_layers = self.config_model.num_hidden_layers,
            bias = self.lstm_bias,
            bidirectional = self.lstm_biderectional,
            n_forecasts = self.n_forecasts
        )

        self.model.set_loss_func(self.config_train.loss_func)
        self.model.set_forecaster(self)
        
        log.debug(self.model)
        return self.model
    
    def _create_dataset(self, df, predict_mode):
        """Construct dataset from dataframe.

        (Configured Hyperparameters can be overridden by explicitly supplying them.
        Useful to predict a single model component.)

        Args:
            df (pd.DataFrame): containing original and normalized columns 'ds', 'y', 't', 'y_scaled'
            predict_mode (bool): False includes target values.
                True does not include targets but includes entire dataset as input
        Returns:
            TimeDataset
        """
        return time_dataset.TimeDataset(
            df,
            n_lags=self.n_lags,
            n_forecasts=self.n_forecasts,
            predict_mode=predict_mode,
        )
    
    def _handle_missing_data(self, df, freq, predicting=False):
        """Checks, auto-imputes and normalizes new data

        Args:
            df (pd.DataFrame): raw data with columns 'ds' and 'y'
            freq (str): data frequency
            predicting (bool): when no lags, allow NA values in 'y' of forecast series or 'y' to miss completely

        Returns:
            pre-processed df
        """

        # add missing dates for autoregression modelling
        df, missing_dates = df_utils.add_missing_dates_nan(df, freq=freq)
        if missing_dates > 0:
            if self.impute_missing:
                log.info("{} missing dates added.".format(missing_dates))
            else:
                raise ValueError(
                    "{} missing dates found. Please preprocess data manually or set impute_missing to True.".format(
                        missing_dates
                    )
                )

        # impute missing values
        data_columns = []
        data_columns.append("y")
        
        for column in data_columns:
            sum_na = sum(df[column].isnull())
            if sum_na > 0:
                if self.impute_missing:
                    df.loc[:, column], remaining_na = df_utils.fill_linear_then_rolling_avg(
                        df[column],
                        limit_linear=self.impute_limit_linear,
                        rolling=self.impute_rolling,
                    )
                    log.info("{} NaN values in column {} were auto-imputed.".format(sum_na - remaining_na, column))
                    if remaining_na > 0:
                        raise ValueError(
                            "More than {} consecutive missing values encountered in column {}. "
                            "{} NA remain. Please preprocess data manually.".format(
                                2 * self.impute_limit_linear + self.impute_rolling, column, remaining_na
                            )
                        )
                else:  # fail because set to not impute missing
                    raise ValueError(
                        "Missing values found. " "Please preprocess data manually or set impute_missing to True."
                    )
        return df
    
    def _init_train_loader(self, df):
        """Executes data preparation steps and initiates training procedure.

        Args:
            df (pd.DataFrame): containing column 'ds', 'y' with training data

        Returns:
            torch DataLoader
        """
        if not self.fitted:
            self.data_params = df_utils.init_data_params(
                df,
                normalize=self.normalize,
                covariates_config=None,
                regressor_config=None,
                events_config=None,
            )
            
        df = df_utils.normalize(df, self.data_params)
        self.config_train.set_auto_batch_epoch(n_data=len(df))
        self.config_train.apply_train_speed(batch=True, epoch=True)
        dataset = self._create_dataset(df, predict_mode=False)  # needs to be called after set_auto_seasonalities
        loader = DataLoader(dataset, batch_size=self.config_train.batch_size, shuffle=True)
        self.loader_size = len(loader)
    
        
        
        # выкинуть это отсюда в трейн!!!
        if not self.fitted:
            self.model = self._init_model()  # needs to be called after set_auto_seasonalities        
        
        
        assert self.config_train.learning_rate is not None, 'Please, provide a learning rate'
            
            
            
        self.config_train.apply_train_speed(lr=True)
        self.optimizer = self.config_train.get_optimizer(self.model.parameters())
        ######
        self.model.set_optimizer(self.optimizer)
        self.scheduler = self.config_train.get_scheduler(self.optimizer, steps_per_epoch=len(loader))
        self.model.set_scheduler(self.scheduler)
        ######
        return loader
    
    def _init_val_loader(self, df):
        """Executes data preparation steps and initiates evaluation procedure.

        Args:
            df (pd.DataFrame): containing column 'ds', 'y' with validation data

        Returns:
            torch DataLoader
        """
        df = df_utils.normalize(df, self.data_params)
        dataset = self._create_dataset(df, predict_mode=False)
        loader = DataLoader(dataset, batch_size=min(1024, len(dataset)), shuffle=False, drop_last=False)
        return loader
    

    def _train(self, df, df_val=None, progress_bar=True, plot_live_loss=False):
        """Execute model training procedure for a configured number of epochs.

        Args:
            df (pd.DataFrame): containing column 'ds', 'y' with training data
            df_val (pd.DataFrame): containing column 'ds', 'y' with validation data
            progress_bar (bool): display updating progress bar
            plot_live_loss (bool): plot live training loss,
                requires [live] install or livelossplot package installed.
        Returns:
            df with metrics
        """
        if plot_live_loss:
            try:
                from livelossplot import PlotLosses
            except:
                plot_live_loss = False
                log.warning(
                    "To plot live loss, please install neuralprophet[live]."
                    "Using pip: 'pip install neuralprophet[live]'"
                    "Or install the missing package manually: 'pip install livelossplot'",
                    exc_info=True,
                )

        loader = self._init_train_loader(df)
        val = df_val is not None
        ## Metrics
        if self.highlight_forecast_step_n is not None:
            self.metrics.add_specific_target(target_pos=self.highlight_forecast_step_n - 1)
        if not self.normalize == "off":
            self.metrics.set_shift_scale((self.data_params["y"].shift, self.data_params["y"].scale))
        if val:
            val_loader = self._init_val_loader(df_val)
            val_metrics = metrics.MetricsCollection([m.new() for m in self.metrics.batch_metrics])
            
            self.val_metrics = val_metrics
            
        ## Run
        start = time.time()
        if progress_bar:
            training_loop = tqdm(
                range(self.config_train.epochs), total=self.config_train.epochs, leave=log.getEffectiveLevel() <= 20
            )
        else:
            training_loop = range(self.config_train.epochs)
        if plot_live_loss:
            live_out = ["MatplotlibPlot"]
            if not progress_bar:
                live_out.append("ExtremaPrinter")
            live_loss = PlotLosses(outputs=live_out)

        self.metrics.reset()
        if val:
            self.val_metrics.reset()

        self.trainer = Trainer(max_epochs=self.config_train.epochs,
                          # logger = log
                          )
        
        if val:
            self.trainer.fit(self.model, train_dataloader = loader, val_dataloaders = val_loader)
        else:
            self.trainer.fit(self.model, train_dataloader = loader)
                    

        metrics_df = self.metrics.get_stored_as_df()

        if val:
            metrics_df_val = self.val_metrics.get_stored_as_df()
            for col in metrics_df_val.columns:
                metrics_df["{}_val".format(col)] = metrics_df_val[col]
        return metrics_df
    
    def _evaluate(self, loader):
        """Evaluates model performance.

        Args:
            loader (torch DataLoader):  instantiated Validation Dataloader (with TimeDataset)
        Returns:
            df with evaluation metrics
        """
        test_metrics = metrics.MetricsCollection([m.new() for m in self.metrics.batch_metrics])
        if self.highlight_forecast_step_n is not None:
            test_metrics.add_specific_target(target_pos=self.highlight_forecast_step_n - 1)
        ## Run
        
        self.test_metrics = test_metrics
        self.trainer.test(self.model, test_dataloaders=loader, ckpt_path=None, verbose = False)
        
        test_metrics_dict = self.test_metrics.compute(save=True)
        
        log.info("Validation metrics: {}".format(utils.print_epoch_metrics(test_metrics_dict)))
        val_metrics_df = self.test_metrics.get_stored_as_df()
        return val_metrics_df

    def split_df(self, df, freq, valid_p=0.2):
        """Splits timeseries df into train and validation sets.

        Prevents overbleed of targets. Overbleed of inputs can be configured.
        Also performs basic data checks and fills in missing data.

        Args:
            df (pd.DataFrame): data
            freq (str):Data step sizes. Frequency of data recording,
                Any valid frequency for pd.date_range, such as '5min', 'D' or 'MS'
            valid_p (float): fraction of data to use for holdout validation set
                Targets will still never be shared.

        Returns:
            df_train (pd.DataFrame):  training data
            df_val (pd.DataFrame): validation data
        """
        df = df.copy(deep=True)
        df = df_utils.check_dataframe(df, check_y=False)
        df = self._handle_missing_data(df, freq=freq, predicting=False)
        df_train, df_val = df_utils.split_df(
            df,
            n_lags=self.n_lags,
            n_forecasts=self.n_forecasts,
            valid_p=valid_p,
            inputs_overbleed=True,
        )
        return df_train, df_val

    def crossvalidation_split_df(self, df, freq, k=5, fold_pct=0.1, fold_overlap_pct=0.5):
        """Splits timeseries data in k folds for crossvalidation.

        Args:
            df (pd.DataFrame): data
            freq (str):Data step sizes. Frequency of data recording,
                Any valid frequency for pd.date_range, such as '5min', 'D' or 'MS'
            k: number of CV folds
            fold_pct: percentage of overall samples to be in each fold
            fold_overlap_pct: percentage of overlap between the validation folds.

        Returns:
            list of k tuples [(df_train, df_val), ...] where:
                df_train (pd.DataFrame):  training data
                df_val (pd.DataFrame): validation data
        """
        df = df.copy(deep=True)
        df = df_utils.check_dataframe(df, check_y=False)
        df = self._handle_missing_data(df, freq=freq, predicting=False)
        folds = df_utils.crossvalidation_split_df(
            df,
            n_lags=self.n_lags,
            n_forecasts=self.n_forecasts,
            k=k,
            fold_pct=fold_pct,
            fold_overlap_pct=fold_overlap_pct,
        )
        return folds

    def fit(
        self, df, freq, epochs=None, validate_each_epoch=False, valid_p=0.2, progress_bar=True, plot_live_loss=False
    ):
        """Train, and potentially evaluate model.

        Args:
            df (pd.DataFrame): containing column 'ds', 'y' with all data
            freq (str):Data step sizes. Frequency of data recording,
                Any valid frequency for pd.date_range, such as '5min', 'D' or 'MS'
            epochs (int): number of epochs to train.
                default: if not specified, uses self.epochs
            validate_each_epoch (bool): whether to evaluate performance after each training epoch
            valid_p (float): fraction of data to hold out from training for model evaluation
            progress_bar (bool): display updating progress bar (tqdm)
            plot_live_loss (bool): plot live training loss,
                requires [live] install or livelossplot package installed.
        Returns:
            metrics with training and potentially evaluation metrics
        """
        self.data_freq = freq
        if epochs is not None:
            default_epochs = self.config_train.epochs
            self.config_train.epochs = epochs
        if self.fitted is True:
            log.warning("Model has already been fitted. Re-fitting will produce different results.")
        df = df_utils.check_dataframe(
            df, check_y=True)
        df = self._handle_missing_data(df, freq=self.data_freq)
        if validate_each_epoch:
            df_train, df_val = df_utils.split_df(df, n_lags=self.n_lags, n_forecasts=self.n_forecasts, valid_p=valid_p)
            metrics_df = self._train(df_train, df_val, progress_bar=progress_bar, plot_live_loss=plot_live_loss)
        else:
            metrics_df = self._train(df, progress_bar=progress_bar, plot_live_loss=plot_live_loss)
        if epochs is not None:
            self.config_train.epochs = default_epochs
        self.fitted = True

        return metrics_df
    
    def test(self, df):
        """Evaluate model on holdout data.

        Args:
            df (pd.DataFrame): containing column 'ds', 'y' with holdout data
        Returns:
            df with evaluation metrics
        """
        if self.fitted is False:
            log.warning("Model has not been fitted. Test results will be random.")
        df = df_utils.check_dataframe(df, check_y=True)
        df = self._handle_missing_data(df, freq=self.data_freq)
        loader = self._init_val_loader(df)
        val_metrics_df = self._evaluate(loader)
        return val_metrics_df
    
    
    
    def make_future_dataframe(self, df, periods=None, n_historic_predictions=0):
        df = df.copy(deep=True)
        
        n_lags = 0 if self.n_lags is None else self.n_lags
        if periods is None:
            periods = 1 if n_lags == 0 else self.n_forecasts
        else:
            assert periods >= 0

        if isinstance(n_historic_predictions, bool):
            if n_historic_predictions:
                n_historic_predictions = len(df) - n_lags
            else:
                n_historic_predictions = 0
        elif not isinstance(n_historic_predictions, int):
            log.error("non-integer value for n_historic_predictions set to zero.")
            n_historic_predictions = 0

        if periods == 0 and n_historic_predictions == 0:
            raise ValueError("Set either history or future to contain more than zero values.")
            
        last_date = pd.to_datetime(df["ds"].copy(deep=True)).sort_values().max()

        if len(df) < n_lags:
            raise ValueError("Insufficient data for a prediction")
        elif len(df) < n_lags + n_historic_predictions:
            log.warning(
                "Insufficient data for {} historic forecasts, reduced to {}.".format(
                    n_historic_predictions, len(df) - n_lags
                )
            )
            n_historic_predictions = len(df) - n_lags
        if (n_historic_predictions + n_lags) == 0:
            df = pd.DataFrame(columns=df.columns)
        else:
            df = df[-(n_lags + n_historic_predictions) :]

        if len(df) > 0:
            if len(df.columns) == 1 and "ds" in df:
                assert n_lags == 0
                df = df_utils.check_dataframe(df, check_y=False)
            else:
                df = df_utils.check_dataframe(
                    df, check_y=n_lags > 0
                )
                df = self._handle_missing_data(df, freq=self.data_freq, predicting=True)
            df = df_utils.normalize(df, self.data_params)

        # future data
        # check for external events known in future
        
        if n_lags > 0:
            if periods > 0 and periods != self.n_forecasts:
                periods = self.n_forecasts
                log.warning(
                    "Number of forecast steps is defined by n_forecasts. " "Adjusted to {}.".format(self.n_forecasts)
                )

        if periods > 0:
            future_df = df_utils.make_future_df(
                df_columns=df.columns,
                last_date=last_date,
                periods=periods,
                freq=self.data_freq,
            )
            future_df = df_utils.normalize(future_df, self.data_params)
            if len(df) > 0:
                df = df.append(future_df)
            else:
                df = future_df
        df.reset_index(drop=True, inplace=True)
        return df

    def predict(self, df):
        """Runs the model to make predictions.

        and compute stats (MSE, MAE)
        Args:
            df (pandas DataFrame): Dataframe with columns 'ds' datestamps, 'y' time series values and
                other external variables

        Returns:
            df_forecast (pandas DataFrame): columns 'ds', 'y', 'trend' and ['yhat<i>']
        """
        # TODO: Implement data sanity checks?
        if self.fitted is False:
            log.warning("Model has not been fitted. Predictions will be random.")
        dataset = self._create_dataset(df, predict_mode=True)
        loader = DataLoader(dataset, batch_size=min(1024, len(df)), shuffle=False, drop_last=False)

        predicted_vectors = list()
        component_vectors = None
        with torch.no_grad():
            self.model.eval()
            for inputs, _ in loader:
                predicted = self.model.forward(inputs)
                predicted_vectors.append(predicted.detach().numpy())
                
        predicted = np.concatenate(predicted_vectors)

        scale_y, shift_y = self.data_params["y"].scale, self.data_params["y"].shift
        predicted = predicted * scale_y + shift_y

        cols = ["ds", "y"]  # cols to keep from df
        df_forecast = pd.concat((df[cols],), axis=1)

        # create a line for each forecast_lag
        # 'yhat<i>' is the forecast for 'y' at 'ds' from i steps ago.
        for i in range(self.n_forecasts):
            forecast_lag = i + 1
            forecast = predicted[:, forecast_lag - 1]
            pad_before = self.n_lags + forecast_lag - 1
            pad_after = self.n_forecasts - forecast_lag
            yhat = np.concatenate(([None] * pad_before, forecast, [None] * pad_after))
            df_forecast["yhat{}".format(i + 1)] = yhat
            df_forecast["residual{}".format(i + 1)] = yhat - df_forecast["y"]

        
        return df_forecast

    
    
    
    

In [14]:
data_location = "../"
df = pd.read_csv(data_location + "example_data/yosemite_temps.csv").iloc[:1000]
df.head(3)

Unnamed: 0,ds,y
0,2017-05-01 00:00:00,27.8
1,2017-05-01 00:05:00,27.0
2,2017-05-01 00:10:00,26.8


In [15]:
m = LSTM(n_lags = 10,
            n_forecasts=3,
            num_hidden_layers=1,
            d_hidden=64,
            learning_rate=0.1,
            epochs=10,
            batch_size=None,
            loss_func="Huber",
            optimizer="AdamW",
            train_speed=None,
            normalize="auto",
            impute_missing=True,
            lstm_bias = True,
            lstm_biderectional = False)

In [16]:
metrics_df = m.fit(df, freq = '5min', validate_each_epoch=False)

INFO - (NP.config.set_auto_batch_epoch) - Auto-set batch_size to 32
  0%|          | 0/10 [00:00<?, ?it/s]GPU available: False, used: False
TPU available: False, using: 0 TPU cores


  | Name      | Type         | Params
-------------------------------------------
0 | lstm      | LSTM         | 19.5 K
1 | linear    | Linear       | 195   
2 | loss_func | SmoothL1Loss | 0     
-------------------------------------------
19.7 K    Trainable params
0         Non-trainable params
19.7 K    Total params
0.079     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]




Training: 0it [00:00, ?it/s]

In [17]:
metrics_df

Unnamed: 0,SmoothL1Loss,MAE,MSE
0,0.05994,9.500672,153.647242
1,0.024996,6.748224,64.072436
2,0.005584,3.04318,14.312893
3,0.002526,1.935654,6.475463
4,0.001938,1.686954,4.966622
5,0.001854,1.557276,4.753134
6,0.001737,1.48376,4.452983
7,0.001581,1.420255,4.05202
8,0.0015,1.396395,3.845985
9,0.001354,1.307024,3.470326


In [18]:
m.test(df)




Testing: 0it [00:00, ?it/s]

Unnamed: 0,SmoothL1Loss,MAE,MSE
0,0.001187,1.285098,3.042945


In [19]:
x = m.make_future_dataframe(df, periods=100, n_historic_predictions=10)

Number of forecast steps is defined by n_forecasts. Adjusted to 3.


In [20]:
m.predict(x)

Unnamed: 0,ds,y,yhat1,residual1,yhat2,residual2,yhat3,residual3
0,2017-05-04 09:40:00,8.7,,,,,,
1,2017-05-04 09:45:00,8.5,,,,,,
2,2017-05-04 09:50:00,8.4,,,,,,
3,2017-05-04 09:55:00,8.6,,,,,,
4,2017-05-04 10:00:00,8.5,,,,,,
5,2017-05-04 10:05:00,8.5,,,,,,
6,2017-05-04 10:10:00,8.7,,,,,,
7,2017-05-04 10:15:00,8.6,,,,,,
8,2017-05-04 10:20:00,8.5,,,,,,
9,2017-05-04 10:25:00,8.3,,,,,,


In [259]:
plt.plot(df.ds, df.y)

NameError: name 'plt' is not defined

In [232]:
df = df_utils.check_dataframe(
            df, check_y=True)

In [233]:
df = m._handle_missing_data(df, freq='D')

  0%|          | 0/400 [48:28<?, ?it/s]
  0%|          | 0/400 [42:29<?, ?it/s]
  0%|          | 0/400 [40:39<?, ?it/s]


In [234]:
df_train, df_val = df_utils.split_df(df, n_lags=10, n_forecasts=1, valid_p=0.1)

In [235]:
m._init_train_loader(df)

<torch.utils.data.dataloader.DataLoader at 0x7fd33d1732b0>

In [236]:
df

Unnamed: 0,ds,y,t,y_scaled
0,2017-05-01,27.8,0.000000,0.613466
1,2017-05-02,29.4,0.015385,0.653367
2,2017-05-03,31.3,0.030769,0.700748
3,2017-05-04,33.1,0.046154,0.745636
4,2017-05-05,33.6,0.061538,0.758105
...,...,...,...,...
61,2017-07-01,42.2,0.938462,0.972569
62,2017-07-02,40.6,0.953846,0.932668
63,2017-07-03,41.0,0.969231,0.942643
64,2017-07-04,40.9,0.984615,0.940150


In [238]:
a = time_dataset.TimeDataset(
            df,
            n_lags=n_lags,
            n_forecasts=n_forecasts,
            predict_mode=False,
        )

<neuralprophet.time_dataset.TimeDataset at 0x7fd33d1cabb0>

In [113]:
m.fit(df, freq = 'D')

INFO - (NP.config.set_auto_batch_epoch) - Auto-set batch_size to 16
INFO - (NP.config.set_auto_batch_epoch) - Auto-set epochs to 400


  0%|          | 0/190 [00:00<?, ?it/s]

<neuralprophet.time_dataset.TimeDataset object at 0x7fd3381d0a00>


AttributeError: 'dict' object has no attribute 'size'

In [91]:
a = LSTM(input_size = 10,
     hidden_size = 64,
     num_layers = 100,
     bias = False,
     bidirectional = False
     )

In [93]:
a.forward()

TypeError: forward() missing 1 required positional argument: 'x'

In [169]:
from torch.utils.data import Dataset, DataLoader


In [172]:
class TimeseriesDataset(Dataset):   
    '''
    Custom Dataset subclass. 
    Serves as input to DataLoader to transform X 
      into sequence data using rolling window. 
    DataLoader using this dataset will output batches 
      of `(batch_size, seq_len, n_features)` shape.
    Suitable as an input to RNNs. 
    '''
    def __init__(self, X: np.ndarray, y: np.ndarray, seq_len: int = 1):
        self.X = torch.tensor(X).float()
        self.y = torch.tensor(y).float()
        self.seq_len = seq_len

    def __len__(self):
        return self.X.__len__() - (self.seq_len-1)

    def __getitem__(self, index):
        return (self.X[index:index+self.seq_len], self.y[index+self.seq_len-1])

In [None]:
class TSDataModule(pl.LightningDataModule):
    '''
    PyTorch Lighting DataModule subclass:
    https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html

    Serves the purpose of aggregating all data loading 
      and processing work in one place.
    '''
    
    def __init__(self, seq_len = 1, batch_size = 128, num_workers=0):
        super().__init__()
        self.seq_len = seq_len
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.X_train = None
        self.y_train = None
        self.X_val = None
        self.y_val = None
        self.X_test = None
        self.X_test = None
        self.columns = None
        self.preprocessing = None

    def prepare_data(self):
        pass

    def setup(self, stage=None):
        '''
        Data is resampled to hourly intervals.
        Both 'np.nan' and '?' are converted to 'np.nan'
        'Date' and 'Time' columns are merged into 'dt' index
        '''

        if stage == 'fit' and self.X_train is not None:
            return 
        if stage == 'test' and self.X_test is not None:
            return
        if stage is None and self.X_train is not None and self.X_test is not None:  
            return
        
        
        df = pd.read_csv(
            path, 
            sep=';', 
            parse_dates={'dt' : ['Date', 'Time']}, 
            infer_datetime_format=True, 
            low_memory=False, 
            na_values=['nan','?'], 
            index_col='dt'
        )

        df_resample = df.resample('h').mean()

        X = df_resample.dropna().copy()
        y = X['Global_active_power'].shift(-1).ffill()
        self.columns = X.columns


        X_cv, X_test, y_cv, y_test = train_test_split(
            X, y, test_size=0.2, shuffle=False
        )
    
        X_train, X_val, y_train, y_val = train_test_split(
            X_cv, y_cv, test_size=0.25, shuffle=False
        )

        preprocessing = StandardScaler()
        preprocessing.fit(X_train)

        if stage == 'fit' or stage is None:
            self.X_train = preprocessing.transform(X_train)
            self.y_train = y_train.values.reshape((-1, 1))
            self.X_val = preprocessing.transform(X_val)
            self.y_val = y_val.values.reshape((-1, 1))

        if stage == 'test' or stage is None:
            self.X_test = preprocessing.transform(X_test)
            self.y_test = y_test.values.reshape((-1, 1))
        

    def train_dataloader(self):
        train_dataset = TimeseriesDataset(self.X_train, 
                                          self.y_train, 
                                          seq_len=self.seq_len)
        train_loader = DataLoader(train_dataset, 
                                  batch_size = self.batch_size, 
                                  shuffle = False, 
                                  num_workers = self.num_workers)
        
        return train_loader

    def val_dataloader(self):
        val_dataset = TimeseriesDataset(self.X_val, 
                                        self.y_val, 
                                        seq_len=self.seq_len)
        val_loader = DataLoader(val_dataset, 
                                batch_size = self.batch_size, 
                                shuffle = False, 
                                num_workers = self.num_workers)

        return val_loader

    def test_dataloader(self):
        test_dataset = TimeseriesDataset(self.X_test, 
                                         self.y_test, 
                                         seq_len=self.seq_len)
        test_loader = DataLoader(test_dataset, 
                                 batch_size = self.batch_size, 
                                 shuffle = False, 
                                 num_workers = self.num_workers)

        return test_loader

In [174]:
n_lags = 10
n_forecasts = 1
valid_p = 0.1

In [248]:
freq = '5min'

In [249]:
df_for_tsd = df.copy(deep = True)
df_for_tsd['ds'] = pd.to_datetime(df_for_tsd['ds'])
df_for_tsd = pd.DataFrame(pd.date_range(start=df_for_tsd.ds.min(), end=df_for_tsd.ds.max(), freq = freq),
             columns = ['ds']).merge(df_for_tsd, how = 'left')
df_for_tsd = df_for_tsd.sort_values('ds')
df_for_tsd = df_for_tsd.reset_index()
df_for_tsd.columns = ['time_idx', 'date', 'value']
df_for_tsd['series'] = 0

In [251]:
df_for_tsd

Unnamed: 0,time_idx,date,value,series
0,0,2017-05-01 00:00:00,27.8,0
1,1,2017-05-01 00:05:00,27.0,0
2,2,2017-05-01 00:10:00,26.8,0
3,3,2017-05-01 00:15:00,26.5,0
4,4,2017-05-01 00:20:00,25.6,0
...,...,...,...,...
18716,18716,2017-07-04 23:40:00,42.8,0
18717,18717,2017-07-04 23:45:00,43.0,0
18718,18718,2017-07-04 23:50:00,42.1,0
18719,18719,2017-07-04 23:55:00,42.1,0


In [252]:
n_lags = 10
n_forecasts = 5

In [253]:
df_for_tsd.iloc[50]

time_idx                     50
date        2017-05-01 04:10:00
value                       8.5
series                        0
Name: 50, dtype: object

In [254]:
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
# import dataset, network to train and metric to optimize
from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer, QuantileLoss


In [257]:
# define the dataset, i.e. add metadata to pandas dataframe for the model to understand it
max_encoder_length = 36
max_prediction_length = 6
training_cutoff = "2017-06-20 04:10:00"  # day for cutoff

training = TimeSeriesDataSet(
    df_for_tsd[lambda x: x.date <= training_cutoff],
    time_idx= 'time_idx',  # column name of time of observation
    target= 'value',  # column name of target to predict
    group_ids= ['series'],  # column name(s) for timeseries IDs
    max_encoder_length=max_encoder_length,  # how much history to use
    max_prediction_length=max_prediction_length,  # how far to predict into future
    
)

ValueError: 12 (0.08%) of value values were found to be NA or infinite (even after encoding). NA values are not allowed `allow_missings` refers to missing rows, not to missing values. Possible strategies to fix the issue are (a) dropping the variable value, (b) using `NaNLabelEncoder(add_nan=True)` for categorical variables, (c) filling missing values and/or (d) optionally adding a variable indicating filled values

In [256]:
training

<pytorch_forecasting.data.timeseries.TimeSeriesDataSet at 0x7fd359af3df0>

In [200]:
# create validation dataset using the same normalization techniques as for the training dataset
validation = TimeSeriesDataSet.from_dataset(training, df_for_tsd,
                                            min_prediction_idx=training.index.time.max() + 1,
                                            stop_randomization=True)


In [216]:
# convert datasets to dataloaders for training
batch_size = 128
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=2)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=2)


In [226]:
actuals = torch.cat([y for x, (y, weight) in iter(val_dataloader)])


In [229]:
for i in iter(val_dataloader):
    print(i)

({'encoder_cat': tensor([], size=(25, 36, 0), dtype=torch.int64), 'encoder_cont': tensor([], size=(25, 36, 0)), 'encoder_target': tensor([[27.8000, 29.4000, 31.3000, 33.1000, 33.6000, 22.7000,  4.4000, 11.2000,
         19.9000, 26.5000, 27.6000, 19.1000, 17.5000, 20.8000, 16.3000,  3.2000,
          7.3000, 22.0000, 26.3000, 28.0000, 31.6000, 33.3000, 36.1000, 35.0000,
         31.9000, 31.6000, 29.8000, 34.9000, 37.5000, 36.1000, 27.5000, 12.8000,
         26.2000, 37.7000, 37.3000, 36.9000],
        [29.4000, 31.3000, 33.1000, 33.6000, 22.7000,  4.4000, 11.2000, 19.9000,
         26.5000, 27.6000, 19.1000, 17.5000, 20.8000, 16.3000,  3.2000,  7.3000,
         22.0000, 26.3000, 28.0000, 31.6000, 33.3000, 36.1000, 35.0000, 31.9000,
         31.6000, 29.8000, 34.9000, 37.5000, 36.1000, 27.5000, 12.8000, 26.2000,
         37.7000, 37.3000, 36.9000, 37.2000],
        [31.3000, 33.1000, 33.6000, 22.7000,  4.4000, 11.2000, 19.9000, 26.5000,
         27.6000, 19.1000, 17.5000, 20.8000, 16.3

In [217]:
# create PyTorch Lighning Trainer with early stopping
lr_logger = LearningRateMonitor()
trainer = pl.Trainer(
    max_epochs=100,
    gpus=0,  # run on CPU, if on multiple GPUs, use accelerator="ddp"
    gradient_clip_val=0.1,
    limit_train_batches=30,  # 30 batches per epoch
    callbacks=[lr_logger],
    logger=TensorBoardLogger("lightning_logs")
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores


In [224]:
# define network to train - the architecture is mostly inferred from the dataset, so that only a few hyperparameters have to be set by the user
tft = LSTM.from_dataset(
    # dataset
    training,

)
print(f"Number of parameters in network: {tft.size()/1e3:.1f}k")

AttributeError: type object 'LSTM' has no attribute 'from_dataset'

In [223]:
trainer.fit(
    tft, train_dataloader=train_dataloader, val_dataloaders=val_dataloader,
)


   | Name                               | Type                            | Params
----------------------------------------------------------------------------------------
0  | loss                               | QuantileLoss                    | 0     
1  | logging_metrics                    | ModuleList                      | 0     
2  | input_embeddings                   | MultiEmbedding                  | 0     
3  | prescalers                         | ModuleDict                      | 0     
4  | static_variable_selection          | VariableSelectionNetwork        | 0     
5  | encoder_variable_selection         | VariableSelectionNetwork        | 0     
6  | decoder_variable_selection         | VariableSelectionNetwork        | 0     
7  | static_context_variable_selection  | GatedResidualNetwork            | 4.3 K 
8  | static_context_initial_hidden_lstm | GatedResidualNetwork            | 4.3 K 
9  | static_context_initial_cell_lstm   | GatedResidualNetwork            | 4.3 

Validation sanity check: 0it [00:00, ?it/s]

StopIteration: 

In [None]:
actuals = torch.cat([y for x, (y, weight) in iter(val_dataloader)])
predictions = tft.predict(val_dataloader)
print(f"Mean absolute error of model: {(actuals - predictions).abs().mean()}")


In [None]:
tabularize_univariate_datetime

In [177]:
TimeseriesDataset(df_train, 
                  df_val, 
                  seq_len=n_lags)

ValueError: could not determine the shape of object type 'DataFrame'

In [258]:
df

Unnamed: 0,ds,y
0,2017-05-01 00:00:00,27.8
1,2017-05-01 00:05:00,27.0
2,2017-05-01 00:10:00,26.8
3,2017-05-01 00:15:00,26.5
4,2017-05-01 00:20:00,25.6
...,...,...
18716,2017-07-04 23:40:00,42.8
18717,2017-07-04 23:45:00,43.0
18718,2017-07-04 23:50:00,42.1
18719,2017-07-04 23:55:00,42.1


In [259]:
class TimeseriesDataset(Dataset):   
    '''
    Custom Dataset subclass. 
    Serves as input to DataLoader to transform X 
      into sequence data using rolling window. 
    DataLoader using this dataset will output batches 
      of `(batch_size, seq_len, n_features)` shape.
    Suitable as an input to RNNs. 
    '''
    def __init__(self, X: np.ndarray, y: np.ndarray, seq_len: int = 1):
        self.X = torch.tensor(X).float()
        self.y = torch.tensor(y).float()
        self.seq_len = seq_len

    def __len__(self):
        return self.X.__len__() - (self.seq_len-1)

    def __getitem__(self, index):
        return (self.X[index:index+self.seq_len], self.y[index+self.seq_len-1])

In [None]:
class PowerConsumptionDataModule(pl.LightningDataModule):
    '''
    PyTorch Lighting DataModule subclass:
    https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html

    Serves the purpose of aggregating all data loading 
      and processing work in one place.
    '''
    
    def __init__(self, seq_len = 1, batch_size = 128, num_workers=0):
        super().__init__()
        self.seq_len = seq_len
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.X_train = None
        self.y_train = None
        self.X_val = None
        self.y_val = None
        self.X_test = None
        self.X_test = None
        self.columns = None
        self.preprocessing = None

    def prepare_data(self):
        pass

    def setup(self, stage=None):
        
        if stage == 'fit' and self.X_train is not None:
            return 
        if stage == 'test' and self.X_test is not None:
            return
        if stage is None and self.X_train is not None and self.X_test is not None:  
            return
        

        X = df_resample.dropna().copy()
        y = X['Global_active_power'].shift(-1).ffill()
        self.columns = X.columns


        X_cv, X_test, y_cv, y_test = train_test_split(
            X, y, test_size=0.2, shuffle=False
        )
    
        X_train, X_val, y_train, y_val = train_test_split(
            X_cv, y_cv, test_size=0.25, shuffle=False
        )

        preprocessing = StandardScaler()
        preprocessing.fit(X_train)

        if stage == 'fit' or stage is None:
            self.X_train = preprocessing.transform(X_train)
            self.y_train = y_train.values.reshape((-1, 1))
            self.X_val = preprocessing.transform(X_val)
            self.y_val = y_val.values.reshape((-1, 1))

        if stage == 'test' or stage is None:
            self.X_test = preprocessing.transform(X_test)
            self.y_test = y_test.values.reshape((-1, 1))
        

    def train_dataloader(self):
        train_dataset = TimeseriesDataset(self.X_train, 
                                          self.y_train, 
                                          seq_len=self.seq_len)
        train_loader = DataLoader(train_dataset, 
                                  batch_size = self.batch_size, 
                                  shuffle = False, 
                                  num_workers = self.num_workers)
        
        return train_loader

    def val_dataloader(self):
        val_dataset = TimeseriesDataset(self.X_val, 
                                        self.y_val, 
                                        seq_len=self.seq_len)
        val_loader = DataLoader(val_dataset, 
                                batch_size = self.batch_size, 
                                shuffle = False, 
                                num_workers = self.num_workers)

        return val_loader

    def test_dataloader(self):
        test_dataset = TimeseriesDataset(self.X_test, 
                                         self.y_test, 
                                         seq_len=self.seq_len)
        test_loader = DataLoader(test_dataset, 
                                 batch_size = self.batch_size, 
                                 shuffle = False, 
                                 num_workers = self.num_workers)

        return test_loader

In [262]:
from neuralprophet.time_dataset import tabularize_univariate_datetime

In [269]:
df

Unnamed: 0,ds,y,t,y_scaled
0,2017-05-01 00:00:00,27.8,0.000000,
1,2017-05-01 00:05:00,27.0,0.000053,
2,2017-05-01 00:10:00,26.8,0.000107,
3,2017-05-01 00:15:00,26.5,0.000160,
4,2017-05-01 00:20:00,25.6,0.000214,
...,...,...,...,...
18716,2017-07-04 23:40:00,42.8,0.999786,
18717,2017-07-04 23:45:00,43.0,0.999840,
18718,2017-07-04 23:50:00,42.1,0.999893,
18719,2017-07-04 23:55:00,42.1,0.999947,


In [278]:
data_location = "../"
df = pd.read_csv(data_location + "example_data/yosemite_temps.csv")
df.head(3)

Unnamed: 0,ds,y
0,2017-05-01 00:00:00,27.8
1,2017-05-01 00:05:00,27.0
2,2017-05-01 00:10:00,26.8


In [279]:
df = df_utils.check_dataframe(
            df, check_y=True)
# df = self._handle_missing_data(df, freq=self.data_freq)

In [280]:
data_params = df_utils.init_data_params(
                df,
                normalize="auto",
                covariates_config=None,
                regressor_config=None,
                events_config=None,
            )

In [281]:
df = df_utils.normalize(df, data_params)

In [282]:
df

Unnamed: 0,ds,y,t,y_scaled
0,2017-05-01 00:00:00,27.8,0.000000,
1,2017-05-01 00:05:00,27.0,0.000053,
2,2017-05-01 00:10:00,26.8,0.000107,
3,2017-05-01 00:15:00,26.5,0.000160,
4,2017-05-01 00:20:00,25.6,0.000214,
...,...,...,...,...
18716,2017-07-04 23:40:00,42.8,0.999786,
18717,2017-07-04 23:45:00,43.0,0.999840,
18718,2017-07-04 23:50:00,42.1,0.999893,
18719,2017-07-04 23:55:00,42.1,0.999947,


In [284]:
tabularize_univariate_datetime(df)[0]

OrderedDict([('time',
              array([[0.00000000e+00],
                     [5.34188034e-05],
                     [1.06837607e-04],
                     ...,
                     [9.99893162e-01],
                     [9.99946581e-01],
                     [1.00000000e+00]]))])