In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
import pandas as pd
import numpy as np
from datetime import datetime
import time
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
neigh = KNeighborsRegressor(n_neighbors=2)


from darts import TimeSeries
from darts.models import (NaiveSeasonal, NaiveDrift, Prophet,
                          ExponentialSmoothing, ARIMA, AutoARIMA,
                          RegressionModel, Theta, FFT)
from darts.utils.utils import ModelMode, SeasonalityMode, TrendMode
from darts.metrics import mape, mase, r2_score, smape
from darts.utils.statistics import check_seasonality, plot_acf, plot_residuals_analysis, extract_trend_and_seasonality

import warnings
warnings.filterwarnings("ignore")
import logging
logging.disable(logging.CRITICAL)

In [None]:
df = pd.read_csv("../../data/later/profile_growth.csv")
df['Date'] = pd.to_datetime(df['Date'])
# Create a TimeSeries, specifying the time and value columns
series = TimeSeries.from_dataframe(df, 'Date', 'Followers')

In [None]:
plt.figure(figsize=(10, 4))
series.plot()

### Creating a training and validation series
First, let's split our TimeSeries into a training and a validation series. Note: in general, it is also a good practice to keep a test series aside and never touch it until the end of the process. Here, we just build a training and a test series for simplicity.

#### Validation - 2 weeks

In [None]:
plt.figure(figsize=(10, 4))
train, val = series.split_after(df.shape[0] - 14)#pd.Timestamp('20220214'))
train.plot(label='training')
val.plot(label='validation')
plt.legend();

### Quickly try a few more models
Let's train a few more and compute their respective MAPE on the validation set:

In [None]:
def eval_model(model):
    model.fit(train)
    forecast = model.predict(len(val))
    print('model {} obtains MAPE: {:.2f}%'.format(model, mape(val, forecast)))
    plt.title("MAPE = {:.2f}%".format(mape(forecast, val)))
    series.plot(label="actual")
    forecast.plot(label=f"{model}:forecast")
    plt.legend()


eval_model(ExponentialSmoothing())
eval_model(Prophet())
eval_model(AutoARIMA())
eval_model(Theta())
eval_model(FFT())

In [None]:
model = AutoARIMA()
model.fit(train)
forecast = model.predict(len(val))

print('model {} obtains MAPE: {:.2f}%'.format(model, mape(val, forecast)))

series.plot(label="actual")
forecast.plot(label="forecast")
plt.legend();

### Scaled

Normalize the time series (note: we avoid fitting the transformer on the validation set)


In [None]:
from darts.dataprocessing.transformers import Scaler
transformer = Scaler()
train_transformed = transformer.fit_transform(train)
val_transformed = transformer.transform(val)
series_transformed = transformer.transform(series)

model = AutoARIMA()
model.fit(train_transformed)
preds = model.predict(len(val_transformed))

forecast = transformer.inverse_transform(preds)

print('model {} obtains MAPE: {:.2f}%'.format(model, mape(val, forecast)))

series.plot(label="actual")
forecast.plot(label=f"{model}:forecast")
plt.legend()

### Box Cox Transformed

In [None]:
from darts.dataprocessing.transformers import BoxCox
from darts.utils.utils import ModelMode, SeasonalityMode, TrendMode


transformer_boxcox = BoxCox()
train_transformed = transformer_boxcox.fit_transform(train)
val_transformed = transformer_boxcox.transform(val)
series_transformed = transformer_boxcox.transform(series)

model = AutoARIMA()
model.fit(train_transformed)
preds = model.predict(len(val_transformed))

forecast = transformer.inverse_transform(preds)

print('model {} obtains MAPE: {:.2f}%'.format(model, mape(val, forecast)))

series.plot(label="actual")
forecast.plot(label=f"{model}:forecast")
plt.legend()

### Backtesting

Compute error values that the model would have produced when used on series.

It repeatedly builds a training set from the beginning of series. It trains the current model on the training set, emits a forecast of length equal to forecast_horizon, and then moves the end of the training set forward by stride time steps. A metric (given by the metric function) is then evaluated on the forecast and the actual values. Finally, the method returns a reduction (the mean by default) of all these metric scores.

In [None]:
from datetime import timedelta
days_to_subtract = 14
start_timestamp = df['Date'].max() - timedelta(days=days_to_subtract)

In [None]:
average_error = model.backtest(series,
                               start= start_timestamp,
                               forecast_horizon=1,
                               metric=mape,
                               last_points_only=True,
                               verbose=True)
print("Average error (MAPE) over all historical forecasts: {}".format(
    average_error))

In [None]:
median_error = model.backtest(series,
                                  start=start_timestamp,
                                  forecast_horizon=1,
                                  metric=mape,
                                  last_points_only=True,
                                  reduction=np.median,
                                  verbose=True)

print("Median error (MAPE) over all historical forecasts: {}".format(
    median_error))

In [None]:
raw_errors = model.backtest(series,
                                start=start_timestamp,
                                forecast_horizon=1,
                                metric=mape,
                                last_points_only=True,
                                reduction=None,
                                verbose=True)

plt.hist(raw_errors)
plt.title("Individual error scores (histogram)")
plt.show()

In [None]:
historical_fcast = model.historical_forecasts(
    series,
    start=start_timestamp,
    forecast_horizon=7,
    last_points_only=True,
    verbose=True)

Let's see what this backtest forecast looks like. 

You can see it produces more accurate predictions at a 1 day horizon than the one-off prediction (7 days) done above, because here the model is re-fit every day.

In [None]:
series.plot(label='data')
historical_fcast.plot(label='backtest 3-days ahead forecast')
plt.title('MAPE = {:.2f}%'.format(mape(historical_fcast,
                                       series)))
plt.legend()

Let's look at the fitted value residuals of our current `Auto Arima` model, i.e. the difference between the 1-step forecasts at every point in time obtained by fitting the model on all previous points, and the actual observed values.

In [None]:
plot_residuals_analysis(model.residuals(series))

In [None]:
model.residuals(series)

In [None]:
from darts.utils.statistics import plot_hist
plot_hist(model.residuals(series))

In [None]:
plot_residuals_analysis

### Error Analysis

In [None]:
predictions = TimeSeries.pd_dataframe(forecast)
predictions.columns = ['Label']

In [None]:
predictions['Label'] = np.round(predictions['Label'])

In [None]:
import seaborn as sns
error = (predictions['Label'] - test['Followers']).astype('int').to_frame()

In [None]:
test = TimeSeries.pd_dataframe(val)
test.columns = ['Followers']

In [None]:
error.columns = ['errors']
errors_mean = error['errors'].mean()
errors_std = error['errors'].std()

fig, ax = plt.subplots(figsize=(7, 3))

sns.distplot(a=error['errors'], ax=ax, bins=15, rug=True)
ax.axvline(x=errors_mean, color='b', linestyle='--', label=r'$\mu$')
ax.axvline(x=errors_mean + 2 * errors_std,
           color='r',
           linestyle='--',
           label=r'$\mu \pm 2\sigma$')
ax.axvline(x=errors_mean - 2 * errors_std, color='k', linestyle='--')
ax.legend()
ax.set(title='Model Errors');

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

fig, ax = plt.subplots(2, 1, figsize=(8, 6))
plot_acf(x=error['errors'], ax=ax[0]),
#plot_pacf(x=error['errors'], ax=ax[1]);