# Timeseries Analysis

In [None]:
from typing import Dict
from collections import OrderedDict

In [None]:
import numpy as np
import pandas as pd
import random
import statsmodels.formula.api as smf
import statsmodels.tsa.stattools as smtsa

In [None]:
import sys
sys.path.append('lib')

import compstats
import hypothesis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from IPython.core.pylabtools import figsize
sns.set_theme()
figsize(11, 5)

In [None]:
from IPython.display import display

Load the data from "Price of Weed".

In [None]:
transactions = pd.read_csv('data/mj-clean.csv', parse_dates=[5])
transactions.head()

The following function takes a DataFrame of transactions and compute daily averages.

In [None]:
def group_by_day(transactions: pd.DataFrame, func=np.mean) -> pd.DataFrame:
    """Groups transactions by day and compute the daily mean ppg.

    transactions: DataFrame of transactions

    returns: DataFrame of daily prices
    """
    grouped = transactions[['date', 'ppg']].groupby('date')
    daily = grouped.aggregate(func)

    daily['date'] = daily.index
    start = daily.date[0]
    one_year = np.timedelta64(1, 'Y')
    # dt / 1 year
    daily['years'] = (daily.date - start) / one_year

    return daily

In [None]:
group_by_day(transactions)

The following function returns a map from quality name to a DataFrame of daily averages.

In [None]:
def group_by_quality_and_day(transactions: pd.DataFrame) -> OrderedDict[str, pd.DataFrame]:
    """Divides transactions by quality and computes mean daily price.

    transaction: DataFrame of transactions
    
    returns: map from quality to time series of ppg
    """
    groups = transactions.groupby('quality')
    dailies = OrderedDict()
    for name, group in groups:
        dailies[name] = group_by_day(group)        

    return dailies

In [None]:
group_by_quality_and_day(transactions).keys()

In [None]:
# put the quality categories in order
transactions['quality'] = pd.Categorical(
    transactions.quality.values,
    categories=['high', 'medium', 'low'],
    ordered=True
)

In [None]:
for name, _ in transactions.groupby('quality'):
    print(name)

In [None]:
group_by_quality_and_day(transactions).keys()

`dailies` is the map from quality name to DataFrame.

In [None]:
dailies = group_by_quality_and_day(transactions)

The following plots the daily average price for each quality.

In [None]:
fig, _ = plt.subplots(nrows=len(dailies), ncols=1, figsize=(11, 13))
for i, (name, daily) in enumerate(dailies.items()):
    # n rows and 1 column
    plt.subplot(len(dailies), 1, i+1)
    plt.scatter(
        daily.index,
        daily.ppg,
        alpha=0.2,
        label=name
    )
    plt.legend(loc='upper right')
    plt.ylim([0, 20])
    if i == 0:
        plt.title('Price per gram ($)')
    if i == len(dailies) - 1:
        plt.xticks(rotation=30)
    else:
        plt.xticks([])
fig.suptitle('Time series of daily price per gram for high, medium, and low quality cannabis.');

We can use `statsmodels` to run a linear model of price as a function of time.

In [None]:
def summarize_results(results, show_std=True):
    """Prints the most important parts of linear regression results:

    results: RegressionResults object
    """
    for name, param in results.params.items():
        pvalue = results.pvalues[name]
        print(f'{name:26}: {param:0.4f}: {pvalue:0.4f}')
    try:
        print(f'R^2      : {results.rsquared:0.4f}')
        if show_std:
            print(f'Std(ys)  : {results.model.endog.std():0.4f}')
            print(f'Std(res) : {results.resid.std():0.4f}')
    except AttributeError:
        print(f'R^2      : {results.prsquared:0.4f}')

Here's what the results look like.

In [None]:
for name, daily in dailies.items():
    results = smf.ols('ppg ~ years', data=daily).fit()
    print('\n{}'.format(name))
    summarize_results(results, show_std=False)

The estimated slopes indicate that the price of high quality cannabis dropped by about 71 cents per year during the observed interval; for medium quality it increased by 28 cents per year, and for low quality it increased by 57 cents per year. These estimates are all statistically significant with very small p-values.

The $R^2$ value for high quality cannabis is 0.44, which means that time as an explanatory variable accounts for 44% of the observed variability in price. For the other qualities, the change in price is smaller, and variability in prices is higher, so the values of $R^2$ are smaller (but still statistically significant).

In [None]:
def run_linear_model(daily: pd.DataFrame):
    model = smf.ols('ppg ~ years', data=daily)
    results = model.fit()
    return model, results

def fit_linear_model(daily: pd.DataFrame):
    return smf.ols('ppg ~ years', data=daily).fit()

Now let's plot the fitted model with the data.

In [None]:
def plot_fitted_values(model, results, label=''):
    """Plots original data and fitted values.

    model: StatsModel model object
    results: StatsModel results object
    """
    years = model.exog[:,1]
    values = model.endog
    plt.scatter(years, values, s=15, label=label)
    plt.plot(years, results.fittedvalues, label='model', color='#ff7f00')
    plt.xlabel('Years')
    plt.xlim([-0.1, 3.8])
    plt.ylabel('Price per gram ($)');
    plt.legend(loc='upper right');

In [None]:
figsize(11, 5)

Time series of daily price per gram for high quality cannabis, and a linear least squares fit.

The following function plots the original data and the fitted curve.

In [None]:
def plot_linear_model(daily, name):
    """Plots a linear fit to a sequence of prices, and the residuals.
    
    daily: DataFrame of daily prices
    name: string
    """
    model, results = run_linear_model(daily)
    plot_fitted_values(model, results, label=name)

Here are results for the high quality category:

In [None]:
name = 'high'
daily = dailies[name]
plot_linear_model(daily, name)

## Moving averages

As a simple example, I'll show the rolling average of the numbers from 1 to 10.

In [None]:
array = np.arange(10)

With a "window" of size 3, we get the average of the previous 3 elements, or nan when there are fewer than 3.

In [None]:
array

In [None]:
# But Series now provides `rolling`
series = pd.Series(array)
series.rolling(3).mean()

The following function plots the rolling mean.

In [None]:
def plot_rolling_mean(daily, name):
    """Plots rolling mean.

    daily: DataFrame of daily prices
    """
    dates = pd.date_range(daily.index.min(), daily.index.max())
    reindexed = daily.reindex(dates)

    plt.scatter(reindexed.index, reindexed.ppg, s=15, alpha=0.2, label=name)
    # roll_mean = pd.rolling_mean(reindexed.ppg, 30)
    roll_mean = reindexed.ppg.rolling(30).mean()
    plt.plot(roll_mean, label='rolling mean', color='#ff7f00')
    plt.xticks(rotation=30)
    plt.ylabel('price per gram ($)')
    plt.legend(loc='upper right');

Here's what it looks like for the high quality category.

In [None]:
plot_rolling_mean(dailies['high'], 'High')

The exponentially-weighted moving average gives more weight to more recent points.

In [None]:
def plot_ewma(daily, name):
    """Plots rolling mean.

    daily: DataFrame of daily prices
    """
    dates = pd.date_range(daily.index.min(), daily.index.max())
    reindexed = daily.reindex(dates)

    plt.scatter(reindexed.index, reindexed.ppg, s=15, alpha=0.2, label=name)
    roll_mean = reindexed.ppg.ewm(30).mean()
    plt.plot(roll_mean, label='EWMA', color='#ff7f00')
    plt.xticks(rotation=30)
    plt.xlabel('price per gram ($)')
    plt.legend(loc='upper right');

In [None]:
plot_ewma(dailies['high'], 'High')

We can use resampling to generate missing values with the right amount of noise.

In [None]:
def fill_missing(daily, span=30):
    """Fills missing values with an exponentially weighted moving average.

    Resulting DataFrame has new columns 'ewma' and 'resid'.

    daily: DataFrame of daily prices
    span: window size (sort of) passed to ewma

    returns: new DataFrame of daily prices
    """
    
    # fill in the gaps in the dates
    dates = pd.date_range(daily.index.min(), daily.index.max())
    reindexed = daily.reindex(dates)
    # fill the missing values with the moving average
    ewma = reindexed.ppg.ewm(span=span).mean()
    # residuals values not including days when ppg is nan
    resid = (reindexed.ppg - ewma).dropna()
    # sum of the moving average and a random sample of the residuals
    fake_data = ewma + compstats.resample_n(resid, len(reindexed))
    # finally replace nan with values from fake_data
    reindexed.ppg.fillna(fake_data, inplace=True)
    # store our moving average and noise components
    reindexed['ewma'] = ewma
    reindexed['resid'] = reindexed.ppg - ewma
    return reindexed

In [None]:
def plot_filled(daily, name):
    """Plots the EWMA and filled data.

    daily: DataFrame of daily prices
    """
    filled = fill_missing(daily, span=30)
    plt.scatter(filled.index, filled.ppg, s=15, alpha=0.2, label=name)
    plt.plot(filled.ewma, label='EWMA', color='#ff7f00')
    plt.xticks(rotation=30)
    plt.xlabel('price per gram ($)')
    plt.legend(loc='upper right');

Here's what the EWMA model looks like with missing values filled.

In [None]:
plot_filled(dailies['high'], 'High')

## Serial correlation

As prices vary from day to day, you might expect to see patterns. If the price is high on Monday, you might expect it to be high for a few more days; and if it’s low, you might expect it to stay low.

A pattern like this is called serial correlation, because each value is correlated with the next one in the series. To compute serial correlation, we can shift the time series by an interval called a lag, and then compute the correlation of the shifted series with the original:

In [None]:
series = pd.Series(array)
series.values

In [None]:
lag = 1
# everything but the first item
xs = series[lag:]
xs.values

In [None]:
# shift to the right. each element get the value of the element before it.
ys = series.shift(lag)
ys.values

In [None]:
# remove the nan at the beginning
ys = ys[lag:].astype(int)

In [None]:
ys.values

so `ys` lags behind `xs` by 1. Finally compute the correlation between `xs` and `ys`

In [None]:
compstats.corr(xs, ys)

In [None]:
del xs, ys, lag, series

The following function computes serial correlation with the given lag.

In [None]:
def serial_corr(series, lag=1):
    # 0-9 -> 1-9
    xs = series[lag:]
    # 0-8
    ys = series.shift(lag)[lag:]
    # (1,0),(2,1),(3,2),(4,3) etc
    return compstats.corr(xs, ys)

Before computing correlations, we'll fill missing values.

In [None]:
filled_dailies = {}
for name, daily in dailies.items():
    filled_dailies[name] = fill_missing(daily, span=30)

Here are the serial correlations for raw price data.

In [None]:
for name, filled in filled_dailies.items():            
    corr = serial_corr(filled.ppg, lag=1)
    print(f'{name:12}: {corr:0.3f}')

It's not surprising that there are correlations between consecutive days, because there are obvious trends in the data.

It is more interesting to see if the correlation persists if you subtract away the trend. For example, we can compute the residual of the EWMA and then compute its serial correlation:

In [None]:
for name, daily in filled_dailies.items():
    print(f'{name:12}: {serial_corr(daily.resid, 1):0.3f}')

Even if the correlations between consecutive days are weak, there might be correlations across intervals of one week, one month, or one year.

In [None]:
for lag in [1, 7, 30, 365]:
    print(f'Lag {lag}\t')
    for name, filled in filled_dailies.items():            
        corr = serial_corr(filled.resid, lag)
        print(f'{name:12}: {corr:0.3f}\t')
    print()

The strongest correlation is a weekly cycle in the medium quality category.

## Autocorrelation

The autocorrelation function is the serial correlation computed for all lags.

We can use it to replicate the results from the previous section.

In [None]:
filled = filled_dailies['high']
acf = smtsa.acf(filled.resid, nlags=365, adjusted=True, fft=False)
print('%0.2g, %.2g, %0.2g, %0.2g, %0.2g' % 
      (acf[0], acf[1], acf[7], acf[30], acf[365]))

To get a sense of how much autocorrelation we should expect by chance, we can resample the data (which eliminates any actual autocorrelation) and compute the ACF.

In [None]:
def simulate_autocorrelation(daily, iters=1001, nlags=40):
    """Resample residuals, compute autocorrelation, and plot percentiles.

    daily: DataFrame
    iters: number of simulations to run
    nlags: maximum lags to compute autocorrelation
    """
    # run simulations
    t = []
    for _ in range(iters):
        filled = fill_missing(daily, span=30)
        resid = compstats.resample(filled.resid)
        acf = smtsa.acf(resid, nlags=nlags, adjusted=False, fft=False)[1:]
        t.append(np.abs(acf))
    high = np.percentile(np.array(t), 97.5, axis=0)
    low = -high
    lags = range(1, nlags+1)
    plt.fill_between(lags, low, high, alpha=0.2, color='gray');

In [None]:
daily = dailies['high']
simulate_autocorrelation(daily)

The following function plots the actual autocorrelation for lags up to 40 days.

The flag `add_weekly` indicates whether we should add a simulated weekly cycle.

To see what the autocorrelation function looks like when there is a seasonal component, I generated simulated data by adding a weekly cycle. Assuming that demand for cannabis is higher on weekends, we might expect the price to be higher. To simulate this effect, I select dates that fall on Friday or Saturday and add a random amount to the price, chosen from a uniform distribution from \\$0 to \\$2.

In [None]:
def add_weekly_seasonality(daily):
    """Adds a weekly pattern.

    daily: DataFrame of daily prices

    returns: new DataFrame of daily prices
    """
    fri_or_sat = (daily.index.dayofweek==4) | (daily.index.dayofweek==5)
    weekly = daily.ppg.values.copy()
    weekly[fri_or_sat] += np.random.uniform(0, 2, fri_or_sat.sum())
    fake = daily.copy()
    fake.ppg = weekly
    return fake

In [None]:
add_weekly_seasonality(dailies['high'])

Can we improve on this?

In [None]:
def plot_autocorrelation(dailies, nlags=40, add_weekly=False):
    """Plots autocorrelation functions.

    dailies: map from category name to DataFrame of daily prices
    nlags: number of lags to compute
    add_weekly: boolean, whether to add a simulated weekly pattern
    """
    simulate_autocorrelation(dailies['high'])
    for i, (name, daily) in enumerate(dailies.items()):
        if add_weekly:
            daily = add_weekly_seasonality(daily)

        filled = fill_missing(daily, span=30)
        acf = smtsa.acf(filled.resid, nlags=nlags, adjusted=True, fft=False)
        lags = np.arange(len(acf))
        plt.plot(lags[1:], acf[1:], label=name)
    plt.xlim([0, 41])
    plt.ylim([-0.2, 0.2])
    plt.xlabel('lag (day)')
    plt.ylabel('correlation')
    plt.legend(loc='lower right');

To show what a strong weekly cycle would look like, we have the option of adding a price increase of 1-2 dollars on Friday and Saturdays.

Here's what the real ACFs look like.  The gray regions indicate the levels we expect by chance.

In [None]:
plot_autocorrelation(dailies, add_weekly=False)

The autocorrelation functions for the three quality categories, with nlags=40. The gray region shows the normal variability we would expect if there is no actual autocorrelation; anything that falls outside this range is statistically significant, with a p-value less than 5%. Since the false positive rate is 5%, and we are computing 120 correlations (40 lags for each of 3 times series), we expect to see about 6 points outside this region. In fact, there are 7. We conclude that there are no autocorrelations in these series that could not be explained by chance.

Here's what it would look like if there were a weekly cycle.

In [None]:
plot_autocorrelation(dailies, add_weekly=True)

## Prediction

The simplest way to generate predictions is to use `statsmodels` to fit a model to the data, then use the `predict` method from the results.

In [None]:
def generate_simple_prediction(results, years):
    """Generates a simple prediction.

    results: results object
    years: sequence of times (in years) to make predictions for

    returns: sequence of predicted values
    """
    n = len(years)
    inter = np.ones(n)
    d = dict(Intercept=inter, years=years, years2=years**2)
    predict_df = pd.DataFrame(d)
    predict = results.predict(predict_df)
    return predict

In [None]:
results = fit_linear_model(dailies['high'])
# five years ahead
years = np.linspace(0, 5, 101)
generate_simple_prediction(results, years)

In [None]:
def plot_simple_prediction(daily, results, years, name):
    predict = generate_simple_prediction(results, years)

    plt.scatter(daily.years, daily.ppg, alpha=0.2, label=name)
    plt.plot(years, predict, color='#ff7f00', label='prediction')
    plt.xlim([years[0]-0.1, years[-1]+0.1])
    plt.xlabel('Years')
    plt.ylabel('Price per gram ($)')
    plt.legend(loc='upper right');

Here's what the prediction looks like for the high quality category, using the linear model.

In [None]:
results = fit_linear_model(dailies['high'])
# five years ahead
years = np.linspace(0, 5, 101)
plot_simple_prediction(dailies['high'], results, years, 'High')

When we generate predictions, we want to quatify the uncertainty in the prediction.  We can do that by resampling.  The following function fits a model to the data, computes residuals, then resamples from the residuals to general fake datasets.  It fits the same model to each fake dataset and returns a list of results.

In [None]:
def simulate_results(daily, iters=101, func=fit_linear_model):
    """Run simulations based on resampling residuals.

    daily: DataFrame of daily prices
    iters: number of simulations
    func: function that fits a model to the data

    returns: list of result objects
    """
    results = func(daily)
    fake = daily.copy()
    
    result_seq = []
    for _ in range(iters):
        fake.ppg = results.fittedvalues + compstats.resample(results.resid)
        fake_results = func(fake)
        result_seq.append(fake_results)

    return result_seq

To generate predictions, we take the list of results fitted to resampled data.  For each model, we use the `predict` method to generate predictions, and return a sequence of predictions.

If `add_resid` is true, we add resampled residuals to the predicted values, which generates predictions that include predictive uncertainty (due to random noise) as well as modeling uncertainty (due to random sampling).

In [None]:
def generate_predictions(result_seq, years, add_resid=False):
    """Generates an array of predicted values from a list of model results.

    When add_resid is False, predictions represent sampling error only.

    When add_resid is True, they also include residual error (which is
    more relevant to prediction).
    
    result_seq: list of model results
    years: sequence of times (in years) to make predictions for
    add_resid: boolean, whether to add in resampled residuals

    returns: sequence of predictions
    """
    n = len(years)
    d = dict(Intercept=np.ones(n), years=years, years2=years**2)
    predict_df = pd.DataFrame(d)
    
    predict_seq = []
    for fake_results in result_seq:
        predict = fake_results.predict(predict_df)
        if add_resid:
            predict += compstats.resample_n(fake_results.resid, n)
        predict_seq.append(predict)

    return np.array(predict_seq)

To visualize predictions, I show a darker region that quantifies modeling uncertainty and a lighter region that quantifies predictive uncertainty.

In [None]:
years = np.linspace(0, 5, 101)
result_seq = simulate_results(daily, iters=101)
predict_seq = generate_predictions(result_seq, years, add_resid=True)

In [None]:
percent = 90
p = (100 - percent) / 2
percents = p, 100-p

In [None]:
low, high = np.percentile(predict_seq, percents, axis=0)

In [None]:
plt.scatter(
    daily.years,
    daily.ppg
);
plt.fill_between(
    years,
    low,
    high,
    alpha=0.3,
    color='gray'
);

In [None]:
def plot_predictions(daily, years, iters=101, percent=90, func=fit_linear_model):
    """Plots predictions.

    daily: DataFrame of daily prices
    years: sequence of times (in years) to make predictions for
    iters: number of simulations
    percent: what percentile range to show
    func: function that fits a model to the data
    """
    result_seq = simulate_results(daily, iters=iters, func=func)
    p = (100 - percent) / 2
    percents = p, 100-p

    predict_seq = generate_predictions(result_seq, years, add_resid=True)
    low, high = np.percentile(predict_seq, percents, axis=0)
    plt.fill_between(years, low, high, alpha=0.3, color='gray')

    predict_seq = generate_predictions(result_seq, years, add_resid=False)
    low, high = np.percentile(predict_seq, percents, axis=0)
    plt.fill_between(years, low, high, alpha=0.5, color='gray')

Here are the results for the high quality category.

In [None]:
years = np.linspace(0, 5, 101)
plt.scatter(daily.years, daily.ppg, alpha=0.1, label='High')
plot_predictions(daily, years)
plt.xlim([years[0]-0.1, years[-1]+0.1])
plt.xlabel('Years');
plt.ylabel('Price per gram ($)');

But there is one more source of uncertainty: how much past data should we use to build the model?

The following function generates a sequence of models based on different amounts of past data.

In [None]:
def simulate_intervals(daily, iters=101, func=fit_linear_model):
    """Run simulations based on different subsets of the data.

    daily: DataFrame of daily prices
    iters: number of simulations
    func: function that fits a model to the data

    returns: list of result objects
    """
    result_seq = []
    starts = np.linspace(0, len(daily), iters).astype(int)

    for start in starts[:-2]:
        subset = daily[start:]
        results = func(subset)
        fake = subset.copy()

        for _ in range(iters):
            fake.ppg = (
                results.fittedvalues + compstats.resample(results.resid)
            )
            fake_results = func(fake)
            result_seq.append(fake_results)

    return result_seq

And this function plots the results.

In [None]:
def plot_intervals(daily, years, iters=101, percent=90, func=fit_linear_model):
    """Plots predictions based on different intervals.

    daily: DataFrame of daily prices
    years: sequence of times (in years) to make predictions for
    iters: number of simulations
    percent: what percentile range to show
    func: function that fits a model to the data
    """
    result_seq = simulate_intervals(daily, iters=iters, func=func)
    p = (100 - percent) / 2
    percents = p, 100-p

    predict_seq = generate_predictions(result_seq, years, add_resid=True)
    low, high = np.percentile(predict_seq, percents, axis=0)
    plt.fill_between(years, low, high, alpha=0.2, color='gray')

Here's what the high quality category looks like if we take into account uncertainty about how much past data to use.

In [None]:
# this takes a very long time
name = 'high'
daily = dailies[name]

plt.scatter(daily.years, daily.ppg, alpha=0.1, label=name)
plot_intervals(daily, years)
plot_predictions(daily, years)
plt.xlim([years[0]-0.1, years[-1]+0.1])
plt.xlabel('Years');
plt.ylabel('Price per gram ($)');
plt.title('Predictions');

## Exercises

**Exercise:**   The linear model I used in this chapter has the obvious drawback that it is linear, and there is no reason to expect prices to change linearly over time. We can add flexibility to the model by adding a quadratic term, as we did in Section 11.3.

Use a quadratic model to fit the time series of daily prices, and use the model to generate predictions. You will have to write a version of `RunLinearModel` that runs that quadratic model, but after that you should be able to reuse code from the chapter to generate predictions.

In [None]:
# Solution

def run_quadratic_model(daily):
    """Runs a linear model of prices versus years.

    daily: DataFrame of daily prices

    returns: model, results
    """
    daily['years2'] = daily.years**2
    model = smf.ols('ppg ~ years + years2', data=daily)
    results = model.fit()
    return model, results

def fit_quadratic_model(daily):
    return smf.ols('ppg ~ years + years2', data=daily).fit()

In [None]:
# Solution

name = 'high'
daily = dailies[name]

model, results = run_quadratic_model(daily)
summarize_results(results)

In [None]:
plot_fitted_values(model, results, label='High')

In [None]:
years = np.linspace(0, 5, 101)
plt.scatter(daily.years, daily.ppg, alpha=0.1, label=name)
plot_predictions(daily, years, func=fit_quadratic_model)
plt.xlim([years[0]-0.1, years[-1]+0.1])
plt.xlabel('Years');
plt.ylabel('Price per gram ($)');

**Exercise:** Write a definition for a class named `SerialCorrelationTest` that extends `HypothesisTest` from Section 9.2. It should take a series and a lag as data, compute the serial correlation of the series with the given lag, and then compute the p-value of the observed correlation.

Use this class to test whether the serial correlation in raw price data is statistically significant. Also test the residuals of the linear model and (if you did the previous exercise), the quadratic model.

In [None]:
class SerialCorrelationTest(hypothesis.HypothesisTest):
    
    def __init__(self, series: pd.Series, lag: int):
        self.series = series
        self.lag = lag
        self.actual = self.test_statistic(self.series)
        
    def make_model(self):
        pass
        
    def test_statistic(self, data: pd.Series):
        """Computes the test statistic.

        data: tuple of xs and ys
        """
        return abs(serial_corr(data, self.lag))
    
    def run_model(self):
        """Run the model of the null hypothesis.

        returns: simulated data
        """
        permutation = self.series.reindex(np.random.permutation(self.series.index))
        return permutation

In [None]:
# test the correlation between consecutive prices

name = 'high'
daily = dailies[name]

test = SerialCorrelationTest(daily.ppg, 1)
p_val = test.p_value()
print(f'Actual: {test.actual:0.2f}: p-val: {p_val:0.2f}')

In [None]:
# test for serial correlation in residuals of the linear model

results = fit_linear_model(daily)
series = results.resid
test = SerialCorrelationTest(results.resid, 1)
p_val = test.p_value()
print(f'Actual: {test.actual:0.2f}: p-val: {p_val:0.2f}') 

In [None]:
# test for serial correlation in residuals of the quadratic model
results = fit_quadratic_model(daily)
test = SerialCorrelationTest(results.resid, 1)
p_val = test.p_value()
print(f'Actual: {test.actual:0.2f}: p-val: {p_val:0.2f}') 

**Worked example:** There are several ways to extend the EWMA model to generate predictions. One of the simplest is something like this:

1. Compute the EWMA of the time series and use the last point as an intercept, `inter`.

2. Compute the EWMA of differences between successive elements in the time series and use the last point as a slope, `slope`.

3. To predict values at future times, compute `inter + slope * dt`, where `dt` is the difference between the time of the prediction and the time of the last observation.


In [None]:
name = 'high'
daily = dailies[name]

filled = fill_missing(daily)
diffs = filled.ppg.diff()
plt.plot(diffs)
plt.xticks(rotation=30)
plt.ylabel('Daily change in price per gram ($)');

In [None]:
filled['slope'] = diffs.ewm(span=365).mean()
plt.plot(filled.slope[-365:])
plt.xticks(rotation=30)
plt.ylabel('EWMA of diff ($)');

In [None]:
# extract the last inter and the mean of the last 30 slopes
start = filled.index[-1]
inter = filled.ewma[-1]
slope = filled.slope[-30:].mean()
print(f'start: {start}, intercept: {inter:0.2f}, slope: {slope:0.4f}')

In [None]:
# reindex the DataFrame, adding a year to the end
dates = pd.date_range(
    filled.index.min(), 
    filled.index.max() + np.timedelta64(365, 'D')
)
predicted = filled.reindex(dates)

In [None]:
# generate predicted values and add them to the end
predicted['date'] = predicted.index
one_day = np.timedelta64(1, 'D')
predicted['days'] = (predicted.date - start) / one_day
predict = inter + slope * predicted.days
predicted.ewma.fillna(predict, inplace=True)

In [None]:
# plot the actual values and predictions
plt.scatter(daily.ppg, alpha=0.1, label=name)
plt.plot(predicted.ewma, color='#ff7f00');