In [1]:
""" 
----- IMPORTS ----- 
"""
import numpy as np
from statsmodels.tsa.stattools import pacf
from statsmodels.tsa.arima.model import ARIMA

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn 

import yfinance as yf

from datetime import datetime

%matplotlib inline

In [2]:
""" 
----- COLOURS ----- 
"""
palette = ["#072F5F", "#1261A0", "#3895D3", "#58CCED"]

# VALE 

**SUMMARY**

Vale is a Brazilian multinational corporation in the metals and mining industry, and is one of the largest logistics operators in Brazil. They are the largest producer of iron ore and nickel globally, and also produce manganese, ferroalloys, copper, bauxite, potash, kaolin, and cobalt. The company operates hydroelectric plants, railroads, ships, and ports to transport their products. Despite being the most valuable company in Latin America, Vale has faced criticism for its two catastrophic tailings dam failures in Mariana (2015) and Brumadinho (2019), which resulted in the loss of its license to operate eight dams and a decline in stock value. (- Wikipedia page)

**RATIONAL FOR CHOOSING TIME SERIES**

The article mentions that Vale, the Brazilian mining group, has received multiple bids for a stake in its base metals business, which includes nickel, copper, cobalt, and platinum group metals that are vital for the energy transition. The CEO of Vale has stated that the base metals unit could one day outgrow the company and float on the stock market. As the world moves towards electrification of transport and power, the demand for energy transition metals is expected to grow. Share prices for major mining companies have risen over the past year due to the growth in demand for energy transition metals. Therefore, as Vale's base metals unit is positioned to benefit from this trend, the stock of Vale could be worth watching for investors interested in the energy transition. Therefore, this report investigates this hypothesis by investigating the last 300 trading days of VALE stock.

**FT ARTICLE:** https://www.ft.com/content/b03bc946-73da-44d5-8d9e-5541ddb5038b

# **Time Series**

## 1. Download a price time series using an API. The length of the time series T, with $T=300$. The resolution could be any, from tick data to months.

In [140]:
def download_vale_data( start_date, end_date ):
    """ 
    Downloades VALE time series data from Yahoo! Finance's API\\
        between the specified start and end date.
    
    Arguments:
    ----------
        start_date (datetime) : earliest time point from which to collect data
        end_date (datetime) : lates time point from which to collect data
    
    Returns:
    ---------
        data (dataframe) : dataframe of VALE stock data
    """
    vale = yf.download(tickers = 'VALE', start = start_date, end = end_date)
    vale['Date'] = [idx.date() for idx in vale.index] # add date

    return vale

In [141]:
# get VALE data
start_date = datetime(2021, 11, 24) # yyyy-mm-dd
end_date = datetime(2023, 2, 6) # yyyy-mm-dd
vale = download_vale_data( start_date, end_date )

[*********************100%***********************]  1 of 1 completed


In [146]:
# print data
vale.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Date
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-11-24 00:00:00-05:00,12.56,12.74,12.48,12.71,12.045043,27093600,2021-11-24
2021-11-26 00:00:00-05:00,12.2,12.45,12.11,12.37,11.722832,23473100,2021-11-26
2021-11-29 00:00:00-05:00,12.64,12.66,12.35,12.44,11.789168,25153400,2021-11-29
2021-11-30 00:00:00-05:00,12.66,12.81,12.21,12.37,11.722832,37493900,2021-11-30
2021-12-01 00:00:00-05:00,12.69,12.81,12.24,12.25,11.60911,35263600,2021-12-01


In [147]:
# print meta
vale.info();


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 300 entries, 2021-11-24 00:00:00-05:00 to 2023-02-03 00:00:00-05:00
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Open       300 non-null    float64
 1   High       300 non-null    float64
 2   Low        300 non-null    float64
 3   Close      300 non-null    float64
 4   Adj Close  300 non-null    float64
 5   Volume     300 non-null    int64  
 6   Date       300 non-null    object 
dtypes: float64(5), int64(1), object(1)
memory usage: 18.8+ KB


In [148]:
# print data statistics
vale.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,300.0,300.0,300.0,300.0,300.0,300.0
mean,15.602733,15.832533,15.376033,15.618333,15.10448,34696640.0
std,2.292406,2.301912,2.271924,2.296789,2.17158,12598940.0
min,12.0,12.19,11.72,12.14,11.60911,11023100.0
25%,13.5675,13.7875,13.37,13.6,13.226705,25951650.0
50%,15.295,15.56,15.135,15.355,14.760154,32599800.0
75%,17.24,17.5175,17.01,17.2825,16.692041,41765350.0
max,21.09,21.290001,21.040001,21.23,20.119297,83276300.0


## 2. Plot the price time series

In [167]:
def plot_time_series( dates, P, title, xlabel, ylabel ):
    """
    Plots a given time series

    Arguments:
    ----------
        P (array_like) : time series to plot
        dates (datetime) : coresponding time stamps of time series
        title (str) : plot title
        xlabel (str) : x-axis label
        ylabel (str) : y-axis label

    Returns:
    ----------
        fig (figure) : figure object of plot
    """
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=dates, y=P, mode='lines', name='VALE open USD', line=dict(color='#000000'))) # Price series
    fig.update_layout(
        height = 500,
        title={'text': f"<b>{title}</b>", 'y':0.9, 'x':0.5, 'xanchor': 'center', 'yanchor': 'top'},
        xaxis_title="Date",
        yaxis_title="Open Price USD",)
    fig.update_traces(line_color='#000000')

    return fig


In [170]:
# plot time series
dates = vale.Date.values
open_price = vale.Open.values
open_price_plot = plot_time_series( dates, open_price, 'VALE open price USD', 'Date', 'Open Price USD' )
open_price_plot.show()
open_price_plot.write_image("images/fig_1_vale_open_price.png")

In [5]:

fig = px.line(vale, x='Date', y="Open")

fig.update_layout(
    title={
        'text': "<b>VALE open price USD </b>",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title="Date",
    yaxis_title="Open Price USD",)

fig.update_traces(line_color='#000000')

fig.show()

**COMMENT ON THE TIME SERIES**

From a technical analysis perspective, the period around March 2022 can be seen as exhibiting a "cup" trend in the context of a "cup and handle" chart pattern. The pattern is characterized by a rounded bottom formation that is created by a gradual decline in the price of Vale stock, reaching its low point around September 2022, followed by a slow recovery back to its original level. This kind pattern is typically completed with a slight pullback that creates the "handle" part of the chart. Therefore, we might expect a slight drop in Vale stock in the short term. However, this pattern is considered bullish in the long term, indicating a potential for a continued future price increase in Vale. 

We can dive deeper in understanding the behaviour of this times series and its characterist cup-like shape by referencing Vale's history over the past 300 trading days. Specifically, in January 2022, Vale experienced a tailings dam disaster in Minas Gerais, Brazil, which had severe consequences for both the environment and human life. In response, Vale suspended several of its operations and announced plans to decommission its remaining tailings dams. The aftermath of the disaster included financial challenges and a damaged reputation, such as a decrease in iron ore prices, lawsuits, and increased public scrutiny. These factors likely contributed to the sharp decline in Vale's stock price.

Despite these challenges, Vale continued to invest in new mining projects, and in August 2022, the company announced a settlement with the Brazilian authorities, bringing closure to the January 2022 events. Throughout the scrutiny Vale maintained its position as a leading base metal producer, and this has likely driven the gradual recovery seen in the stock price after August 2022. However, it is important to note that technical analysis alone is insuficient and provides guarantee of future price movements and should thefefore always be used in conjunction with other advanced analytical methods, such asARIMA models considered in this report.

# 2. MOVING AVERAGES

**(QUESTION 3) Define mathematically the moving average of the price time series with an arbitrary time window $\tau$**

**(SIMPLE) MOVING AVERAGE**

The moving average of a time series with time window $\tau$ is the average of the $\tau$ consecutive values in the time series, calculated for each time step and shifted by one time step at a time, resulting in a smoothed version of the original time series. The choice of $\tau$ determines the smoothness of the moving average. A larger $\tau$ results in a smoother moving average, while a smaller $\tau$ will result in a less smooth but more responsive moving average that is closer to the original time series.

Mathematically we can define it as follows: 

Let $X$ be the time series and $X_t$ be the value of the time series at time step $t$. Then, the moving average of $X$ with window size $\tau$ is defined as:

$$ Y_t = \frac{1}{\tau}(X_{t} + X_{t-1} + X_{t-2} + ... + X_{t-\tau}) $$

where $Y_t$ is the moving average at time step t. The moving average is calculated for each time step by taking the average of the $\tau$ consecutive values in the time series, starting from the current time step and including the $\tau - 1$ preceding values. The result is a smoothed version of the original time series.

In [6]:
def moving_average(X, t):
    """ 
    Computes the moving average for a time sereis X and window t

    Arguments:
        X: time series 
        t: window of moving average
    
    Returns:
        Y: moving average time series
    """
    cumsum = np.cumsum(X)
    Y = (cumsum[t-1:] - cumsum[:-t+1]) / t
    return Y

In [7]:
# get series of interest
open_p = vale.Open.values
dates = vale.Date.values

# get moving average
open_ma_10 = moving_average(vale.Open.values, 10)
open_ma_20 = moving_average(vale.Open.values, 20)
open_ma_30 = moving_average(vale.Open.values, 30)



In [8]:
# plot moving averages against time series
fig = go.Figure()

fig.add_trace(go.Scatter(x=dates, y=open_p, mode='lines', name='VALE open USD', line=dict(color='#000000'))) # Price series
fig.add_trace(go.Scatter(x=dates[10-1:], y=open_ma_10, mode='lines', name='10 MA', line=dict(color='#072F5F'))) # t = 10 MA
fig.add_trace(go.Scatter(x=dates[20-1:], y=open_ma_20, mode='lines', name='20 MA', line=dict(color='#1261A0'))) # t = 20 MA
fig.add_trace(go.Scatter(x=dates[30-1:], y=open_ma_30, mode='lines', name='30 MA', line=dict(color='#58CCED'))) # t = 30 MA

fig.update_layout(
    title={
        'text': "<b>VALE open price USD with t = 10, 20, 30 Moving Averages (MAs)</b>",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title="Date",
    yaxis_title="Open Price USD",
    showlegend=False)

**COMMENT ON THE MOVING AVERAGES**

Moving average series smooth out fluctuations in a time series and as a result make it easier to identify trends and patterns that may be obfuscated by the noise and "chatoic" nature of the raw data. Ultimatley, they filter out short-term fluctuations in the data leaving the underlying trend of the series and are therefore a powerfult tool used in technical analysis for financial time series. From the plot above, we can we can see that they provide further evidence for our "cup" hypothesis poised. That is, a gradual decline in the open price of Vale's stock followed by a gradual increase beginning around the time that Vale reached a settlement with the Brazilian government about the tailing dam disaster. 

Comparing the different MAs to each other, we can see that they provide varying perspectives on the general patter of the data. Specifically, the 10 day MA is more sensitive to short term fluctuations and therefore responds more quickly that the 20 and 30 day MA. However, it is also more susceptible to random variations. Comparatively, the 30 day MA, is far more "smooth" and can be said to capture the general trend better than the 10 day MA, but is also far less sensitive to short term fluctuations in the data. Finally the 20 day MA, provides some form of compromise between the two. Ultimatley, all series provide some support the "cup" pattern, and combining the MAs provides a more comprehensive view of the trends and patterns in Vale's open price series.

**LINEAR RETURN**

The linear return of a time series is the change in the value of an investment over a specific period of time, calculated as the ratio of the current value to the initial value, minus 1. In other words, it measures the proportionate change in the value of an investment over time and is used as a basic measure of an investment's performance.

Mathematicall it is defined as follow: Given a price time series $P_t$, the return time series is defined as

$$ R_t = \frac{P_t - P_{t-1}}{P_t} $$

**LOG RETURN**

The log-return series $RL_t$ is similiarly defined as

$$ RL_t = \log(\frac{P_t}{P_{t-1}}) $$

In [9]:
def linear_return(P):
    """ 
    Computes the linear return of a price times series P

    Arguments:
        P: price time series

    Returns:
        R: linear return series
    """
    R = (P[1:] - P[:-1]) / P[:-1]
    
    return R

In [10]:
def log_return(P):
    """ 
    Computes the log return of a price times series P

    Arguments:
        P: price time series

    Returns:
        RL: log return series
    """
    RL = np.log(P[1:]/P[:-1])
    
    return RL

In [11]:
# linear return
linear_r = linear_return(open_p)
linear_r_ma_10 = moving_average(linear_r, 10)
linear_r_ma_20 = moving_average(linear_r, 20)
linear_r_ma_30 = moving_average(linear_r, 30)

# log return
log_r = log_return(open_p)
log_r_ma_10 = moving_average(log_r, 10)
log_r_ma_20 = moving_average(log_r, 20)
log_r_ma_30 = moving_average(log_r, 30)

# plot
fig = make_subplots(rows=2, cols=1, subplot_titles=("<b>Linear Return</b>", "<b>Log Return</b>"))

# linear return
fig.add_trace(go.Scatter(x=dates[1:], y=linear_r, mode='lines', name='VALE open USD', line=dict(color='#000000')), row=1, col=1) # Linear Return series
fig.add_trace(go.Scatter(x=dates[1:], y=[np.mean(linear_r)]*len(dates[1:]), mode='lines', name='0', line=dict(color='red')), row=1, col=1) # mean of linear return
fig.add_trace(go.Scatter(x=dates[1+10-1:], y=linear_r_ma_10, mode='lines', name='10 MA', line=dict(color='#072F5F')), row=1, col=1) # t = 10 MA
fig.add_trace(go.Scatter(x=dates[1+20-1:], y=linear_r_ma_20, mode='lines', name='20 MA', line=dict(color='#1261A0')), row=1, col=1) # t = 20 MA
fig.add_trace(go.Scatter(x=dates[1+30-1:], y=linear_r_ma_30, mode='lines', name='30 MA', line=dict(color='#58CCED')), row=1, col=1) # t = 30 MA

# log return
fig.add_trace(go.Scatter(x=dates[1:], y=log_r, mode='lines', name='VALE open USD', line=dict(color='#000000')), row=2, col=1) # Linear Return series
fig.add_trace(go.Scatter(x=dates[1:], y=[np.mean(log_r)]*len(dates[1:]), mode='lines', name='0', line=dict(color='red')), row=2, col=1) # mean of linear return
fig.add_trace(go.Scatter(x=dates[1+10-1:], y=log_r_ma_10, mode='lines', name='10 MA', line=dict(color='#072F5F')), row=2, col=1) # t = 10 MA
fig.add_trace(go.Scatter(x=dates[1+20-1:], y=log_r_ma_20, mode='lines', name='20 MA', line=dict(color='#1261A0')), row=2, col=1) # t = 20 MA
fig.add_trace(go.Scatter(x=dates[1+30-1:], y=log_r_ma_30, mode='lines', name='30 MA', line=dict(color='#58CCED')), row=2, col=1) # t = 30 MA

# Update xaxis properties
fig.update_xaxes(title_text="Date", row=1, col=1)
fig.update_xaxes(title_text="Date", row=2, col=1)

# Update yaxis properties
fig.update_yaxes(title_text="Linear Return", row=1, col=1)
fig.update_yaxes(title_text="Log Return", row=2, col=1)

fig.update_layout(
    height=700,
    showlegend= False)

In [12]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=dates[1:], y=log_return(open_p), mode='lines', name='VALE open USD', line=dict(color='#000000'))) # Price series

fig.add_trace(go.Scatter(x=dates[1:], y=[0]*len(dates[1:]), mode='lines', name='0', line=dict(color='red'))) # Price series

fig.update_layout(
    title={
        'text': "Log Return of Vale Open Price USD",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title="Date",
    yaxis_title="Linear Return",)

In [13]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=dates[1:], y=linear_return(open_p) - log_return(open_p), mode='lines', name='VALE open USD', line=dict(color='#000000'))) # Price series

fig.add_trace(go.Scatter(x=dates[1:], y=[0]*len(dates[1:]), mode='lines', name='0', line=dict(color='red'))) # Price series

fig.update_layout(
    title={
        'text': "Log Return of Vale Open Price USD",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title="Date",
    yaxis_title="Linear Return",)

In [14]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=[-0.1, 0.1], y=[-.1,0.1], mode='lines', name='VALE open USD', line=dict(color='red'))) # Price series

fig.add_trace(go.Scatter(x=linear_return(open_p), y=log_return(open_p), mode='markers', name='VALE open USD', line=dict(color='#000000'))) # Price series

fig.update_layout(
    title={
        'text': "<b>Linear Return against Log Return</b>",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title="Linear Return",
    yaxis_title="Log Return",
    showlegend = False)



**COMMENT ON LINEAR VS LOG RETURN**

The plot above represents the comparison between the linear and log returns of the Vale price time series. The red line indicates a perfectly linear relationship, with the Y-axis and X-axis values being equal. Observing the plot, we can see that the relationship between the two return types is almost linear, but it starts to exhibit a curved shape as the values deviate from zero. This curvature is due to the logarithmic nature of log returns.

For small changes in the price, the difference between linear and log returns is negligible, and both methods provide similar results, leading to a linear-like relationship. However, it's important to note that log returns are not linear, and their behavior changes as the price changes. The log return will grow more slowly as the price increases, and this creates the curved shape that we observe in the plot. The straight red line indicates a proportional relationship between log and linear returns, but this is only applicable to small changes in price.

In [15]:
# TODO: MAKE SERIES STATIONARY 
# - WHAT FOLLOWS RELIES ON THE SERIES BEING STATIONARY

# 3.   TIME-SERIES ANALYSIS

Visual inspection of the plots above suggest that the the time series for Vale's open price is not stationary since it exhibits some cyclic patter. To investigate this hypothesis more rigorously we use the _The Augmented Dickey-Fuller (ADF) test is a statistical test for stationarity._ Specifically, the ADF test tests the null hypothesis that a unit root is present in the sample of the time series (i.n $H_0$: Series is not stationary) against the alternative that no such root is present (i.e $H_1$: series is stationary).

The testing procedure for the ADF test uses the augmented Dickey-Fuller regression equation, which takes the form:

$$ \Delta y_t = \alpha + \beta t + \gamma t_{t-1} + \delta_1 \Delta y_{t-1} + ... + \delta_{p-1} \Delta y_{t-p+1} + \epsilon_t $$


where:

* $\Delta y_t$ is the first difference of the time series $y_t$
* $\gamma$ is the coefficient of the lagged value $y_{t-1}$ which tests for the presence of a unit root
* $p$ is the number of lags
* $\delta_i$ are the coefficients of the lagged differences 
* $\epsilon$ is the error term (assumed to be white noise)

Formally, the ADF test specified the null $H_0: \gamma = 0$ (that the series is stationary) against the alternative $H_1: \gamma > 0$ by computing the test statistics:

$$ DF_{\rho} = \frac{\hat{\gamma}}{SE(\hat{\gamma})} $$ 

The test statistic and its p-value is compared to a critical value from a table based on the sample size and significance level.

We undertake this test below to determine wheteher the open price time series for Vale's stock is stationary or non-stationary.

In [16]:
# Testing For Stationarity
from statsmodels.tsa.stattools import adfuller

def adfuller_test( P ):
    """
    Function conducts the ADF test for stationarity for a price time series P

    # Arguments:
        P: price time series
    
    # Returns:
        None
    """
    result = adfuller(P)
    labels = ['ADF Test Statistic', 'p-value', 'No. of Lags Used', 'Number of Observations Used']
    for value, label in zip(result,labels):
        print(f'{label}: {value:.4f}')
        
    # accept / reject
    if result[1] <= 0.05:
        print("P value is less than 0.05 that means we can reject the null hypothesis(Ho). Therefore we can conclude that data has no unit root and is stationary")
    else:
        print("Weak evidence against null hypothesis that means time series has a unit root which indicates that it is non-stationary ")

adfuller_test(open_p)

ADF Test Statistic: -2.0347
p-value: 0.2716
No. of Lags Used: 1.0000
Number of Observations Used: 298.0000
Weak evidence against null hypothesis that means time series has a unit root which indicates that it is non-stationary 


Since the series is non-stationary we compute the first difference of the series in order to render a stationary time series despite having computed the linear and log return above. There exists a relationship between the linear return and the first difference of a time series, that is the linear return can be approximated by the first difference because the linear return is the first difference scalled by the current price. Therefore, the approximation holds for small changes in the value of the time series, but for this reason we choose to continue with the first difference for simplicity (at no real cost).

In [17]:
def first_difference( P ):
    """
    Compute the first difference of a time series P.
    
    Parameters
    ----------
    P : array_like
        A time series to difference.
    
    Returns
    -------
    diff : array_like
        The first difference of the time series.
    """    
    diff = np.diff(P, n=1)
    
    return diff

In [18]:
# - PLOTS FIRST DIFFERENCE OF PRICE TIME SERIES - #

# get 1st dif
open_p_dif1 = first_difference(open_p)

# plots
fig = go.Figure()
fig.add_trace(go.Scatter(x=dates[1:], y=open_p_dif1, mode='lines', name='VALE open USD', line=dict(color='#000000'))) # Price series
fig.add_trace(go.Scatter(x=dates[1:], y=[np.mean(open_p_dif1)]*len(dates[1:]), mode='lines', name='0', line=dict(color='red'))) # Price series

fig.update_layout(
    title={
        'text': "First difference of Vale Open Price USD",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title="Date",
    yaxis_title="first difference",
    showlegend=False)

# 3. TIME-SERIES ANALYSIS

**AUTO-CORRELATION FUNCTION (ACF) OF A STATIONARY TIME SERIES**

The acf of a stationary time series is defined as follow:

$$ \rho(h) = \frac{\gamma(t+h, t)}{\sqrt{\gamma(t+h, t+h)\gamma(t, t)}} = \frac{\gamma(h, 0)}{\gamma(0, 0)} $$

where the following results have be used:
1. A weakly stationary times series has constant mean $\mu$
2. A weekly staitonary times sereies's $\gamma(s,t)$ depends only on $s$ and $t$ via their lag $h = |s - t| = |t+h - t|$. Therefore, $\gamma(t+h , t) = \gamma(h, 0)$ because the "lag" of these two shifts are equal (justifying the numerator). Further note that for any $\gamma(t, t) = \gamma(0,0)$ because again, the value of the "lag" are equal (justifying the denominator). 

**SAMPLE AUTO-CORRELATION FUNCTION**

The ACF for a limited number of observations $x_1, ..., x_n$ is define as:

$$ \hat{\rho}(h) = \frac{\hat{\gamma}(h, 0)}{\hat{\gamma}(0, 0)} $$

where,

* $ \hat{\rho}(h) = \frac{1}{n} \sum_{t=1}^{n-h} (x_{t+h} - \bar{x})(x_t - \bar{x}) $
* $ \bar{x} = \frac{1}{n} \sum_{t=1}^n x_t $

In [19]:
def acf( P ):
    """  
    Returns the sample auto-correlation function for a price seiers P

    Arguments:
        P: Price time series

    Returns:
        rho: acf for the price time series P
    """
    n = len(P)
    mu = np.mean(P)

    # get gamma(0,0)
    gamma_0 = (1/n) * ((P - mu) @ (P - mu))

    # get gamma(h, 0)
    def gamma( h ):
        gamma_h = (1/n) * ((P[h:] - mu) @ (P[:n-h] - mu))
        return gamma_h
    
    gamma_h = np.array([gamma(h) for h in range(n)])

    # get rho(h)
    rho = gamma_h / gamma_0

    return rho

In [20]:
# plot the ACF for the price time series
fig = go.Figure()

fig.add_trace(
    go.Bar(x=list(range(1,301)), y=acf(open_p), )
)

# peak significance bars
fig.add_trace(go.Scatter(x = [1, 300], y = [2/(np.sqrt(len(open_p)))]*2, mode='lines', line=dict(color='red', dash='dash', width = 1.5),))
fig.add_trace(go.Scatter(x = [1, 300], y = [-2/(np.sqrt(len(open_p)))]*2, mode='lines', line=dict(color='red', dash='dash', width = 1.5),))



#fill in ['none', 'tozeroy', 'tozerox', 'tonexty', 'tonextx','toself', 'tonext']

fig.update_traces(marker_color='#000000')

fig.update_layout(
    title={
        'text': "<b>ACF of Vale Open Price USD Series</b>",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title="Lag",
    yaxis_title="ACF",
    showlegend = False)

fig.show()

In [21]:
# plot the ACF for the price time series
fig = go.Figure()

fig.add_trace(
    go.Bar(x=list(range(1,301)), y=acf(open_p_dif1), )
)

# peak significance bars
fig.add_trace(go.Scatter(x = [1, 300], y = [2/(np.sqrt(len(open_p_dif1)))]*2, mode='lines', line=dict(color='red', dash='dash'),))
fig.add_trace(go.Scatter(x = [1, 300], y = [-2/(np.sqrt(len(open_p_dif1)))]*2, mode='lines', line=dict(color='red', dash='dash'),))

fig.update_traces(marker_color='#000000')

fig.update_layout(
    title={
        'text': "ACF of Vale Open Price USD First Difference Series",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title="Lag",
    yaxis_title="ACF",
    showlegend = False)

fig.show()

**COMMENT ON ACF PLOT**

The above figure plots the sample Autocorrelation Function (ACF) of Vale's open price time series. The ACF function quantities the dependence of points oon the same series observed at different times. (Time series book) This plot provides valuable information on the correlation between the open price at various time lags. Therin, due to the "Large sample distribution of the ACF" property (Time series book), we have plotted the thresholds for a peak to be defined as "significant" illustrated by the red dashed line in the plots, that are ploted at $\pm$ two standard errors (that is $\pm\frac{2}{\sqrt{300}}$). The ACF plot shows a damping periodic sequence of signals with an estimated pitch period of 50-100 points. This suggests that the open price of Vale stock has a cyclical pattern that repeats every 50-100 points. The cycle trend seen in the ACF can help guide statistical analysis, such as transforming the series for stationarity or modeling trends for prediction. This periodic pattern can also be confirmed by observing the original open price time series and the moving average plots, which show a rising and falling pattern with a cycle duration of approximately 50-100 points. These findings provide further important information in understanding the behavior of Vale's stock price. 



**PARTIAL AUTO-CORRELATION FUNCTION (ACF)**

One issue with the vanialla ACF is that alone is provide little information about the orders of dependence for non MA processes. That is, for any point $x_t$, it is dependent on $x_{t-h}$ via all intervening points $x_{t-1}, ..., x_{t - h + 1}$.(Time Series Book p.118) The Partial Autocorrelation Function (PACF) attempts to solve this issue by providing a means to measure the correlation between an observed time series and its lagged values, while controlling for the intermediate lagged values. That is, it provides a way of measure the linear dependendence between two observed values on the same series while eliminating the effect of the intermediate lags. It is defined as follows:

_The partial autocorrelation function (PCAF) of a stationary time series x_t denoted $\phi_{hh}$, for $h = 1, 2, ...,$ is_

$$ \phi_{11} = corr(x_{t+1}, x_t) = \rho(1) $$

_and for $h \geq 2$ ,_

$$ \phi_{hh} = corr(x_{t+h} - \hat{x}_{t+h}, x_t - \hat{x}_t) $$

In the above definition, $\hat{x}_{t+h}$ denotes the regression of $x_{t+h}$ on $\{ x_{t+h−1} , x_{t+h−2} , ... , x_{t+1} \}$, that is, 

$$ \hat{x}_{t+h} =  \beta_1 x_{t+h−1} + \beta_2 x_{t+h−2} + ... + \beta_{h−1}x_{t+1} $$

andn similiarly, $\hat{x}_t$ denotes the regression of $x_t$ on $\{ x_{t+1}, x_{t+2}, ... , x_{t+h−1} \}$ that is, 

$$ \hat{x}_t = \beta_1 x_{t+1} + \beta_2 x_{t+2} + ... + \beta_{h−1} x_{t+h−1} $$

Further, if the mean is not $0$, then we need only replace $x_t$ by $x_t - \mu_x$ where $\mu_x$ is the mean of the series.

NOTE:

In practice, the number of lags is limited upper bounded by $\frac{n}{2}$ when $n$ is the length or number of samples in the series when computing the PCAF. This is done to avoid the issue of serial correlation. That is, when the auto-correlation of the residuals in the regression models for $\hat{x}_{t+h}$ and $\hat{x}_t$ are non-zero. If the size of the lag is too high, and the set of observations on which we are regression becomes too high, the residuals of this regressio model can become serially correlated and therefore the results from the PCAF will be unreliable. Therefore, by limiting the number of lags to half of the sample size we are guarding against this issue and ensure that the PCAF results are reliable.

The reason for limiting the lag size to less than half the sample size when computing the PCAF is due to the idea of stationarity in time series analysis.

A time series is considered stationary if its statistical properties, such as the mean and variance, are constant over time. A key assumption when computing the PCAF is that the time series is stationary. However, if the time series is non-stationary, it may have a high autocorrelation at large lags, which can result in biased or unstable estimates of the PCAF.

To ensure that the PCAF is a valid estimate, it is important to limit the number of lags to less than half the sample size. This is because, when computing the PCAF for large lags, the sample size for each lag becomes smaller, making it difficult to accurately estimate the autocorrelation. By limiting the number of lags to less than half the sample size, we can ensure that the sample size remains large enough to produce a reliable estimate of the PCAF.

In [22]:
def pcaf( P ):
    """
    Computes the PCAF for a time series P

    # Arguments:
        P: Price Time series

    # Returns:
        pcaf: PCAF of time series
    """
    x = np.array(P)
    PCAF = pacf(x, nlags=len(x)//2 - 1, method='ols')
    return PCAF

In [23]:
fig = go.Figure()

fig.add_trace(go.Bar(x=list(range(1,len(open_p/2))), y=pcaf(open_p),))

# peak significance bars
fig.add_trace(go.Scatter(x = [1, 150], y = [2/(np.sqrt(len(open_p)))]*2, mode='lines', line=dict(color='red', dash='dash'),))
fig.add_trace(go.Scatter(x = [1, 150], y = [-2/(np.sqrt(len(open_p)))]*2, mode='lines', line=dict(color='red', dash='dash'),))

fig.update_traces(marker_color='#000000')

fig.update_layout(
    title={
        'text': "<b>PACF of Vale Open Price USD Series</b>",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title="Lag",
    yaxis_title="PACF",
    showlegend = False)

fig.show()


In [24]:
fig = go.Figure()

fig.add_trace(go.Bar(x=list(range(1,len(open_p_dif1/2))), y=pcaf(open_p_dif1),))

# peak significance bars
fig.add_trace(go.Scatter(x = [1, 300], y = [2/(np.sqrt(len(open_p_dif1)))]*2, mode='lines', line=dict(color='red', dash='dash'),))
fig.add_trace(go.Scatter(x = [1, 300], y = [-2/(np.sqrt(len(open_p_dif1)))]*2, mode='lines', line=dict(color='red', dash='dash'),))

fig.update_traces(marker_color='#000000')

fig.update_layout(
    title={
        'text': "PACF of Vale Open Price USD Series (First Difference)",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title="Lag",
    yaxis_title="PACF",
    showlegend = False)

fig.show()

**COMMENT ON PCAF**

Similiarly to the ACF plot, the "significant" threshold bounds have been plotted. From these thresholds it can be determined that notably the $h = 1, 2$ lags are the only "reasonably" significant lags, and for $h > 2$ the PCAF is approximatley $0$, suggesting that a second order $p = 2$ auto-regression model might provide a good fit for modelling the open price of Vale's stock.

Note, that around lag $150$ we see "significant" PCAF values, but we can attribute this to the cyclic variation in the process and that it is no stationary. Differencing for seasobal varying and then re-computing the PCAF should correct this.



In [25]:
linear_r = linear_return(open_p)
log_r = log_return(open_p)

acf_linear_return = acf(linear_r)
acf_log_return = acf(log_r)

pcaf_linear_return = pcaf(linear_r)
pcaf_log_return = pcaf(log_r)

fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=("<b>ACF Linear Return</b>", "<b>ACF Log Return</b>", "<b>PCAF Linear Return</b>", "<b>PCAF Log Return</b>"),)

# ACF
fig.add_trace(go.Bar(x = list(range(1, len(acf_linear_return//2))), y = acf_linear_return[:len(acf_linear_return)//2 -1]), row=1, col=1)
fig.add_trace(go.Bar(x=list(range(1, len(acf_log_return//2))), y=acf_log_return[:len(acf_log_return)//2 -1]),row=1, col=2)

# PCAF
fig.add_trace(go.Bar(x=list(range(1, len(pcaf_linear_return)+1)), y=pcaf_linear_return), row=2, col=1)
fig.add_trace(go.Bar(x=list(range(1, len(pcaf_log_return)+1)), y=pcaf_log_return), row=2, col=2)

# peak significance bars
fig.add_trace(go.Scatter(x = [1, 149], y = [2/(np.sqrt(len(open_p)))]*2, mode='lines', line=dict(color='red', dash='dash',  width=1.5),), row=[1,1,2,2], col=[1,2,1,2])
fig.add_trace(go.Scatter(x = [1, 149], y = [-2/(np.sqrt(len(open_p)))]*2, mode='lines', line=dict(color='red', dash='dash',  width=1.5,),), row=[1,1,2,2], col=[1,2,1,2])

# Update xaxis properties
fig.update_xaxes(title_text="Lag", row=1, col=1)
fig.update_xaxes(title_text="Lag", row=1, col=2)
fig.update_xaxes(title_text="Lag", row=2, col=1)
fig.update_xaxes(title_text="Lag", row=2, col=2)

# Update yaxis properties
fig.update_yaxes(title_text="ACF", row=1, col=1)
fig.update_yaxes(title_text="ACF", row=1, col=2)
fig.update_yaxes(title_text="PCAF", row=2, col=1)
fig.update_yaxes(title_text="PCAF", row=2, col=2)

fig.update_traces(marker_color='#000000')

fig.update_layout(
    height=700,
    showlegend = False)


In [26]:
# TODO: MAKE A COMMENT ON THIS PLOT
# - WHY IS IT USEFUL 
# - WHY HAS IT BEEN ADDED THE CW
# - HOW DOES IT HELP US

# 4. ARMA MODEL

The notion of autoregressive (AR) and moving average (MA) models can be mixed to an autoregressive moving average (ARMA), models for stationary time series. It can formally be defined as follows:

_A time series $\{ x_t; t = 0, \pm1, \pm2, ... \}$ is ARMA(p,q) if it is,_

1. _Stationary_
2. $ x_t  = \phi_1 x_{t-1} + ... + \phi_p x_{t-p} + w_t + \theta_1 w_{t-1} + ... + \theta_q w_{t-q} $

_with $\phi_p \neq 0$, $\theta_q \neq 0$, and $\sigma_w^2 > 0$._ 

Note: The parameters p and q are called the autoregressive and the moving average orders, respectively.

_If the series $x_t$ has a nonzero mean $\mu$, we set $\alpha=\mu(1−\phi_1 − ... −\phi_p)$ and write the model as_

$$ x_t = \alpha + \phi_1 x_{t−1} + ... + \phi_p x_{t−p} + w_t + \theta_1 w_{t−1} + ... + \theta_q w_{t−q} $$

_Alternatively, the ARMA model can be written in terms of the AR and MA operators concisely as_

$$ \phi(B)x_t = \theta(B)w_t  $$




**Defining a training and test set and fit an ARMA model to the price time series**

In [27]:
# TODO: MAKE JUSTIFICATIONS FOR EVERYTHING THAT YOU DO HERE

In [79]:
# - AUXILIARY FUNCTINO - #
def print_model_summary( arma_model ):
    """
    Prints model summary for a given ARMA(p,q) model

    Arguments:
    ----------
        arma_model: fitted ARMA(p, q) model
    
    Returns:
    ----------
        None
    """
    print(arma_model.summary())

    return None

def print_model_measures_of_fit( arma_model ):
    """
    Prints the AIC, BIC and HQIC for a given ARMA(p, q) model

    Arguments:
    ----------
        model: fitted model 
    
    Returns:
    ----------
        none
    """
    print('##### MEASURES OF FIT #####')
    print(f'AIC:     {arma_model.aic:.3f}')
    print(f'BIC:     {arma_model.bic:.3f}')
    print(f'HQIC:    {arma_model.hqic:.3f}')

    return None

def train_test_split( X, test_size ):
    """
    Split data into a training and test set preserving 
    the order of the data using spefied split

    Arguments:
    ----------
        X (array_like) :  Time series data to split
        test_size (float) : percentage of data to put in training set
    
    Returns:
        train (array_like) : Training data containing 100*(1-test_size)% of the data
        test (array_like) : test data containing 100*test_size% of the data
    """

    assert test_size <= 1, 'Not a valid percentage, requires 0 <= test_size <= 1'

    # split dataset
    split_value = int(len(X) * (1-test_size))
    train, test = X[:split_value], X[split_value:]

    return train, test

def train_val_test_split( X, val_size, test_size ):
    """
    Split data into a training, validation and test set preserving 
    the order of the data using spefied split

    Arguments:
    ----------
        X (array_like) :  Time series data to split
        val_size (float) : percentage of data to put in training set
    
    Returns:
        train (array_like) : Training data containing 100*(1-val_size-test_size)% of the data
        val (array_like) : Validation data containing 100*val_size of the data
        test (array_like) : test data containing 100*test_size% of the data
    """

    assert val_size + test_size <= 1, 'Not a valid percentage, requires 0 <= val_size + test_size <= 1'

    # split dataset
    train, test = train_test_split( X, test_size )
    train, val = train_test_split( train, val_size )

    return train, val, test


def mean_squared_error( y_true, y_pred ):
    """ 
    Computes the MSE for predicted values

    Parameters
    ----------
        y_true : array_like
                True values 

        y_pred : array_like
                Predicted values from model

    Returns
    -------
        mse : float
                MSE for predicted values given true observations
    """
    return ((y_true - y_pred) ** 2).mean()

def multi_arma_aic_bic_mse( train_data, max_p, max_q ):
    """
    Cycles through all ARMA models formed by combinations 
    of 0 <= p <= max_p, and 0 <= q <= max_q and stores the
    respective AIC and BIC in dictionary fomat

    Arguments:
    ----------
        train_data (array_like) : Training data for model
        max_p (int) : max auto-regressive order of ARMA model
        max_q (int) : max moving average order of ARMA model
    
    Returns:
    ----------
        aic_bic_dict (dict) : Dictionary with respective AIC and BIC
                                indexed by string 'ARMA(p,q)' : {'AIC' : value, 'BIC': value}
    """
    aic_bic_dict = {}
    for p in range(1, max_p+1):
        for q in range(1, max_q+1):
            arma_model = ARIMA(train_data, order=(p, 0, q)).fit()
            y_train_pred  = arma_model.predict(start=max(p,q)+1, end = len(train_data))
            mse = mean_squared_error(train_data[max(p,q):], y_train_pred)
            aic_bic_dict[f'ARMA({p},{q})'] = {'AIC' : round(arma_model.aic,3), 'BIC' : round(arma_model.bic, 3), 'MSE' : round(mse, 5)}
    
    return aic_bic_dict

In [80]:
# split dataset
# split dataset
train, test = train_test_split( open_p, 0.2 )

In [81]:
criterion_dict = multi_arma_aic_bic_mse( train, 2, 2 )

In [82]:
criterion_dict

{'ARMA(1,1)': {'AIC': 327.691, 'BIC': 341.613, 'MSE': 0.00393},
 'ARMA(1,2)': {'AIC': 329.48, 'BIC': 346.884, 'MSE': 0.00402},
 'ARMA(2,1)': {'AIC': 329.532, 'BIC': 346.935, 'MSE': 0.00397},
 'ARMA(2,2)': {'AIC': 329.003, 'BIC': 349.887, 'MSE': 0.00732}}

**UNIVARIATE FORECAST**

The forecast above may not look very impressive, as it is almost a straight line. This is because this is a very simple, univariate forecasting model. Nonetheless, keep in mind that these simple forecasting models can be extremely competitive.

In [83]:
train, test = train_test_split(open_p, 0.2)

# train ARMA(1, 1) model
arma11 = ARIMA(train, order=(1, 0, 1)).fit()

# print coeffients
print_model_summary(arma11)

                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:                  240
Model:                 ARIMA(1, 0, 1)   Log Likelihood                -159.845
Date:                Thu, 16 Feb 2023   AIC                            327.691
Time:                        10:47:49   BIC                            341.613
Sample:                             0   HQIC                           333.301
                                - 240                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         14.8411      1.337     11.098      0.000      12.220      17.462
ar.L1          0.9765      0.013     76.633      0.000       0.952       1.001
ma.L1          0.0818      0.059      1.375      0.1

In [84]:
# split dataset
train, test = train_test_split( open_p, 0.2 )
n_train = len(train)
n_test = len(test)


# train ARMA(1, 1) model
arma11 = ARIMA(train, order=(1, 0, 1)).fit()

# get training error
y_train_pred_arma11  = arma11.predict(start=1, end = len(train))

# make forecasts
y_test_pred_arma11 = arma11.forecast(len(test))

# get MSE
print('##### ARMA(1, 1) #####')
print(f'- Training MSE:    {mean_squared_error(train, y_train_pred_arma11):.3f}')
print(f'- Test MSE:  {mean_squared_error(test, y_test_pred_arma11 ):.3f}')

# plot

fcast = arma11.get_forecast(steps = len(test)).summary_frame()

y_pred = fcast['mean']
y_pred_mean_ci_lower = fcast['mean_ci_lower'].values
y_pred_mean_ci_upper = fcast['mean_ci_upper'].values

fig = go.Figure()

fig.add_trace(go.Scatter(x=dates[n_train:], y=y_pred_mean_ci_lower, mode='lines', name='0', line=dict(color='grey'))) # Lower CI
fig.add_trace(go.Scatter(x=dates[n_train:], y=y_pred_mean_ci_upper, mode='lines', name='0', line=dict(color='grey'), fill='tonexty')) # Upper CI

fig.add_trace(go.Scatter(x=dates[n_train-n_test:n_train], y=train[-n_test:], mode='lines', name='VALE open USD', line=dict(color='#000000'))) # Train Price series
fig.add_trace(go.Scatter(x=dates[n_train:], y=test, mode='lines', name='VALE open USD', line=dict(color='blue'))) # Test Price series

fig.add_trace(go.Scatter(x=dates[n_train:], y=y_test_pred_arma11, mode='lines', name='VALE open USD', line=dict(color='red'))) # Out of sample forecast


fig.update_layout(
    title={
        'text': "<b>Out of Sample Test Set Forecast for ARMA(1,1)</b>",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title="Date",
    yaxis_title="Vale Price USD",
    showlegend=False)



##### ARMA(1, 1) #####
- Training MSE:    0.004
- Test MSE:  5.999


**PSEUDO OUT-OF-SMAPLE FORECAST**

In [85]:
# split dataset
train, test = train_test_split( open_p, 0.2 )
n_train = len(train)
n_test = len(test)

# train ARMA(1, 1) model
arma11 = ARIMA(train, order=(1, 0, 1)).fit()

# Setup forecasts
forecasts = {}

# Save initial forecast
forecasts[n_train] = arma11.forecast()

# Step through the rest of the sample
for t in range(1,n_test):
    # Update the results by appending the next observation
    arma11 = arma11.append(test[t-1:t], refit=False)

    # Save the new set of forecasts
    forecasts[n_train+t] = arma11.forecast()

# plot
y_pred = np.array([x[0] for x in forecasts.values()])
print(f'Test MSE:        {mean_squared_error(test, y_pred )}')

fig = go.Figure()

fig.add_trace(go.Scatter(x=dates[n_train-n_test:n_train], y=train[-n_test:], mode='lines', name='VALE open USD', line=dict(color='#000000'))) # Price series

fig.add_trace(go.Scatter(x=dates[n_train:], y=test, mode='lines', name='VALE open USD', line=dict(color='blue'))) # Price series
fig.add_trace(go.Scatter(x=dates[n_train:], y=y_pred, mode='lines', name='0', line=dict(color='red'))) # Price series


fig.update_layout(
    title={
        'text': "<b>Pseudo Out-of-Sample Forecast for ARMA(1,1)</b>",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title="Date",
    yaxis_title="Vale Price USD",
    showlegend=False)


Test MSE:        0.12337914791453146


**Fit an ARMA model to the return time series**

In [86]:
train, test = train_test_split(log_r, 0.2)

In [91]:
criterion_dict = multi_arma_aic_bic_mse( train, 2, 1 )
print(criterion_dict)

{'ARMA(1,1)': {'AIC': -988.515, 'BIC': -974.609, 'MSE': 0.00081}, 'ARMA(2,1)': {'AIC': -986.749, 'BIC': -969.366, 'MSE': 0.00082}}


In [93]:
# train ARMA(1, 1) model
arma21 = ARIMA(train, order=(2, 0, 1)).fit()

# print coeffients
print_model_summary(arma21)

                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:                  239
Model:                 ARIMA(2, 0, 1)   Log Likelihood                 498.374
Date:                Thu, 16 Feb 2023   AIC                           -986.749
Time:                        10:58:52   BIC                           -969.366
Sample:                             0   HQIC                          -979.744
                                - 239                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0005      0.002      0.258      0.796      -0.003       0.004
ar.L1          0.0256      1.625      0.016      0.987      -3.160       3.212
ar.L2         -0.0403      0.109     -0.371      0.7

In [94]:
# split dataset
train, test = train_test_split( linear_r, 0.2 )
n_train = len(train)
n_test = len(test)


# train ARMA(1, 1) model
arma21 = ARIMA(train, order=(2, 0, 1)).fit()

# get training error
y_train_pred_arma11  = arma21.predict(start=3, end = len(train))

# make forecasts
y_test_pred_arma11 = arma21.forecast(len(test))

# get MSE
print('##### ARMA(1, 1) #####')
print(f'- Training MSE:    {mean_squared_error(train[2:], y_train_pred_arma11):.3f}')
print(f'- Test MSE:  {mean_squared_error(test, y_test_pred_arma11 ):.6f}')

# plot

fcast = arma21.get_forecast(steps = len(test)).summary_frame()

y_pred = fcast['mean']
y_pred_mean_ci_lower = fcast['mean_ci_lower'].values
y_pred_mean_ci_upper = fcast['mean_ci_upper'].values

fig = go.Figure()

fig.add_trace(go.Scatter(x=dates[n_train:], y=y_pred_mean_ci_lower, mode='lines', name='0', line=dict(color='grey'))) # Lower CI
fig.add_trace(go.Scatter(x=dates[n_train:], y=y_pred_mean_ci_upper, mode='lines', name='0', line=dict(color='grey'), fill='tonexty')) # Upper CI

fig.add_trace(go.Scatter(x=dates[n_train-n_test:n_train], y=train[-n_test:], mode='lines', name='VALE open USD', line=dict(color='#000000'))) # Train Price series
fig.add_trace(go.Scatter(x=dates[n_train:], y=test, mode='lines', name='VALE open USD', line=dict(color='blue'))) # Test Price series

fig.add_trace(go.Scatter(x=dates[n_train:], y=y_test_pred_arma11, mode='lines', name='VALE open USD', line=dict(color='red'))) # Out of sample forecast


fig.update_layout(
    title={
        'text': "<b>Out of Sample Log Return Test Set Forecast for ARMA(2,1)</b>",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title="Date",
    yaxis_title="Vale Price USD",
    showlegend=False)

##### ARMA(1, 1) #####
- Training MSE:    0.001
- Test MSE:  0.000465


In [95]:
# split dataset
train, test = train_test_split( log_r, 0.2 )
n_train = len(train)
n_test = len(test)

# train ARMA(2, 1) model
arma21 = ARIMA(train, order=(2, 0, 1)).fit()

# Setup forecasts
forecasts = {}

# Save initial forecast
forecasts[n_train] = arma21.forecast()

# Step through the rest of the sample
for t in range(1,n_test):
    # Update the results by appending the next observation
    arma21 = arma21.append(test[t-1:t], refit=False)

    # Save the new set of forecasts
    forecasts[n_train+t] = arma21.forecast()

# plot
y_pred = np.array([x[0] for x in forecasts.values()])
print(f'Test MSE:        {mean_squared_error(test, y_pred )}')

fig = go.Figure()

fig.add_trace(go.Scatter(x=dates[n_train-n_test:n_train], y=train[-n_test:], mode='lines', name='VALE open USD', line=dict(color='#000000'))) # Price series

fig.add_trace(go.Scatter(x=dates[n_train:], y=test, mode='lines', name='VALE open USD', line=dict(color='blue'))) # Price series
fig.add_trace(go.Scatter(x=dates[n_train:], y=y_pred, mode='lines', name='0', line=dict(color='red'))) # Price series


fig.update_layout(
    title={
        'text': "<b>Pseudo Out of Sample Log Return Test Set Forecast for ARMA(2,1)</b>",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title="Date",
    yaxis_title="Vale Price USD",
    showlegend=False)


Test MSE:        0.0004602429826990649


In [66]:
# split dataset
train, test = train_test_split( log_r, 0.2 )
n_train = len(train)
n_test = len(test)

# train ARMA(2, 1) model
arma21 = ARIMA(train, order=(2, 0, 1)).fit()

# predict
y_pred = arma21.predict(start=3, end = len(train))

fig = go.Figure()

fig.add_trace(go.Scatter(x=dates, y=train, mode='lines', name='VALE open USD', line=dict(color='#000000'))) # Price series

fig.add_trace(go.Scatter(x=dates[2:], y=y_pred, mode='lines', name='0', line=dict(color='red'))) # Price series


fig.update_layout(
    title={
        'text': "<b>Pseudo Out-of-Sample Forecast for ARMA(2,1)</b>",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title="Date",
    yaxis_title="Vale Price USD",
    showlegend=False)


# 5. Gaussianity and Stationarity test

**Introduce mathematically a Gaussianity test**

The Shapiro-Wilk test is a statistical test used to test the null hypothesis that a sample of data comes from a normal distribution. In the context of ARMA models, the Shapiro-Wilk test can be used to test whether the residuals of the model follow a normal distribution specifically in our context whether they follow a white noise distribution i.e $w_t \sim N(0, \sigma_w^2)$.

The test statistic for the Shapiro-Wilk test is given by:

$$ W = \frac{\left( \sum_{i=1}^T a_i r_{(i)} \right)^2}{\sum_{i=1}^T (r_{(i)} - \mu)^2} $$

where $r_{(i)}$ are the ordered sample values of the time series, and $a_i$ are the coefficients that depend on the sample size given by,

$$ a_i = \frac{m^T V^{-1}}{N} $$

where N is a normalization factor., such that $\sum_{i=1}^T a_i^2 = 1$, and $m$ is the vector of expected values of all the order statistics in a Gaussian distribution, and $V$ is the expected covariance of pairs of order statistics. The test statistic $W$ measures the degree of departure of the sample from the normal distribution. Under the null hypothesis of normality, the Shapiro-Wilk test statistic follows a distribution that is close to a standard normal distribution.

If the p-value is less than the significance level (e.g., 0.05), then the null hypothesis of normality is rejected, and it is concluded that the residuals are not normally distributed, and therefore the residuals do not follow a normal distribution violating a modelling assumption.



**Perform a Gaussianity test of the return time series**

In [None]:
from scipy.stats import shapiro

In [112]:
def shaprio_wilk_normality_test( sample ):
    """ 
    Computes the test statistic and p-value of 
    the Shapiro-Wilk normality test on a provided sample. \\
    Precisely, it tests the null hypothesis that the give sample was 
    sampled from a normal population. i.e \\
    H0: Sample is drawn from normal population \\
    H1: Sample is not drawn from normal population \\

    Arguments:
    ----------
    sample (array_like) : sample to be tested

    Returns:
    ---------
    W (float) : test stiatics
    p_value (float) : p_value for associated test statistic W
    rejected (bool) : truth value of test 
    """
    # get stats
    W, p_value = shapiro(sample)
    rejected = False

    # accept / reject
    if p_value >= 0.05:
        print('Insuficient evidence to reject null, i.e sample is normally distributed')
        print(f'Test Statistics:  |   {W:.3f}')
        print(f'p_value:          |   {p_value:.3f}')
        print(f'Null rejected:    |   {rejected}')
    else:
        rejected = True
        print('Evidence rejects null, i.e sample is not normally distributed')
        print(f'Test Statistics:  |   {W:.3f}')
        print(f'p_value:          |   {p_value:.3f}')
        print(f'Null rejected:    |   {rejected}')
        
    
    return W, p_value, rejected


In [110]:
# Normality test for Log Return Series
W, p_value, rejected = shaprio_wilk_normality_test( log_r )

Insuficient evidence to reject null, i.e sample is normally distributed
Test Statistics:  |   0.996
p_value:          |   0.737
Null rejected:    |   False


In [111]:
# Normality test for ARMA(2,1) residuals on Log Return Series
W, p_value, rejected = shaprio_wilk_normality_test( arma21.resid )

Insuficient evidence to reject null, i.e sample is normally distributed
Test Statistics:  |   0.997
p_value:          |   0.858
Null rejected:    |   False


**Introduce mathematically a stationarity test**


_The Augmented Dickey-Fuller (ADF) test is a statistical test for stationarity._ Specifically, the ADF test tests the null hypothesis that a unit root is present in the sample of the time series (i.n $H_0$: Series is not stationary) against the alternative that no such root is present (i.e $H_1$: series is stationary).

The testing procedure for the ADF test uses the augmented Dickey-Fuller regression equation, which takes the form:

$$ \Delta y_t = \alpha + \beta t + \gamma t_{t-1} + \delta_1 \Delta y_{t-1} + ... + \delta_{p-1} \Delta y_{t-p+1} + \epsilon_t $$


where:

* $\Delta y_t$ is the first difference of the time series $y_t$
* $\gamma$ is the coefficient of the lagged value $y_{t-1}$ which tests for the presence of a unit root
* $p$ is the number of lags
* $\delta_i$ are the coefficients of the lagged differences 
* $\epsilon$ is the error term (assumed to be white noise)

Formally, the ADF test specified the null $H_0: \gamma = 0$ (that the series is stationary) against the alternative $H_1: \gamma > 0$ by computing the test statistics:

$$ DF_{p} = \frac{\hat{\gamma}}{SE(\hat{\gamma})} $$ 

The test statistic and its p-value is compared to a critical value from a table based on the sample size and significance level.

**Perform a stationarity test of the return time series**

In [135]:
from statsmodels.tsa.stattools import adfuller

In [136]:
def adfull_test( sample ):
    """ 
    Computes the test statistic and p-value of 
    the ADF stationarity test on a provided sample. \\
    Precisely, it tests the null hypothesis that the give sample has 
    a unit root i.e \\
    H0: Sample has unit root => non-stationary \\
    H1: Sample does not have unit root => stationary \\

    Arguments:
    ----------
    sample (array_like) : sample to be tested

    Returns:
    ---------
    DF (float) : test stiatics
    p_value (float) : p_value for associated test statistic W
    rejected (bool) : truth value of test
    """
    DF, p_value, _, _, _, _ = adfuller(sample)
    rejected = False

    # accept / reject
    if p_value >= 0.05:
        print('Insuficient evidence to reject null, i.e sample has unit root => series is non-stationary')
        print(f'Test Statistics:  |   {DF:.3f}')
        print(f'p_value:          |   {p_value:.3f}')
        print(f'Null rejected:    |   {rejected}')
    else:
        rejected = True
        print('Evidence rejects null, i.e sample has no unit root => series is stationary')
        print(f'Test Statistics:  |   {DF:.3f}')
        print(f'p_value:          |   {p_value:.3f}')
        print(f'Null rejected:    |   {rejected}')
    
    return DF, p_value, rejected
    

In [137]:
# Stationarity test on Log Return Series
DF, p_value, rejected = adfull_test(log_r)

Evidence rejects null, i.e sample has no unit root => series is stationary
Test Statistics:  |   -8.373
p_value:          |   0.000
Null rejected:    |   True


In [138]:
# Stationarity test on Original Opening Price Time Series
DF, p_value, rejected = adfull_test(open_p)

Insuficient evidence to reject null, i.e sample has unit root => series is non-stationary
Test Statistics:  |   -2.035
p_value:          |   0.272
Null rejected:    |   False
