# Time Series Model on Stock Prices

In [1]:
# Importing Libraries
import pandas as pd
import numpy as np
import itertools
from statistics import mean, median
from statsmodels.tsa.arima_model import ARIMA
from pmdarima.arima import AutoARIMA
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from tqdm.notebook import tqdm
from sklearn.metrics import mean_squared_error
from datetime import date, timedelta
import yfinance as yf

In [29]:
# Getting the date five years ago to download the current timeframe
years = (date.today() - timedelta(weeks=400)).strftime("%Y-%m-%d")

# Stocks to analyze
stocks = ['FB', 'AMZN', 'AAPL', 'NFLX', 'GOOG']

# Getting the data for multiple stocks
df = yf.download(stocks, start=years)

print("Rows in DataFrame: ", df.shape[0])

[*********************100%***********************]  5 of 5 completed
Rows in DataFrame:  1931


In [30]:
# Storing the dataframes in a dictionary
stock_df = {}

for col in set(df.columns.get_level_values(0)):
    
    # Assigning the information (High, Low, etc.) for each stock in the dictionary
    stock_df[col] = df[col]

# Preprocessing Data

Scale the data using a logarithmic scale.  Also rounding the log result by 2 decimal points in order to reduce any unnecessary noise.

In [31]:
# Finding the log returns
stock_df['LogReturns'] = stock_df['Adj Close'].apply(np.log).diff().dropna()

# Logarithmic scaling of the data and rounding the result
stock_df['LogClose'] = stock_df['Adj Close'].apply(np.log).apply(lambda x: round(x, 2))

# Visualizing the Data

In [32]:
px.line(stock_df['Adj Close'], 
        x=stock_df['Adj Close'].index, 
        y=stock_df['Adj Close'].columns,
        labels={'variable': 'Stock',
                'value': 'Price'},
        title='Adj Close')


In [33]:
px.line(stock_df['LogClose'], 
        x=stock_df['LogClose'].index, 
        y=stock_df['LogClose'].columns,
        labels={'variable': 'Stock',
                'value': 'Log Price'},
        title='Log of Closing Prices')

## Optimum Parameter Search Function

In [34]:
opt_param = AutoARIMA(start_p=0, start_q=0,
                      start_P=0, start_Q=0,
                      max_p=8, max_q=8,
                      max_P=5, max_Q=5,
                      error_action='ignore',
                      information_criterion='bic',
                      suppress_warnings=True)

for stock in tqdm(stocks):

    opt_param.fit(stock_df['LogClose'][stock])

    print(f'Summary for {stock}', '--'*20)
    display(opt_param.summary())

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

Summary for FB ----------------------------------------


0,1,2,3
Dep. Variable:,y,No. Observations:,1931.0
Model:,"SARIMAX(0, 1, 0)",Log Likelihood,4644.292
Date:,"Mon, 07 Sep 2020",AIC,-9286.585
Time:,17:59:43,BIC,-9281.019
Sample:,0,HQIC,-9284.538
,- 1931,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
sigma2,0.0005,4.74e-06,100.405,0.000,0.000,0.000

0,1,2,3
Ljung-Box (Q):,56.43,Jarque-Bera (JB):,28758.91
Prob(Q):,0.04,Prob(JB):,0.0
Heteroskedasticity (H):,1.14,Skew:,0.43
Prob(H) (two-sided):,0.11,Kurtosis:,21.89


Summary for AMZN ----------------------------------------


0,1,2,3
Dep. Variable:,y,No. Observations:,1931.0
Model:,"SARIMAX(0, 1, 0)",Log Likelihood,4835.911
Date:,"Mon, 07 Sep 2020",AIC,-9667.822
Time:,17:59:44,BIC,-9656.691
Sample:,0,HQIC,-9663.727
,- 1931,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,0.0013,0.000,2.889,0.004,0.000,0.002
sigma2,0.0004,6.19e-06,63.065,0.000,0.000,0.000

0,1,2,3
Ljung-Box (Q):,51.5,Jarque-Bera (JB):,3161.95
Prob(Q):,0.11,Prob(JB):,0.0
Heteroskedasticity (H):,1.18,Skew:,0.14
Prob(H) (two-sided):,0.04,Kurtosis:,9.26


Summary for AAPL ----------------------------------------


0,1,2,3
Dep. Variable:,y,No. Observations:,1931.0
Model:,"SARIMAX(1, 1, 0)",Log Likelihood,4990.62
Date:,"Mon, 07 Sep 2020",AIC,-9977.241
Time:,17:59:49,BIC,-9966.11
Sample:,0,HQIC,-9973.147
,- 1931,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ar.L1,-0.0824,0.013,-6.229,0.000,-0.108,-0.056
sigma2,0.0003,5.28e-06,62.931,0.000,0.000,0.000

0,1,2,3
Ljung-Box (Q):,112.99,Jarque-Bera (JB):,4549.85
Prob(Q):,0.0,Prob(JB):,0.0
Heteroskedasticity (H):,1.86,Skew:,-0.5
Prob(H) (two-sided):,0.0,Kurtosis:,10.46


Summary for NFLX ----------------------------------------


0,1,2,3
Dep. Variable:,y,No. Observations:,1931.0
Model:,"SARIMAX(0, 1, 0)",Log Likelihood,4128.691
Date:,"Mon, 07 Sep 2020",AIC,-8253.382
Time:,17:59:51,BIC,-8242.252
Sample:,0,HQIC,-8249.288
,- 1931,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,0.0019,0.001,2.760,0.006,0.001,0.003
sigma2,0.0008,8.45e-06,96.105,0.000,0.001,0.001

0,1,2,3
Ljung-Box (Q):,39.29,Jarque-Bera (JB):,28894.78
Prob(Q):,0.5,Prob(JB):,0.0
Heteroskedasticity (H):,0.7,Skew:,1.28
Prob(H) (two-sided):,0.0,Kurtosis:,21.78


Summary for GOOG ----------------------------------------


0,1,2,3
Dep. Variable:,y,No. Observations:,1931.0
Model:,"SARIMAX(0, 1, 1)",Log Likelihood,5190.825
Date:,"Mon, 07 Sep 2020",AIC,-10377.65
Time:,17:59:58,BIC,-10366.52
Sample:,0,HQIC,-10373.556
,- 1931,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ma.L1,-0.0994,0.015,-6.662,0.000,-0.129,-0.070
sigma2,0.0003,3.58e-06,75.295,0.000,0.000,0.000

0,1,2,3
Ljung-Box (Q):,102.14,Jarque-Bera (JB):,7830.23
Prob(Q):,0.0,Prob(JB):,0.0
Heteroskedasticity (H):,1.65,Skew:,0.43
Prob(H) (two-sided):,0.0,Kurtosis:,12.83





# Using the ARIMA Model
Using the price history from the past N days to make predictions

In [35]:
# Days in the past to train on
days_to_train = 90 

# Days in the future to predict
days_to_predict = 10

# Establishing a new DFs for predictions
stock_df['Predictions'] = pd.DataFrame(index=stock_df['LogClose'].index,
                                       columns=stock_df['LogClose'].columns)

# Iterate through each stock
for stock in tqdm(stocks):
    
    # Training a model for each day and getting predictions
    for day in tqdm(range(days_to_train, stock_df['LogClose'].shape[0], days_to_predict)):

        # Data to use, containing rolling amount of past days
        training = stock_df['LogClose'][stock].iloc[day-days_to_train:day].dropna()

        # Finding the best parameters
        model    = AutoARIMA(start_p=0, start_q=0,
                             start_P=0, start_Q=0,
                             max_p=8, max_q=8,
                             max_P=5, max_Q=5,
                             error_action='ignore',
                             information_criterion='aic',
                             suppress_warnings=True)

        # Getting predictions for the optimum parameters by fitting to the training set            
        forecast = model.fit_predict(training,
                                     n_periods=days_to_predict)


        # Getting the overall average prediction for the next N days
        stock_df['Predictions'][stock].iloc[day-days_to_predict:day] = np.exp(mean(forecast))


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=185.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=185.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=185.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=185.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=185.0), HTML(value='')))





# Predictions vs Actual Values

In [36]:
# Shift ahead by 2 to compare the actual values to the predictions
pred_df = stock_df['Predictions'].shift(2).astype(float).dropna()

pred_df

Unnamed: 0_level_0,AAPL,AMZN,FB,GOOG,NFLX
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-05-06,13.463738,267.735620,26.575773,461.089922,35.367469
2013-05-07,13.463738,267.735620,26.575773,461.089922,35.367469
2013-05-08,13.463738,267.735620,26.575773,461.089922,35.367469
2013-05-09,13.463738,267.735620,26.575773,461.089922,35.367469
2013-05-10,13.463738,267.735620,26.575773,461.089922,35.367469
...,...,...,...,...,...
2020-08-31,124.229954,3516.374454,297.293117,1676.309393,523.218940
2020-09-01,124.229954,3516.374454,297.293117,1676.309393,523.218940
2020-09-02,124.229954,3516.374454,297.293117,1676.309393,523.218940
2020-09-03,124.229954,3516.374454,297.293117,1676.309393,523.218940


## Plotting the Predictions
Comparing the actual values with the predictions

In [37]:
for stock in stocks:
    
    fig = go.Figure()
    
    # Plotting the actual moving average values
    fig.add_trace(go.Scatter(x=pred_df.index,
                             y=stock_df['Adj Close'][stock].loc[pred_df.index],
                             name='Actual Adj Close',
                             mode='lines'))
    
    # Plotting the predicted moving average value
    fig.add_trace(go.Scatter(x=pred_df.index,
                             y=pred_df[stock],
                             name='Predicted Adj Close',
                             mode='lines'))
    
    # Setting the labels
    fig.update_layout(title=f'Predicting the Average Adj Close for the Next {days_to_predict} days for {stock}',
                      xaxis_title='Date',
                      yaxis_title='Prices')
    
    fig.show()

## Evaluation Metric

In [38]:
for stock in stocks:
    
    # Finding the root mean squared error
    rmse = mean_squared_error(stock_df['Adj Close'][stock].loc[pred_df.index],
                              pred_df[stock],
                              squared=False)

    print(f"On average, the model is off by ${round(rmse, 2)} for {stock}\n")

On average, the model is off by $5.65 for FB

On average, the model is off by $55.17 for AMZN

On average, the model is off by $1.89 for AAPL

On average, the model is off by $11.03 for NFLX

On average, the model is off by $29.88 for GOOG



# Trading Signal
Turning the model into a Trading Signal

In [39]:
def get_positions(difference, thres=3, short=True):
    """
    Compares the percentage difference between actual values and the respective predictions.
    
    Returns the decision or positions to long or short based on the difference.
    
    Optional: shorting in addition to buying
    """
    
    if difference > thres/100:
        
        return 1
    
    
    elif short and difference < -thres/100:
        
        return -1
    
    
    else:
        
        return 0

### Creating a Trading DF
__Note:__ _On Preventing Lookahead Bias_

For example, if the model is ran after hours and a position is established on the next day's opening, then a shift ahead of 1 is ok.  But if a position is established on the next day, near the close, then it needs to be shifted ahead by 2, because the position missed any gains or losses that day.  These are due to the fact that gains or losses in the day are determined when a trade is entered.

(This can also determine how long the predicted forecast remains valid.)

In [47]:
# Creating a DF for trading the model
trade_df = {}

# Getting the percentage difference between the predictions and the actual values
trade_df['PercentDiff'] = (stock_df['Predictions'].dropna() / stock_df['Adj Close'].loc[stock_df['Predictions'].dropna().index]) - 1

# Getting positions and preventing lookahead bias
trade_df['Positions'] = trade_df['PercentDiff'].applymap(lambda x: get_positions(x, thres=1, short=False) / len(stocks))

trade_df['Positions'] = trade_df['Positions'].shift(2).dropna()

# Getting Log Returns
trade_df['LogReturns'] = stock_df['LogReturns'].loc[trade_df['Positions'].index]                                    
    
display(trade_df['PercentDiff'])
display(trade_df['Positions'])

Unnamed: 0_level_0,AAPL,AMZN,FB,GOOG,NFLX
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-05-02,-0.0350395,0.0601291,-0.082645,0.115749,0.154237
2013-05-03,-0.0446038,0.0375339,-0.0612585,0.0944951,0.159861
2013-05-06,-0.066855,0.0469874,-0.0360619,0.074385,0.175055
2013-05-07,-0.0626845,0.0388221,-0.0116856,0.0797994,0.20035
2013-05-08,-0.0731521,0.0350071,-0.0200674,0.0595291,0.186771
...,...,...,...,...,...
2020-08-28,-0.00462752,0.0336805,0.0123718,0.0193987,-0.00128095
2020-08-31,-0.0372756,0.0189554,0.0139601,0.0257801,-0.0119742
2020-09-01,-0.0741544,0.00493105,0.00627239,0.00939323,-0.0598887
2020-09-02,-0.0545665,-0.00426893,-0.0172128,-0.0300707,-0.0535799


Unnamed: 0_level_0,AAPL,AMZN,FB,GOOG,NFLX
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-05-06,0.0,0.2,0.0,0.2,0.2
2013-05-07,0.0,0.2,0.0,0.2,0.2
2013-05-08,0.0,0.2,0.0,0.2,0.2
2013-05-09,0.0,0.2,0.0,0.2,0.2
2013-05-10,0.0,0.2,0.0,0.2,0.2
...,...,...,...,...,...
2020-08-28,0.0,0.2,0.0,0.2,0.0
2020-08-31,0.0,0.2,0.2,0.2,0.0
2020-09-01,0.0,0.2,0.2,0.2,0.0
2020-09-02,0.0,0.2,0.2,0.2,0.0


## Plotting the Positions

In [48]:
# Getting the number of positions
pos = trade_df['Positions'].apply(pd.value_counts)

# Plotting total positions
fig = px.bar(pos, 
             x=pos.index, 
             y=pos.columns,
             title='Total Positions',
             labels={'variable':'Stocks',
                      'value':'Count of Positions',
                      'index':'Position'})

fig.show()


# Calculating and Plotting the Potential Returns

## Returns on Each Individual Stock

In [49]:
# Calculating Returns by multiplying the positions by the log returns
returns = trade_df['Positions'] * trade_df['LogReturns']

# Calculating the performance as we take the cumulative sum of the returns and transform the values back to normal
performance = returns.cumsum().apply(np.exp)

# Plotting the performance per stock
px.line(performance,
        x=performance.index,
        y=performance.columns,
        title='Returns Per Stock Using ARIMA Forecast',
        labels={'variable':'Stocks',
                'value':'Returns'})

## Returns on the Overall Portfolio

In [50]:
# Returns for the portfolio
returns = (trade_df['Positions'] * trade_df['LogReturns']).sum(axis=1)

# Returns for SPY
spy = yf.download('SPY', start=returns.index[0])

spy = spy['Adj Close'].apply(np.log).diff().dropna().cumsum().apply(np.exp)

# Calculating the performance as we take the cumulative sum of the returns and transform the values back to normal
performance = returns.cumsum().apply(np.exp)


# Plotting the comparison between SPY returns and GARCH returns
fig = go.Figure()

fig.add_trace(go.Scatter(x=spy.index,
                         y=spy,
                         name='SPY Returns',
                         mode='lines'))

fig.add_trace(go.Scatter(x=performance.index,
                         y=performance.values,
                         name='ARIMA Returns on Portfolio',
                         mode='lines'))

fig.update_layout(title='SPY vs ARIMA Overall Portfolio Returns',
                  xaxis_title='Date',
                  yaxis_title='Returns')

fig.show()

[*********************100%***********************]  1 of 1 completed
