In [1]:
# Let's start by importing the libraries we will need below.

from google.colab import drive
drive.mount('/content/drive')

import math
import numpy as np
import pandas as pd
from pandas.plotting import autocorrelation_plot

import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.graphics.tsaplots import plot_pacf

import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/April')  # Here is the location of my stock historical database
from pipelines import loaders, filters, interpolators # Import my own data pipelines.

print("Imported Libraries")

Mounted at /content/drive


  import pandas.util.testing as tm


Imported Libraries


In [2]:
# Let's choose from a number of stocks to get a good feel for our EDA
# For argument's sake let's select two of the worst performing stocks $WYNN and $ETSY, and then one of the highest performing stocks 'AAPL'
# However, depending on your strategy you might weight trades more to positive momentum, market cap, or trade volume.

tickers = ['AAPL', 'AMD', 'WYNN', 'ETSY']
yf_filepath = '/content/drive/MyDrive/Colab Notebooks/April/data/yf'
columns = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume']  # We will request OHLCV data from our database.
ticker_historical = loaders.load_historicals(tickers, columns, yf_filepath)
ticker_historical = ticker_historical['AAPL']

In [3]:
# Since the data is varying
# Standardization would be casting the data on log-returns
# Vs Normalization would be something akin to moving averages

# Standardize our dataset so the values that we plug into our model are constrained within a range and repeatable, for any point in time
# I go over three common ways to introduce stationarity to the model below

def difference_by_previous_data(ticker_historical, periods=1):
  '''Difference'''
  ticker_diff = ticker_historical.diff(periods=periods).dropna()
  return ticker_diff

def difference_by_return_percentage(ticker_historical):
  '''A'''
  ticker_diff = (((ticker_historical.shift(-1)-ticker_historical)/ticker_historical)*100) #Double check can we shift(1) flip here as with diff log returns
  shifted_index = ticker_diff.index[1:] #Shift index for log returns to be on the same day
  ticker_diff = ticker_diff.dropna()
  ticker_diff.index = shifted_index
  return ticker_diff
 
def difference_by_log_returns(ticker_historical, base=10):
  '''A'''
  ticker_diff = (ticker_historical/ticker_historical.shift(1))
  if base == 10:
    ticker_diff = np.log10(ticker_diff)*100
  else:
    ticker_diff = np.log(ticker_diff)
  ticker_diff = ticker_diff.dropna()
  return ticker_diff

def plot_stationary_graphs(ticker_diff):
  '''A'''
  %matplotlib inline
  figure_dims = (20, 10)
  fig, ax = plt.subplots(2,2, figsize=figure_dims)
  sns.set_style('whitegrid')
  ax[0][0] = autocorrelation_plot(ticker_diff, ax=ax[0][0])
  ax[0][1] = plot_pacf(ticker_diff, ax=ax[0][1])
  ax[1][0] = sns.histplot(data=ticker_diff, ax=ax[1][0])
  sm.qqplot(ticker_diff, line='45', ax=ax[1][1])
  print(ticker_diff.describe())
  print('Skewness is {}'.format(ticker_diff.skew()))
  print('Kurtosis is {}'.format(ticker_diff.kurtosis()))
  plt.show()

### Standardization

In [None]:
print("Log Differences\n")
# At some point we will have to look into natural log base vs log10 base
ticker_log_diff = difference_by_log_returns(ticker_historical, base=10).drop(columns='Volume') 
plot_stationary_graphs(ticker_log_diff['Close'])

### Normalization