## Libraries

In [58]:
import pandas as pd
import numpy as np
import datetime as dt
import yfinance as yf
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose

from functions import grangerTests, plot_seasonal_decompose

## Functions

In [59]:
def daySignal(day_change):
    if day_change > 0:
        return 1
    elif day_change < 0:
        return -1
    else:
        return 0

def addDateParts(df):
    df['day_of_year'] = df.index.day
    df['day_of_week'] = df.index.dayofweek
    df['week'] = df.index.week
    df['month'] = df.index.month
    return df

## Get Data

In [60]:
#define the ticker symbol
tickerSymbol = 'ETH-USD' #'MSFT'

# Date period
start = dt.datetime(2010,1,1)
end = dt.datetime.now()

#get data on this ticker
tickerData = yf.Ticker(tickerSymbol)
#get the historical prices for this ticker
df = tickerData.history(period='1D', start=start, end=end)

# Removing features with constant values
df = df.loc[:,df.apply(pd.Series.nunique) != 1]

# Some calculated variables
df['day_change'] = df['Close'] - df['Open']
df['day_change_pct'] = (df['day_change'] / df['Open']) * 100
df['day_change_signal'] = df['day_change'].apply(daySignal)

# Lagging close, high and low by one day
#for feature in ['Close', 'High', 'Low']:
#    df[f"{feature}_lag1"] = df[feature].shift(1).bfill() 

# Adding date features
#df = addDateParts(df)

# Exit datetime index
df = df.reset_index()

# View dataframe
df

Unnamed: 0,Date,Open,High,Low,Close,Volume,day_change,day_change_pct,day_change_signal
0,2017-11-09 00:00:00+00:00,308.644989,329.451996,307.056000,320.884003,893249984,12.239014,3.965402,1
1,2017-11-10 00:00:00+00:00,320.670990,324.717987,294.541992,299.252991,885985984,-21.417999,-6.679120,-1
2,2017-11-11 00:00:00+00:00,298.585999,319.453003,298.191986,314.681000,842300992,16.095001,5.390407,1
3,2017-11-12 00:00:00+00:00,314.690002,319.153015,298.513000,307.907990,1613479936,-6.782013,-2.155141,-1
4,2017-11-13 00:00:00+00:00,307.024994,328.415009,307.024994,316.716003,1041889984,9.691010,3.156424,1
...,...,...,...,...,...,...,...,...,...
1902,2023-01-24 00:00:00+00:00,1627.848267,1639.723877,1551.389771,1556.604248,8180274691,-71.244019,-4.376576,-1
1903,2023-01-25 00:00:00+00:00,1556.807495,1632.241699,1530.797852,1611.711060,10598973448,54.903564,3.526677,1
1904,2023-01-26 00:00:00+00:00,1611.080933,1626.198242,1586.598145,1603.105957,8395315241,-7.974976,-0.495008,-1
1905,2023-01-27 00:00:00+00:00,1603.080078,1617.000854,1565.244995,1598.156494,8124465373,-4.923584,-0.307133,-1


In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1907 entries, 0 to 1906
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype              
---  ------             --------------  -----              
 0   Date               1907 non-null   datetime64[ns, UTC]
 1   Open               1907 non-null   float64            
 2   High               1907 non-null   float64            
 3   Low                1907 non-null   float64            
 4   Close              1907 non-null   float64            
 5   Volume             1907 non-null   int64              
 6   day_change         1907 non-null   float64            
 7   day_change_pct     1907 non-null   float64            
 8   day_change_signal  1907 non-null   int64              
dtypes: datetime64[ns, UTC](1), float64(6), int64(2)
memory usage: 134.2 KB


## EDA

### Descriptive statistics

In [62]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Open,1907.0,1134.215,1180.549,84.27969,211.8501,527.194,1745.32,4810.071
High,1907.0,1170.456,1216.56,85.34274,217.9251,543.717,1802.073,4891.705
Low,1907.0,1093.216,1138.905,82.82989,207.7129,511.889,1680.912,4718.039
Close,1907.0,1134.642,1179.966,84.3083,211.6746,531.702,1745.43,4812.087
Volume,1907.0,12815610000.0,10737550000.0,621733000.0,4483747000.0,10463090000.0,18086390000.0,84482910000.0
day_change,1907.0,0.4265643,79.05506,-921.978,-11.68945,0.3034668,14.93256,543.655
day_change_pct,1907.0,0.2012952,5.032611,-42.30885,-2.152596,0.1006195,2.691741,25.95129
day_change_signal,1907.0,0.02149974,1.000031,-1.0,-1.0,1.0,1.0,1.0


### Correlation

In [63]:
import plotly.express as px

#df = px.data.medals_wide(indexed=True)
corr_matrix = df.corr()
fig = px.imshow(corr_matrix, text_auto=True, aspect='auto')
fig.show()





### Decomposition

In [64]:
from statsmodels.tsa.seasonal import seasonal_decompose

feat_ex = 'Close'
decomposition = seasonal_decompose(df[feat_ex], model='additive', period=365)
fig = plot_seasonal_decompose(decomposition, dates=df['Date'])
fig.show()

### Stationarity

In [65]:
from statsmodels.tsa.stattools import adfuller
result = adfuller(df[feat_ex])
print(f'Test Statistics: {result[0]}')
print(f'p-value: {result[1]}')
print(f'critical_values: {result[4]}')
print("------------------")
if result[1] > 0.05:
    print("Series is not stationary and will be diff'd")
    print("-------------------------------------------")
    #diff data
    for col in df.columns:
        # We don't want to diff the date
        if col not in ('Date'):
            df[col] = df[col].diff()
    df = df.dropna()
    # Retest
    result = adfuller(df[feat_ex])
    if result[1] > 0.05:
        print("Series is still not stationary after diff")
    else:
        print("Series is stationary after diff")
else:
    print("Series is stationary")

Test Statistics: -1.403060161763486
p-value: 0.5808081623666652
critical_values: {'1%': -3.4338164946764294, '5%': -2.863071261312085, '10%': -2.5675851863579537}
------------------
Series is not stationary and will be diff'd
-------------------------------------------
Series is stationary after diff


### Causality

In [67]:
grangerTests(df, 'Close', 5)

Open granger causes Close at lag: 1. P=0.0
Open granger causes Close at lag: 2. P=0.0
Open granger causes Close at lag: 3. P=0.0
Open granger causes Close at lag: 4. P=0.0
Open granger causes Close at lag: 5. P=0.0
High granger causes Close at lag: 1. P=0.0
High granger causes Close at lag: 2. P=0.0
High granger causes Close at lag: 3. P=0.0
High granger causes Close at lag: 4. P=0.0
High granger causes Close at lag: 5. P=0.0
Low granger causes Close at lag: 1. P=0.0
Low granger causes Close at lag: 2. P=0.0
Low granger causes Close at lag: 3. P=0.0
Low granger causes Close at lag: 4. P=0.0
Low granger causes Close at lag: 5. P=0.0
Volume granger causes Close at lag: 1. P=0.024
Volume granger causes Close at lag: 2. P=0.003
Volume granger causes Close at lag: 3. P=0.001
Volume granger causes Close at lag: 4. P=0.001
Volume granger causes Close at lag: 5. P=0.0
day_change granger causes Close at lag: 1. P=0.0
day_change granger causes Close at lag: 2. P=0.0
day_change granger causes Clo

### Charting

### Profit/loss

In [68]:
px.histogram(df, 'day_change_signal', title='Histogram of day change signal')

In [69]:
px.line(df, x='Date', y=['Open','Close'])

In [70]:
px.line(df, x='Date', y=['day_change_pct'])