In [1]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import preprocessing
import seaborn as sns
import datetime as dt
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import grangercausalitytests
import numpy as np
import scipy
import time

In [2]:
start_date = '2016-01-04'
end_date = '2020-03-01'

In [3]:
exchange = "btc-usd" 

In [4]:
data1 = pd.read_csv("generated_price_data/"+exchange+"_daily.csv")

In [5]:
um = "wiki_pageview" # usermetric: either wiki_pageview or trends 
ak = "Bitcoin" #either article name or a keyword name
tu = "daily" #time unit

In [6]:
data2 = pd.read_csv("generated_"+um+"_data/"+ak+"_"+tu+".csv")

In [7]:
## Filter data by dates
data1.index = [pd.Timestamp(t) for t in data1['day']]
data2.index = [pd.Timestamp(t) for t in data2['day']]


d1 = data1[start_date:end_date].copy()
d2 = data2[start_date:end_date].copy()

In [8]:
d1.drop_duplicates(subset=['day'],keep='first',inplace=True)
d2.drop_duplicates(subset=['day'],keep='first',inplace=True)

In [9]:
print(d1.shape)
print(d2.shape)

(1519, 8)
(1519, 17)


In [10]:
feature_names1 = ['close']
feature_names2 = ['pageview','sma10','sma21'] #first element either trend or pageview

In [11]:
features_interested = ['log_','log_diff_','diff_ratio_']

In [12]:
#stationarity test
def is_stationary(x):
    try:
        ts = np.array(x)
        ts = ts[~np.isnan(ts)]
        result = adfuller(ts)
        if result[1] < 0.05:
            return True
        else:
            return False
    except:
        return False

In [13]:
data_dict = dict()

max_lag = 15
window_size = [120,360,720]
i=0

chisq_test = 'ssr_chi2test'
F_test = 'ssr_ftest'

In [14]:
st = time.time()
for fi in features_interested:
    fn1 = [fi+fn for fn in feature_names1]
    fn2 = [fi+fn for fn in feature_names2]
    for col1 in fn1:
        for col2 in fn2:
            for w in window_size:
                for t in range(d1.shape[0]-w):
                    ts_1 = pd.Series(d1.iloc[t:t+w][col1].values)
                    tf_1 = str(d1.iloc[t]['day'])+' - '+str(d1.iloc[t+w]['day'])
                    
                    ts_2 = pd.Series(d2.iloc[t:t+w][col2].values)
                    tf_2 = str(d2.iloc[t]['day'])+' - '+str(d2.iloc[t+w]['day'])
                    
                    
                    try:
                        if is_stationary(ts_1) and is_stationary(ts_2):
                            granger_test_data = pd.DataFrame()
                            granger_test_data['d1'] = ts_1
                            granger_test_data['d2'] = ts_2
                            test_result = grangercausalitytests(granger_test_data,maxlag=max_lag,verbose=False)
                            chisq_p_values = [round(test_result[i+1][0][chisq_test][1], 4) for i in range(max_lag)]
                            ftest_p_values = [round(test_result[i+1][0][F_test][1], 4) for i in range(max_lag)]
                        else:
                            chisq_p_values = [np.nan for i in range(max_lag)]
                            ftest_p_values = [np.nan for i in range(max_lag)] 
                    except:
                        chisq_p_values = [np.nan for i in range(max_lag)]
                        ftest_p_values = [np.nan for i in range(max_lag)]
                    
                    for l in range(1,max_lag+1):
                        data_dict[i] = {"window_size":w,
                                        "lag":l,
                                        "feature_1":col1,
                                        "feature_1_timeframe":tf_1,
                                        "feature_2":col2,
                                        "feature_2_timeframe":tf_2,
                                        "F_test_p_val":ftest_p_values[l-1],
                                        "chisq_test_p_val":chisq_p_values[l-1]}

                        i += 1
ft = time.time()

In [15]:
ft-st

1272.4937012195587

In [16]:
data = pd.DataFrame.from_dict(data_dict, "index")
data

Unnamed: 0,window_size,lag,feature_1,feature_1_timeframe,feature_2,feature_2_timeframe,F_test_p_val,chisq_test_p_val
0,120,1,log_close,2016-01-04 - 2016-05-03,log_pageview,2016-01-04 - 2016-05-03,,
1,120,2,log_close,2016-01-04 - 2016-05-03,log_pageview,2016-01-04 - 2016-05-03,,
2,120,3,log_close,2016-01-04 - 2016-05-03,log_pageview,2016-01-04 - 2016-05-03,,
3,120,4,log_close,2016-01-04 - 2016-05-03,log_pageview,2016-01-04 - 2016-05-03,,
4,120,5,log_close,2016-01-04 - 2016-05-03,log_pageview,2016-01-04 - 2016-05-03,,
...,...,...,...,...,...,...,...,...
453190,720,11,diff_ratio_close,2018-03-12 - 2020-03-01,diff_ratio_sma21,2018-03-12 - 2020-03-01,0.3038,0.2736
453191,720,12,diff_ratio_close,2018-03-12 - 2020-03-01,diff_ratio_sma21,2018-03-12 - 2020-03-01,0.3439,0.3087
453192,720,13,diff_ratio_close,2018-03-12 - 2020-03-01,diff_ratio_sma21,2018-03-12 - 2020-03-01,0.4588,0.4184
453193,720,14,diff_ratio_close,2018-03-12 - 2020-03-01,diff_ratio_sma21,2018-03-12 - 2020-03-01,0.5083,0.4638


In [17]:
data_ = data[data['F_test_p_val'] > 0].copy()

In [18]:
data_.shape

(270348, 8)

In [19]:
file_name = 'generated_causality_data/'+um+'/'
file_name += ak+'_'+exchange+'.csv'

In [20]:
file_name

'generated_causality_data/wiki_pageview/Bitcoin_btc-usd.csv'

In [21]:
data_.to_csv(file_name,index=False)