In [1]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import preprocessing
import seaborn as sns
import datetime as dt
from statsmodels.tsa.stattools import adfuller
import numpy as np
import scipy
import time

In [2]:
start_date = '2016-01-04'
end_date = '2020-03-01'

In [3]:
data1 = pd.read_csv("../data generators/generated_price_data_daily.csv")
data2 = pd.read_csv("../data generators/generated_pageview_data_bitcoin_daily.csv")

In [4]:
## Filter data by dates
data1.index = [pd.Timestamp(t) for t in data1['day']]
data2.index = [pd.Timestamp(t) for t in data2['day']]


d1 = data1[start_date:end_date].copy()
d2 = data2[start_date:end_date].copy()

In [5]:
d1.drop_duplicates(subset=['day'],keep='first',inplace=True)
d2.drop_duplicates(subset=['day'],keep='first',inplace=True)

In [6]:
print(d1.shape)
print(d2.shape)

(1519, 17)
(1519, 17)


In [7]:
feature_names1 = ['close','sma10','sma21']
feature_names2 = ['pageview','sma10','sma21']

In [8]:
features_interested = ['log_','log_diff_','diff_ratio_']

In [9]:
data_dict = dict()

lag = 14 
window_size = [14,30,60]
i=0

In [10]:
st = time.time()
for fi in features_interested:
    fn1 = [fi+fn for fn in feature_names1]
    fn2 = [fi+fn for fn in feature_names2]
    for col1 in fn1:
        for col2 in fn2:
            for w in window_size:
                for t in range(d1.shape[0]-lag-w):
                    ts_1 = pd.Series(d1.iloc[t+lag:t+lag+w][col1].values)
                    tf_1 = str(d1.iloc[t+lag]['day'])+' - '+str(d1.iloc[t+lag+w]['day'])

                    for l in range(lag+1):
                        ts_2 = pd.Series(d2.iloc[t+l:t+l+w][col2].values)
                        tf_2 = str(d2.iloc[t+l]['day'])+' - '+str(d2.iloc[t+l+w]['day'])
                        try:
                            corr = scipy.stats.pearsonr(np.float32(ts_1.values),np.float32(ts_2.values))[0]
                        except:
                            corr = np.nan

                        data_dict[i] = {"window_size":w,
                                        "lag":lag-l,
                                        "feature_1":col1,
                                        "feature_1_timeframe":tf_1,
                                        "feature_2":col2,
                                        "feature_2_timeframe":tf_2,
                                        "correlation":corr}

                        i += 1
ft = time.time()

In [11]:
ft-st

1184.6411788463593

In [12]:
data = pd.DataFrame.from_dict(data_dict, "index")
data

Unnamed: 0,window_size,lag,feature_1,feature_1_timeframe,feature_2,feature_2_timeframe,correlation
0,14,14,log_close,2016-01-18 - 2016-02-01,log_pageview,2016-01-04 - 2016-01-18,-0.194988
1,14,13,log_close,2016-01-18 - 2016-02-01,log_pageview,2016-01-05 - 2016-01-19,-0.494150
2,14,12,log_close,2016-01-18 - 2016-02-01,log_pageview,2016-01-06 - 2016-01-20,-0.795461
3,14,11,log_close,2016-01-18 - 2016-02-01,log_pageview,2016-01-07 - 2016-01-21,-0.693944
4,14,10,log_close,2016-01-18 - 2016-02-01,log_pageview,2016-01-08 - 2016-01-22,-0.161153
...,...,...,...,...,...,...,...
1786450,60,4,diff_ratio_sma21,2020-01-01 - 2020-03-01,diff_ratio_sma21,2019-12-28 - 2020-02-26,
1786451,60,3,diff_ratio_sma21,2020-01-01 - 2020-03-01,diff_ratio_sma21,2019-12-29 - 2020-02-27,
1786452,60,2,diff_ratio_sma21,2020-01-01 - 2020-03-01,diff_ratio_sma21,2019-12-30 - 2020-02-28,
1786453,60,1,diff_ratio_sma21,2020-01-01 - 2020-03-01,diff_ratio_sma21,2019-12-31 - 2020-02-29,


In [14]:
file_name = 'correlation_data/wiki_pageview/'
file_name += data2['article'].values[0]+'_'+data1['market'].values[0]+'.csv'

In [15]:
file_name

'correlation_data/wiki_pageview/Bitcoin_btc-usd.csv'

In [16]:
data.to_csv(file_name,index=False)