In [97]:
import pandas as pd
import numpy as np
from statsmodels.tools.eval_measures import rmse
from statsmodels.tsa.stattools import adfuller
import matplotlib.pylab as plt

# Lagi niestacjonarne

In [98]:
def calc_rmse(ts1, ts2):
    ts1u = ts1.dropna()
    ts2u = ts2[ts1u.index]
    return rmse(ts1u, ts2u)

def scale(ts):
    return ts/ts.max()

def find_best_lag(ts1, ts2, k=30):
    ts1_scaled = scale(ts1)
    ts2_scaled = scale(ts2)
    rmse_list = []
    for i in range(k):
        ts1_scaled_shift = ts1_scaled.shift(i)
        rmse_list.append(calc_rmse(ts1_scaled_shift, ts2_scaled))
    lag = rmse_list.index(min(rmse_list))
    
    return lag, rmse_list

def dft(df, column, d=0):
    ts = df[column].copy()
    if d!=0: ts = differentiated(ts, d)
    ts = ts.dropna()
    #Perform Augmented Dickey–Fuller test:
#     print('Results of Dickey Fuller Test:')
    dftest = adfuller(ts, autolag='AIC')

    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value

#     print(dfoutput)
    return dfoutput

In [99]:
df_lag = pd.read_csv('datasets/cale/zbior_do_lagow.csv', index_col=0)
variables = ['TensorFlow_', 'deep learning_', 'Artificial Intelligence_', 'Machine Learning_', 'neural network_', 'MXNet_', 'blockchain_news', 'GPU_news', 'gaming_news', 'BTC', 'SandP', 'NasdaqTech', 'ATVI_gaming', 'TTWO_gaming', 'UBSFY_gaming', 'fps_transformed_ffill', 'fps_transformed_interpolated', 'fps_interpolated_transformed', 'ATVI_gaming_rr', 'TTWO_gaming_rr', 'AMD_rr', 'SandP_rr', 'NasdaqTech_rr']
df_lag.head()

Unnamed: 0,open,high,low,close,Adj Close,volume,ATR,MOM,CCI,EMA,...,ATVI_gaming_rr,TTWO_gaming_rr,AMD_rr,SandP_rr,NasdaqTech_rr,return ratio,fps_transformed_ffill,fps_transformed_0fill,fps_transformed_interpolated,fps_interpolated_transformed
2012-01-03,14.3,14.4,14.01,14.04,12.981244,11701100,,,,,...,,,,,,,,0.0,,
2012-01-04,14.05,14.26,13.92,14.2,13.129175,8684300,,,,,...,-0.013788,0.00539,-0.007266,0.00726,-0.008122,0.011396,,0.0,,
2012-01-05,14.13,14.78,14.07,14.71,13.600721,14088700,,,,,...,-0.003701,0.032881,-0.00183,0.001578,0.007389,0.035915,,0.0,,
2012-01-06,14.7,14.71,14.37,14.54,13.44354,13331300,,,,,...,0.004127,0.017301,-0.003666,0.000149,0.007123,-0.011557,,0.0,,
2012-01-09,14.55,14.82,14.44,14.54,13.44354,12706100,,,,,...,0.00411,0.010204,0.012879,-8.2e-05,0.006352,0.0,,0.0,,


In [100]:
nonstationairy = []
stationairy = []
for var in variables:
    if dft(df_lag, var).loc['p-value'] < 0.05: stationairy.append(var)
    else: nonstationairy.append(var)

In [101]:
nonstationairy_lags = {}
for col in nonstationairy:
    lag, rmse_list = find_best_lag(df_lag[col], df_lag['close'])
    nonstationairy_lags[col] = (lag, rmse_list[lag])

In [102]:
nonstationairy_lags

{'ATVI_gaming': (29, 0.1789218461391596),
 'Artificial Intelligence_': (29, 0.24346104169574315),
 'BTC': (0, 0.22050114349282332),
 'GPU_news': (6, 0.18791021119177917),
 'MXNet_': (29, 0.11632617023875003),
 'Machine Learning_': (29, 0.15078105527635832),
 'NasdaqTech': (29, 0.322570178843348),
 'SandP': (29, 0.4738276052427284),
 'TTWO_gaming': (0, 0.09655558673731462),
 'TensorFlow_': (29, 0.1092558962244424),
 'UBSFY_gaming': (0, 0.07772626861299578),
 'blockchain_news': (0, 0.08023894850124773),
 'deep learning_': (29, 0.17194288935718868),
 'fps_interpolated_transformed': (0, 0.2151711339282012),
 'fps_transformed_ffill': (2, 0.2190667769608292),
 'fps_transformed_interpolated': (0, 0.21525800430149886),
 'gaming_news': (29, 0.4447378360088254),
 'neural network_': (29, 0.30537558703484496)}

In [103]:
ekon = pd.read_csv('datasets/cale/NVIDIA_czysty_zbior_do_modeli_ekonometrycznych.csv', index_col=0)
ekon['GPU_news_shift_6'] = df_lag['GPU_news'].shift(6)
ekon['fps_interpolated_transformed'] = df_lag['fps_interpolated_transformed']
ekon.to_csv('datasets/cale/ekonometryczne_wszystkie.csv')

In [104]:
ml = pd.read_csv('datasets/cale/NVIDIA_dataset_machine_learning_adjustment.csv', index_col=0)
ml.drop(['return ratio', 'return ratio_shift'], axis=1)
ml['GPU_news_shift_6'] = df_lag['GPU_news'].shift(7)
ml['fps_interpolated_transformed'] = df_lag['fps_interpolated_transformed'].shift(1)
ml['return_ratio'] = df_lag['return ratio'].shift(1)
ml['return_ratio_shift_1'] = df_lag['return ratio'].shift(2)
ml['return_ratio_shift_2'] = df_lag['return ratio'].shift(3)
ml['return_ratio_shift_3'] = df_lag['return ratio'].shift(4)
ml['return_ratio_shift_4'] = df_lag['return ratio'].shift(5)
ml['return_ratio_shift_5'] = df_lag['return ratio'].shift(6)
ml['return_ratio_shift_6'] = df_lag['return ratio'].shift(7)
ml['return_ratio_shift_7'] = df_lag['return ratio'].shift(8)
ml.to_csv('datasets/cale/ml_wszystkie.csv')

# Lagi stacjonarne

In [105]:
from statsmodels.tsa.stattools import adfuller
def differentiated(ts, d=1):
    ts = ts.copy()
    for i in range(d):
        ts = ts-ts.shift()
    return ts


def make_stationary(df, column, d=0):
    try:
        p_val = dft(df, column, d).loc['p-value']
        if p_val < 0.05: return d, df[column]
        else:
            d = d+1
            if d==4: return 0, None
            make_stationary(df, column, d)
        return d, differentiated(df[column], d)
    except: return 0, None

In [106]:
technical_analysis = [ 'ATR',
 'MOM',
 'CCI',
 'EMA',
 'RSI',
 'Will_R',
 'MFI',
 'DX',
 'PLUS_DM',
 'PPO',
 'AROONOSC',
 'BOP',
 'MINUS_DM',
 'ULTOSC',
 'AD',
 'OBV',
 'DEMA',
 'HT_TRENDLINE',
 'KAMA',
 'MIDPOINT',
 'MIDPRICE',
 'SAR',
 'SAREXT',
 'T3',
 'TEMA',
 'TRIMA',
 'WMA',
 'ADX',
 'ADXR',
 'APO',
 'CMO',
 'MINUS_DI',
 'PLUS_DI',
 'ROC',
 'ROCP',
 'ROCR',
 'ROCR100',
 'ADOSC',
 'NATR',
 'TRANGE',
 'AVGPRICE',
 'MEDPRICE',
 'TYPPRICE',
 'WCLPRICE',
 'HT_DCPERIOD',
 'HT_DCPHASE',
 'HT_TRENDMODE',
 'CDL2CROWS',
 'CDL3BLACKCROWS',
 'CDL3INSIDE',
 'CDL3LINESTRIKE',
 'CDL3OUTSIDE',
 'CDL3STARSINSOUTH',
 'CDL3WHITESOLDIERS',
 'CDLABANDONEDBABY',
 'CDLADVANCEBLOCK',
 'CDLBELTHOLD',
 'CDLBREAKAWAY',
 'CDLCLOSINGMARUBOZU',
 'CDLCONCEALBABYSWALL',
 'CDLCOUNTERATTACK',
 'CDLDARKCLOUDCOVER',
 'CDLDOJI',
 'CDLDOJISTAR',
 'CDLDRAGONFLYDOJI',
 'CDLENGULFING',
 'CDLEVENINGDOJISTAR',
 'CDLEVENINGSTAR',
 'CDLGAPSIDESIDEWHITE',
 'CDLGRAVESTONEDOJI',
 'CDLHAMMER',
 'CDLHANGINGMAN',
 'CDLHARAMI',
 'CDLHARAMICROSS',
 'CDLHIGHWAVE',
 'CDLHIKKAKE',
 'CDLHIKKAKEMOD',
 'CDLHOMINGPIGEON',
 'CDLIDENTICAL3CROWS',
 'CDLINNECK',
 'CDLINVERTEDHAMMER',
 'CDLKICKING',
 'CDLKICKINGBYLENGTH',
 'CDLLADDERBOTTOM',
 'CDLLONGLEGGEDDOJI',
 'CDLLONGLINE',
 'CDLMARUBOZU',
 'CDLMATCHINGLOW',
 'CDLMATHOLD',
 'CDLMORNINGDOJISTAR',
 'CDLMORNINGSTAR',
 'CDLONNECK',
 'CDLPIERCING',
 'CDLRICKSHAWMAN',
 'CDLRISEFALL3METHODS',
 'CDLSEPARATINGLINES',
 'CDLSHOOTINGSTAR',
 'CDLSHORTLINE',
 'CDLSPINNINGTOP',
 'CDLSTALLEDPATTERN',
 'CDLSTICKSANDWICH',
 'CDLTAKURI',
 'CDLTASUKIGAP',
 'CDLTHRUSTING',
 'CDLTRISTAR',
 'CDLUNIQUE3RIVER',
 'CDLUPSIDEGAP2CROWS',
 'CDLXSIDEGAP3METHODS',
 'BETA',
 'CORREL',
 'LINEARREG',
 'LINEARREG_ANGLE',
 'LINEARREG_INTERCEPT',
 'LINEARREG_SLOPE',
 'STDDEV',
 'TSF',
 'VAR',
 'ADD',
 'DIV',
 'MAX',
 'MAXINDEX',
 'MIN',
 'MININDEX',
 'MULT',
 'SUB',
 'SUM',
 'ATAN',
 'CEIL',
 'COS',
 'COSH',
 'EXP',
 'FLOOR',
 'LN',
 'LOG10',
 'SIN',
 'SINH',
 'SQRT',
 'TAN',
 'TANH']

In [107]:
nonstationairy_plus_technical = nonstationairy + technical_analysis

In [108]:
converted_to_stationairy = []
for col in nonstationairy_plus_technical:
    d, ts = make_stationary(df_lag, col)
    if d==0: continue
    ts.name = col+f'_diff_{d}'
    converted_to_stationairy.append(ts)

  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  return self.params / self.bse
  return self.params / self.bse


In [109]:
stationairy_df = pd.concat(converted_to_stationairy, axis=1).join(df_lag[stationairy])
stationairy_df.head()

Unnamed: 0,TensorFlow__diff_1,deep learning__diff_1,Artificial Intelligence__diff_1,Machine Learning__diff_1,neural network__diff_1,MXNet__diff_1,blockchain_news_diff_1,GPU_news_diff_1,gaming_news_diff_1,BTC_diff_1,...,CEIL_diff_1,FLOOR_diff_1,LN_diff_1,LOG10_diff_1,SQRT_diff_1,ATVI_gaming_rr,TTWO_gaming_rr,AMD_rr,SandP_rr,NasdaqTech_rr
2012-01-03,,,,,,,,,,,...,,,,,,,,,,
2012-01-04,0.0,0.032258,0.096774,0.064516,0.129032,0.0,0.0,-0.064516,-0.322581,0.175,...,0.0,0.0,0.011332,0.004921,0.02129,-0.013788,0.00539,-0.007266,0.00726,-0.008122
2012-01-05,0.0,0.032258,0.096774,0.064516,0.129032,0.0,0.0,-0.064516,-0.322581,1.035,...,0.0,0.0,0.035286,0.015324,0.067073,-0.003701,0.032881,-0.00183,0.001578,0.007389
2012-01-06,0.0,0.032258,0.096774,0.064516,0.129032,0.0,0.0,-0.064516,-0.322581,0.565,...,0.0,0.0,-0.011624,-0.005048,-0.022227,0.004127,0.017301,-0.003666,0.000149,0.007123
2012-01-09,0.0,0.096774,0.290323,0.193548,0.387097,0.0,0.0,-0.193548,-0.967742,-0.105,...,0.0,0.0,0.0,0.0,0.0,0.00411,0.010204,0.012879,-8.2e-05,0.006352


In [110]:
stationairy_lags = {}
for col in stationairy_df.columns:
    lag, rmse_list = find_best_lag(stationairy_df[col], df_lag['return ratio'])
    stationairy_lags[col] = (lag, rmse_list[lag])

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [111]:
stationairy_lags = {k:v for k, v in stationairy_lags.items() if not np.isnan(v[1])}

In [112]:
stationairy_lags

{'ADD_diff_1': (0, 0.12384309351590543),
 'AD_diff_1': (0, 0.11460118477699821),
 'AMD_rr': (0, 0.09540053794189046),
 'ATAN_diff_1': (0, 0.11768888609803435),
 'ATR_diff_1': (29, 0.10369089328398175),
 'ATVI_gaming_diff_1': (0, 0.11085281916069531),
 'ATVI_gaming_rr': (0, 0.10704900375983717),
 'AVGPRICE_diff_1': (0, 0.128324760086223),
 'Artificial Intelligence__diff_1': (19, 0.12330451243859841),
 'BTC_diff_1': (29, 0.10120603990184197),
 'CEIL_diff_1': (0, 0.10364095830513056),
 'DEMA_diff_1': (29, 0.2530165480038307),
 'EMA_diff_1': (29, 0.26779451303822943),
 'FLOOR_diff_1': (0, 0.10355882381742272),
 'GPU_news_diff_1': (17, 0.19304940811366036),
 'HT_TRENDLINE_diff_1': (29, 0.2680998843125545),
 'KAMA_diff_1': (29, 0.14767740720615646),
 'LINEARREG_INTERCEPT_diff_1': (29, 0.15145354136794548),
 'LINEARREG_diff_1': (0, 0.20940994737842378),
 'LN_diff_1': (0, 0.011030227869935957),
 'LOG10_diff_1': (0, 0.011030227869937133),
 'MAXINDEX_diff_1': (0, 0.13321686425639204),
 'MAX_diff

In [115]:
for col, tpl in stationairy_lags.items():
    if tpl[0]!=29:
        ekon[col+('_shift_'+str(tpl[0]) if tpl[0]!=0 else "")] = stationairy_df[col].shift(tpl[0])

In [121]:
ekon.head()

Unnamed: 0,return ratio,ATR,MOM,CCI,EMA,RSI,Will_R,MFI,DX,PLUS_DM,...,MAXINDEX_diff_1,MIN_diff_1,MININDEX_diff_1_shift_19,MULT_diff_1,ATAN_diff_1,CEIL_diff_1,FLOOR_diff_1,LN_diff_1,LOG10_diff_1,SQRT_diff_1
2012-07-02,-0.026773,0.481986,1.05,103.024634,12.824644,57.664648,-22.613065,52.602204,23.907183,2.182347,...,0.0,0.0,0.0,-2.103,-0.00198,0.0,0.0,-0.027138,-0.011786,-0.050102
2012-07-03,0.026022,0.475416,0.56,110.859729,12.88757,61.020501,-5.025126,57.027455,23.907183,2.026465,...,0.0,0.0,0.0,0.4492,0.001876,0.0,0.0,0.025689,0.011157,0.047411
2012-07-05,-0.010145,0.466458,0.21,90.224949,12.937404,59.005672,-12.698413,58.891864,23.907183,1.881717,...,0.0,0.0,0.0,-0.6687,-0.000739,0.0,0.0,-0.010197,-0.004428,-0.018891
2012-07-06,-0.019034,0.462425,0.56,44.708625,12.967249,55.350451,-27.777778,54.004518,15.702364,1.747309,...,0.0,0.0,0.0,-5.0065,-0.001413,0.0,0.0,-0.019217,-0.008346,-0.035343
2012-07-09,-0.013433,0.450109,0.22,-3.784987,12.983555,52.906929,-47.887324,50.153908,10.201357,1.622501,...,0.0,0.0,0.0,-4.7882,-0.00101,0.0,0.0,-0.013524,-0.005873,-0.024669


In [122]:
ekon.to_csv('datasets/cale/ekonometryczne_wszystkie.csv')

In [123]:
for col, tpl in stationairy_lags.items():
    if tpl[0]!=29:
        ml[col+('_shift_'+str(tpl[0]) if tpl[0]!=0 else "")] = stationairy_df[col].shift(tpl[0]+1)

In [124]:
ml.head()

Unnamed: 0,y_return_ratio,ATR,MOM,CCI,EMA,RSI,Will_R,MFI,DX,PLUS_DM,...,MAXINDEX_diff_1,MIN_diff_1,MININDEX_diff_1_shift_19,MULT_diff_1,ATAN_diff_1,CEIL_diff_1,FLOOR_diff_1,LN_diff_1,LOG10_diff_1,SQRT_diff_1
2012-07-02,-0.026773,0.475985,1.53,148.118434,12.781515,62.987843,-1.546392,59.313104,31.365741,2.35022,...,7.0,0.0,1.0,15.5678,0.003153,0.0,0.0,0.042874,0.01862,0.078845
2012-07-03,0.026022,0.481986,1.05,103.024634,12.824644,57.664648,-22.613065,52.602204,23.907183,2.182347,...,0.0,0.0,0.0,-2.103,-0.00198,0.0,0.0,-0.027138,-0.011786,-0.050102
2012-07-05,-0.010145,0.475416,0.56,110.859729,12.88757,61.020501,-5.025126,57.027455,23.907183,2.026465,...,0.0,0.0,0.0,0.4492,0.001876,0.0,0.0,0.025689,0.011157,0.047411
2012-07-06,-0.019034,0.466458,0.21,90.224949,12.937404,59.005672,-12.698413,58.891864,23.907183,1.881717,...,0.0,0.0,0.0,-0.6687,-0.000739,0.0,0.0,-0.010197,-0.004428,-0.018891
2012-07-09,-0.013433,0.462425,0.56,44.708625,12.967249,55.350451,-27.777778,54.004518,15.702364,1.747309,...,0.0,0.0,0.0,-5.0065,-0.001413,0.0,0.0,-0.019217,-0.008346,-0.035343


In [125]:
ml.to_csv('datasets/cale/ml_wszystkie.csv')

In [126]:
ekon_stationairy = pd.DataFrame()
ekon_stationairy['y_return_ratio'] = df_lag['return ratio']
for col, tpl in stationairy_lags.items():
    if tpl[0]!=29:
        ekon_stationairy[col+('_shift_'+str(tpl[0]) if tpl[0]!=0 else "")] = stationairy_df[col].shift(tpl[0])

In [127]:
ekon_stationairy.head()

Unnamed: 0,y_return_ratio,TensorFlow__diff_1_shift_4,deep learning__diff_1_shift_4,Artificial Intelligence__diff_1_shift_19,Machine Learning__diff_1_shift_4,neural network__diff_1_shift_9,MXNet__diff_1_shift_9,blockchain_news_diff_1_shift_4,GPU_news_diff_1_shift_17,gaming_news_diff_1_shift_2,...,CEIL_diff_1,FLOOR_diff_1,LN_diff_1,LOG10_diff_1,SQRT_diff_1,ATVI_gaming_rr,TTWO_gaming_rr,AMD_rr,SandP_rr,NasdaqTech_rr
2012-01-03,,,,,,,,,,,...,,,,,,,,,,
2012-01-04,0.011396,,,,,,,,,,...,0.0,0.0,0.011332,0.004921,0.02129,-0.013788,0.00539,-0.007266,0.00726,-0.008122
2012-01-05,0.035915,,,,,,,,,,...,0.0,0.0,0.035286,0.015324,0.067073,-0.003701,0.032881,-0.00183,0.001578,0.007389
2012-01-06,-0.011557,,,,,,,,,-0.322581,...,0.0,0.0,-0.011624,-0.005048,-0.022227,0.004127,0.017301,-0.003666,0.000149,0.007123
2012-01-09,0.0,,,,,,,,,-0.322581,...,0.0,0.0,0.0,0.0,0.0,0.00411,0.010204,0.012879,-8.2e-05,0.006352


In [128]:
ml_stationairy = pd.DataFrame()
ml_stationairy['y_return_ratio'] = df_lag['return ratio']
ml_stationairy['return_ratio_shift_1'] = df_lag['return ratio'].shift(2)
ml_stationairy['return_ratio_shift_2'] = df_lag['return ratio'].shift(3)
ml_stationairy['return_ratio_shift_3'] = df_lag['return ratio'].shift(4)
ml_stationairy['return_ratio_shift_4'] = df_lag['return ratio'].shift(5)
ml_stationairy['return_ratio_shift_5'] = df_lag['return ratio'].shift(6)
ml_stationairy['return_ratio_shift_6'] = df_lag['return ratio'].shift(7)
ml_stationairy['return_ratio_shift_7'] = df_lag['return ratio'].shift(8)
for col, tpl in stationairy_lags.items():
    if tpl[0]!=29:
        ml_stationairy[col+('_shift_'+str(tpl[0]) if tpl[0]!=0 else "")] = stationairy_df[col].shift(tpl[0]+1)

In [129]:
ml_stationairy.head()

Unnamed: 0,y_return_ratio,return_ratio_shift_1,return_ratio_shift_2,return_ratio_shift_3,return_ratio_shift_4,return_ratio_shift_5,return_ratio_shift_6,return_ratio_shift_7,TensorFlow__diff_1_shift_4,deep learning__diff_1_shift_4,...,CEIL_diff_1,FLOOR_diff_1,LN_diff_1,LOG10_diff_1,SQRT_diff_1,ATVI_gaming_rr,TTWO_gaming_rr,AMD_rr,SandP_rr,NasdaqTech_rr
2012-01-03,,,,,,,,,,,...,,,,,,,,,,
2012-01-04,0.011396,,,,,,,,,,...,,,,,,,,,,
2012-01-05,0.035915,,,,,,,,,,...,0.0,0.0,0.011332,0.004921,0.02129,-0.013788,0.00539,-0.007266,0.00726,-0.008122
2012-01-06,-0.011557,0.011396,,,,,,,,,...,0.0,0.0,0.035286,0.015324,0.067073,-0.003701,0.032881,-0.00183,0.001578,0.007389
2012-01-09,0.0,0.035915,0.011396,,,,,,,,...,0.0,0.0,-0.011624,-0.005048,-0.022227,0.004127,0.017301,-0.003666,0.000149,0.007123


In [130]:
ekon_stationairy.to_csv('datasets/cale/ekonometryczne_stacjonarne.csv')
ml_stationairy.to_csv('datasets/cale/ml_stacjonarne.csv')