In [78]:
import pandas as pd
import numpy as np
from statsmodels.tools.eval_measures import rmse
from statsmodels.tsa.stattools import adfuller
import matplotlib.pylab as plt

# Lagi niestacjonarne

In [79]:
def calc_rmse(ts1, ts2):
    ts1u = ts1.dropna()
    ts2u = ts2[ts1u.index]
    return rmse(ts1u, ts2u)

def scale(ts):
    return ts/ts.max()

def find_best_lag(ts1, ts2, k=30):
    ts1_scaled = scale(ts1)
    ts2_scaled = scale(ts2)
    rmse_list = []
    for i in range(k):
        ts1_scaled_shift = ts1_scaled.shift(i)
        rmse_list.append(calc_rmse(ts1_scaled_shift, ts2_scaled))
    lag = rmse_list.index(min(rmse_list))
    
    return lag, rmse_list

def dft(df, column, d=0):
    ts = df[column].copy()
    if d!=0: ts = differentiated(ts, d)
    ts = ts.dropna()
    #Perform Augmented Dickey–Fuller test:
#     print('Results of Dickey Fuller Test:')
    dftest = adfuller(ts, autolag='AIC')

    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value

#     print(dfoutput)
    return dfoutput

In [80]:
df_lag = pd.read_csv('datasets/cale/zbior_do_lagow.csv', index_col=0)
variables = ['TensorFlow_', 'deep learning_', 'Artificial Intelligence_', 'Machine Learning_', 'neural network_', 'MXNet_', 'blockchain_news', 'GPU_news', 'gaming_news', 'BTC', 'SandP', 'NasdaqTech', 'ATVI_gaming', 'TTWO_gaming', 'UBSFY_gaming', 'fps_transformed_ffill', 'fps_transformed_interpolated', 'fps_interpolated_transformed', 'ATVI_gaming_rr', 'TTWO_gaming_rr', 'AMD_rr', 'SandP_rr', 'NasdaqTech_rr']
df_lag.head()

Unnamed: 0,open,high,low,close,Adj Close,volume,ATR,MOM,CCI,EMA,...,ATVI_gaming_rr,TTWO_gaming_rr,AMD_rr,SandP_rr,NasdaqTech_rr,return ratio,fps_transformed_ffill,fps_transformed_0fill,fps_transformed_interpolated,fps_interpolated_transformed
2012-01-03,14.3,14.4,14.01,14.04,12.981244,11701100,,,,,...,,,,,,,,0.0,,
2012-01-04,14.05,14.26,13.92,14.2,13.129175,8684300,,,,,...,-0.013788,0.00539,-0.007266,0.00726,-0.008122,0.011396,,0.0,,
2012-01-05,14.13,14.78,14.07,14.71,13.600721,14088700,,,,,...,-0.003701,0.032881,-0.00183,0.001578,0.007389,0.035915,,0.0,,
2012-01-06,14.7,14.71,14.37,14.54,13.44354,13331300,,,,,...,0.004127,0.017301,-0.003666,0.000149,0.007123,-0.011557,,0.0,,
2012-01-09,14.55,14.82,14.44,14.54,13.44354,12706100,,,,,...,0.00411,0.010204,0.012879,-8.2e-05,0.006352,0.0,,0.0,,


In [81]:
nonstationairy = []
stationairy = []
for var in variables:
    if dft(df_lag, var).loc['p-value'] < 0.05: stationairy.append(var)
    else: nonstationairy.append(var)

In [82]:
nonstationairy_lags = {}
for col in nonstationairy:
    lag, rmse_list = find_best_lag(df_lag[col], df_lag['close'])
    nonstationairy_lags[col] = (lag, rmse_list[lag])

In [83]:
nonstationairy_lags

{'TensorFlow_': (29, 0.1092558962244424),
 'deep learning_': (29, 0.17194288935718868),
 'Artificial Intelligence_': (29, 0.24346104169574315),
 'Machine Learning_': (29, 0.15078105527635832),
 'neural network_': (29, 0.30537558703484496),
 'MXNet_': (29, 0.11632617023875003),
 'blockchain_news': (0, 0.08023894850124773),
 'GPU_news': (6, 0.18791021119177917),
 'gaming_news': (29, 0.4447378360088254),
 'BTC': (0, 0.22050114349282332),
 'SandP': (29, 0.4738276052427284),
 'NasdaqTech': (29, 0.322570178843348),
 'ATVI_gaming': (29, 0.1789218461391596),
 'TTWO_gaming': (0, 0.09655558673731462),
 'UBSFY_gaming': (0, 0.07772626861299578),
 'fps_transformed_ffill': (2, 0.2190667769608292),
 'fps_transformed_interpolated': (0, 0.21525800430149886),
 'fps_interpolated_transformed': (0, 0.2151711339282012)}

In [84]:
ml_czysty = pd.read_csv('datasets/cale/NVIDIA_dataset_machine_learning_adjustment.csv', index_col=0)

ekon_czysty = pd.read_csv('datasets/cale/NVIDIA_czysty_zbior_do_modeli_ekonometrycznych.csv', index_col=0)
ekon_columns = ekon_czysty.columns.drop('GPU_news', 'fps_interpolated_transformed').tolist()
ekon_columns

['return ratio',
 'ATR',
 'MOM',
 'CCI',
 'EMA',
 'RSI',
 'Will_R',
 'MFI',
 'DX',
 'PLUS_DM',
 'PPO',
 'AROONOSC',
 'BOP',
 'MINUS_DM',
 'ULTOSC',
 'AD',
 'OBV',
 'DEMA',
 'HT_TRENDLINE',
 'KAMA',
 'MIDPOINT',
 'MIDPRICE',
 'SAR',
 'SAREXT',
 'T3',
 'TEMA',
 'TRIMA',
 'WMA',
 'ADX',
 'ADXR',
 'APO',
 'CMO',
 'MINUS_DI',
 'PLUS_DI',
 'ROC',
 'ROCP',
 'ROCR',
 'ROCR100',
 'ADOSC',
 'NATR',
 'TRANGE',
 'AVGPRICE',
 'MEDPRICE',
 'TYPPRICE',
 'WCLPRICE',
 'HT_DCPERIOD',
 'HT_DCPHASE',
 'HT_TRENDMODE',
 'CDL2CROWS',
 'CDL3BLACKCROWS',
 'CDL3INSIDE',
 'CDL3LINESTRIKE',
 'CDL3OUTSIDE',
 'CDL3STARSINSOUTH',
 'CDL3WHITESOLDIERS',
 'CDLABANDONEDBABY',
 'CDLADVANCEBLOCK',
 'CDLBELTHOLD',
 'CDLBREAKAWAY',
 'CDLCLOSINGMARUBOZU',
 'CDLCONCEALBABYSWALL',
 'CDLCOUNTERATTACK',
 'CDLDARKCLOUDCOVER',
 'CDLDOJI',
 'CDLDOJISTAR',
 'CDLDRAGONFLYDOJI',
 'CDLENGULFING',
 'CDLEVENINGDOJISTAR',
 'CDLEVENINGSTAR',
 'CDLGAPSIDESIDEWHITE',
 'CDLGRAVESTONEDOJI',
 'CDLHAMMER',
 'CDLHANGINGMAN',
 'CDLHARAMI',
 'CDLHA

In [85]:
ekon_wszystkie = pd.DataFrame(ml_czysty['y_return_ratio'])
not_found = []
for col in ekon_columns:
    try:
        ekon_wszystkie[col+'_1_day_before'] = df_lag[col].shift(1)
    except:
        not_found.append(col)
        
for col in not_found:
    try:
        ekon_wszystkie[col+'_1_day_before'] = ml_czysty[col]
    except:
        print(col)

In [86]:
ekon_wszystkie.columns.tolist()

['y_return_ratio',
 'return ratio_1_day_before',
 'ATR_1_day_before',
 'MOM_1_day_before',
 'CCI_1_day_before',
 'EMA_1_day_before',
 'RSI_1_day_before',
 'Will_R_1_day_before',
 'MFI_1_day_before',
 'DX_1_day_before',
 'PLUS_DM_1_day_before',
 'PPO_1_day_before',
 'AROONOSC_1_day_before',
 'BOP_1_day_before',
 'MINUS_DM_1_day_before',
 'ULTOSC_1_day_before',
 'AD_1_day_before',
 'OBV_1_day_before',
 'DEMA_1_day_before',
 'HT_TRENDLINE_1_day_before',
 'KAMA_1_day_before',
 'MIDPOINT_1_day_before',
 'MIDPRICE_1_day_before',
 'SAR_1_day_before',
 'SAREXT_1_day_before',
 'T3_1_day_before',
 'TEMA_1_day_before',
 'TRIMA_1_day_before',
 'WMA_1_day_before',
 'ADX_1_day_before',
 'ADXR_1_day_before',
 'APO_1_day_before',
 'CMO_1_day_before',
 'MINUS_DI_1_day_before',
 'PLUS_DI_1_day_before',
 'ROC_1_day_before',
 'ROCP_1_day_before',
 'ROCR_1_day_before',
 'ROCR100_1_day_before',
 'ADOSC_1_day_before',
 'NATR_1_day_before',
 'TRANGE_1_day_before',
 'AVGPRICE_1_day_before',
 'MEDPRICE_1_day_be

In [87]:
ekon_wszystkie['GPU_news_6_days_before'] = df_lag['GPU_news'].shift(6)
ekon_wszystkie['fps_interpolated_transformed_1_day_before'] = df_lag['fps_interpolated_transformed'].shift(1)
# ekon_wszystkie.to_csv('datasets/cale/ekonometryczne_wszystkie.csv')

In [88]:
ml = pd.read_csv('datasets/cale/NVIDIA_dataset_machine_learning_adjustment.csv', index_col=0)
ml.drop(['return ratio', 'return ratio_shift'], axis=1)
ml['GPU_news_shift_6'] = df_lag['GPU_news'].shift(7)
ml['fps_interpolated_transformed'] = df_lag['fps_interpolated_transformed'].shift(1)
ml['return_ratio'] = df_lag['return ratio'].shift(1)
ml['return_ratio_shift_1'] = df_lag['return ratio'].shift(2)
ml['return_ratio_shift_2'] = df_lag['return ratio'].shift(3)
ml['return_ratio_shift_3'] = df_lag['return ratio'].shift(4)
ml['return_ratio_shift_4'] = df_lag['return ratio'].shift(5)
ml['return_ratio_shift_5'] = df_lag['return ratio'].shift(6)
ml['return_ratio_shift_6'] = df_lag['return ratio'].shift(7)
ml['return_ratio_shift_7'] = df_lag['return ratio'].shift(8)
ml.to_csv('datasets/cale/ml_wszystkie.csv')

# Lagi stacjonarne

In [89]:
from statsmodels.tsa.stattools import adfuller
def differentiated(ts, d=1):
    ts = ts.copy()
    for i in range(d):
        ts = ts-ts.shift()
    return ts


def make_stationary(df, column, d=0):
    try:
        p_val = dft(df, column, d).loc['p-value']
        if p_val < 0.05: return d, df[column]
        else:
            d = d+1
            if d==4: return 0, None
            make_stationary(df, column, d)
        return d, differentiated(df[column], d)
    except: return 0, None

In [90]:
technical_analysis = [ 'ATR',
 'MOM',
 'CCI',
 'EMA',
 'RSI',
 'Will_R',
 'MFI',
 'DX',
 'PLUS_DM',
 'PPO',
 'AROONOSC',
 'BOP',
 'MINUS_DM',
 'ULTOSC',
 'AD',
 'OBV',
 'DEMA',
 'HT_TRENDLINE',
 'KAMA',
 'MIDPOINT',
 'MIDPRICE',
 'SAR',
 'SAREXT',
 'T3',
 'TEMA',
 'TRIMA',
 'WMA',
 'ADX',
 'ADXR',
 'APO',
 'CMO',
 'MINUS_DI',
 'PLUS_DI',
 'ROC',
 'ROCP',
 'ROCR',
 'ROCR100',
 'ADOSC',
 'NATR',
 'TRANGE',
 'AVGPRICE',
 'MEDPRICE',
 'TYPPRICE',
 'WCLPRICE',
 'HT_DCPERIOD',
 'HT_DCPHASE',
 'HT_TRENDMODE',
 'CDL2CROWS',
 'CDL3BLACKCROWS',
 'CDL3INSIDE',
 'CDL3LINESTRIKE',
 'CDL3OUTSIDE',
 'CDL3STARSINSOUTH',
 'CDL3WHITESOLDIERS',
 'CDLABANDONEDBABY',
 'CDLADVANCEBLOCK',
 'CDLBELTHOLD',
 'CDLBREAKAWAY',
 'CDLCLOSINGMARUBOZU',
 'CDLCONCEALBABYSWALL',
 'CDLCOUNTERATTACK',
 'CDLDARKCLOUDCOVER',
 'CDLDOJI',
 'CDLDOJISTAR',
 'CDLDRAGONFLYDOJI',
 'CDLENGULFING',
 'CDLEVENINGDOJISTAR',
 'CDLEVENINGSTAR',
 'CDLGAPSIDESIDEWHITE',
 'CDLGRAVESTONEDOJI',
 'CDLHAMMER',
 'CDLHANGINGMAN',
 'CDLHARAMI',
 'CDLHARAMICROSS',
 'CDLHIGHWAVE',
 'CDLHIKKAKE',
 'CDLHIKKAKEMOD',
 'CDLHOMINGPIGEON',
 'CDLIDENTICAL3CROWS',
 'CDLINNECK',
 'CDLINVERTEDHAMMER',
 'CDLKICKING',
 'CDLKICKINGBYLENGTH',
 'CDLLADDERBOTTOM',
 'CDLLONGLEGGEDDOJI',
 'CDLLONGLINE',
 'CDLMARUBOZU',
 'CDLMATCHINGLOW',
 'CDLMATHOLD',
 'CDLMORNINGDOJISTAR',
 'CDLMORNINGSTAR',
 'CDLONNECK',
 'CDLPIERCING',
 'CDLRICKSHAWMAN',
 'CDLRISEFALL3METHODS',
 'CDLSEPARATINGLINES',
 'CDLSHOOTINGSTAR',
 'CDLSHORTLINE',
 'CDLSPINNINGTOP',
 'CDLSTALLEDPATTERN',
 'CDLSTICKSANDWICH',
 'CDLTAKURI',
 'CDLTASUKIGAP',
 'CDLTHRUSTING',
 'CDLTRISTAR',
 'CDLUNIQUE3RIVER',
 'CDLUPSIDEGAP2CROWS',
 'CDLXSIDEGAP3METHODS',
 'BETA',
 'CORREL',
 'LINEARREG',
 'LINEARREG_ANGLE',
 'LINEARREG_INTERCEPT',
 'LINEARREG_SLOPE',
 'STDDEV',
 'TSF',
 'VAR',
 'ADD',
 'DIV',
 'MAX',
 'MAXINDEX',
 'MIN',
 'MININDEX',
 'MULT',
 'SUB',
 'SUM',
 'ATAN',
 'CEIL',
 'COS',
 'COSH',
 'EXP',
 'FLOOR',
 'LN',
 'LOG10',
 'SIN',
 'SINH',
 'SQRT',
 'TAN',
 'TANH']

In [91]:
nonstationairy_plus_technical = nonstationairy + technical_analysis

In [92]:
converted_to_stationairy = []
for col in nonstationairy_plus_technical:
    d, ts = make_stationary(df_lag, col)
    if d==0: continue
    ts.name = col+f'_{d}_diff'
    converted_to_stationairy.append(ts)

  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  return self.params / self.bse
  return self.params / self.bse


In [93]:
stationairy_df = pd.concat(converted_to_stationairy, axis=1).join(df_lag[stationairy])
stationairy_df.head()

Unnamed: 0,TensorFlow__1_diff,deep learning__1_diff,Artificial Intelligence__1_diff,Machine Learning__1_diff,neural network__1_diff,MXNet__1_diff,blockchain_news_1_diff,GPU_news_1_diff,gaming_news_1_diff,BTC_1_diff,...,CEIL_1_diff,FLOOR_1_diff,LN_1_diff,LOG10_1_diff,SQRT_1_diff,ATVI_gaming_rr,TTWO_gaming_rr,AMD_rr,SandP_rr,NasdaqTech_rr
2012-01-03,,,,,,,,,,,...,,,,,,,,,,
2012-01-04,0.0,0.032258,0.096774,0.064516,0.129032,0.0,0.0,-0.064516,-0.322581,0.175,...,0.0,0.0,0.011332,0.004921,0.02129,-0.013788,0.00539,-0.007266,0.00726,-0.008122
2012-01-05,0.0,0.032258,0.096774,0.064516,0.129032,0.0,0.0,-0.064516,-0.322581,1.035,...,0.0,0.0,0.035286,0.015324,0.067073,-0.003701,0.032881,-0.00183,0.001578,0.007389
2012-01-06,0.0,0.032258,0.096774,0.064516,0.129032,0.0,0.0,-0.064516,-0.322581,0.565,...,0.0,0.0,-0.011624,-0.005048,-0.022227,0.004127,0.017301,-0.003666,0.000149,0.007123
2012-01-09,0.0,0.096774,0.290323,0.193548,0.387097,0.0,0.0,-0.193548,-0.967742,-0.105,...,0.0,0.0,0.0,0.0,0.0,0.00411,0.010204,0.012879,-8.2e-05,0.006352


In [94]:
stationairy_lags = {}
for col in stationairy_df.columns:
    lag, rmse_list = find_best_lag(stationairy_df[col], df_lag['return ratio'])
    stationairy_lags[col] = (lag, rmse_list[lag])

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [95]:
stationairy_lags = {k:v for k, v in stationairy_lags.items() if not np.isnan(v[1])}

In [96]:
stationairy_lags

{'TensorFlow__1_diff': (4, 0.1298794824289712),
 'deep learning__1_diff': (4, 0.14658689193171912),
 'Artificial Intelligence__1_diff': (19, 0.12330451243859841),
 'Machine Learning__1_diff': (4, 0.18734240906674698),
 'neural network__1_diff': (9, 0.21350003223585282),
 'MXNet__1_diff': (9, 0.17174096666238003),
 'blockchain_news_1_diff': (4, 0.138333183019606),
 'GPU_news_1_diff': (17, 0.19304940811366036),
 'gaming_news_1_diff': (2, 0.15041671884515803),
 'BTC_1_diff': (29, 0.10120603990184197),
 'SandP_1_diff': (0, 0.20056131850671866),
 'NasdaqTech_1_diff': (0, 0.19946965127863228),
 'ATVI_gaming_1_diff': (0, 0.11085281916069531),
 'TTWO_gaming_1_diff': (0, 0.0970372634243754),
 'UBSFY_gaming_1_diff': (0, 0.12976874031977245),
 'fps_transformed_ffill_1_diff': (2, 0.0962432645030855),
 'fps_transformed_interpolated_1_diff': (2, 0.08368482281963738),
 'fps_interpolated_transformed_1_diff': (2, 0.08439060551927864),
 'ATR_1_diff': (29, 0.10369089328398175),
 'EMA_1_diff': (29, 0.2677

In [97]:
for col, tpl in stationairy_lags.items():
    if tpl[0]!=29:
        if tpl[0]==0: ekon_wszystkie[col+'_1_day_before'] = stationairy_df[col].shift(1)
        else: ekon_wszystkie[col+'_'+str(tpl[0])+'_days_before'] = stationairy_df[col].shift(tpl[0])

In [98]:
ekon_wszystkie.rename(columns={'return ratio_1_day_before': 'y_return_ratio_1_day_before'}, inplace=True)
ekon_wszystkie.columns.tolist()

['y_return_ratio',
 'y_return_ratio_1_day_before',
 'ATR_1_day_before',
 'MOM_1_day_before',
 'CCI_1_day_before',
 'EMA_1_day_before',
 'RSI_1_day_before',
 'Will_R_1_day_before',
 'MFI_1_day_before',
 'DX_1_day_before',
 'PLUS_DM_1_day_before',
 'PPO_1_day_before',
 'AROONOSC_1_day_before',
 'BOP_1_day_before',
 'MINUS_DM_1_day_before',
 'ULTOSC_1_day_before',
 'AD_1_day_before',
 'OBV_1_day_before',
 'DEMA_1_day_before',
 'HT_TRENDLINE_1_day_before',
 'KAMA_1_day_before',
 'MIDPOINT_1_day_before',
 'MIDPRICE_1_day_before',
 'SAR_1_day_before',
 'SAREXT_1_day_before',
 'T3_1_day_before',
 'TEMA_1_day_before',
 'TRIMA_1_day_before',
 'WMA_1_day_before',
 'ADX_1_day_before',
 'ADXR_1_day_before',
 'APO_1_day_before',
 'CMO_1_day_before',
 'MINUS_DI_1_day_before',
 'PLUS_DI_1_day_before',
 'ROC_1_day_before',
 'ROCP_1_day_before',
 'ROCR_1_day_before',
 'ROCR100_1_day_before',
 'ADOSC_1_day_before',
 'NATR_1_day_before',
 'TRANGE_1_day_before',
 'AVGPRICE_1_day_before',
 'MEDPRICE_1_day_

In [99]:
ekon_wszystkie.to_csv('datasets/cale/ekonometryczne_wszystkie_2019_03_28__11_52_34.csv')

In [None]:
for col, tpl in stationairy_lags.items():
    if tpl[0]!=29:
        ml[col+('_shift_'+str(tpl[0]) if tpl[0]!=0 else "")] = stationairy_df[col].shift(tpl[0]+1)

In [None]:
ml.head()

In [None]:
ml.to_csv('datasets/cale/ml_wszystkie.csv')

In [None]:
ekon_stationairy = pd.DataFrame()
ekon_stationairy['y_return_ratio'] = df_lag['return ratio']
for col, tpl in stationairy_lags.items():
    if tpl[0]!=29:
        ekon_stationairy[col+('_shift_'+str(tpl[0]) if tpl[0]!=0 else "")] = stationairy_df[col].shift(tpl[0])

In [None]:
ekon_stationairy.head()

In [None]:
ml_stationairy = pd.DataFrame()
ml_stationairy['y_return_ratio'] = df_lag['return ratio']
ml_stationairy['return_ratio_shift_1'] = df_lag['return ratio'].shift(2)
ml_stationairy['return_ratio_shift_2'] = df_lag['return ratio'].shift(3)
ml_stationairy['return_ratio_shift_3'] = df_lag['return ratio'].shift(4)
ml_stationairy['return_ratio_shift_4'] = df_lag['return ratio'].shift(5)
ml_stationairy['return_ratio_shift_5'] = df_lag['return ratio'].shift(6)
ml_stationairy['return_ratio_shift_6'] = df_lag['return ratio'].shift(7)
ml_stationairy['return_ratio_shift_7'] = df_lag['return ratio'].shift(8)
for col, tpl in stationairy_lags.items():
    if tpl[0]!=29:
        ml_stationairy[col+('_shift_'+str(tpl[0]) if tpl[0]!=0 else "")] = stationairy_df[col].shift(tpl[0]+1)

In [None]:
ml_stationairy.head()

In [None]:
ekon_stationairy.to_csv('datasets/cale/ekonometryczne_stacjonarne.csv')
ml_stationairy.to_csv('datasets/cale/ml_stacjonarne.csv')