In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import Normalizer

In [None]:
#functions

def rolling_mean(series, window): return series.rolling(window).mean()
def rolling_std(series, window): return series.rolling(window).std()
def rolling_sum(series, window): return series.rolling(window).sum()
def ewma(series, span, min_periods): return series.ewm(span = span, min_periods = min_periods).mean()
def get_value(df, idx, col): return df.iloc[idx][col]
def minMaxScaling(column):
    return (column - column.min())/(column.max()-column.min())

#Moving Average
def MA(df, n, feature = 'Close',):
    MA = pd.Series(rolling_mean(df[feature], n), name = 'MA_' + str(n))
    df['MA'+'_'+feature] = MA.to_numpy()
    return df

#Exponential Moving Average
def EMA(df, n, feature = 'Close',):
    EMA = pd.Series(ewma(df[feature], span = n, min_periods = n - 1), name = 'EMA_' + str(n))
    df['EMA'+'_'+feature] = EMA.to_numpy()
    return df

#Momentum
def MOM(df, n, feature = 'Close',):
    M = pd.Series(df[feature].diff(n), name = 'Momentum_' + str(n))
    df['MOM'+'_'+str(n)+'_'+feature] = M.to_numpy()
    return df

#Rate of Change
def ROC(df, n, feature = 'Close'):
    M = df[feature].diff(n - 1)
    N = df[feature].shift(n - 1)
    ROC = pd.Series(M / N, name = 'ROC_' + str(n))
    df['ROC'+'_'+str(n) +'_'+feature] = ROC.to_numpy()
    return df

#Average True Range
def ATR(df, n,):
    i = 0
    TR_l = [0]
    while i < len(df) - 1:
        TR = max(get_value(df, i + 1, 'High'), get_value(df, i, 'Close')) - min(get_value(df, i + 1, 'Low'), get_value(df, i, 'Close'))
        TR_l.append(TR)
        i = i + 1
    TR_s = pd.Series(TR_l)
    ATR = pd.Series(ewma(TR_s, span = n, min_periods = n), name = 'ATR_' + str(n))
    #print(TR_l,TR_s, ATR)
    df['ATR'] = ATR.to_numpy()
    return df

#Bollinger Bands
def BBANDS(df, n, feature = 'Close'):
    MA = pd.Series(rolling_mean(df['Close'], n))
    MSD = pd.Series(rolling_std(df['Close'], n))
    b1 = 4 * MSD / MA
    B1 = pd.Series(b1, name = 'BollingerB_' + str(n))
    df['B1'+'_'+feature] = B1
    b2 = (df['Close'] - MA + 2 * MSD) / (4 * MSD)
    B2 = pd.Series(b2, name = 'Bollinger%b_' + str(n))
    df['B2'+'_'+feature] = B2.to_numpy()
    return df

#Pivot Points, Supports and Resistances
def PPSR(df):
    PP = pd.Series((df['High'] + df['Low'] + df['Close']) / 3)
    R1 = pd.Series(2 * PP - df['Low'])
    S1 = pd.Series(2 * PP - df['High'])
    R2 = pd.Series(PP + df['High'] - df['Low'])
    S2 = pd.Series(PP - df['High'] + df['Low'])
    R3 = pd.Series(df['High'] + 2 * (PP - df['Low']))
    S3 = pd.Series(df['Low'] - 2 * (df['High'] - PP))
    psr = {'PP':PP, 'R1':R1, 'S1':S1, 'R2':R2, 'S2':S2, 'R3':R3, 'S3':S3}
    PSR = pd.DataFrame(psr)
    for col in PSR.columns:
        df['PSR_' + col] = PSR[col].to_numpy()
    return df

#Stochastic oscillator %K
def STOK(df):
    SOk = pd.Series((df['Close'] - df['Low']) / (df['High'] - df['Low']), name = 'SO%k')
    df['SOk'] = SOk.to_numpy()
    return df

# Stochastic Oscillator, EMA smoothing, nS = slowing (1 if no slowing)
def STO(df,  nK, nD, nS=1):
    SOk = pd.Series((df['Close'] - df['Low'].rolling(nK).min()) / (df['High'].rolling(nK).max() - df['Low'].rolling(nK).min()), name = 'SO%k'+str(nK))
    SOd = pd.Series(SOk.ewm(ignore_na=False, span=nD, min_periods=nD-1, adjust=True).mean(), name = 'SO%d'+str(nD))
    SOk = SOk.ewm(ignore_na=False, span=nS, min_periods=nS-1, adjust=True).mean()
    SOd = SOd.ewm(ignore_na=False, span=nS, min_periods=nS-1, adjust=True).mean()
    df['SOk'] = SOk.to_numpy()
    df['SOd'] = SOd.to_numpy()
    return df

#Trix
def TRIX(df, n, feature = 'Close'):
    EX1 = ewma(df[feature], span = n, min_periods = n - 1)
    EX2 = ewma(EX1, span = n, min_periods = n - 1)
    EX3 = ewma(EX2, span = n, min_periods = n - 1)
    i = 0
    ROC_l = [0]
    while i + 1 <= len(df) - 1:
        ROC = (EX3.iloc[i + 1] - EX3.iloc[i]) / EX3.iloc[i]
        ROC_l.append(ROC)
        i = i + 1
    Trix = pd.Series(ROC_l, name = 'Trix_' + str(n))
    df['Trix'+'_'+feature] = Trix.to_numpy()
    return df

#Average Directional Movement Index
def ADX(df, n, n_ADX):
    i = 0
    UpI = []
    DoI = []
    while i + 1 <= len(df) - 1:
        UpMove = get_value(df, i + 1, 'High') - get_value(df, i, 'High')
        DoMove = get_value(df, i, 'Low') - get_value(df, i + 1, 'Low')
        if UpMove > DoMove and UpMove > 0:
            UpD = UpMove
        else: UpD = 0
        UpI.append(UpD)
        if DoMove > UpMove and DoMove > 0:
            DoD = DoMove
        else: DoD = 0
        DoI.append(DoD)
        i = i + 1
    i = 0
    TR_l = [0]
    while i < len(df) - 1:
        TR = max(get_value(df, i + 1, 'High'), get_value(df, i, 'Close')) - min(get_value(df, i + 1, 'Low'), get_value(df, i, 'Close'))
        TR_l.append(TR)
        i = i + 1
    TR_s = pd.Series(TR_l)
    ATR = pd.Series(ewma(TR_s, span = n, min_periods = n))
    UpI = pd.Series(UpI)
    DoI = pd.Series(DoI)
    PosDI = pd.Series(ewma(UpI, span = n, min_periods = n - 1) / ATR)
    NegDI = pd.Series(ewma(DoI, span = n, min_periods = n - 1) / ATR)
    ADX = pd.Series(ewma(abs(PosDI - NegDI) / (PosDI + NegDI), span = n_ADX, min_periods = n_ADX - 1), name = 'ADX_' + str(n) + '_' + str(n_ADX))
    df['ADX'] = ADX.to_numpy()
    return df

#MACD, MACD Signal and MACD difference
def MACD(df, n_fast, n_slow, feature = 'Close'):
    EMAfast = pd.Series(ewma(df[feature], span = n_fast, min_periods = n_slow - 1))
    EMAslow = pd.Series(ewma(df[feature], span = n_slow, min_periods = n_slow - 1))
    MACD = pd.Series(EMAfast - EMAslow, name = 'MACD_' + str(n_fast) + '_' + str(n_slow))
    MACDsign = pd.Series(ewma(MACD, span = 9, min_periods = 8), name = 'MACDsign_' + str(n_fast) + '_' + str(n_slow))
    MACDdiff = pd.Series(MACD - MACDsign, name = 'MACDdiff_' + str(n_fast) + '_' + str(n_slow))
    df['MACD'+'_'+feature] = MACD.to_numpy()
    df['MACDsign'+'_'+feature] = MACDsign.to_numpy()
    df['MACDdiff'+'_'+feature] = MACDdiff.to_numpy()
    return df

#Mass Index
def MassI(df):
    Range = df['High'] - df['Low']
    EX1 = ewma(Range, span = 9, min_periods = 8)
    EX2 = ewma(EX1, span = 9, min_periods = 8)
    Mass = EX1 / EX2
    MassI = pd.Series(rolling_sum(Mass, 25), name = 'Mass Index')
    df['MassI'] = MassI.to_numpy()
    return df

#Vortex Indicator: http://www.vortexindicator.com/VFX_VORTEX.PDF
def Vortex(df, n):
    i = 0
    TR = [0]
    while i < len(df) - 1:
        Range = max(get_value(df, i + 1, 'High'), get_value(df, i, 'Close')) - min(get_value(df, i + 1, 'Low'), get_value(df, i, 'Close'))
        TR.append(Range)
        i = i + 1
    i = 0
    VM = [0]
    while i < len(df) - 1:
        Range = abs(get_value(df, i + 1, 'High') - get_value(df, i, 'Low')) - abs(get_value(df, i + 1, 'Low') - get_value(df, i, 'High'))
        VM.append(Range)
        i = i + 1
    VI = pd.Series(rolling_sum(pd.Series(VM), n) / rolling_sum(pd.Series(TR), n), name = 'Vortex_' + str(n))
    df['VI'] = VI.to_numpy()
    return df

#KST Oscillator
def KST(df, r1, r2, r3, r4, n1, n2, n3, n4, feature = 'Close'):
    M = df[feature].diff(r1 - 1)
    N = df[feature].shift(r1 - 1)
    ROC1 = M / N
    M = df[feature].diff(r2 - 1)
    N = df[feature].shift(r2 - 1)
    ROC2 = M / N
    M = df[feature].diff(r3 - 1)
    N = df[feature].shift(r3 - 1)
    ROC3 = M / N
    M = df[feature].diff(r4 - 1)
    N = df[feature].shift(r4 - 1)
    ROC4 = M / N
    KST = pd.Series(rolling_sum(ROC1, n1) + rolling_sum(ROC2, n2) * 2 + rolling_sum(ROC3, n3) * 3 + rolling_sum(ROC4, n4) * 4, name = 'KST_' + str(r1) + '_' + str(r2) + '_' + str(r3) + '_' + str(r4) + '_' + str(n1) + '_' + str(n2) + '_' + str(n3) + '_' + str(n4))
    df['KST'+'_'+feature] = KST.to_numpy()
    return df

#Relative Strength Index
def RSI(df, n):
    i = 0
    UpI = [0]
    DoI = [0]
    while i + 1 <= len(df) - 1:
        UpMove = get_value(df, i + 1, 'High') - get_value(df, i, 'High')
        DoMove = get_value(df, i, 'Low') - get_value(df, i + 1, 'Low')
        if UpMove > DoMove and UpMove > 0:
            UpD = UpMove
        else: UpD = 0
        UpI.append(UpD)
        if DoMove > UpMove and DoMove > 0:
            DoD = DoMove
        else: DoD = 0
        DoI.append(DoD)
        i = i + 1
    UpI = pd.Series(UpI)
    DoI = pd.Series(DoI)
    PosDI = pd.Series(ewma(UpI, span = n, min_periods = n - 1))
    NegDI = pd.Series(ewma(DoI, span = n, min_periods = n - 1))
    RSI = pd.Series(PosDI / (PosDI + NegDI), name = 'RSI_' + str(n))
    df['RSI'] = RSI.to_numpy()
    return df

#True Strength Index
def TSI(df, r, s, feature = 'Close'):
    M = pd.Series(df[feature].diff(1))
    aM = abs(M)
    EMA1 = pd.Series(ewma(M, span = r, min_periods = r - 1))
    aEMA1 = pd.Series(ewma(aM, span = r, min_periods = r - 1))
    EMA2 = pd.Series(ewma(EMA1, span = s, min_periods = s - 1))
    aEMA2 = pd.Series(ewma(aEMA1, span = s, min_periods = s - 1))
    TSI = pd.Series(EMA2 / aEMA2, name = 'TSI_' + str(r) + '_' + str(s))
    df['TSI'+'_'+feature] = TSI.to_numpy()
    return df

#Accumulation/Distribution
def ACCDIST(df, n):
    ad = (2 * df['Close'] - df['High'] - df['Low']) / (df['High'] - df['Low']) * df['Volume']
    M = ad.diff(n - 1)
    N = ad.shift(n - 1)
    ROC = M / N
    AD = pd.Series(ROC, name = 'Acc/Dist_ROC_' + str(n))
    df['AD'] = AD.to_numpy()
    return df

#Chaikin Oscillator
def Chaikin(df):
    ad = (2 * df['Close'] - df['High'] - df['Low']) / (df['High'] - df['Low']) * df['Volume']
    Chaikin = pd.Series(ewma(ad, span = 3, min_periods = 2) - ewma(ad, span = 10, min_periods = 9), name = 'Chaikin')
    df['Chaikin'] = Chaikin.to_numpy()
    return df

#Money Flow Index and Ratio
def MFI(df, n):
    PP = (df['High'] + df['Low'] + df['Close']) / 3
    i = 0
    PosMF = [0]
    while i < len(df) - 1:
        if PP.iloc[i + 1] > PP.iloc[i]:
            PosMF.append(PP.iloc[i + 1] * get_value(df, i + 1, 'Volume'))
        else:
            PosMF.append(0)
        i = i + 1
    PosMF = pd.Series(PosMF)
    TotMF = PP * df['Volume']
    #print(PosMF, TotMF)
    MFR = pd.Series(PosMF.to_numpy() / TotMF.to_numpy())
    MFI = pd.Series(rolling_mean(MFR, n), name = 'MFI_' + str(n))
    df['MFI'] = MFI.to_numpy()
    return df

#On-balance Volume
def OBV(df, n):
    i = 0
    OBV = [0]
    while i < len(df) - 1:
        if get_value(df, i + 1, 'Close') - get_value(df, i, 'Close') > 0:
            OBV.append(get_value(df, i + 1, 'Volume'))
        if get_value(df, i + 1, 'Close') - get_value(df, i, 'Close') == 0:
            OBV.append(0)
        if get_value(df, i + 1, 'Close') - get_value(df, i, 'Close') < 0:
            OBV.append(-get_value(df, i + 1, 'Volume'))
        i = i + 1
    OBV = pd.Series(OBV)
    OBV_ma = pd.Series(rolling_mean(OBV, n), name = 'OBV_' + str(n))
    df['OBV_ma'] = OBV_ma.to_numpy()
    return df

#Force Index
def FORCE(df, n):
    F = pd.Series(df['Close'].diff(n) * df['Volume'].diff(n), name = 'Force_' + str(n))
    df['F'] = F.to_numpy()
    return df

#Ease of Movement
def EOM(df, n):
    EoM = (df['High'].diff(1) + df['Low'].diff(1)) * (df['High'] - df['Low']) / (2 * df['Volume'])
    Eom_ma = pd.Series(rolling_mean(EoM, n), name = 'EoM_' + str(n))
    df['Eom_ma'] = Eom_ma.to_numpy()
    return df

#Commodity Channel Index
def CCI(df, n):
    PP = (df['High'] + df['Low'] + df['Close']) / 3
    CCI = pd.Series((PP - rolling_mean(PP, n)) / rolling_std(PP, n), name = 'CCI_' + str(n))
    df['CCI'] = CCI.to_numpy()
    return df

#Coppock Curve
def COPP(df, n, feature = 'Close'):
    M = df[feature].diff(int(n * 11 / 10) - 1)
    N = df[feature].shift(int(n * 11 / 10) - 1)
    ROC1 = M / N
    M = df[feature].diff(int(n * 14 / 10) - 1)
    N = df[feature].shift(int(n * 14 / 10) - 1)
    ROC2 = M / N
    Copp = pd.Series(ewma(ROC1 + ROC2, span = n, min_periods = n), name = 'Copp_' + str(n))
    df['Copp'+'_'+feature] = Copp.to_numpy()
    return df

#Keltner Channel
def KELCH(df, n):
    KelChM = pd.Series(rolling_mean((df['High'] + df['Low'] + df['Close']) / 3, n), name = 'KelChM_' + str(n))
    KelChU = pd.Series(rolling_mean((4 * df['High'] - 2 * df['Low'] + df['Close']) / 3, n), name = 'KelChU_' + str(n))
    KelChD = pd.Series(rolling_mean((-2 * df['High'] + 4 * df['Low'] + df['Close']) / 3, n), name = 'KelChD_' + str(n))
    df['KelChM'] = KelChM.to_numpy()
    df['KelChU'] = KelChU.to_numpy()
    df['KelChD'] = KelChD.to_numpy()
    return df

#Ultimate Oscillator
def ULTOSC(df):
    i = 0
    TR_l = [0]
    BP_l = [0]
    while i < len(df) - 1:
        TR = max(get_value(df, i + 1, 'High'), get_value(df, i, 'Close')) - min(get_value(df, i + 1, 'Low'), get_value(df, i, 'Close'))
        TR_l.append(TR)
        BP = get_value(df, i + 1, 'Close') - min(get_value(df, i + 1, 'Low'), get_value(df, i, 'Close'))
        BP_l.append(BP)
        i = i + 1
    UltO = pd.Series((4 * rolling_sum(pd.Series(BP_l), 7) / rolling_sum(pd.Series(TR_l), 7)) + (2 * rolling_sum(pd.Series(BP_l), 14) / rolling_sum(pd.Series(TR_l), 14)) + (rolling_sum(pd.Series(BP_l), 28) / rolling_sum(pd.Series(TR_l), 28)), name = 'Ultimate_Osc')
    df['UltO'] = UltO.to_numpy()
    return df

#Donchian Channel
def DONCH(df, n):
    i = 0
    DC_l = []
    while i < n - 1:
        DC_l.append(0)
        i = i + 1
    i = 0
    while i + n - 1 <= (df.shape[0]) - 1:
        DC = max(df['High'].iloc[i:i + n - 1]) - min(df['Low'].iloc[i:i + n - 1])
        DC_l.append(DC)
        i = i + 1
    DonCh = pd.Series(DC_l, name = 'Donchian_' + str(n))
    #print(DonCh)
    DonCh = DonCh.shift(n - 1)
    #print(DonCh)
    df['DonCh'] = DonCh.to_numpy()
    return df

#Standard Deviation
def STDDEV(df, n, feature = 'Close'):
    std_dev = pd.Series(rolling_std(df[feature], n), name = 'STD_' + str(n))
    df['std_dev'+'_'+feature] = std_dev.to_numpy()
    return df

#volitility functions

def realized1(close, N=240):
    rt = list(np.log(C_t / C_t_1) for C_t, C_t_1 in zip(close[1:], close[:-1]))
    rt_mean = sum(rt) / len(rt)
    return np.sqrt(sum((r_i - rt_mean) ** 2 for r_i in rt) * N / (len(rt) - 1))

def parkinson1(high, low, N=240):
    sum_hl = sum(np.log(H_t / L_t) ** 2 for H_t, L_t in zip(high, low))
    result = np.sqrt(sum_hl * N / (4 * len(high) *np.log(2))) 
    #print(result)
    return result

def garman_klass1(open, high, low, close, N=240):
    sum_hl = sum(np.log(H_t / L_t) ** 2 for H_t, L_t in zip(high, low)) / 2
    sum_co = sum(np.log(C_t / O_t) ** 2 for C_t, O_t in zip(close, open)) * (2 * np.log(2) - 1)
    return np.sqrt((sum_hl - sum_co) * N / len(close))

def roger_satchell1(open1, high, low, close, N=240):
    sum_ohlc = sum(np.log(H_t / C_t) * np.log(H_t / O_t) + np.log(L_t / C_t) * np.log(L_t / O_t) for O_t, H_t, L_t, C_t in zip(open1, high, low, close))
    #print(close, len(close))
    return np.sqrt(sum_ohlc * N / len(close))

def yang_zhang1(open, high, low, close, N=240):
    oc = list(np.log(O_t / C_t_1) for O_t, C_t_1 in zip(open[1:], close[:-1]))
    n = len(oc)
    oc_mean = sum(oc) / n
    oc_var = sum((oc_i - oc_mean) ** 2 for oc_i in oc) * N / (n - 1)   
    co = list(np.log(C_t / O_t) for O_t, C_t in zip(open[1:], close[1:]))
    co_mean = sum(co) / n
    co_var = sum((co_i - co_mean) ** 2 for co_i in co) * N / (n - 1)    
    rs_var = (roger_satchell1(open, high, low, close)) ** 2    
    k = 0.34 / (1.34 + (n +1) / (n - 1))    
    return np.sqrt(oc_var + k * co_var + (1-k) * rs_var)

def garkla_yangzh1(open, high, low, close, N=240):
    sum_oc_1 = sum(np.log(O_t / C_t_1) ** 2 for O_t, C_t_1 in zip(open, close))
    sum_hl = sum(np.log(H_t / L_t) ** 2 for H_t, L_t in zip(high, low)) / 2
    sum_co = sum(np.log(C_t / O_t) ** 2 for C_t, O_t in zip(close, open)) * (2 * np.log(2) - 1)
    return np.sqrt((sum_oc_1 + sum_hl - sum_co) * N / (len(close) - 1))

def realized(df, n = 60, feature = 'Close'):
    df['realized' + '_'+str(n)+'_'+feature] = df[feature].rolling(n).apply(realized1)
    return df
    
def parkinson(df, n = 60):
    #print(df.rolling(60))
    df['parkinson'] = df.rolling(int(n)).apply(lambda x: parkinson1(df.loc[x.index, 'High'], df.loc[x.index, 'Low']))['Low']
    return df

def garman_klass(df, n = 60):
    df['garman_klass'] = df.rolling(n).apply(lambda x: garman_klass1(df.loc[x.index, 'Open'], df.loc[x.index, 'High'], df.loc[x.index, 'Low'], df.loc[x.index, 'Close']))['Low']
    return df

def roger_satchell(df, n = 60):
    df['roger_satchell'] = df.rolling(n).apply(lambda x: roger_satchell1(df.loc[x.index, 'Open'], df.loc[x.index, 'High'], df.loc[x.index, 'Low'], df.loc[x.index, 'Close']))['Low']
    return df

def yang_zang(df, n = 60):
    a= df.rolling(n).apply(lambda x: yang_zhang1(df.loc[x.index, 'Open'], df.loc[x.index, 'High'], df.loc[x.index, 'Low'], df.loc[x.index, 'Close']))
    #print(a)
    df['yang_zang'] = a
    return df

def garkla_yangzh(df, n = 60):
    df['garkla_yangzh'] = df.rolling(n).apply(lambda x: garkla_yangzh1(df.loc[x.index, 'Open'], df.loc[x.index, 'High'], df.loc[x.index, 'Low'], df.loc[x.index, 'Close']))
    #print(df)
    return df

def volumeFeatures(df, n=60):
    indexes = df.index
    df['parkinson']      = 0
    df['garman_klass']   = 0
    df['roger_satchell'] = 0
    df['yang_zang']      = 0
    df['garkla_yangzh']  = 0
    for i in range(df.shape[0]-60):
        j = i + n -1
        high = df.loc[indexes[i]:indexes[j], 'High']
        low = df.loc[indexes[i]:indexes[j], 'Low']
        open1 = df.loc[indexes[i]:indexes[j], 'Open']
        close = df.loc[indexes[i]:indexes[j], 'Close']
        df.loc[indexes[j+1], 'parkinson']      = parkinson1(high, low)
        df.loc[indexes[j+1], 'garman_klass']   = garman_klass1(open1, high, low, close)
        df.loc[indexes[j+1], 'roger_satchell'] = roger_satchell1(open1, high, low, close)
        df.loc[indexes[j+1], 'yang_zang']      = yang_zhang1(open1, high, low, close)
        df.loc[indexes[j+1], 'garkla_yangzh']  = garkla_yangzh1(open1, high, low, close)
    return df

def addTimeFeature(biggestBatch):
    indexes = biggestBatch.index
    ## add time variables
    biggestBatch['Weekday'] = 0
    biggestBatch['Hour'] = 0
    biggestBatch['Month'] = 0
    biggestBatch['Minute'] = 0
    biggestBatch['Year'] = 0

    for i in range(biggestBatch.shape[0]):
        info = datetime.fromtimestamp(biggestBatch.timestamp.iloc[i])
        biggestBatch.loc[indexes[i], 'Weekday'] = info.weekday()
        biggestBatch.loc[indexes[i], 'Hour'] = info.hour
        biggestBatch.loc[indexes[i], 'Month'] = info.month
        biggestBatch.loc[indexes[i], 'Minute'] = info.minute
        biggestBatch.loc[indexes[i], 'Year'] = info.year
    return biggestBatch

def upper_shadow(df): return df['High'] - np.maximum(df['Close'], df['Open'])
def lower_shadow(df): return np.minimum(df['Close'], df['Open']) - df['Low']

def normalFeatures(df_feat):
    df_feat['spread'] = df_feat['High'] - df_feat['Low']
    df_feat['mean_trade'] = df_feat['Volume']/df_feat['Count']
    df_feat['log_price_change'] = np.log(df_feat['Close']/df_feat['Open'])
    df_feat['upper_Shadow'] = upper_shadow(df_feat)
    df_feat['lower_Shadow'] = lower_shadow(df_feat)
    df_feat["high_div_low"] = df_feat["High"] / df_feat["Low"]
    df_feat['trade'] = df_feat['Close'] - df_feat['Open']
    df_feat['gtrade'] = df_feat['trade'] / df_feat['Count']
    df_feat['shadow1'] = df_feat['trade'] / df_feat['Volume']
    df_feat['shadow3'] = df_feat['upper_Shadow'] / df_feat['Volume']
    df_feat['shadow5'] = df_feat['lower_Shadow'] / df_feat['Volume']
    df_feat['diff1'] = df_feat['Volume'] - df_feat['Count']
    df_feat['mean1'] = (df_feat['shadow5'] + df_feat['shadow3']) / 2
    df_feat['mean2'] = (df_feat['shadow1'] + df_feat['Volume']) / 2
    df_feat['mean3'] = (df_feat['trade'] + df_feat['gtrade']) / 2
    df_feat['mean4'] = (df_feat['diff1'] + df_feat['upper_Shadow']) / 2
    df_feat['mean5'] = (df_feat['diff1'] + df_feat['lower_Shadow']) / 2
    df_feat['UPS'] = (df_feat['High'] - np.maximum(df_feat['Close'], df_feat['Open']))
    df_feat['UPS'] = df_feat['UPS']
    df_feat['LOS'] = (np.minimum(df_feat['Close'], df_feat['Open']) - df_feat['Low'])
    df_feat['LOS'] = df_feat['LOS']
    df_feat['RNG'] = ((df_feat['High'] - df_feat['Low']) / df_feat['VWAP'])
    df_feat['RNG'] = df_feat['RNG']
    df_feat['MOV'] = ((df_feat['Close'] - df_feat['Open']) / df_feat['VWAP'])
    df_feat['MOV'] = df_feat['MOV']
    df_feat['CLS'] = ((df_feat['Close'] - df_feat['VWAP']) / df_feat['VWAP'])
    df_feat['CLS'] = df_feat['CLS']
    df_feat['LOGVOL'] = np.log(1. + df_feat['Volume'])
    df_feat['LOGVOL'] = df_feat['LOGVOL']
    df_feat['LOGCNT'] = np.log(1. + df_feat['Count'])
    df_feat['LOGCNT'] = df_feat['LOGCNT']
    df_feat["Close/Open"] = df_feat["Close"] / df_feat["Open"]
    df_feat["Close-Open"] = df_feat["Close"] - df_feat["Open"]
    df_feat["High-Low"] = df_feat["High"] - df_feat["Low"]
    df_feat["High/Low"] = df_feat["High"] / df_feat["Low"]
    #if row: df_feat['Mean'] = df_feat[['Open', 'High', 'Low', 'Close']].mean()
    df_feat['Mean'] = np.mean(df_feat[['Open', 'High', 'Low', 'Close']].mean().to_numpy())
    #else: df_feat['Mean'] = df_feat[['Open', 'High', 'Low', 'Close']].mean(axis = 1)
    df_feat["High/Mean"] = df_feat["High"] / df_feat["Mean"]
    df_feat["Low/Mean"] = df_feat["Low"] / df_feat["Mean"]
    df_feat["Volume/Count"] = df_feat["Volume"] / (df_feat["Count"] + 1)
    mean_price = df_feat[['Open', 'High', 'Low', 'Close']].mean(axis=1)
    median_price = df_feat[['Open', 'High', 'Low', 'Close']].median(axis=1)
    df_feat['high2mean'] = df_feat['High'] / mean_price
    df_feat['low2mean'] = df_feat['Low'] / mean_price
    df_feat['high2median'] = df_feat['High'] / median_price
    df_feat['low2median'] = df_feat['Low'] / median_price
    df_feat['volume2count'] = df_feat['Volume'] / (df_feat['Count'] + 1)
    return df_feat

def normalizeFeatures(df):
    for feature in df.columns.drop(['Target','Asset_ID', 'timestamp', 'batch', 'batchSize']):
        df[feature] = minMaxScaling(df[feature])
    return df

def neutralizeFeatures(df, proportion=1.0):
    by = df['Target']
    for feature in df.columns.drop(['Target','Asset_ID', 'timestamp', 'batch', 'batchSize']):
        series = df[feature]

        scores = np.nan_to_num(series.values).reshape(-1, 1)
        exposures = np.nan_to_num(by.values).reshape(-1, 1)
        exposures = np.hstack((exposures, np.array([np.mean(np.nan_to_num(series.values))] * len(exposures)).reshape(-1, 1)))
        correction = proportion * (exposures.dot(np.linalg.lstsq(exposures, scores)[0]))
        corrected_scores = scores - correction
        df['neutralized'+'_'+feature] = pd.Series(corrected_scores.ravel(), index=series.index)
        if feature == 'Target':
            break
    return df

def addFeaturesToDataSet(df):
    n = 15
    nK = 5
    nD = 7
    n_fast = 5
    n_slow = 10
    n_ADX = 10
    r = 3
    s = 7
    n_longTerm = 10000
    n_shortTerm = 100
    n_intermediate = 1000
    
    original_features = df.columns.drop(['Target','Asset_ID', 'timestamp', 'batch', 'batchSize'])

    df = volumeFeatures(df, 60)
    df = ATR(df, n)
    df = PPSR(df)
    df = STOK(df)
    df = STO(df,  nK, nD, nS=1)
    df = ADX(df, n, n_ADX)
    df = MassI(df)
    df = Vortex(df, n)
    df = RSI(df, n)
    df = ACCDIST(df, n)
    df = Chaikin(df)
    df = MFI(df, n)
    df = OBV(df, n)
    df = FORCE(df, n)
    df = EOM(df, n)
    df = CCI(df, n)
    df = KELCH(df, n)
    df = ULTOSC(df)
    df = DONCH(df, n)
    
    df = normalFeatures(df)
    print("finished adding normal features")
    #feature independent
    for feature in original_features:
        df[feature + '_RollingMean_' + str(n)] = rolling_mean(df[feature], n)
        df[feature + '_RollingMean_' + str(n)] = rolling_sum(df[feature], n)
        df = MA(df, n, feature)
        df = EMA(df, n, feature)
        df = MOM(df, n, feature)
        df = ROC(df, n, feature)
        df = BBANDS(df, n, feature)
        df = TRIX(df, n, feature)
        df = MACD(df, n_fast, n_slow, feature)
        df = KST(df, 10, 15, 20, 30, 10, 10, 10, 15, feature)
        df = TSI(df, r, s, feature)
        df = COPP(df, n, feature)
        df = STDDEV(df, n, feature)
        
        df = ROC(df, n_longTerm, feature)
        df = ROC(df, n_shortTerm, feature)
        df = ROC(df, n_intermediate, feature)
        
        df = realized(df, 60, feature)
    print("finished adding feature independent features")
    for feature in df.columns.drop(['Target','Asset_ID', 'timestamp']):
        df = ROC(df, 2, feature)
        df = ROC(df, 5, feature)
        df = ROC(df, 20, feature)
        
    df = addTimeFeature(df)
    
    return df



In [None]:
data = pd.read_hdf('03_non_padded_with_batch_info.h5')
batchSize = max(np.unique(data.batchSize))
batch = data.loc[data.batchSize == batchSize]

In [None]:
data.to_hdf('03_non_padded_with_batch_info.h5', key = 'df', mode = 'w')

In [None]:
#np.unique(data.batchSize)
batch = data.loc[data.batchSize == 100]
batch2 = batch.loc[batch.Asset_ID == 2]

In [None]:
batch_feat = addFeaturesToDataSet(batch2)

In [None]:
def processAllAssets(df):
    cache = []
    for i in np.unique(df.Asset_ID):
        batch = df.loc[df.Asset_ID == i]
        cache.append(.to_numpy())
    return pd.concat(cache)

def processBatch(df, size):
    batch = data.loc[data.batchSize == size]
    return processAllAssets(batch)

In [None]:
processAllAssets(batch)

In [None]:
np.unique(data.batchSize)

biggestBatch = processBatch(data, 28276)

In [None]:
batch = data.loc[data.batchSize == 1019]#28276]
batch3 = batch.loc[batch.Asset_ID == 3]
a = addFeaturesToDataSet(batch3)

print(a.shape)
for feature in a.columns.values:
    print(feature, a[feature].isnull().values.sum())

In [None]:
del batch, data

In [None]:
print(biggestBatch.shape)
for feature in biggestBatch.columns.values:
    print(feature, biggestBatch[feature].isnull().values.sum())

In [None]:
biggestBatch.to_hdf('05_biggestBatch_features.h5', key = 'df', mode = 'w')

In [None]:
from datetime import datetime
info = datetime.fromtimestamp(biggestBatch.timestamp.iloc[60*24*i])
print(info)
info.year
info.hour
info.minute
info.month
info.weekday()

In [None]:
def addTimeFeature(biggestBatch):
    indexes = biggestBatch.index
    ## add time variables
    biggestBatch['Weekday'] = 0
    biggestBatch['Hour'] = 0
    biggestBatch['Month'] = 0
    biggestBatch['Minute'] = 0
    biggestBatch['Year'] = 0

    for i in range(biggestBatch.shape[0]):
        info = datetime.fromtimestamp(biggestBatch.timestamp.iloc[i])
        biggestBatch.loc[indexes[i], 'Weekday'] = info.weekday()
        biggestBatch.loc[indexes[i], 'Hour'] = info.hour
        biggestBatch.loc[indexes[i], 'Month'] = info.month
        biggestBatch.loc[indexes[i], 'Minute'] = info.minute
        biggestBatch.loc[indexes[i], 'Year'] = info.year
    return biggestBatch


In [None]:
addTimeFeature(biggestBatch)

In [None]:
biggestBatch

In [None]:
biggestBatch = biggestBatch.drop([0])

In [None]:
biggestBatch = biggestBatch.fillna(0)

In [None]:
biggestBatch

In [None]:
biggestBatch.to_hdf('05_biggestBatch_features_time.h5', key = 'df', mode = 'w')

In [None]:
batch =  biggestBatch.loc[biggestBatch.Asset_ID == 0]
batch.drop(['Target','Asset_ID', 'timestamp', 'batch', 'batchSize'], axis = 1)

In [None]:
from sklearn.ensemble import RandomForestRegressor
assets = np.unique(biggestBatch.Asset_ID)
cache = []
for asset in assets:
    batch = biggestBatch.loc[biggestBatch.Asset_ID == asset]
    #print(batch)
    X_train = batch.drop(['Target','Asset_ID', 'timestamp', 'batch', 'batchSize'], axis = 1).astype(np.float32)
    Y_train = batch.Target.astype(np.float32)
    print(np.sum(X_train.isnull().sum()), Y_train.isnull().sum())
    
    clf = RandomForestRegressor()
    clf.fit(X_train, Y_train)
    
    y = clf.feature_importances_
    x = np.linspace(0, len(y)-1, len(y))
    fig=plt.figure()
    plt.plot(x,y)
    
    cache.append(clf)