In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_datareader as pdr
import datetime
import numpy as np

In [2]:
start = datetime.datetime(2010, 1, 1)
end = datetime.datetime(2020,12,31)

In [3]:
# 삼성전자 10년치 일 데이터 - 하나의 일 데이터를 하나의 tick data로 생각
# close price를 p_t 라고 생각
df = pdr.get_data_yahoo('005930.KS', start, end)
tick = df[['Volume', 'Close']]
tick

Unnamed: 0_level_0,Volume,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-04,11950800.0,16180.0
2010-01-05,27925850.0,16440.0
2010-01-06,22948850.0,16820.0
2010-01-07,22107950.0,16260.0
2010-01-08,14777550.0,16420.0
...,...,...
2020-12-23,19411326.0,73900.0
2020-12-24,32502870.0,77800.0
2020-12-28,40085044.0,78700.0
2020-12-29,30339449.0,78300.0


In [4]:
# b_t(tick rule)
def tick_rule(df):
    diff = df['Close'] - df['Close'].shift(1)
    b_t = np.abs(diff) / diff
    return b_t

## Tick_imbalance_Bar

In [169]:
# 첫 번째 tib(tick imbalance bar) 정하기위해서는 최초의 값들을 정해줘야 함(이전 bar size, 이전 tick rule 값)
def tick_imbalance_bar(df, initial_expected_bar_size = 10, initial_expected_tick_rule = 1):
    
    i = 0
    bar_dict = dict()
    
    while True:

        if i == 0:
            temp = df
            temp['T'] = np.arange(len(temp)) + 1
            temp['tick_rule'] = tick_rule(tick).fillna(method = 'ffill')
            temp['tick_rule'][0] = 1
            temp['tick_imbalance'] = temp['tick_rule'].cumsum()
#             temp['expected_tick_imbalance'] = initial_expected_bar_size * initial_expected_tick_rule
            expected_tick_imbalance = initial_expected_bar_size * abs(initial_expected_tick_rule)
            result = temp[abs(temp['tick_imbalance']) - expected_tick_imbalance >=0]
            T_star = result.index[0]
            bar_sample = temp.loc[:T_star, :]

            bar_dict[i] = bar_sample
            i += 1
            df = df.loc[T_star + datetime.timedelta(days = 1):, :]

        else:
            temp = df
            temp['T'] = np.arange(len(temp)) + 1
            temp['tick_rule'] = tick_rule(tick).fillna(method = 'ffill')
            temp['tick_rule'][0] = bar_dict[i-1]['T'][-1]
            temp['tick_imbalance'] = temp['tick_rule'].cumsum()
            estimated_E_T = bar_dict[i-1].ewm(com = 0.5).mean()['T'][-1]
            estimated_b_t = bar_dict[i-1].ewm(com = 0.5).mean()['tick_rule'][-1]
            temp['expected_tick_imbalance'] = estimated_E_T * abs(estimated_b_t)
            result = temp[abs(temp['tick_imbalance']) - temp['expected_tick_imbalance'] >=0]
            if len(result) != 0:
                T_star = result.index[0]
                bar_sample = temp.loc[:T_star, :]

                bar_dict[i] = bar_sample
                i += 1
                df = df.loc[T_star + datetime.timedelta(days = 1):, :]
            else: 
                bar_dict[i] = temp
                break

    return bar_dict

## volume_imbalance_Bar

In [175]:
# 첫 번째 tib(tick imbalance bar) 정하기위해서는 최초의 값들을 정해줘야 함(이전 bar size, 이전 tick rule 값)
def volume_imbalance_bar(df, initial_expected_bar_size = 10, initial_expected_volume_rule = 1):
    
    i = 0
    bar_dict = dict()
    
    while True:

        if i == 0:
            temp = df
            temp['T'] = np.arange(len(temp)) + 1
            temp['tick_rule'] = tick_rule(tick).fillna(method = 'ffill')
            temp['tick_rule'][0] = 1
            temp['volume_tick_rule'] = temp['tick_rule'] * temp['Volume']
            temp['volume_imbalance'] = temp['volume_tick_rule'].cumsum()
#             temp['expected_tick_imbalance'] = initial_expected_bar_size * initial_expected_tick_rule
            expected_volume_imbalance = initial_expected_bar_size * abs(initial_expected_volume_rule)
            result = temp[abs(temp['volume_imbalance']) - expected_volume_imbalance >=0]
            T_star = result.index[0]
            bar_sample = temp.loc[:T_star, :]

            bar_dict[i] = bar_sample
            i += 1
            df = df.loc[T_star + datetime.timedelta(days = 1):, :]

        else:
            temp = df
            temp['T'] = np.arange(len(temp)) + 1
            temp['tick_rule'] = tick_rule(tick).fillna(method = 'ffill')
            temp['tick_rule'][0] = bar_dict[i-1]['T'][-1]
            temp['volume_tick_rule'] = temp['tick_rule'] * temp['Volume']
            temp['volume_imbalance'] = temp['volume_tick_rule'].cumsum()
            estimated_E_T = bar_dict[i-1].ewm(com = 0.5).mean()['T'][-1]
            estimated_b_t = bar_dict[i-1].ewm(com = 0.5).mean()['volume_tick_rule'][-1]
            temp['expected_volume_imbalance'] = estimated_E_T * abs(estimated_b_t)
            result = temp[abs(temp['volume_imbalance']) - temp['expected_volume_imbalance'] >=0]
            if len(result) != 0:
                T_star = result.index[0]
                bar_sample = temp.loc[:T_star, :]

                bar_dict[i] = bar_sample
                i += 1
                df = df.loc[T_star + datetime.timedelta(days = 1):, :]
            else: 
                bar_dict[i] = temp
                break

    return bar_dict

## dollar_imbalance_Bar

In [182]:
# 첫 번째 tib(tick imbalance bar) 정하기위해서는 최초의 값들을 정해줘야 함(이전 bar size, 이전 tick rule 값)
def dolloar_imbalance_bar(df, initial_expected_bar_size = 10, initial_expected_dollar_rule = 1):
    
    i = 0
    bar_dict = dict()
    
    while True:

        if i == 0:
            temp = df
            temp['T'] = np.arange(len(temp)) + 1
            temp['tick_rule'] = tick_rule(tick).fillna(method = 'ffill')
            temp['tick_rule'][0] = 1
            temp['dollar_tick_rule'] = temp['tick_rule'] * temp['Volume'] * temp['Close']
            temp['dollar_imbalance'] = temp['dollar_tick_rule'].cumsum()
#             temp['expected_tick_imbalance'] = initial_expected_bar_size * initial_expected_tick_rule
            expected_dollar_imbalance = initial_expected_bar_size * abs(initial_expected_dollar_rule)
            result = temp[abs(temp['dollar_imbalance']) - expected_dollar_imbalance >=0]
            T_star = result.index[0]
            bar_sample = temp.loc[:T_star, :]

            bar_dict[i] = bar_sample
            i += 1
            df = df.loc[T_star + datetime.timedelta(days = 1):, :]

        else:
            temp = df
            temp['T'] = np.arange(len(temp)) + 1
            temp['tick_rule'] = tick_rule(tick).fillna(method = 'ffill')
            temp['tick_rule'][0] = bar_dict[i-1]['T'][-1]
            temp['dollar_tick_rule'] = temp['tick_rule'] * temp['Volume'] * temp['Close']
            temp['dollar_imbalance'] = temp['dollar_tick_rule'].cumsum()
            estimated_E_T = bar_dict[i-1].ewm(com = 0.5).mean()['T'][-1]
            estimated_b_t = bar_dict[i-1].ewm(com = 0.5).mean()['dollar_tick_rule'][-1]
            temp['expected_dollar_imbalance'] = estimated_E_T * abs(estimated_b_t)
            result = temp[abs(temp['dollar_imbalance']) - temp['expected_dollar_imbalance'] >=0]
            if len(result) != 0:
                T_star = result.index[0]
                bar_sample = temp.loc[:T_star, :]

                bar_dict[i] = bar_sample
                i += 1
                df = df.loc[T_star + datetime.timedelta(days = 1):, :]
            else: 
                bar_dict[i] = temp
                break

    return bar_dict

In [183]:
result_bar3 = dolloar_imbalance_bar(tick)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a s

In [184]:
len(result_bar3)

8

In [176]:
result_bar2 = volume_imbalance_bar(tick)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexin

In [179]:
len(result_bar)

3

In [171]:
result_bar = tick_imbalance_bar(tick)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexin

#### tick data가 아니라서 결과값이 이상한건가...?

In [172]:
result_bar

{0:                 Volume    Close    T  tick_rule  tick_imbalance
 Date                                                           
 2010-01-04  11950800.0  16180.0    1        1.0             1.0
 2010-01-05  27925850.0  16440.0    2        1.0             2.0
 2010-01-06  22948850.0  16820.0    3        1.0             3.0
 2010-01-07  22107950.0  16260.0    4       -1.0             2.0
 2010-01-08  14777550.0  16420.0    5        1.0             3.0
 ...                ...      ...  ...        ...             ...
 2011-02-23  17193000.0  18600.0  286       -1.0            -6.0
 2011-02-24  18110900.0  18580.0  287       -1.0            -7.0
 2011-02-25  15181200.0  18520.0  288       -1.0            -8.0
 2011-02-28  16274800.0  18460.0  289       -1.0            -9.0
 2011-03-02  16134300.0  18360.0  290       -1.0           -10.0
 
 [290 rows x 5 columns],
 1:                 Volume    Close  T  tick_rule  tick_imbalance  \
 Date                                                   

----------------------------------------------
----------------------------------------------
## PCA WEIGHT

In [132]:
mean = np.array([2,1,0.5,0.3,0])
cov = np.array([[0.4, 0.1, 0.2, 0.1,0.3], 
                [0.1, 0.3, 0.1, 0.2, 0],
               [0.2, 0.1, 0.5, 0.2, 0.3], 
               [0.1, 0.2, 0.2, 0.5, 0.1], 
               [0.3, 0, 0.3, 0.1, 0.4]])
sample = np.random.multivariate_normal(mean, cov, 10)
sample

array([[ 2.33360058,  1.30346419,  0.9279701 ,  1.13270384,  0.61277748],
       [ 1.875077  ,  1.78293961,  0.44652488,  0.59788315, -0.41803078],
       [ 2.84798468,  0.88464159,  1.2388578 ,  1.62589207,  0.3690929 ],
       [ 1.24029052,  0.46458092,  0.15556368,  0.02439971, -0.71871668],
       [ 0.97147216,  1.82503741,  0.47105828,  0.52636513, -1.62616976],
       [ 2.56565922,  0.626323  ,  1.55148172, -0.37853266,  0.87747592],
       [ 2.15411933,  1.08852034, -0.01088963, -0.0216187 , -0.43259115],
       [ 1.21606862,  0.10811402,  0.92980864,  0.90699082,  0.14770369],
       [ 1.49232625,  0.89374734,  1.09589781,  0.74267008,  0.29757879],
       [ 0.58099691,  1.808902  , -1.29322928, -0.02880777, -1.75183923]])

In [139]:
cov2 = np.zeros((5,5))
cov2_values = np.array([0.1,0.2,0.3,0.4,0.5])
np.fill_diagonal(cov2, cov2_values)
cov2

array([[0.1, 0. , 0. , 0. , 0. ],
       [0. , 0.2, 0. , 0. , 0. ],
       [0. , 0. , 0.3, 0. , 0. ],
       [0. , 0. , 0. , 0.4, 0. ],
       [0. , 0. , 0. , 0. , 0.5]])

In [140]:
np.diag(cov2)

array([0.1, 0.2, 0.3, 0.4, 0.5])

In [141]:
sample2 = np.random.multivariate_normal(mean, cov2, 10)
sample2

array([[ 1.55622132,  0.86247847,  0.86975233, -0.42104822,  0.31191359],
       [ 1.65240761,  1.0845652 ,  0.72339042, -0.39501548,  0.2304178 ],
       [ 2.22923407,  1.13902598,  0.45189571,  1.41294848,  1.38056582],
       [ 2.34553551,  1.46595232, -0.26814678,  0.19373503, -1.93987113],
       [ 2.1650914 ,  1.37329198,  0.37766928,  0.14529139, -0.77069529],
       [ 1.96641561,  0.92012585,  1.2270505 , -0.28456116, -0.24204928],
       [ 2.24793998,  1.68197258,  1.09934959, -0.14681363,  0.65092632],
       [ 1.74970146,  1.30189429,  0.08753593,  0.86869902, -0.49773811],
       [ 1.68365273,  0.78261956,  1.31456432,  0.8013142 ,  0.42967353],
       [ 2.54052651,  0.62975484,  0.9025042 ,  0.60968155,  0.71826796]])

In [162]:
# 대각원소일 떄 - 공분산이 존재하지 않을 때
def inverse_variance_weight(cov):
    weights = (1 / np.diag(cov)) / sum(1 / np.diag(cov))
    return weights

In [157]:
inverse_variance_cov = inverse_variance_weight(cov).T @ cov2 @ inverse_variance_weight(cov)
inverse_variance_cov

8.772222222222222

In [163]:
inverse_variance_weight(cov)

array([0.2027027 , 0.27027027, 0.16216216, 0.16216216, 0.2027027 ])

In [164]:
# risk contribtuion
def RC(weight, covmat) :
    weight = np.array(weight)
    variance = weight.T @ covmat @ weight
    sigma = variance ** 0.5
    mrc = 1/sigma * (covmat @ weight)
    rc = weight * mrc
    rc = rc / rc.sum()
    return(rc)

In [165]:
RC(inverse_variance_weight(cov), cov)

array([0.2210324 , 0.20318506, 0.19659528, 0.16913784, 0.21004942])

In [161]:
RC(pcaWeights(cov), cov)

array([[0.27110515],
       [0.14601499],
       [0.08000791],
       [0.00706984],
       [0.49580212]])

In [148]:
# 포트폴리오의 총 변동성을 정했을 때 각 포트폴리오의 구성 비율을 구해줌
def pcaWeights(cov, riskDist = None, riskTarget = 1.):   # riskDist - 각 component들이 전체 주어진 변동성에서 차지하는 비율, riskTarget - 포트폴리오의 총 변동성 
    eVal, eVec = np.linalg.eigh(cov)
    indices = eVal.argsort()[::-1] # eigen value의 내림차순 정렬 인덱스
    eVal, eVec = eVal[indices], eVec[:, indices] # eigen value의 내림차순 정렬 
    if riskDist is None:
        riskDist = np.zeros(cov.shape[0])
        riskDist[-1] = 1
    loads = riskTarget * (riskDist / eVal) ** 0.5  #### beta
    wghts = np.dot(eVec, np.reshape(loads, (-1, 1)))
    
    return wghts

In [149]:
pcaWeights(cov2, riskDist=np.array([0.2,0.3,0.1,0.2,0.2]), riskTarget=1)

array([[1.41421356],
       [1.        ],
       [0.57735027],
       [0.8660254 ],
       [0.63245553]])

In [152]:
pcaWeights(cov, riskDist=np.array([0.2,0.3,0.1,0.2,0.2]), riskTarget=1)

array([[-0.57672942],
       [-0.06011628],
       [-1.62732812],
       [-0.35776057],
       [ 1.88046691]])

----------------------------------------
----------------------------------------

## CUMSUM Filter

In [11]:
def getTevents(gRaw, h):
    tEvents = []
    sPos = 0
    sNeg = 0
    diff = gRaw.diff()
    for i in diff.index[1:]:
        sPos = max(0, sPos + diff.loc[i])
        sNeg = min(0, sNeg + diff.loc[i])
        if sNeg < -h:
            sNeg = 0
            tEvents.append(i)
        elif sPos > h:
            sPos = 0
            tEvents.append(i)
    return pd.DatetimeIndex(tEvents)

In [18]:
getTevents(tick['Close'], 20000)

DatetimeIndex(['2016-08-23', '2017-07-12', '2020-01-09', '2020-11-13'], dtype='datetime64[ns]', freq=None)