In [1]:
import pandas as pd
import numpy as np
from IPython.display import display

## External DB
### 1) **Simulatable** instruments
- Filtering conditions:

- e.g.: `itemtype` may be ETF, stocks and etc.

In [2]:
itemtype = 'ETF'

#### Load external data formatted as plain files, and then pickle them.
- We <span class="mark">drop</span> any instrument if:
  - `min_datapoints`: <span class="mark"># of data points >= 3-year long.</span>

In [3]:
data_files = ['price_d', 'volume_d', 'price_w', 'volume_w', 'price_m', 'volume_m']
filepath = '../../data/external/'
df = {}
for filename in data_files:
    df[filename] = pd.read_csv(filepath + filename + '.dat', header=7)
    df[filename].drop(range(0,6), inplace=True)
    df[filename].rename(columns={'Code': 'date'}, inplace=True)
    df[filename].date = pd.to_datetime(df[filename].date)
    name_cols = df[filename].columns[1:]
    df[filename][name_cols] = df[filename][name_cols].apply(pd.to_numeric, errors='coerce', axis=1)


#### Merge data sets and melt them into one tall and this data frame.
- `df_db[freq]` is your final data frame.
  - `freq` = {d|w|m} for different frequencies.
- df_db[freq].`trading_amt_mln` is a three-month average trading amount.

In [4]:
df_pr = {}
df_vol = {}
df_db = {}

frequency = ['d', 'w', 'm']
# We need an extra 1 record for return calcaultion.
min_datapoints = {'d': 365*3+1, 'w': 52*3+1, 'm': 12*3+1}
window_3m = {'d': 90, 'w': 12, 'm': 3}

for freq in frequency:
    pr = df['price_'+freq].dropna(thresh=min_datapoints[freq], axis=1).dropna()
    vol = df['volume_'+freq].dropna(thresh=min_datapoints[freq], axis=1).dropna()
    df_pr[freq] = pd.melt(pr, id_vars=['date'], var_name='itemcode', value_name='price')
    df_vol[freq] = pd.melt(vol, id_vars=['date'], var_name='itemcode', value_name='volume')
    df_db[freq]= pd.merge(df_pr[freq], df_vol[freq], left_on=['date', 'itemcode'], right_on=['date', 'itemcode'], how='outer')
    df_db[freq] = df_db[freq].assign(trading_amt_mln=(df_db[freq].price*df_db[freq].volume).divide(10**6).rolling(window_3m[freq], min_periods=1).mean())
    df_db[freq] = df_db[freq].assign(ret=np.log(1+df_db[freq].groupby('itemcode').price.pct_change()))
    df_db[freq]['itemtype'] = itemtype
#     df_db[freq].to_pickle(filepath + 'price_db_' + freq + '.pkl')

### 2) **Non-tradable** instruments
- e.g.: rates

In [5]:
df['ecos_w'] = pd.read_csv(filepath + 'ecos_w' + '.dat', header=3, parse_dates=['date'])
df['ecos_w']['itemcode'] = 'CALL'
df['ecos_w']['itemtype'] = 'riskfree'
df['ecos_w'].rename(columns={'call':'price'}, inplace=True)

In [6]:
df_db['w'] = pd.concat([df_db['w'], df['ecos_w']])

## Pickle final restuls

`price_db_#.pkl`

In [7]:
for freq in frequency:
    df_db[freq].to_pickle(filepath + 'price_db_' + freq + '.pkl')

`simulatable_instruments.pkl`

In [8]:
df_simulatables = df_db['w'][df_db['w'].itemtype=='ETF']
simulatable_instruments = df_simulatables.loc[df_simulatables.date==df_simulatables.date.max()].drop(['date', 'ret'], axis=1).reset_index(drop=True)
simulatable_instruments.to_pickle(filepath + 'simulatable_instruments.pkl')

In [9]:
for freq in frequency:
    display(df_db[freq])

Unnamed: 0,date,itemcode,price,volume,trading_amt_mln,ret,itemtype
0,2016-08-17,A069500,23459.0,2999724.0,70370.525316,,ETF
1,2016-08-18,A069500,23629.0,3610366.0,77839.931765,0.007221,ETF
2,2016-08-19,A069500,23684.0,3608250.0,80379.218843,0.002325,ETF
3,2016-08-22,A069500,23546.0,2882931.0,77254.787464,-0.005844,ETF
4,2016-08-23,A069500,23670.0,1943488.0,71004.302163,0.005252,ETF
...,...,...,...,...,...,...,...
24613,2021-03-04,A245340,17685.0,18258.0,385.770184,-0.005639,ETF
24614,2021-03-05,A245340,17520.0,13537.0,386.990621,-0.009374,ETF
24615,2021-03-08,A245340,17995.0,40753.0,394.363003,0.026751,ETF
24616,2021-03-09,A245340,18355.0,52565.0,403.612825,0.019808,ETF


Unnamed: 0,date,itemcode,price,volume,trading_amt_mln,ret,itemtype
0,2017-12-15,A069500,30640.0000,8519096.0,261025.101440,,ETF
1,2017-12-22,A069500,30152.0000,8482482.0,258394.449352,-0.016055,ETF
2,2017-12-28,A069500,30762.0000,7560958.0,249793.029567,0.020029,ETF
3,2018-01-05,A069500,31203.0000,7465878.0,245584.219983,0.014234,ETF
4,2018-01-12,A069500,30982.0000,5001713.0,227459.990420,-0.007108,ETF
...,...,...,...,...,...,...,...
736,2021-02-10,CALL,0.0049,,,,riskfree
737,2021-02-19,CALL,0.0051,,,,riskfree
738,2021-02-26,CALL,0.0059,,,,riskfree
739,2021-03-05,CALL,0.0047,,,,riskfree


Unnamed: 0,date,itemcode,price,volume,trading_amt_mln,ret,itemtype
0,2017-12-28,A069500,30762.0,7560958.0,232590.189996,,ETF
1,2018-01-31,A069500,31630.0,18955268.0,416072.658418,0.027826,ETF
2,2018-02-28,A069500,29739.0,7129364.0,348055.157611,-0.061647,ETF
3,2018-03-30,A069500,29992.0,8620284.0,356704.946855,0.008471,ETF
4,2018-04-30,A069500,30750.0,4494562.0,202922.498408,0.024959,ETF
...,...,...,...,...,...,...,...
1009,2020-10-30,A245340,14841.0,20089.0,565.419004,-0.085022,ETF
1010,2020-11-30,A245340,16414.0,20275.0,279.354700,0.100741,ETF
1011,2020-12-30,A245340,16615.0,29841.0,375.580971,0.012171,ETF
1012,2021-01-29,A245340,17005.0,24363.0,414.298293,0.023202,ETF


In [11]:
display(simulatable_instruments)

Unnamed: 0,itemcode,price,volume,trading_amt_mln,itemtype
0,A069500,41870.0,10363864.0,450745.579929,ETF
1,A105190,42090.0,298303.0,17458.762105,ETF
2,A102110,41680.0,1024274.0,66844.032313,ETF
3,A232080,14065.0,325605.0,10167.063267,ETF
4,A278540,13455.0,340637.0,7333.746462,ETF
5,A229200,14025.0,3158681.0,84901.170569,ETF
6,A196230,104320.0,795740.0,69645.756634,ETF
7,A130730,100975.0,910079.0,56617.558903,ETF
8,A122260,101175.0,79699.0,44600.656765,ETF
9,A139260,39010.0,290847.0,34582.868955,ETF
