In [1]:
import torch , h5py
import numpy as np
import pandas as pd
import os, shutil , gc , time , argparse
from scripts.data_utils.ModelData import save_block_data
from scripts.functional.func import *
from scripts.util.environ import get_logger , get_config , DIR_data

NBARS      = {'day' : 1 , '15m' : 16 ,}
DATATYPE   = get_config('data_type')['DATATYPE']

logger = get_logger()


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='manual to this script')
    parser.add_argument("--confirm", type=str, default='')
    if parser.parse_args().confirm == 'no':
        pass
    else:
        a = input('You want to update data? print "yes" to confirm!')
        if a == 'yes':
            t1 = time.time()
            logger.critical('Data loading start!')

            #update_trading_data()
            #prepare_model_data()
            #cal_norm_param()
            
            t2 = time.time()
            logger.critical('Data loading Finished! Cost {:.2f} Seconds'.format(t2-t1))


In [91]:
import numpy as np
secid = date = ndim = None
secid , date , ndim

(None, None, None)

In [1]:
import torch , h5py
import numpy as np
import pandas as pd
import os, shutil , gc , time , argparse
from scripts.data_utils.ModelData import (
     DataBlock)
from scripts.data_utils.DataTank import DataTank
from scripts.functional.func import *
from scripts.util.environ import get_logger , get_config , DIR_data
from scripts.util.basic import timer

start_dt , end_dt = None , None # 20150101 , 20150331 # None , None
process_param = {
    'y' : {
        'dtank_h5' : 'DB_labels.h5' ,
        'dtank_file' : ['10days/lag1' , '20days/lag1'] ,
        'start_dt' : start_dt , 'end_dt' : end_dt
    },
    'trade_day' : {
        'dtank_h5' : 'DB_trade_day.h5' ,
        'dtank_file' : 'day/trade' ,
        'feature' : ['adjfactor', 'close', 'high', 'low', 'open', 'volume', 'vwap'] ,
        'process_method' : 'adj_order' ,
        'start_dt' : start_dt , 'end_dt' : end_dt
    },
    'trade_15m' : {
        'dtank_h5' : 'DB_trade_Xmin.h5' ,
        'dtank_file' : '15min/trade' ,
        'feature' : ['minute' , 'close', 'high', 'low', 'open', 'volume', 'vwap'] ,
        'process_method' : 'order' ,
        'start_dt' : start_dt , 'end_dt' : end_dt
    },
    # 'gp' : {}
}
x_trade_norm_dict = dict()
DIR_block  = f'{DIR_data}/block_data'
path_xnorm = f'{DIR_block}/X_normdict.pt'
logger = get_logger()

def block_default_path(key):
    if key.lower() == 'y':
        return f'{DIR_block}/Y.npz'
    else:
        return f'{DIR_block}/X_{key}.npz'
    
def block_process(data_block , process_method = 'default' , feature = [] , **kwargs):
    np.seterr(invalid='ignore')
    assert isinstance(data_block , DataBlock) , type(data_block)
    if process_method == 'default':
        process_method = 'order'
    if 'adj' in process_method and 'adjfactor' in data_block.feature:
        price_feat = np.intersect1d(['close', 'high', 'low', 'open', 'vwap'] , data_block.feature)
        ifeat = np.where(np.isin(data_block.feature,price_feat))[0]
        iadj  = np.where(data_block.feature == 'adjfactor')[0]
        data_block.values[...,ifeat] = np.multiply(data_block.values[...,ifeat],data_block.values[...,iadj])
        ifeat  = np.where(data_block.feature != 'adjfactor')[0]
        data_block.update(values = data_block.values[...,ifeat] , feature = data_block.feature[ifeat])
    if 'order' in process_method:
        raw_order = feature
        raw_order = [o for o in raw_order if o in data_block.feature]
        raw_order += [o for o in data_block.feature if o not in raw_order]
        ifeat = np.array([raw_order.index(f) for f in data_block.feature])
        data_block.update(values = data_block.values[...,ifeat] , feature = data_block.feature[ifeat])
    np.seterr(invalid='warn')
    return data_block

def block_hist_norm(data_block , key , norm_dict = None , 
                    start_dt = None , end_dt = 20161231 , 
                    step_day = 5 , eps = 1e-4 , **kwargs):
    if not key.startswith(('trade','day','15m','min','30m','60m')): return norm_dict
    maxday = {
        'trade_day' : 120 ,
        'trade_min' : 20 ,
        'others'    : 60 ,
    }
    maxday = maxday[key] if key in maxday.keys() else maxday['others']

    date_slice = np.repeat(True , len(data_block.date))
    if start_dt is not None: date_slice[data_block.date < start_dt] = False
    if end_dt   is not None: date_slice[data_block.date > end_dt]   = False

    x = data_block.values[:,date_slice]
    secid = data_block.secid
    date  = data_block.date[date_slice]
    feat  = data_block.feature
    inday = x.shape[2]

    len_step = len(date) // step_day
    len_bars = maxday * inday
    x = np.concatenate([np.full((len(secid),maxday,*x.shape[2:]),np.nan),x],axis=1)

    x = torch.tensor(x)
    x_endpoint = x.shape[1]-1 + step_day * np.arange(-len_step + 1 , 1)
    avg_x = torch.zeros(len_bars , len(feat))
    std_x = torch.zeros(len_bars , len(feat))

    x_div = torch.ones(len(secid) , len_step , 1 , len(feat))
    x_div.copy_(x[:,x_endpoint,-1:])

    nan_sample = (x_div < 0).sum(dim = (-2,-1)) > 0
    nan_set = [x[:,x_endpoint-i].reshape(len(secid),len_step ,-1).isnan().any(dim = -1) for i in range(maxday)]
    for i_set in nan_set: nan_sample += i_set

    for i in range(maxday):
        vijs = ((x[:,x_endpoint - maxday+1 + i] + eps) / (x_div + eps))[nan_sample == 0]
        avg_x[i*inday:(i+1)*inday] = vijs.mean(dim = 0)
        std_x[i*inday:(i+1)*inday] = vijs.std(dim = 0)

    # assert avg_x.isnan().sum() + std_x.isnan().sum() == 0 , ((nan_sample == 0).sum())

    if norm_dict is None:
        return {key : {'avg' : avg_x , 'std' : std_x}}
    else:
        norm_dict.update({key : {'avg' : avg_x , 'std' : std_x}})
        return norm_dict


In [2]:
for key ,param in process_param.items():
    logger.error(f'Preparing {key} data...')

    if isinstance(param['dtank_file'] , str): param['dtank_file'] = [param['dtank_file']]
    blocks = []
    dtank  = DataTank('/'.join([DIR_data,param['dtank_h5']]) , open = True , mode = 'r')
    for f in param['dtank_file']:
        with timer(f'{key} reading {f} Data1D\'s') as t:
            blocks.append(DataBlock().from_dtank(dtank,f,**param))
    dtank.close()
    with timer(f'{key} merging blocks') as t:
        new_block = DataBlock().merge_others(blocks)
    with timer(f'{key} process blocks') as t:
        new_block = block_process(new_block , **param)
    with timer(f'{key} savenpz blocks') as t:
        new_block.save_npz(block_default_path(key))
    with timer(f'norming {key} blocks') as t:
        block_hist_norm(new_block , key , x_trade_norm_dict , **param)
torch.save(x_trade_norm_dict , path_xnorm)

[1m[37m[45m23-12-21 23:07:19|MOD:1789959765  |[0m: [1m[35mPreparing y data...[0m


y reading 10days/lag1 Data1D's ...... cost 0.18 secs
y reading 20days/lag1 Data1D's ...... cost 0.20 secs
y merging blocks ...... cost 0.02 secs
y process blocks ...... cost 0.00 secs
y savenpz blocks ...

[1m[37m[45m23-12-21 23:07:19|MOD:1789959765  |[0m: [1m[35mPreparing trade_day data...[0m


... cost 0.21 secs
norming y blocks ...... cost 0.00 secs
trade_day reading day/trade Data1D's ...... cost 0.35 secs
trade_day merging blocks ...... cost 0.00 secs
trade_day process blocks ...... cost 0.02 secs
trade_day savenpz blocks ...

[1m[37m[45m23-12-21 23:07:20|MOD:1789959765  |[0m: [1m[35mPreparing trade_15m data...[0m


... cost 0.30 secs
norming trade_day blocks ...... cost 0.14 secs
trade_15m reading 15min/trade Data1D's ...... cost 10.52 secs
trade_15m merging blocks ...... cost 0.00 secs
trade_15m process blocks ...... cost 0.08 secs
trade_15m savenpz blocks ...... cost 1.47 secs
norming trade_15m blocks ...... cost 1.20 secs


In [8]:
key = 'trade_day'
param = process_param[key]
logger.error(f'Preparing {key} data...')

if isinstance(param['dtank_file'] , str): param['dtank_file'] = [param['dtank_file']]
blocks = []
dtank  = DataTank('/'.join([DIR_data,param['dtank_h5']]) , open = True , mode = 'r')
for f in param['dtank_file']:
    with timer(f'{key} reading {f} Data1D\'s') as t:
        blocks.append(DataBlock().from_dtank(dtank,f,**param))
dtank.close()
with timer(f'{key} merging blocks') as t:
    new_block = DataBlock().merge_others(blocks)
with timer(f'{key} process blocks') as t:
    new_block = block_process(new_block , **param)
with timer(f'{key} savenpz blocks') as t:
    new_block.save_npz(block_default_path(key))
with timer(f'norming {key} blocks') as t:
    block_hist_norm(new_block , key , x_trade_norm_dict , **param)


[1m[37m[45m23-12-21 23:06:03|MOD:3381047532  |[0m: [1m[35mPreparing trade_day data...[0m


trade_day reading day/trade Data1D's ... , cost 0.61 secs
trade_day merging blocks ... , cost 0.00 secs
trade_day process blocks ... , cost 0.01 secs
trade_day savenpz blocks ... , cost 0.29 secs
norming trade_day blocks ... , cost 0.13 secs


In [9]:
key = 'trade_15m'
param = process_param[key]
logger.error(f'Preparing {key} data...')

if isinstance(param['dtank_file'] , str): param['dtank_file'] = [param['dtank_file']]
blocks = []
dtank  = DataTank('/'.join([DIR_data,param['dtank_h5']]) , open = True , mode = 'r')
for f in param['dtank_file']:
    with timer(f'{key} reading {f} Data1D\'s') as t:
        blocks.append(DataBlock().from_dtank(dtank,f,**param))
dtank.close()
with timer(f'{key} merging blocks') as t:
    new_block = DataBlock().merge_others(blocks)
with timer(f'{key} process blocks') as t:
    new_block = block_process(new_block , **param)
with timer(f'{key} savenpz blocks') as t:
    new_block.save_npz(block_default_path(key))
with timer(f'norming {key} blocks') as t:
    block_hist_norm(new_block , key , x_trade_norm_dict , **param)

[1m[37m[45m23-12-21 23:06:13|MOD:4182274638  |[0m: [1m[35mPreparing trade_15m data...[0m


trade_15m reading 15min/trade Data1D's ... , cost 10.52 secs
trade_15m merging blocks ... , cost 0.00 secs
trade_15m process blocks ... , cost 0.10 secs
trade_15m savenpz blocks ... , cost 1.47 secs
norming trade_15m blocks ... , cost 1.22 secs


In [6]:
def block_hist_norm(data_block , key , norm_dict = None , 
                    start_dt = None , end_dt = 20161231 , 
                    step_day = 5 , eps = 1e-4 , **kwargs):
    if not key.startswith(('trade','day','15m','min','30m','60m')): return norm_dict
    maxday = {
        'trade_day' : 120 ,
        'trade_min' : 20 ,
        'others'    : 60 ,
    }
    maxday = maxday[key] if key in maxday.keys() else maxday['others']

    date_slice = np.repeat(True , len(data_block.date))
    if start_dt is not None: date_slice[data_block.date < start_dt] = False
    if end_dt   is not None: date_slice[data_block.date > end_dt]   = False

    x = data_block.values[:,date_slice]
    secid = data_block.secid
    date  = data_block.date[date_slice]
    feat  = data_block.feature
    inday = x.shape[2]

    len_step = len(date) // step_day
    len_bars = maxday * inday
    x = np.concatenate([np.full((len(secid),maxday,*x.shape[2:]),np.nan),x],axis=1)

    x = torch.tensor(x)
    x_endpoint = x.shape[1]-1 + step_day * np.arange(-len_step + 1 , 1)
    avg_x = torch.zeros(len_bars , len(feat))
    std_x = torch.zeros(len_bars , len(feat))

    x_div = torch.ones(len(secid) , len_step , 1 , len(feat))
    x_div.copy_(x[:,x_endpoint,-1:])

    nan_sample = (x_div < 0).sum(dim = (-2,-1)) > 0
    nan_set = [x[:,x_endpoint-i].reshape(len(secid),len_step ,-1).isnan().any(dim = -1) for i in range(maxday)]
    for i_set in nan_set: nan_sample += i_set

    for i in range(maxday):
        vijs = ((x[:,x_endpoint - maxday+1 + i] + eps) / (x_div + eps))[nan_sample == 0]
        avg_x[i*inday:(i+1)*inday] = vijs.mean(dim = 0)
        std_x[i*inday:(i+1)*inday] = vijs.std(dim = 0)

    assert avg_x.isnan().sum() + std_x.isnan().sum() == 0 , ((nan_sample == 0).sum())

    if norm_dict is None:
        return {key : {'avg' : avg_x , 'std' : std_x}}
    else:
        norm_dict.update({key : {'avg' : avg_x , 'std' : std_x}})
        return norm_dict
    
block_hist_norm(new_block , key , xnorm_dict , **param)

torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size([2679, 11])
torch.Size(

AssertionError: tensor(0)

In [14]:
dtank = DataTank('/'.join([DIR_data,param['dtank_h5']]) , open = True , mode = 'r')

'DataUpdater1.h5'

In [15]:
dtank = DataTank('/'.join([DIR_data,param['dtank_h5']]) , open = True , mode = 'r')
file = ['15min' , 'trade']
g = dtank.get_object(file)
g.keys()
k = dtank.read_data1D(['15min' , 'trade' , '20100104'])

In [16]:
k


<class 'scripts.data_utils.DataTank.Data1D'>
secid len (27120): array([     1,      1,      1, ..., 601999, 601999, 601999], dtype=int32)
feature len (8): array(['minute', 'open', 'high', 'low', 'close', 'amount', 'volume',
       'vwap'], dtype='<U6')
values shape (27120, 8)

In [17]:
k.to_kline()

AttributeError: 'numpy.ndarray' object has no attribute 'isnan'

In [19]:
dtank = DataTank('/'.join([DIR_data,param['dtank_h5']]) , open = True , mode = 'r')

In [20]:
dtank.get_object(f)

In [None]:
dtank.file['15min/trade'].keys()