In [1]:
import gc
import time , argparse
import scripts.util as U
from datetime import datetime , timedelta
from dataclasses import dataclass , field

from scripts.util.basic import Timer
from scripts.util.logger import get_logger
from scripts.environ import DIR_data
from scripts.util.data.ModelData import (
    block_load_DB , block_process , block_mask , path_block_data , block_hist_norm , path_norm_data)
 
logger = get_logger()

DIR_block     = f'{DIR_data}/block_data'
DIR_hist_norm = f'{DIR_data}/hist_norm'


def today(offset = 0):
    d = datetime.today() + timedelta(days=offset)
    return int(d.strftime('%Y%m%d'))

@dataclass
class DataProcessConfig:
    load_start_dt : int | None
    load_end_dt   : int | None
    save_start_dt : int | None
    save_end_dt   : int | None
    hist_start_dt : int | None
    hist_end_dt   : int | None
    mask : dict = field(default_factory=dict)
    data : dict = field(default_factory=dict)

Configs = DataProcessConfig(
    load_start_dt = None ,
    load_end_dt   = None ,
    save_start_dt = 20070101 ,
    save_end_dt   = None ,
    hist_start_dt = None ,
    hist_end_dt   = 20161231 ,  
    mask          = {'list_dt':True}
)
Configs.data['y'] = {
    'DB_source'  : {
        'labels': {'inner_path' : ['10days/lag1' , '20days/lag1']} ,
        'models': {'inner_path' : 'risk_model/exposure'} ,
    }
}


Use device name: NVIDIA GeForce RTX 4090


In [2]:
if_train = False

t1 = time.time()
logger.critical('Data Processing start!')
logger.error(f'{len(Configs.data)} datas :' + str(list(Configs.data.keys())))

for key , param in Configs.data.items():
    tt1 = time.time()
    print(f'{time.ctime()} : {key} start ...')
    
    BlockDict = block_load_DB(
        param['DB_source'] , 
        start_dt = Configs.load_start_dt if if_train else today(-181), 
        end_dt   = Configs.load_end_dt   if if_train else None)

    '''
    

    with Timer(f'{key} blocks process'):
        ThisBlock = block_process(BlockDict , key)

    with Timer(f'{key} blocks masking'):   
        ThisBlock = block_mask(
            ThisBlock , 
            mask = Configs.mask)

    with Timer(f'{key} blocks saving '):
    ThisBlock.save(
        path_block_data(key , if_train) , 
        start_dt = Configs.save_start_dt if if_train else None , 
        end_dt   = Configs.save_end_dt   if if_train else None)

    '''

    tt2 = time.time()
    print(f'{time.ctime()} : {key} finished! Cost {tt2-tt1:.2f} Seconds')

    #del ThisBlock
    gc.collect()

t2 = time.time()
logger.critical('Data Processing Finished! Cost {:.2f} Seconds'.format(t2-t1))


[1m[37m[41m24-03-24 16:46:30|MOD:1840015316  |[0m: [1m[31mData Processing start![0m
[1m[37m[45m24-03-24 16:46:30|MOD:1840015316  |[0m: [1m[35m1 datas :['y'][0m


Sun Mar 24 16:46:30 2024 : y start ...
labels blocks reading 10days/lag1 Data1D's ...... cost 0.18 secs
labels blocks reading 20days/lag1 Data1D's ...... cost 0.15 secs
labels blocks merging ...... cost 0.05 secs
models blocks reading risk_model/exposure Data1D's ...... cost 1.17 secs
models blocks merging ...... cost 0.00 secs
2 blocks aligning ...

[1m[37m[41m24-03-24 16:46:32|MOD:1840015316  |[0m: [1m[31mData Processing Finished! Cost 2.04 Seconds[0m


... cost 0.39 secs
Sun Mar 24 16:46:31 2024 : y finished! Cost 1.94 Seconds


In [3]:
BlockDict['labels'].shape , BlockDict['models'].shape

((5112, 101, 1, 4), (5112, 101, 1, 48))

In [4]:
from scripts.function.primas import neutralize_2d , process_factor
import torch

x = torch.FloatTensor(BlockDict['models'].values[...,:BlockDict['models'].feature.tolist().index('size')+1]).permute(1,0,2,3).squeeze(2)
for i_feat,lb_name in enumerate(BlockDict['labels'].feature):
    if lb_name[:3] == 'rtn':
        y_raw = torch.FloatTensor(BlockDict['labels'].values[...,i_feat]).permute(1,0,2).squeeze(2)
        y_std = neutralize_2d(y_raw , x).permute(1,0).unsqueeze(2)
        BlockDict['labels'].add_feature('std'+lb_name[3:],y_std)

y_ts = torch.FloatTensor(BlockDict['labels'].values)[:,:,0]
for i_feat,lb_name in enumerate(BlockDict['labels'].feature):
    y_pro = process_factor(y_ts[...,i_feat], dim = 0)
    if not isinstance(y_pro , torch.Tensor): continue
    y_pro = y_pro.unsqueeze(-1).numpy()
    BlockDict['labels'].values[...,i_feat] = y_pro