In [19]:
import os , torch
import numpy as np
import pandas as pd

from itertools import product
from tqdm import tqdm
from typing import Any , Literal , Optional

from src.nn_model.classes import BatchOutput
from src.nn_model.trainer.models import ModelEnsembler
from src.nn_model.trainer import NetDataModule
from src.nn_model.util import Deposition , Device , TrainConfig
from src.basic import RegModel , PATH , REG_MODELS

class HiddenExtractor:
    '''for a model to predict recent/history data'''
    def __init__(self , reg_model : RegModel):
        self.reg_model = reg_model 

        self.contents : dict[str,pd.DataFrame] = {}

        self.config     = TrainConfig.load(f'{PATH.model}/{self.reg_model.name}' , override={'short_test':False})
        self.deposition = Deposition(self.config)
        self.device     = Device()

        self.data = NetDataModule(self.config , False).load_data()
        self.target_path = os.path.join(PATH.hidden , self.reg_model.name)

        if not np.isin(self.reg_model.model_dates , self.data.model_date_list).all():
            print('Caution! Not all model dates are in data.model_date_list, possibly due to short_test!')
            self.model_dates = self.data.model_date_list
        else:
            self.model_dates = self.reg_model.model_dates

    def hidden_key(self , model_num , model_type , model_date) : 
        return f'hidden.{model_num}.{model_type}.{model_date}.feather'

    def deploy(self):
        '''deploy df in contents to target path'''
        os.makedirs(self.target_path , exist_ok=True)
        for hidden_key , hidden_df in self.contents.items():
            hidden_df.to_feather(os.path.join(self.target_path , hidden_key))
        return self

    def update_model_iter(self):
        model_iter = [(d , n , t) for (d , n , t) 
                      in product(self.model_dates[:-1] , self.reg_model.model_nums , self.reg_model.model_types)
                      if not os.path.exists(os.path.join(self.target_path , self.hidden_key(n , t , d)))]
        model_iter += list(product(self.model_dates[-1:] , self.reg_model.model_nums , self.reg_model.model_types))
        return model_iter
    
    def given_model_iter(self , model_dates : Optional[list | np.ndarray | int] = None):
        if model_dates is None:
            print('Input model_dates to extract.')
            return []
        
        if isinstance(model_dates , int): model_dates = [model_dates]
        model_dates = np.intersect1d(self.model_dates , model_dates)
        model_iter = list(product(model_dates , self.reg_model.model_nums , self.reg_model.model_types))
        return model_iter

    def extract_hidden(self , what : Literal['given' , 'update'] , model_dates : Optional[list | np.ndarray | int] = None ,
                       verbose = True , deploy = False):
        if what == 'given':
            model_iter = self.given_model_iter(model_dates)
        elif what == 'update':
            model_iter = self.update_model_iter()
        else:
            raise KeyError(what)

        with torch.no_grad():
            for model_date , model_num , model_type in model_iter:
                hidden_key = self.hidden_key(model_num , model_type , model_date)
                hidden_df  = self.model_hidden(model_num , model_type , model_date , verbose = verbose)
                if deploy:
                    hidden_df.to_feather(os.path.join(self.target_path , hidden_key))
                else:
                    self.contents[hidden_key] = hidden_df
        return self
    
    def model_hidden(self , model_num , model_type , model_date , verbose = True) -> pd.DataFrame:
        model_param = self.config.model_param[model_num]
        
        model = self.deposition.load_model(model_date , model_num , model_type)
        self.net = ModelEnsembler.get_net(self.config.model_module , model_param , model['state_dict'] , self.device)
        self.net.eval()

        df_list : list[pd.DataFrame] = []
        desc = f'Extract {model_num}/{model_type}/{model_date}' if verbose else ''

        self.data.setup('fit' ,  model_param , model_date)
        df_list += self.loader_hidden('train' , desc)
        df_list += self.loader_hidden('valid' , desc)

        self.data.setup('test' ,  model_param , model_date)
        df_list += self.loader_hidden('test' , desc)

        df = pd.concat(df_list , axis=0)
        return df
    
    def loader_hidden(self, dataset : Literal['train' , 'valid' , 'test'] , desc = ''):
        if dataset == 'train': loader = self.data.train_dataloader()
        elif dataset == 'valid': loader = self.data.val_dataloader()
        elif dataset == 'test': loader = self.data.test_dataloader()

        df_list : list[pd.DataFrame] = []
        if desc: 
            loader = tqdm(loader , total=len(loader))
            desc = f'{desc}/{dataset}'

        for batch_data in loader:
            batch_output = BatchOutput(self.net(batch_data.x))
            df = batch_output.hidden_df(batch_data , self.data.y_secid , self.data.y_date).assign(dataset = dataset)
            df_list.append(df)
            if isinstance(loader , tqdm): loader.set_description(desc)
        return df_list
    
aa = HiddenExtractor(REG_MODELS[0])

try using d:\Coding\learndl\learndl\data\DataSet/day.20240607.pt , success!
Load  2 DataBlocks...... finished! Cost 0.04 secs
Align 2 DataBlocks...... finished! Cost 0.16 secs
Pre-Norming method of [day] : {'divlast': True, 'histnorm': True}


In [20]:
aa.extract_hidden('update' , deploy = True)

Extract 0/best/20170103/train: 100%|██████████| 380/380 [00:39<00:00,  9.70it/s]
Extract 0/best/20170103/valid: 100%|██████████| 96/96 [00:12<00:00,  7.84it/s]
Extract 0/best/20170103/test: 100%|██████████| 120/120 [00:18<00:00,  6.37it/s]
Extract 0/swabest/20170103/train: 100%|██████████| 380/380 [00:43<00:00,  8.78it/s]
Extract 0/swabest/20170103/valid: 100%|██████████| 96/96 [00:12<00:00,  7.82it/s]
Extract 0/swabest/20170103/test: 100%|██████████| 120/120 [00:16<00:00,  7.14it/s]
Extract 0/swalast/20170103/train: 100%|██████████| 380/380 [00:42<00:00,  9.00it/s]
Extract 0/swalast/20170103/valid: 100%|██████████| 96/96 [00:14<00:00,  6.59it/s]
Extract 0/swalast/20170103/test: 100%|██████████| 120/120 [00:16<00:00,  7.19it/s]
Extract 1/best/20170103/train: 100%|██████████| 380/380 [01:24<00:00,  4.47it/s]
Extract 1/best/20170103/valid: 100%|██████████| 96/96 [00:28<00:00,  3.34it/s]
Extract 1/best/20170103/test: 100%|██████████| 120/120 [00:43<00:00,  2.78it/s]
Extract 1/swabest/2017

In [16]:
aa.update_model_iter()

[(20170704, 0, 'best'),
 (20170704, 0, 'swabest'),
 (20170704, 0, 'swalast'),
 (20170704, 1, 'best'),
 (20170704, 1, 'swabest'),
 (20170704, 1, 'swalast'),
 (20171226, 0, 'best'),
 (20171226, 0, 'swabest'),
 (20171226, 0, 'swalast'),
 (20171226, 1, 'best'),
 (20171226, 1, 'swabest'),
 (20171226, 1, 'swalast'),
 (20180627, 0, 'best'),
 (20180627, 0, 'swabest'),
 (20180627, 0, 'swalast'),
 (20180627, 1, 'best'),
 (20180627, 1, 'swabest'),
 (20180627, 1, 'swalast'),
 (20181220, 0, 'best'),
 (20181220, 0, 'swabest'),
 (20181220, 0, 'swalast'),
 (20181220, 1, 'best'),
 (20181220, 1, 'swabest'),
 (20181220, 1, 'swalast'),
 (20190624, 0, 'best'),
 (20190624, 0, 'swabest'),
 (20190624, 0, 'swalast'),
 (20190624, 1, 'best'),
 (20190624, 1, 'swabest'),
 (20190624, 1, 'swalast'),
 (20191217, 0, 'best'),
 (20191217, 0, 'swabest'),
 (20191217, 0, 'swalast'),
 (20191217, 1, 'best'),
 (20191217, 1, 'swabest'),
 (20191217, 1, 'swalast'),
 (20200617, 0, 'best'),
 (20200617, 0, 'swabest'),
 (20200617, 0

In [3]:
aa.reg_model.model_dates

array([20170103, 20170704, 20171226, 20180627, 20181220, 20190624,
       20191217, 20200617, 20201214, 20210615, 20211209, 20220613,
       20221206, 20230606, 20231201, 20240604])

In [18]:
import gc , torch
import numpy as np

from abc import abstractmethod
from numpy.random import permutation
from torch import Tensor
from torch.utils.data import BatchSampler
from typing import Any , Iterator , Literal , Optional

from src.nn_model.classes import BaseDataModule , BatchData
from src.nn_model.util import BufferSpace , DataloaderStored , Device , LoaderWrapper , Storage , TrainConfig
from src.data import DataBlockNorm , DataProcessor , ModuleData , BoosterData
from src.basic import PATH , CONF
from src.func import tensor_standardize_and_weight , match_values
from src.nn_model.trainer.data_module import DataModule

from src.func import index_intersect
from src.data import DataBlock

class AggregatorDataModule(DataModule):
    '''for boosting such as algo.boost.lgbm, create booster'''
    def train_dataloader(self) -> Iterator[BoosterData]: return self.loader_dict['train']
    def val_dataloader(self) -> Iterator[BoosterData]:   return self.loader_dict['valid']
    def test_dataloader(self) -> Iterator[BoosterData]:  return self.loader_dict['test']
    def predict_dataloader(self) -> Iterator[BoosterData]: return self.loader_dict['test']
        
    def load_data(self):
        with CONF.Silence():
            self.datas = ModuleData.load([] , self.config['data.labels'], self.predict, self.config.precision)
        self.labels_n = min(self.datas.y.shape[-1] , self.config.Model.max_num_output)
        if self.predict:
            self.model_date_list = self.datas.date[0]
            self.test_full_dates = self.datas.date[1:]
        else:
            self.model_date_list = self.datas.date_within(self.config['beg_date'] , self.config['end_date'] , self.config['interval'])
            self.test_full_dates = self.datas.date_within(self.config['beg_date'] , self.config['end_date'])[1:]

        self.static_prenorm_method = {}
        self.reset_dataloaders()
        return self

    def setup(self, stage : Literal['fit' , 'test' , 'predict'] , 
              param = {'seqlens' : {'day': 30 , '30m': 30 , 'style': 30}} , 
              model_date = -1) -> None:
        if self.predict: stage = 'predict'

        if self.loader_param == (stage , model_date): return
        self.loader_param = stage , model_date

        assert stage in ['fit' , 'test' , 'predict'] and model_date > 0 , (stage , model_date)

        self.stage = stage
        self.seqs = {'hidden':1}
        self.seq0 = self.seqx = self.seqy = 1

        hidden_dates : list[np.ndarray] = []
        hidden_df : pd.DataFrame | Any = None
        ds_list = ['train' , 'valid'] if stage == 'fit' else ['test' , 'predict']
        for hidden_key in self.config['data.hidden']:
            model_name , model_num , model_type = hidden_key.split('.')
            hidden_path = os.path.join(PATH.hidden , model_name , f'hidden.{model_num}.{model_type}.{model_date}.feather')
            df = pd.read_feather(hidden_path)
            df = df[df['dataset'].isin(ds_list)].drop(columns='dataset').set_index(['secid','date'])
            hidden_dates.append(df.index.get_level_values('date').unique().to_numpy())
            df.columns = [f'{hidden_key}.{col}' for col in df.columns]
            hidden_df = df if hidden_df is None else hidden_df.join(df , how='outer')

        stage_date = index_intersect(hidden_dates)[0]
        if self.stage != 'fit':
            stage_date = index_intersect([stage_date , self.test_full_dates])[0]
        self.day_len = len(stage_date)
        self.step_len = len(stage_date)
        self.date_idx , self.step_idx = torch.arange(self.day_len) , torch.arange(self.day_len)

        y_aligned = self.datas.y.align_date(stage_date , inplace=False)
        self.y_secid , self.y_date = y_aligned.secid , y_aligned.date

        if stage == 'fit':
            ...
        elif stage in ['predict' , 'test']:
            self.model_test_dates = stage_date
            self.early_test_dates = stage_date[:0]
        else:
            raise KeyError(stage)

        x = {'hidden':DataBlock.from_dataframe(hidden_df).align_secid_date(self.y_secid , self.y_date).as_tensor().values}
        y = Tensor(y_aligned.values).squeeze(2)[...,:self.labels_n]
        self.hidden_cols = hidden_df.columns
        self.y , _ = self.standardize_y(y , None , None , no_weight = True)

        if stage != 'fit':
            w , valid = None , None
            y , _ = self.standardize_y(self.y , None , self.step_idx)
        else:
            valid = self.full_valid_sample(x , self.y , self.step_idx)
            y , w = self.standardize_y(self.y , valid , self.step_idx)

        self.y[:,self.step_idx] = y[:]
        self.static_dataloader(x , y , w , valid)

        gc.collect() 
        torch.cuda.empty_cache()

    def static_dataloader(self , x : dict[str,Tensor] , y : Tensor , w = None , valid = None) -> None:
        '''update loader_dict , save batch_data to f'{PATH.model}/{model_name}/{set_name}_batch_data' and later load them'''
        index0, index1 = torch.arange(len(y)) , self.step_idx
        sample_index = self.split_sample(self.stage , index0 , index1 , self.config.train_ratio)
        self.storage.del_group(self.stage)
        assert len(x) == 1 , len(x)
        x0 = x['hidden']
        for set_key , set_samples in sample_index.items():
            if set_key in ['train' , 'valid']:
                bb_x , bb_y , bb_d = [] , [] , []
                for bnum , b_i in enumerate(set_samples):
                    i0 , i1 , yindex1 = b_i[:,0] , b_i[:,1] , match_values(b_i[:,1] , index1)

                    bb_x.append(x0[i0 , i1].reshape(len(self.y_secid),1,-1))
                    bb_y.append(y[i0 , yindex1])
                    bb_d.append(self.y_date[index1[yindex1[0]]])
                bb_x = torch.concat(bb_x , dim = 1)
                bb_y = torch.concat(bb_y , dim = 1)
                bb_d = np.array(bb_d)
                bnum = 0
                batch_files = [f'{PATH.batch}/{set_key}.{bnum}.pt']
                self.storage.save(BoosterData(bb_x , bb_y , self.y_secid , bb_d , self.hidden_cols) , batch_files[bnum] , group = self.stage)
            elif set_key == 'test':
                batch_files = [f'{PATH.batch}/{set_key}.{bnum}.pt' for bnum in range(len(set_samples))]
                for bnum , b_i in enumerate(set_samples):
                    i0 , i1 , yindex1 = b_i[:,0] , b_i[:,1] , match_values(b_i[:,1] , index1)

                    b_x = x0[i0,i1].reshape(len(self.y_secid),1,-1)
                    b_y = y[i0 , yindex1] # [n_stock x num_output]
                    dates = np.array([self.y_date[index1[yindex1[0]]]])
                    self.storage.save(BoosterData(b_x , b_y , self.y_secid , dates , self.hidden_cols) , batch_files[bnum] , group = self.stage)
            else:
                raise KeyError(set_key)
            self.loader_dict[set_key] = DataloaderStored(self.storage , batch_files)

    @staticmethod
    def split_sample(stage , index0 : Tensor , index1 : Tensor , train_ratio   : float = 0.8) -> dict[str,list]:
        l0 , l1 = len(index0) , len(index1)
        pos = torch.stack([index0.repeat_interleave(l1) , index1.repeat(l0)] , -1).reshape(l0,l1,2)

        def sequential_sampling(beg , end , posit = pos): return [posit[:,j] for j in range(beg , end)]
        
        sample_index = {}
        if stage == 'fit':
            # must be sequential
            sep = int(l1 * train_ratio)
            sample_index['train'] = sequential_sampling(0 , sep)
            sample_index['valid'] = sequential_sampling(sep , l1)
        else:
            # test dataloader should have the same length as dates, so no filtering of val[:,j].sum() > 0
            sample_index['test'] = sequential_sampling(0 , l1)
        return sample_index

In [19]:
self = AggregatorDataModule(predict=False)
self.load_data()

<__main__.AggregatorDataModule at 0x1abc3770c10>

In [20]:
self.setup('fit',model_date=20170103)

In [7]:
import os
import pandas as pd

from src.func import index_intersect
from src.data import DataBlock

stage : Literal['fit' , 'test' , 'predict'] = 'fit'  
param = {'seqlens' : {'day': 30 , '30m': 30 , 'style': 30}}
model_date = 20170103

if self.predict: stage = 'predict'

# if self.loader_param == (stage , model_date): return
self.loader_param = stage , model_date

assert stage in ['fit' , 'test' , 'predict'] and model_date > 0 , (stage , model_date)

self.stage = stage
x_keys , y_keys = ['hidden'] , []
self.seqs = {'hidden':1}
self.seq0 = self.seqx = self.seqy = 1

hidden_dates : list[np.ndarray] = []
hidden_df : pd.DataFrame | Any = None
ds_list = ['train' , 'valid'] if stage == 'fit' else ['test' , 'predict']
for hidden_key in self.config['data.hidden']:
    model_name , model_num , model_type = hidden_key.split('.')
    hidden_path = os.path.join(PATH.hidden , model_name , f'hidden.{model_num}.{model_type}.{model_date}.feather')
    df = pd.read_feather(hidden_path)
    df = df[df['dataset'].isin(ds_list)].drop(columns='dataset').set_index(['secid','date'])
    hidden_dates.append(df.index.get_level_values('date').unique().to_numpy())
    df.columns = [f'{hidden_key}.{col}' for col in df.columns]
    hidden_df = df if hidden_df is None else hidden_df.join(df , how='outer')



In [10]:

stage_date = index_intersect(hidden_dates)[0]

self.day_len = len(stage_date)
self.step_len = len(stage_date)
self.date_idx , self.step_idx = torch.arange(self.day_len) , torch.arange(self.day_len)

y_aligned = self.datas.y.align_date(stage_date , inplace=False)
self.y_secid , self.y_date = y_aligned.secid , y_aligned.date

if stage == 'fit':
    ...
elif stage in ['predict' , 'test']:
    self.model_test_dates = stage_date
    self.early_test_dates = stage_date[:0]
else:
    raise KeyError(stage)

x = {'hidden':DataBlock.from_dataframe(hidden_df).align_secid_date(self.y_secid , self.y_date).as_tensor().values}
y = Tensor(y_aligned.values).squeeze(2)[...,:self.labels_n]

self.y , _ = self.standardize_y(y , None , None , no_weight = True)

if stage != 'fit':
    w , valid = None , None
    y , _ = self.standardize_y(self.y , None , self.step_idx)
else:
    valid = self.full_valid_sample(x , self.y , self.step_idx)
    y , w = self.standardize_y(self.y , valid , self.step_idx)

self.y[:,self.step_idx] = y[:]
self.static_dataloader(x , y , w , valid)

gc.collect() 
torch.cuda.empty_cache()

In [None]:
import os
import pandas as pd

from src.func import index_intersect
from src.data import DataBlock

stage : Literal['fit' , 'test' , 'predict'] = 'fit'  
param = {'seqlens' : {'day': 30 , '30m': 30 , 'style': 30}}
model_date = 20170103

if self.predict: stage = 'predict'

# if self.loader_param == (stage , model_date): return
self.loader_param = stage , model_date

assert stage in ['fit' , 'test' , 'predict'] and model_date > 0 , (stage , model_date)

self.stage = stage
x_keys = ['hidden']
y_keys = []
self.seqs = {k:1 for k in y_keys + x_keys}
self.seq0 = self.seqx = self.seqy = 1

hidden_dates : list[np.ndarray] = []
hidden_blocks : list[DataBlock] = []
ds_list = ['train' , 'valid'] if stage == 'fit' else ['test' , 'predict']
for hidden_key in self.config['data.hidden']:
    model_name , model_num , model_type = hidden_key.split('.')
    hidden_path = os.path.join(PATH.hidden , model_name , f'hidden.{model_num}.{model_type}.{model_date}.feather')
    df = pd.read_feather(hidden_path)
    df = df[df['dataset'].isin(ds_list)].drop(columns='dataset').set_index(['secid','date'])
    hidden_dates.append(df.index.get_level_values('date').unique().to_numpy())
    df.columns = [f'{hidden_key}.{col}' for col in df.columns]
    hidden_blocks.append(DataBlock.from_dataframe(df).align_secid_date(self.datas.y.secid , self.datas.y.date))

db = DataBlock.concat_feature(hidden_blocks)
self.datas.x['hidden'] = db.as_tensor()

stage_date = index_intersect([*hidden_dates , self.datas.y.date])[0]
d0 = np.where(self.datas.y.date == stage_date.min())[0][0]
d1 = np.where(self.datas.y.date == stage_date.max())[0][0] + 1

self.day_len = d1 - d0
self.step_len = len(stage_date)
self.date_idx = torch.tensor(match_values(stage_date , self.datas.y.date))
self.step_idx = self.date_idx - d0

self.y_secid , self.y_date = self.datas.y.secid , self.datas.y.date[d0:d1]

if stage == 'fit':
    ...
elif stage in ['predict' , 'test']:
    self.model_test_dates = stage_date
    self.early_test_dates = stage_date[:0]
else:
    raise KeyError(stage)

x = {k:Tensor(v.values)[:,d0:d1] for k,v in self.datas.x.items()}
y = Tensor(self.datas.y.values)[:,d0:d1].squeeze(2)[...,:self.labels_n]


self.y , _ = self.standardize_y(y , None , None , no_weight = True)

if stage != 'fit':
    w , valid = None , None
    y , _ = self.standardize_y(self.y , None , self.step_idx)
else:
    valid = self.full_valid_sample(x , self.y , self.step_idx)
    y , w = self.standardize_y(self.y , valid , self.step_idx)

self.y[:,self.step_idx] = y[:]
self.static_dataloader(x , y , w , valid)

gc.collect() 
torch.cuda.empty_cache()