In [3]:
import torch

net = torch.nn.LSTM(4 , 4)
x = torch.rand(10,5,4)
x = x[torch.zeros(10).to(torch.bool)]
net(x)


RuntimeError: Expected sequence length to be larger than 0 in RNN

In [23]:
import lightgbm as lgb
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
import os , torch

from copy import deepcopy
from dataclasses import dataclass
from src.environ import DIR
# from src.algo.boost.lgbt import LgbtPlot , LgbtWeight
from typing import Any , ClassVar , Literal , Optional
from src.func import match_values, np_nanic_2d , np_nanrankic_2d

@dataclass
class BoosterData:
    raw_x   : pd.DataFrame | np.ndarray | torch.Tensor
    raw_y   : pd.Series    | np.ndarray | torch.Tensor
    secid   : Any = None
    date    : Any = None
    feature : Any = None
    weight_param : Optional[dict] = None

    df_var_sec  : ClassVar[list[str]] = ['SecID','instrument']
    df_var_date : ClassVar[list[str]] = ['TradeDate','datetime']

    def __post_init__(self):
        assert len(self.raw_x) == len(self.raw_y) , f'x and y length must match'
        if isinstance(self.raw_x , torch.Tensor) and isinstance(self.raw_y , torch.Tensor): 
            self.x = self.raw_x.detach().cpu().numpy()
            self.y = self.raw_y.detach().cpu().numpy()
        elif isinstance(self.raw_x , np.ndarray) and isinstance(self.raw_y , np.ndarray): 
            self.x = self.raw_x
            self.y = self.raw_y
        elif isinstance(self.raw_x , pd.DataFrame) and isinstance(self.raw_y , pd.Series): 
            self.var_sec  = [v for v in self.df_var_sec  if v in self.raw_x.index.names][0]
            self.var_date = [v for v in self.df_var_date if v in self.raw_x.index.names][0]
            x = self.raw_x.reset_index().set_index([self.var_sec,self.var_date])
            xarr = xr.Dataset.from_dataframe(x)
 
            xindex = [arr.values for arr in xarr.indexes.values()] + [list(xarr.data_vars)]
            self.x = np.stack([arr.to_numpy() for arr in xarr.data_vars.values()] , -1)
            if self.secid is None : self.secid = xindex[0]
            if self.date  is None : self.date  = xindex[1]
            if self.feature is None : self.feature = xindex[-1]

            yarr = xr.Dataset.from_dataframe(pd.DataFrame(self.raw_y.reset_index().set_index([self.var_sec,self.var_date])))
            self.y = np.stack([arr.to_numpy() for arr in yarr.data_vars.values()] , -1)
        else:
            raise TypeError(f'x and y type must match')
        if self.secid is None : self.secid = np.arange(self.x.shape[0])
        if self.date  is None : self.date  = np.arange(self.x.shape[1])
        if self.feature is None : self.feature = np.array([f'feature.{i}' for i in range(self.x.shape[-1])])
        assert self.x.shape == (len(self.secid) , len(self.date) , len(self.feature))
        self.update_feature()
        if self.weight_param is None: self.weight_param = {'tau':0.75*np.log(0.5)/np.log(0.75) , 'ts_type':'lin' , 'rate':0.5}  

    def update_feature(self , use_feature = None):
        if use_feature is not None:
            assert all(np.isin(use_feature , self.feature)) , np.setdiff1d(use_feature , self.feature)
        self.use_feature = use_feature

    def lgbt_dataset(self , weight_param = None , reference = None):
        return lgb.Dataset(self.X() , self.Y() , weight = self.W(weight_param) , reference = reference)
    
    def X(self): 
        if self.use_feature is None:
            return self.x.reshape(-1,self.x.shape[-1])
        else:
            return self.X_feat(self.use_feature).reshape(-1,len(self.use_feature))

    def Y(self): return self.y.flatten()

    def W(self , weight_param : Optional[dict] = None):
        weight_param = self.weight_param if weight_param is None else weight_param
        if weight_param is None: weight_param = {}
        w = self.calculate_weight(self.y , **weight_param)
        return w.flatten()
    
    def X_feat(self , feature): return self.x[...,match_values(feature , self.feature)]

    @property
    def shape(self): return self.x.shape

    @property
    def nfeat(self): return len(self.feature) if self.use_feature is None else len(self.use_feature)
    
    @classmethod
    def calculate_weight(cls , y : np.ndarray, 
                         cs_type : Optional[Literal['top']] = 'top' ,
                         ts_type : Optional[Literal['lin' , 'exp']] = 'lin' ,
                         **kwargs):
        assert y.ndim == 2 or (y.ndim == 3 and y.shape[-1] == 1) , y.shape
        if y.ndim == 3: y = y[...,0]
        return cls.cs_weight(y , cs_type , **kwargs) * cls.ts_weight(y , ts_type , **kwargs)

    @classmethod
    def cs_weight(cls , y : np.ndarray , cs_type : Optional[Literal['top']] = 'top' , tau : Optional[float] = None , **kwargs):
        w = y * 0 + 1.
        if cs_type is None: return w
        if tau is None : tau = 0.75*np.log(0.5)/np.log(0.75)
        for j in range(w.shape[1]):
            if cs_type == 'top':
                v = y[:,j] * 1.
                v[~np.isnan(v)] = v[~np.isnan(v)].argsort()
                w[:,j] = np.exp((1 - v / np.nanmax(v))*np.log(0.5) / tau)
        return w
    
    @classmethod
    def ts_weight(cls , y : np.ndarray , ts_type : Optional[Literal['lin' , 'exp']] = 'lin' , rate : Optional[float] = None , **kwargs):
        w = y * 0 + 1.
        if ts_type is None: return w
        if rate is None : rate = 0.5
        if ts_type == 'lin':
            w *= np.linspace(rate,1,w.shape[1]).reshape(1,-1)
        elif ts_type == 'exp':
            w *= np.power(2 , -np.arange(w.shape[1])[::-1] / int(rate * w.shape[1])).reshape(1,-1)
        return w

In [24]:
def rand_nan(x , ratio = 0.1):
    ii = np.random.choice(np.arange(len(x)) , int(ratio * len(x)))
    x[ii] = np.nan
    return x

train = rand_nan(np.random.rand(1000,40,21))
valid = rand_nan(np.random.rand(500,40,21))
test  = rand_nan(np.random.rand(100,40,21))

In [27]:

train = pd.read_csv(f'{DIR.data}/tree_data/df_train.csv' , index_col=[0,1])
valid = pd.read_csv(f'{DIR.data}/tree_data/df_valid.csv' , index_col=[0,1])
test  = pd.read_csv(f'{DIR.data}/tree_data/df_test.csv' , index_col=[0,1])


In [17]:
import pandas as pd
a = pd.DataFrame([1])