In [9]:
import pandas as pd
import numpy as np
from numba import jit , cuda

import torch
from torch import nn
from sklearn.linear_model import LinearRegression

from recreate import *

gp_params = gp_parameters(True)
gp_params['device'] = torch.device("cuda")
def Ts(df):
    if isinstance(df , torch.Tensor):
        if gp_params.get('device') is not None: df = df.to(gp_params.get('device'))
        # df.share_memory_() # 执行多进程时使用：将张量移入共享内存
    return df

Device name: NVIDIA GeForce RTX 4090 , Available: True
Pop directory is : "./pop/bendi"


In [10]:
cs_indus_code = Ts(read_gp_data('cs_indus_code',gp_params['slice_date'])[0])
size = Ts(read_gp_data('size',gp_params['slice_date'])[0])
cp_raw = Ts(read_gp_data('close_raw',gp_params['slice_date'])[0])
labels = F.ts_delaypct(cp_raw, 10)  # t-10至t的收益率
labels = F.ts_delay(labels, -11)  # t+1至t+11的收益率

In [62]:
def one_hot(x):
    if not isinstance(x , torch.Tensor): x = torch.Tensor(x)
    nan_index = x.isnan()
    m = int(x[~nan_index].max())
    if 1:
        x_new = torch.where(nan_index , m + 1 , x).to(torch.int64)
        dummy = torch.nn.functional.one_hot(x_new).to(torch.float)[...,:m+1]
    else:
        x_new = torch.where(nan_index , -1 , x)
        dummy = torch.stack([(x == i) * 1. for i in range(m+1)] , -1)
        #dummy = torch.zeros(*x.shape , m + 1).to(x.device)
        #for i in range(m+1): dummy[...,i] = (x_new == i) * 1.
    return dummy

def _neutralize_yx(y , x_list = [] , x_group = None , auto_intercept = True , index = None):
    if isinstance(x_list , torch.Tensor): x_list = [x_list]
    if len(x_list) == 0 and x_group is None: return y , None
    elif x_group is None:
        x = torch.stack(x_list,dim = -1)
    elif len(x_list) == 0:
        x = one_hot(x_group)[...,:-1]
    else:
        x = torch.stack(x_list,dim = -1)
        x = torch.cat([x,one_hot(x_group)[...,:-1]],dim=-1)
    if auto_intercept: x = torch.nn.functional.pad(x , (1,0) , value = 1.)
    y = torch.where(y.isinf() , torch.nan , y).unsqueeze(-1)
    x = torch.where(x.isinf() , torch.nan , x)
    if index: y , x = y[index] , x[index]
    return y , x

def betas_torch(x , y):
    try:
        b = torch.linalg.lstsq(x , y , rcond=None)[0]
    except: # 20240215: numpy.linalg.LinAlgError: SVD did not converge in Linear Least Squares
        try:    
            b = torch.linalg.inv(x.T.mm(x)).mm(x.T).mm(y)
        except:
            print('neutralization error!')
            b = torch.zeros(x.shape[-1],1).to(x)
    return b

def betas_np(x , y):
    try:
        b = np.linalg.lstsq(x , y , rcond=None)[0]
    except: # 20240215: numpy.linalg.LinAlgError: SVD did not converge in Linear Least Squares
        try:    
            b = np.linalg.inv(x.T.dot(x)).dot(x.T).dot(y)
        except:
            print('neutralization error!')
            b = np.zeros((x.shape[-1],1)).to(x)
    return b

def betas_sk(x , y):
    try:
        b = LinearRegression(fit_intercept=False).fit(x, y).coef_.T
    except: # 20240215: numpy.linalg.LinAlgError: SVD did not converge in Linear Least Squares
        print('neutralization error!')
        b = np.zeros((x.shape[-1],1)).to(x)
    return b

def neutralizer(method = 'torch'):
    assert method in ['torch' , 'np' , 'sk'] , method
    if method == 'np':
        betas_func = betas_np
    elif method == 'sk':
        betas_func = betas_sk
    else:
        betas_func = betas_torch
    def neutralize(y , x):
        if len(y.shape) == len(x.shape)-1: y = y.unsqueeze(-1)
        return (y - x @ betas_func(x , y)).flatten()
    return neutralize

def neutralize_2d(y , x_list = [] , x_group = None, method = 'torch' , auto_intercept = True , silent= True):  # [tensor (TS*C), tensor (TS*C)]
    assert method in ['sk' , 'np' , 'torch']
    y , x = _neutralize_yx(y , x_list , x_group , auto_intercept)
    if x is None: return y
    nans  = y.isnan().any(dim=-1) + x.isnan().any(dim=-1)
    zeros = (x.nan_to_num() == 0).all(dim=-2)
    if method == 'torch' and not nans.any() and not zeros.any():
        # fastest, but cannot deal nan's
        model = torch.linalg.lstsq(x , y , rcond=None)
        resids = (y - x @ model[0])
    else:
        neutralize = neutralizer(method)
        if method in ['sk' , 'np']:
            dev = y.device
            x,y,nans,zeros = map(lambda a:a.cpu().numpy() , (x,y,nans,zeros))
            resids = np.full_like(y.squeeze(-1) , np.nan)
        else:
            resids = torch.full_like(y.squeeze(-1) , torch.nan)
        
        for i , (y_ , x_ , nan_ , zero_) in enumerate(zip(y , x , nans , zeros)):
            if i % 500 == 0 and not silent: print('neutralize by tradedate',i)
            if (~nan_).sum() < 10:  continue
            resids[i ,~nan_] = neutralize(y_[~nan_] , x_[~nan_][: , ~zero_])
        
        if isinstance(resids , np.ndarray): resids = torch.Tensor(resids).to(dev)
    return resids

In [46]:
%timeit neutralize_2d(labels, size, cs_indus_code , 'torch')

330 ms ± 7.52 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [48]:
%timeit neutralize_2d(labels, size, cs_indus_code , 'sk')

1.91 s ± 10.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [49]:
%timeit neutralize_2d(labels, size, cs_indus_code , 'np')

1.79 s ± 6.91 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [63]:
v1 = neutralize_2d(labels[:1], size[:1], cs_indus_code[:1] , 'torch')
v2 = neutralize_2d(labels[:1], size[:1], cs_indus_code[:1] , 'sk')
v3 = neutralize_2d(labels[:1], size[:1], cs_indus_code[:1] , 'np')
v1 , v2 , v3

(tensor([[-0.0478,  0.0456, -0.0263,  ...,     nan,     nan,     nan]],
        device='cuda:0'),
 tensor([[-0.0478,  0.0456, -0.0263,  ...,     nan,     nan,     nan]],
        device='cuda:0'),
 tensor([[-0.0478,  0.0456, -0.0263,  ...,     nan,     nan,     nan]],
        device='cuda:0'))

In [60]:
a , b , c , d = map(lambda x:x**2,[1,2,3,4])

In [61]:
a , b , c , d

(1, 4, 9, 16)

In [69]:
y , x = _neutralize_yx(labels[:1], size[:1], cs_indus_code[:1])

In [70]:
isnan = y[0].isnan().any(1) + x[0].isnan().any(1)

In [72]:
(x == 0).all(1)

tensor([[False, False,  True, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False]])

In [62]:
y = y[0,~isnan]
x = x[0,~isnan]
x = x[:,x.sum(0) != 0]

In [65]:
try:
    model = torch.linalg.lstsq(x , y , rcond=None)
    #print(model)
    resids = (y - x @ model[0]).T  # [1, C]
except: # 20240215: numpy.linalg.LinAlgError: SVD did not converge in Linear Least Squares
    try:    
        print("here")
        beta = torch.linalg.inv(x.T.matmul(x)).matmul(x.T).matmul(y)
        resids = (y - x @ beta).T
    except:
        print('neutralization error!')
        resids = y.copy_()

resids

tensor([[-0.0478,  0.0456, -0.0263,  ...,  0.0196,  0.0196, -0.0432]])

In [33]:
resids

tensor([nan, nan, nan,  ..., nan, nan, nan])

In [30]:
x.shape

torch.Size([1, 5210, 32])

AttributeError: 'builtin_function_or_method' object has no attribute '__dict__'

In [12]:
v1 , v2

(tensor([[0.0097, 0.0595, 0.0283,  ...,    nan,    nan,    nan]]),
 tensor([[-0.0478,  0.0456, -0.0263,  ...,     nan,     nan,     nan]]))

In [6]:
neutralize_2d(labels, size, cs_indus_code , 'torch')

tensor([[ 0.0097,  0.0595,  0.0283,  ...,     nan,     nan,     nan],
        [ 0.0333,  0.0773, -0.0043,  ...,     nan,     nan,     nan],
        [ 0.0517,  0.0252,  0.0309,  ...,     nan,     nan,     nan],
        ...,
        [    nan,     nan,     nan,  ...,     nan,     nan,     nan],
        [    nan,     nan,     nan,  ...,     nan,     nan,     nan],
        [    nan,     nan,     nan,  ...,     nan,     nan,     nan]])

In [7]:
neutralize_2d(labels, size, cs_indus_code , 'np')

tensor([[-0.0478,  0.0456, -0.0263,  ...,     nan,     nan,     nan],
        [-0.0368,  0.0450, -0.0094,  ...,     nan,     nan,     nan],
        [-0.0063,  0.0188,  0.0345,  ...,     nan,     nan,     nan],
        ...,
        [    nan,     nan,     nan,  ...,     nan,     nan,     nan],
        [    nan,     nan,     nan,  ...,     nan,     nan,     nan],
        [    nan,     nan,     nan,  ...,     nan,     nan,     nan]])

In [37]:
v1

tensor([[0.0097, 0.0595, 0.0283,  ...,    nan,    nan,    nan]])

In [35]:
v2

tensor([[-0.0478,  0.0456, -0.0263,  ...,     nan,     nan,     nan]])

In [36]:
v3

tensor([[-0.0478,  0.0456, -0.0263,  ...,     nan,     nan,     nan]])