In [26]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import os , torch

fctCombTrain = pd.read_feather('CombStdByZXMkt_All_TrainLabel.feather') # 训练集，带Label
del fctCombTrain['ZX'], fctCombTrain['mktVal'], fctCombTrain['nextRtnM'], fctCombTrain['mktValRank']
fctCombTrain['date'] = pd.to_datetime(fctCombTrain['date'])
fctCombTrain = fctCombTrain.sort_values(['date', 'StockID']).reset_index(drop=True)

In [27]:
import torch
from torch import Tensor

import numpy as np
import pandas as pd

from typing import Any

class WeakBooster:
    EPS = 1e-6
    def __init__(self, n_bins : int): 
        self.n_bins = n_bins
    def __repr__(self) -> str: 
        return  f'{self.__class__.__name__}(n_bins={self.n_bins})'

    def fit(self, X : Tensor , y : Tensor , weight : Tensor | None = None):
        if weight is None: weight = torch.ones_like(y) / len(y)
        assert isinstance(X , Tensor) and isinstance(y , Tensor) and isinstance(weight , Tensor) , (X , y , weight)
        assert torch.all(X < self.n_bins) , X.max()
        assert not torch.is_floating_point(X) , X.dtype
        self.n_feat = X.shape[-1]

        pos_wgt , neg_wgt = (weight * (y == 1))[:,None] , (weight * (y == -1))[:,None]
        pos_imp , neg_imp = torch.zeros(self.n_feat , self.n_bins) , torch.zeros(self.n_feat , self.n_bins)

        for ibin in range(self.n_bins):
            where = X == ibin
            pos_imp[:,ibin] = (pos_wgt * where).sum(dim = 0)
            neg_imp[:,ibin] = (neg_wgt * where).sum(dim = 0)
        
        feat_imp = (pos_imp + neg_imp).sum(-1,keepdim=True)
        pos_imp , neg_imp = pos_imp / feat_imp , neg_imp / feat_imp
        
        self.feat_losses = torch.sqrt(pos_imp * neg_imp).sum(1)
        self.feat_idx = self.feat_losses.argmin()
        self.bin_predictions = 0.5 * torch.log((pos_imp[self.feat_idx] + self.EPS) / (neg_imp[self.feat_idx] + self.EPS))
        return self
    
    @property
    def min_feat_loss(self): return self.feat_losses[self.feat_idx]

    def predict(self, X : Tensor):
        assert X.shape[-1] == self.n_feat , (X.shape , self.n_feat)
        assert isinstance(X , Tensor) , X
        assert not torch.is_floating_point(X) , X.dtype
        group = X[:, self.feat_idx]
        return torch.where(group >= 0 , self.bin_predictions[group] , torch.nan)
    
class RealAdaBoost:
    def __init__(self, n_booster = 30, n_bins = 20):
        self.n_booster , self.n_bins = n_booster , n_bins
        self.boosters = [WeakBooster(n_bins) for _ in range(n_booster)]

    def __repr__(self) -> str: return f'{self.__class__.__name__}(n_booster={self.n_booster},n_bins={self.n_bins})'
    def __getitem__(self , i): return self.boosters[i]
    
    def update_weight(self , weight : Tensor , y : Tensor , y_pred : Tensor):
        weight = torch.exp(-y * y_pred.nan_to_num(0)) * weight
        return weight / weight.sum()

    def fit(self, X : Tensor , y : Tensor , init_weight : Tensor | None = None):
        weight = self.init_weight(y) if init_weight is None else init_weight
        weight = weight.to(y.device)
        for i , booster in enumerate(self.boosters):  # 使用 tqdm 显示进度条
            y_pred = booster.fit(X , y , weight).predict(X)
            weight = self.update_weight(weight , y , y_pred)
            if i % 5 == 0: print(f'Round: {i+1}, Estimator Error: {booster.min_feat_loss}, feature_idx: {booster.feat_idx}')
        return self
    
    def predict(self, X):
        return torch.stack([booster.predict(X) for booster in self.boosters] , dim = -1)
    
    @property
    def feat_idx(self): return [booster.feat_idx for booster in self.boosters]
    @property
    def feat_err(self): return [booster.min_feat_loss for booster in self.boosters]

    @staticmethod
    def inputX(x : pd.DataFrame , n_bins : int = 20 , date_colname = 'date'):
        X = torch.tensor(x.groupby(date_colname).rank(pct = True).values) * n_bins
        X[X >= n_bins] = n_bins - 1
        return X.nan_to_num(-1).to(torch.int32)
    
    @staticmethod
    def inputY(y : pd.DataFrame): return torch.tensor(y.values.squeeze())

    @staticmethod
    def init_weight(y , top_weight = False , date_weight = False , component_weight = None , 
                    date_colname = 'date' , secid_colname = 'StockID' , halflife = 12): 
        wgt : Any = pd.DataFrame({'weight':1},index=y.index) if isinstance(y , pd.DataFrame) else torch.ones_like(y)

        if top_weight: 
            where = (y > 0).values if isinstance(y , pd.DataFrame) else y > 0
            wgt[where] = wgt[where] * 2

        if date_weight:
            assert isinstance(wgt , pd.DataFrame) and date_colname in wgt.index.names , wgt
            dates = wgt.index.get_level_values(date_colname).unique()
            d_wgt = np.exp(np.log(0.5) * np.flip(np.arange(len(dates))) / halflife)
            date_wgt = pd.DataFrame({date_colname:dates,'weight':d_wgt}).set_index(date_colname)
            wgt = wgt * date_wgt

        if component_weight is not None:
            assert isinstance(wgt , pd.DataFrame) and secid_colname in wgt.index.names , wgt
            where = wgt.index.get_level_values(secid_colname).isin(component_weight)
            wgt[where] = wgt[where] * 2

        if isinstance(wgt , pd.DataFrame):
            wgt = torch.tensor(wgt.values.squeeze())
        return wgt / wgt.sum()
    
windowsLen = 24 # 回望过去的月数
MDTs = fctCombTrain['date'].unique()   # 测试集日期-完整
idx = 24

resDetails = dict()
resFct = pd.DataFrame()

idtEnd = MDTs[idx]
idtStart = MDTs[idx - windowsLen+1]
idtPredict = MDTs[idx + 1]
# if pd.to_datetime(idtPredict) < pd.to_datetime('2014-12-31'):
#     continue
print(f'\n训练区间：{idtStart} - {idtEnd}')
fctCombTrainIdt = fctCombTrain[(fctCombTrain['date'] >= idtStart) & (fctCombTrain['date'] <= idtEnd)]
fctCombTrainIdt = fctCombTrainIdt.set_index(['date', 'StockID']).dropna(subset=['nextRtnM_Label'])
fctCombTrainIdt = fctCombTrainIdt[fctCombTrainIdt['nextRtnM_Label']!=0]

missingSeries = fctCombTrainIdt.isnull().sum() / len(fctCombTrainIdt)
if missingSeries['nextRtnM_Label'] > 0.2: print('Error: nextRtnM_Label 缺失值比例大于0.2')

fctCombTrainIdt = fctCombTrainIdt.loc[:, missingSeries[missingSeries < 0.2].index] # .fillna(0.0)
assert isinstance(fctCombTrainIdt , pd.DataFrame)

trainX = RealAdaBoost.inputX(fctCombTrainIdt.iloc[:,:-1])
trainY = RealAdaBoost.inputY(fctCombTrainIdt.iloc[:,-1:])
init_weight = RealAdaBoost.init_weight(fctCombTrainIdt.iloc[:,-1:])

model = RealAdaBoost(n_booster = 30, n_bins = 20)
model.fit(trainX, trainY , init_weight=init_weight)


训练区间：2005-02-28 00:00:00 - 2007-01-31 00:00:00
Round: 1, Estimator Error: 0.4941096007823944, feature_idx: 129
Round: 6, Estimator Error: 0.49925777316093445, feature_idx: 135
Round: 11, Estimator Error: 0.49936383962631226, feature_idx: 5
Round: 16, Estimator Error: 0.4994628131389618, feature_idx: 33
Round: 21, Estimator Error: 0.49956050515174866, feature_idx: 7
Round: 26, Estimator Error: 0.49958834052085876, feature_idx: 90


RealAdaBoost(n_booster=30,n_bins=20)

In [None]:


fctCombTrain = pd.read_feather(os.path.join('input_data', '因子值CombStdByZXMkt_All_TrainLabel.feather')) # 训练集，带Label
del fctCombTrain['ZX'], fctCombTrain['mktVal'], fctCombTrain['nextRtnM'], fctCombTrain['mktValRank']
fctCombTrain['date'] = pd.to_datetime(fctCombTrain['date'])
fctCombTrain = fctCombTrain.sort_values(['date', 'StockID']).reset_index(drop=True)

fctCombTest = pd.read_feather(os.path.join('input_data', '因子值CombStdByZXMkt_All.feather')) # 测试集，不带Label
fctCombTest['date'] = pd.to_datetime(fctCombTest['date'])
del fctCombTest['ZX'], fctCombTest['mktVal'], fctCombTest['nextRtnM']
fctCombTest = fctCombTest.sort_values(['date', 'StockID']).reset_index(drop=True)

MDTs = fctCombTest['date'].unique()   # 测试集日期-完整
IDs = fctCombTest['StockID'].unique() # 测试集股票代码-完整


# %% 训练1-滚动训练

windowsLen = 24 # 回望过去的月数
idx = len(MDTs)-2

resDetails = dict()
resFct = pd.DataFrame()
for idx in tqdm(range(windowsLen-1, len(MDTs)-1)):
    idtEnd = MDTs[idx]
    idtStart = MDTs[idx - windowsLen+1]
    idtPredict = MDTs[idx + 1]
    # if pd.to_datetime(idtPredict) < pd.to_datetime('2014-12-31'):
    #     continue
    print(f'\n训练区间：{idtStart} - {idtEnd}')
    fctCombTrainIdt = fctCombTrain[(fctCombTrain['date'] >= idtStart) & (fctCombTrain['date'] <= idtEnd)]
    fctCombTrainIdt = fctCombTrainIdt.set_index(['date', 'StockID']).dropna(subset=['nextRtnM_Label'])
    fctCombTrainIdt = fctCombTrainIdt[fctCombTrainIdt['nextRtnM_Label']!=0]
    # 每列计算缺失值比例
    missingSeries = fctCombTrainIdt.isnull().sum() / len(fctCombTrainIdt)
    if missingSeries['nextRtnM_Label'] > 0.2:
        print('Error: nextRtnM_Label 缺失值比例大于0.2')
        break
    # 剔除缺失值占比大于0.2的列
    fctCombTrainIdt = fctCombTrainIdt.loc[:, missingSeries[missingSeries < 0.2].index].fillna(0.0)
    # 训练集
    trainX = fctCombTrainIdt.iloc[:, :-1].values
    trainY = fctCombTrainIdt.iloc[:, -1].values
    # 训练模型
    model = RealAdaBoost(n_classifier = 30, n_bins = 20)
    model.fit(trainX, trainY)

    # 预测
    idtPredict = MDTs[idx+1]
    print(f'预测区间：{idtPredict}')
    fctCombTestIdtPredict = fctCombTest[fctCombTest['date']==idtPredict]
    fctCombTestIdtPredict = fctCombTestIdtPredict.set_index(['date', 'StockID'])
    # 设置和前面一样的列顺序，同时不带nextRtnM_Label
    feature_list = missingSeries[missingSeries < 0.2].index.drop('nextRtnM_Label')
    fctCombTestIdtPredict = fctCombTestIdtPredict.loc[:, feature_list]
    # fctCombTestIdtPredict = fctCombTestIdtPredict.fillna(0.0)
    # 确定测试集的股票代码
    IDs = fctCombTestIdtPredict.index.get_level_values(1).tolist()
    # 测试集数据
    testX = fctCombTestIdtPredict.values
    feature_list, predictions_matrix = model.predict(testX)

    # 保存details
    feature_names = fctCombTestIdtPredict.columns[feature_list]
    resDetails[idtPredict] = {'feature_names': feature_names, 'predictions_matrix': predictions_matrix}
    y_predict = np.sum(predictions_matrix, axis=1)
    y_predict[y_predict == 0.0] = np.nan
    resFctTmp = pd.DataFrame(y_predict, index=IDs, columns=[idtPredict])
    resFct = resFct.merge(resFctTmp, left_index=True, right_index=True, how='outer')

resFctQS = resFct.T
resFctQS.to_csv(os.path.join('output_data', 'RealAdaBoost-原始训练.csv'), encoding='utf-8-sig')
