Create test data matrix $M_{T\times\\N}$

PBO（Probability of Backtest Overfitting）是定量衡量回测过拟合风险的指标，计算方式 基于 Bailey、Borwein、López de Prado 和 Zhu 在 2017 年提出的组合对称交叉验证 （Combinatorially-Symmetric Cross-Validation，简记为 CSCV）框架。

PBO 的计算步骤：
1. 构建矩阵$M_{T\times\\N}$每列分别表示第N组策略下 T期的收益率序列。


In [None]:
#########################  PBO #####################################

class ProbabilityOfBacktestOverfitting(object):
    
    def __init__(self,returns_df:pd.DataFrame,S:int):
        
        self.df = returns_df # 收益序列
        self.S = S # 切分个数 S必须为偶数       
        self.w = []
        self.PBO = 0
        
    def get_group_ids(self):
        '''
        切分df个数为T/S
        '''
        idx = self.df.index.get_level_values(0).drop_duplicates()
        mapping = pd.Series(np.arange(len(idx)),index=idx)
        res = self.df.index.get_level_values(0).map(mapping) % self.S
        return res.values
    
    def split_df(self)->dict:
        '''
        获取切分后的df
        ================
        return dict key为range(0,T/S) values df
        '''
        idx = self.get_group_ids()
        df_set = {label:df for label,df in self.df.groupby(idx)}
        
        return df_set
    
    # 生成C_s
    def get_C_set_id(self)->list:
        '''
        生成排列组合 0,1,2...对应split_df生成的字典
        '''
        a = range(self.S)
        return list(itertools.combinations(a,int(self.S * 0.5)))
    
    # 获取每个组合的补集
    @staticmethod
    def get_complement_set_id(target_set:set,S:int)->set:
        
        all_set = set(range(S))
        c_set = [all_set.difference(a) for a in target_set]

        return c_set
    
    
    def get_w(self):
        
        j = self.get_C_set_id()
        hat_j = self.get_complement_set_id(j,self.S)
        
        df_dict = self.split_df()
        
        n = len(j)
        w_list = []
        
        for i in tqdm(range(n),desc='获取w'):
            
            j_df = self.get_j_df(df_dict,j,i)
            hat_j_df = self.get_j_df(df_dict,hat_j,i)

            # 获取在训练集种的N*
            n_start = self.get_Sharpe_Ratio(j_df).idxmax()
            # 降序获取排名百分比
            w = self.get_Sharpe_Ratio(hat_j_df).rank(ascending=False,pct=True)[n_start]
            #print('n*:%s,j_rank:%s,total:%s'%(n_start,hat_j_rank,self.df.shape[1]))
            #w = hat_j_rank / (self.df.shape[1] + 1)
            
            w_list.append(w)
            
        self.w = pd.Series(w_list) 
        self.PBO = len(self.w[self.w>0.5]) / len(self.w)
        
    @staticmethod
    def get_j_df(df_dict:dict,j:list,num:int)->pd.DataFrame:
        
        return pd.concat([df_dict[i] for i in j[num]])
    
    @staticmethod
    # 计算夏普
    def get_Sharpe_Ratio(df:pd.DataFrame)->pd.Series:
        cum = (1+df).cumprod()
        
        ann_ret = (cum.iloc[-1]-1)**(250/len(df)) - 1
        return (ann_ret - 0.04) / (df.std(ddof=1)*np.sqrt(250))
    
# 画图 显示w的分布
def plot_dist_bar(df:pd.DataFrame):
    
    df_ = df.copy()
    df_.columns = ['T/S=%d'%i for i in df_.columns]
    df_ = df_.stack().reset_index(level=1).copy()
    df_.columns = ['group','w']
    df_['r'] = pd.cut(df_['w'],np.arange(0,1.1,0.125),duplicates='drop')
    df_ = df_.groupby('group')['r'].apply(lambda x:x.value_counts() / x.value_counts().sum())
    df_.index.names = ['group','g']
    df_ = df_.reset_index()
    
    plt.figure(figsize=(12,6))
    plt.title('相对排名分布')
    sns.barplot(x='g',y='r',data=df_,hue='group')
    plt.ylabel('频率')
    plt.xlabel(f'相对排名$\omega$')
    plt.xticks(np.arange(-1,8),[f'{x:.2%}'for x in np.arange(0, 1.1, step=0.125)]) # rotation=90


def GetW(ret:pd.DataFrame,step:list):
    '''
    回测后的returns_df :index-date columns-回测标号
    setp:不同的S
    '''
    w_dict = {}
    pbo_dict = {}

    for i in step:

        pbo = ProbabilityOfBacktestOverfitting(ret,i)
        pbo.get_w()

        w_dict[i] = pbo.w
        pbo_dict[i] = pbo.PBO
    
    return w_dict,pbo_dict