In [7]:
# 从数据库导入数据dfpbroech6
import pandas as pd
from sqlalchemy import create_engine

# 根据你的实际数据库信息填写
username = "panjinhe"
password = "20020112p"
host = "localhost"
port = "5432"
database = "pbroe"

# 定义要查询的表和schema
table_name = 'pbroech6'
schema_name = 'pbroe'

# 定义日期范围
start_date = '2005-04'
end_date = '2025-03'

# 构建连接字符串
connection_string = f"postgresql+psycopg2://{username}:{password}@{host}:{port}/{database}"

# 创建引擎
engine = create_engine(connection_string)

# 构建带有日期范围筛选的SQL查询语句
# 这样可以在数据库层面直接过滤，效率更高
sql_query = f"""
SELECT * FROM {schema_name}.{table_name}
WHERE "trdmnt" >= '{start_date}' AND "trdmnt" <= '{end_date}'
"""
# 使用 pd.read_sql_query 执行带有条件的查询
dfpbroech6 = pd.read_sql_query(sql_query, engine)

# print("\n数据加载成功！")
# # --- 3. 显示数据信息 ---
# print("\nDataFrame Info:")
print(dfpbroech6.info())
# print("\nDataFrame Head (first 10 rows):")
#display(dfpbroech6.head(10))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 665563 entries, 0 to 665562
Data columns (total 11 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   stkcd       665563 non-null  object 
 1   trdmnt      665563 non-null  object 
 2   accper      665563 non-null  object 
 3   shortname   665563 non-null  object 
 4   if_st       665563 non-null  int64  
 5   indcd1      665563 non-null  object 
 6   indnme1     665563 non-null  object 
 7   price       665563 non-null  float64
 8   market_cap  665563 non-null  float64
 9   PB          663460 non-null  float64
 10  roe_ttm     636635 non-null  float64
dtypes: float64(4), int64(1), object(6)
memory usage: 55.9+ MB
None


In [12]:
import pandas as pd
import numpy as np

def calculate_rolling_roe_avg(df: pd.DataFrame, years: list = [2, 3, 5]) -> pd.DataFrame:
    """
    为给定的DataFrame计算每只股票的滚动ROE_ttm平均值。

    该函数的核心逻辑是：对于每个月的截面数据，我们回顾过去N年的同期数据，
    并计算这些年度ROE_ttm的平均值。这种方法可以有效地平滑季节性波动，
    并考察公司长期的盈利能力稳定性。

    参数:
    - df (pd.DataFrame): 包含股票数据的DataFrame，
      必须包含 'stkcd', 'trdmnt', 和 'roe_ttm' 列。
    - years (list): 一个包含整数的列表，指定计算滚动平均值的年份窗口，
      例如 [2, 3, 5] 表示分别计算2年、3年和5年的滚动平均。

    返回:
    - pd.DataFrame: 增加了新的滚动平均ROE列的DataFrame。
      新增列的命名格式为 'roe_Xyr_avg'，其中X是年份。
    """
    if 'stkcd' not in df.columns or 'trdmnt' not in df.columns or 'roe_ttm' not in df.columns:
        raise ValueError("输入DataFrame必须包含 'stkcd', 'trdmnt', 和 'roe_ttm' 列")

    df_copy = df.copy()
    df_copy['trdmnt'] = pd.to_datetime(df_copy['trdmnt'])
    df_copy = df_copy.sort_values(by=['stkcd', 'trdmnt']).reset_index(drop=True)

    max_shift_years = max(years)
    for i in range(1, max_shift_years):
        shift_periods = 12 * i
        lag_col_name = f'roe_ttm_lag_{i}Y'
        df_copy[lag_col_name] = df_copy.groupby('stkcd')['roe_ttm'].shift(shift_periods)

    for year in years:
        avg_col_name = f'roe_{year}yr_avg'
        cols_to_avg = ['roe_ttm'] + [f'roe_ttm_lag_{i}Y' for i in range(1, year)]
        # 确保只选择存在的列进行计算
        existing_cols = [col for col in cols_to_avg if col in df_copy.columns]
        df_copy[avg_col_name] = df_copy[existing_cols].mean(axis=1)

    lag_cols_to_drop = [f'roe_ttm_lag_{i}Y' for i in range(1, max_shift_years)]
    # 确保只删除存在的列
    existing_lag_cols = [col for col in lag_cols_to_drop if col in df_copy.columns]
    df_copy = df_copy.drop(columns=existing_lag_cols)

    print(f"已成功计算并添加 {len(years)} 个滚动平均列: {[f'roe_{y}yr_avg' for y in years]}")
    return df_copy

# --- 1. 计算滚动平均值 ---
# 假设 dfpbroech6 已经加载到内存中
dfpbroech6_with_avg = calculate_rolling_roe_avg(dfpbroech6)

# --- 2. 整理并保存数据 ---
print("\n--- 开始保存数据 ---")

# 复制一份用于保存，避免修改内存中的DataFrame
df_to_save = dfpbroech6_with_avg.copy()

# 重命名列以匹配您的格式要求
rename_dict = {
    'roe_2yr_avg': 'roe_avg_2y',
    'roe_3yr_avg': 'roe_avg_3y',
    'roe_5yr_avg': 'roe_avg_5y'
}
df_to_save.rename(columns=rename_dict, inplace=True)

# 定义最终输出的列名和顺序
final_columns_format = [
    'stkcd', 'trdmnt', 'accper', 'shortname', 'if_st', 'indcd1', 'indnme1',
    'price', 'market_cap', 'PB', 'total_shares', 'eps_ttm_core',
    'eps_ttm_total', 'roe_ttm', 'total_equity', 'ar', 'inventory',
    'intangibles', 'onaps', 'arps', 'invps', 'iaps', 'year', 'month',
    'roe_avg_2y', 'roe_avg_3y', 'roe_avg_5y'
]

# 筛选出当前DataFrame中存在的列，并按指定顺序排列
# 这样做可以避免因缺少某些列而引发的错误
final_columns_existing = [col for col in final_columns_format if col in df_to_save.columns]
df_final_output = df_to_save[final_columns_existing]

# 将 'trdmnt' 列转回 'YYYY-MM-DD' 字符串格式，以保证与原始数据格式一致
df_final_output['trdmnt'] = df_final_output['trdmnt'].dt.strftime('%Y-%m-%d')


# 保存到CSV文件
output_filename = 'pbroe6.2_ROEavg.csv'
df_final_output.to_csv(output_filename, index=False, encoding='utf-8-sig')

print(f"数据已成功保存到文件: {output_filename}")
print(f"共保存 {len(df_final_output)} 行, {len(df_final_output.columns)} 列。")
print("\n--- 保存后数据预览 ---")
print(df_final_output.head())


已成功计算并添加 3 个滚动平均列: ['roe_2yr_avg', 'roe_3yr_avg', 'roe_5yr_avg']

--- 开始保存数据 ---
数据已成功保存到文件: pbroe6.2_ROEavg.csv
共保存 665563 行, 14 列。

--- 保存后数据预览 ---
    stkcd      trdmnt      accper shortname  if_st indcd1 indnme1  price  \
0  000001  2005-04-01  2004-09-30      深发展A      0    J66  货币金融服务   6.20   
1  000001  2005-05-01  2004-12-31      深发展A      0    J66  货币金融服务   6.01   
2  000001  2005-06-01  2005-03-31      深发展A      0    J66  货币金融服务   5.93   
3  000001  2005-07-01  2005-03-31      深发展A      0    J66  货币金融服务   5.93   
4  000001  2005-08-01  2005-03-31      深发展A      0    J66  货币金融服务   6.23   

     market_cap        PB   roe_ttm  roe_avg_2y  roe_avg_3y  roe_avg_5y  
0  1.206410e+10  0.623302  0.097829    0.097829    0.097829    0.097829  
1  1.169439e+10  0.604200  0.067002    0.067002    0.067002    0.067002  
2  1.153873e+10  0.596158  0.050908    0.050908    0.050908    0.050908  
3  1.153873e+10  1.179310  0.050908    0.050908    0.050908    0.050908  
4  1.212247e+10  1.2389

In [13]:
# 计算残差
import pandas as pd
import numpy as np
import statsmodels.api as sm
from pathlib import Path
import os
from joblib import Parallel, delayed

# =================================================================== #
#                           【1. 配置区域】                           #
# =================================================================== #
# 输入文件：包含多年平均ROE的数据文件
INPUT_FILE = Path('./pbroe6.2_ROEavg.csv')

# 输出文件：保存最终结果
OUTPUT_DIR = Path('./')
OUTPUT_FILENAME = 'pbroe6.2_avg_roe_residuals.csv'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# 定义要处理的多年平均ROE列
AVG_ROE_COLS = ['roe_avg_2y', 'roe_avg_3y', 'roe_avg_5y']

# 时序分位数计算周期
QUANTILE_PERIODS = [10]


# =================================================================== #
#                       【2. 核心计算函数】                         #
# =================================================================== #

def run_regression_for_avg_roe(df: pd.DataFrame, avg_roe_col: str) -> pd.DataFrame:
    """
    对给定的多年平均ROE列执行月度行业内回归，并计算标准化残差。
    【关键修复点】此函数现在只返回包含键和新残差列的精简DataFrame。
    """
    print(f"\n--- 开始为 '{avg_roe_col}' 执行回归 ---")

    df_regr = df.copy()
    df_regr.dropna(subset=[avg_roe_col, 'PB', 'indnme1'], inplace=True)
    df_regr = df_regr[df_regr['PB'] > 0]
    df_regr['lnPB'] = np.log(df_regr['PB'])

    if df_regr.empty:
        print(f"警告: 为 '{avg_roe_col}' 准备数据后，无有效记录可供回归。")
        return pd.DataFrame()

    def calculate_residuals(group, roe_col):
        if len(group) < 10:
            group['residual_zscore'] = np.nan
            return group
        try:
            y = group['lnPB']
            X = sm.add_constant(group[roe_col])
            model = sm.OLS(y, X, missing='drop').fit()
            residuals = model.resid
            resid_mean, resid_std = residuals.mean(), residuals.std()
            if resid_std > 1e-6:
                group['residual_zscore'] = (residuals - resid_mean) / resid_std
            else:
                group['residual_zscore'] = 0.0
        except Exception:
            group['residual_zscore'] = np.nan
        return group

    results = df_regr.groupby(['trdmnt', 'indnme1']).apply(calculate_residuals, roe_col=avg_roe_col, include_groups=False)
    results.dropna(subset=['residual_zscore'], inplace=True)

    new_resid_col_name = f"residual_zscore_{avg_roe_col.replace('roe_', '')}"
    results.rename(columns={'residual_zscore': new_resid_col_name}, inplace=True)

    print(f"回归完成，为 '{avg_roe_col}' 计算出 {len(results)} 条有效残差。")

    # 【关键修复点】只返回包含关键信息和新残差的精简DataFrame
    final_results = results.reset_index()
    return final_results[['stkcd', 'trdmnt', new_resid_col_name]]


def calculate_quantiles_for_single_stock(stock_df, residual_col, periods):
    stock_df = stock_df.sort_values(by='trdmnt').copy()
    for period in periods:
        quantile_col_name = f"residual_quantile_{period}m_{residual_col.split('_')[-1]}"
        stock_df[quantile_col_name] = stock_df[residual_col].rolling(
            window=period, min_periods=1
        ).apply(lambda y: y.rank(pct=True).iloc[-1], raw=False)
    return stock_df


def calculate_time_series_quantiles_parallel(df: pd.DataFrame, residual_col: str, periods: list) -> pd.DataFrame:
    print(f"\n--- 开始为 '{residual_col}' 计算时序分位数 (并行) ---")

    df_quant = df[['stkcd', 'trdmnt', residual_col]].dropna().copy()
    if df_quant.empty:
        print(f"警告: 列 '{residual_col}' 没有有效数据，无法计算分位数。")
        return pd.DataFrame()

    num_cores = os.cpu_count() or 1
    print(f"使用 {num_cores} 个CPU核心进行并行计算。")

    grouped_dfs = [group for _, group in df_quant.groupby('stkcd')]

    results = Parallel(n_jobs=-1)(
        delayed(calculate_quantiles_for_single_stock)(stock_df, residual_col, periods)
        for stock_df in grouped_dfs
    )

    df_quantiles = pd.concat(results).sort_values(by=['stkcd', 'trdmnt']).reset_index(drop=True)
    print(f"时序分位数计算完成。")
    return df_quantiles


# =================================================================== #
#                          【3. 主函数执行】                          #
# =================================================================== #

def main():
    print(f"--- 开始处理文件: {INPUT_FILE} ---")
    try:
        df_main = pd.read_csv(INPUT_FILE)
        df_main['trdmnt'] = pd.to_datetime(df_main['trdmnt']).dt.to_period('M').astype(str)
        df_main['stkcd'] = df_main['stkcd'].astype(str).str.zfill(6)
    except FileNotFoundError:
        print(f"错误: 找不到输入文件 {INPUT_FILE}。请检查文件路径。")
        return

    for avg_roe_col in AVG_ROE_COLS:
        # 1. 计算回归残差
        df_residuals = run_regression_for_avg_roe(df_main, avg_roe_col)

        if df_residuals.empty:
            continue

        # 【关键修复点】合并逻辑现在更简单、更安全
        df_main = pd.merge(df_main, df_residuals, on=['stkcd', 'trdmnt'], how='left')

        # 2. 基于新残差计算时序分位数
        # `df_residuals`现在只包含一个残差列，所以可以直接获取其列名
        new_resid_col_name = [col for col in df_residuals.columns if 'residual_zscore' in col][0]
        df_quantiles = calculate_time_series_quantiles_parallel(df_main, new_resid_col_name, QUANTILE_PERIODS)

        if df_quantiles.empty:
            continue

        df_quantiles.drop(columns=[new_resid_col_name], inplace=True)
        df_main = pd.merge(df_main, df_quantiles, on=['stkcd', 'trdmnt'], how='left')

    output_path = OUTPUT_DIR / OUTPUT_FILENAME
    try:
        final_columns = ['stkcd', 'trdmnt'] + [col for col in df_main.columns if col not in ['stkcd', 'trdmnt']]
        df_main = df_main[final_columns]

        df_main.to_csv(output_path, index=False, encoding='utf-8-sig', float_format='%.6f')
        print(f"\n--- 全部计算完成！---")
        print(f"最终数据已保存至: {output_path}")
        print("\n最终文件内容预览:")
        print(df_main.dropna(subset=[col for col in df_main.columns if 'quantile' in col]).tail())
    except Exception as e:
        print(f"保存文件时出错: {e}")

if __name__ == "__main__":
    main()


--- 开始处理文件: pbroe6.2_ROEavg.csv ---

--- 开始为 'roe_avg_2y' 执行回归 ---
回归完成，为 'roe_avg_2y' 计算出 607734 条有效残差。

--- 开始为 'residual_zscore_avg_2y' 计算时序分位数 (并行) ---
使用 192 个CPU核心进行并行计算。
时序分位数计算完成。

--- 开始为 'roe_avg_3y' 执行回归 ---
回归完成，为 'roe_avg_3y' 计算出 607843 条有效残差。

--- 开始为 'residual_zscore_avg_3y' 计算时序分位数 (并行) ---
使用 192 个CPU核心进行并行计算。
时序分位数计算完成。

--- 开始为 'roe_avg_5y' 执行回归 ---
回归完成，为 'roe_avg_5y' 计算出 607854 条有效残差。

--- 开始为 'residual_zscore_avg_5y' 计算时序分位数 (并行) ---
使用 192 个CPU核心进行并行计算。
时序分位数计算完成。

--- 全部计算完成！---
最终数据已保存至: pbroe6.2_avg_roe_residuals.csv

最终文件内容预览:
         stkcd   trdmnt      accper shortname  if_st indcd1           indnme1  \
665558  689009  2024-11  2024-06-30      九号公司      0    C39  计算机、通信和其他电子设备制造业   
665559  689009  2024-12  2024-09-30      九号公司      0    C39  计算机、通信和其他电子设备制造业   
665560  689009  2025-01  2024-09-30      九号公司      0    C39  计算机、通信和其他电子设备制造业   
665561  689009  2025-02  2024-09-30      九号公司      0    C39  计算机、通信和其他电子设备制造业   
665562  689009  2025-03  2024-09-30

In [17]:
import pandas as pd
import numpy as np
from pathlib import Path
import os
from joblib import Parallel, delayed

def build_pbroe6_2_portfolios(
    data_file: Path,
    strategy_configs: dict,
    cs_quantile: float = 0.2,
    ts_threshold: float = 0.1
) -> dict:
    """
    基于多年平均ROE的残差和时序分位数构建投资组合。

    Args:
        data_file (Path): 包含所有预计算指标的数据文件。
        strategy_configs (dict): 策略配置，映射策略名到残差和分位数。
        cs_quantile (float): 横截面残差的筛选分位数阈值。
        ts_threshold (float): 时序残差分位数的筛选阈值。

    Returns:
        dict: 一个嵌套字典，结构为 {'策略名': {调仓日期: {股票代码集合}}}。
    """
    print(f"--- 步骤 1: 构建 PBROE 6.2 系列投资组合 ---")
    try:
        df = pd.read_csv(data_file)
        df['trdmnt'] = pd.to_datetime(df['trdmnt'])
        # 创建用于选股的调入日期
        df['调入日期'] = df['trdmnt'] + pd.DateOffset(months=1)
        df['stkcd'] = df['stkcd'].astype(str).str.zfill(6)
        print(f"数据文件 '{data_file.name}' 加载成功。")
    except FileNotFoundError:
        print(f"错误: 找不到数据文件 {data_file}。")
        return {}

    all_selections = {}
    for strategy_name, configs in strategy_configs.items():
        resid_col = configs['residual']
        quant_col = configs['quantile']
        print(f"  -> 正在为策略 '{strategy_name}' 构建持仓...")

        df_strategy = df[['调入日期', 'stkcd', resid_col, quant_col]].dropna().copy()

        selections_dict = {}
        for date, group in df_strategy.groupby('调入日期'):
            resid_threshold = group[resid_col].quantile(cs_quantile)
            cs_selected = set(group[group[resid_col] <= resid_threshold]['stkcd'])

            ts_selected = set(group[group[quant_col] <= ts_threshold]['stkcd'])

            final_selection = cs_selected.intersection(ts_selected)

            if final_selection:
                selections_dict[date] = final_selection

        all_selections[strategy_name] = selections_dict
        print(f"     已为 {len(selections_dict)} 个调仓日构建投资组合。")

    # 【新增】打印所有策略涉及的唯一调仓日期总数
    total_unique_dates = set()
    for selections in all_selections.values():
        total_unique_dates.update(selections.keys())
    print(f"\n构建投资组合完成。所有策略共涉及 {len(total_unique_dates)} 个唯一的调仓日期。")

    return all_selections


def _backtest_single_strategy_job(strategy_name: str, selections: dict, backtest_months: pd.DatetimeIndex, returns_df: pd.DataFrame) -> pd.Series:
    """
    【新增】用于并行计算的辅助函数，回测单个策略。
    """
    print(f"  -> 开始回测策略: {strategy_name}")
    portfolio_map = pd.Series(index=backtest_months, dtype='object')
    rebalance_dates = sorted(selections.keys())
    for month in backtest_months:
        applicable_date = next((d for d in reversed(rebalance_dates) if d <= month), None)
        if applicable_date:
            portfolio_map[month] = selections[applicable_date]

    monthly_returns = []
    for month, stocks in portfolio_map.items():
        if not stocks:
            monthly_returns.append(0.0)
            continue

        month_str = month.strftime('%Y-%m')
        current_returns = returns_df[
            (returns_df['Trdmnt'] == month_str) &
            (returns_df['Stkcd'].isin(stocks))
        ]
        avg_return = current_returns['Mretwd'].mean() if not current_returns.empty else 0.0
        monthly_returns.append(avg_return)

    print(f"  <- 完成回测策略: {strategy_name}")
    return pd.Series(monthly_returns, index=backtest_months, name=f"return_{strategy_name}")


def run_multi_strategy_backtest_parallel(all_selections: dict, returns_df: pd.DataFrame, start_date: str, end_date: str) -> pd.DataFrame:
    """
    【重构】对多个策略并行执行向量化回测。
    """
    print("\n--- 步骤 2: 执行多策略向量化回测 (并行加速) ---")
    backtest_months = pd.to_datetime(pd.date_range(start_date, end_date, freq='MS'))

    num_cores = os.cpu_count() or 1
    print(f"使用 {num_cores} 个CPU核心进行并行回测。")

    # 使用joblib并行执行每个策略的回测
    results = Parallel(n_jobs=-1)(
        delayed(_backtest_single_strategy_job)(
            strategy_name,
            selections,
            backtest_months,
            returns_df
        ) for strategy_name, selections in all_selections.items()
    )

    all_returns_df = pd.concat(results, axis=1)
    print(f"回测完成，已为 {len(all_selections)} 个策略生成 {len(all_returns_df)} 条月度收益记录。")
    return all_returns_df


def calculate_and_save_performance_multi(
    all_returns_df: pd.DataFrame,
    benchmark_df: pd.DataFrame,
    all_selections: dict,
    risk_free_rate: float,
    returns_output_file: Path,
    performance_output_file: Path
):
    """
    计算并保存多个策略的详细绩效指标，并格式化输出。
    """
    print("\n--- 步骤 3: 计算并保存各策略绩效指标 ---")

    results = all_returns_df.join(benchmark_df.set_index('date'), how='left').fillna(0)
    all_metrics = []

    for strategy_name, selections in all_selections.items():
        return_col = f'return_{strategy_name}'
        cum_col = f'cum_{strategy_name}'
        results[cum_col] = (1 + results[return_col]).cumprod()

        total_months = len(results)
        final_return = results[cum_col].iloc[-1]
        annualized_return = final_return ** (12 / total_months) - 1
        annualized_volatility = results[return_col].std() * np.sqrt(12)
        sharpe_ratio = (annualized_return - risk_free_rate) / annualized_volatility if annualized_volatility != 0 else 0

        rolling_max = results[cum_col].expanding().max()
        drawdown = (results[cum_col] - rolling_max) / rolling_max
        max_drawdown = drawdown.min()

        annualized_benchmark_return = (1 + results['benchmark_return']).prod() ** (12 / total_months) - 1
        excess_return = results[return_col] - results['benchmark_return']
        annualized_excess_return = annualized_return - annualized_benchmark_return
        tracking_error = excess_return.std() * np.sqrt(12)
        information_ratio = annualized_excess_return / tracking_error if tracking_error != 0 else 0

        turnover_list = []
        rebalance_dates = sorted(selections.keys())
        for i in range(1, len(rebalance_dates)):
            prev_portfolio = selections.get(rebalance_dates[i-1], set())
            curr_portfolio = selections.get(rebalance_dates[i], set())
            if not prev_portfolio: continue
            stocks_sold = len(prev_portfolio - curr_portfolio)
            period_turnover = stocks_sold / len(prev_portfolio)
            turnover_list.append(period_turnover)
        annual_turnover = np.mean(turnover_list) * 12 if turnover_list else 0.0

        metrics = {
            '策略名称': strategy_name,
            '年化收益率': annualized_return, '年化波动率': annualized_volatility, '夏普比率': sharpe_ratio,
            '最大回撤': max_drawdown, '年化换手率': annual_turnover, '累计收益率': final_return - 1,
            '年化超额收益率': annualized_excess_return, '信息比率': information_ratio, '跟踪误差': tracking_error,
        }
        all_metrics.append(metrics)

    performance_df = pd.DataFrame(all_metrics).set_index('策略名称')
    performance_df.loc['基准 (沪深300)', '年化收益率'] = annualized_benchmark_return

    results.to_csv(returns_output_file, encoding='utf-8-sig', float_format='%.6f')
    print(f"月度收益数据已保存至: {returns_output_file}")

    formatted_performance_df = performance_df.copy()
    percent_cols = ['年化收益率', '年化波动率', '最大回撤', '年化换手率', '累计收益率', '年化超额收益率', '跟踪误差']
    for col in percent_cols:
        formatted_performance_df[col] = formatted_performance_df[col].apply(lambda x: f"{x:.2%}" if pd.notna(x) else '-')

    float_cols = ['夏普比率', '信息比率']
    for col in float_cols:
        formatted_performance_df[col] = formatted_performance_df[col].apply(lambda x: f"{x:.2f}" if pd.notna(x) else '-')

    formatted_performance_df.to_csv(performance_output_file, encoding='utf-8-sig')
    print(f"格式化的绩效评估报告已保存至: {performance_output_file}")

    print("\n--- 各策略绩效对比简报 ---")
    print(formatted_performance_df)


def main():
    """主执行函数"""
    # --- 配置区域 ---
    DATA_PATH = Path("E:/PBROE/data")
    CH6_PATH = Path("E:/PBROE/ch6")

    DATA_FILE = CH6_PATH / 'pbroe6.2_avg_roe_residuals.csv'
    RETURNS_FILE = DATA_PATH / 'TRDNEW_Mnth.csv'
    BENCHMARK_FILE = DATA_PATH / 'benchmark_indices.csv'

    RETURNS_OUTPUT_FILE = CH6_PATH / 'pbroe6.2_returns.csv'
    PERFORMANCE_OUTPUT_FILE = CH6_PATH / 'pbroe6.2_performance.csv'

    STRATEGY_CONFIGS = {
        'pbroe6.2_avg_2y': {
            'residual': 'residual_zscore_avg_2y',
            'quantile': 'residual_quantile_10m_2y'
        },
        'pbroe6.2_avg_3y': {
            'residual': 'residual_zscore_avg_3y',
            'quantile': 'residual_quantile_10m_3y'
        },
        'pbroe6.2_avg_5y': {
            'residual': 'residual_zscore_avg_5y',
            'quantile': 'residual_quantile_10m_5y'
        }
    }

    BACKTEST_START_DATE = '2010-05-01'
    BACKTEST_END_DATE = '2025-04-30'
    BENCHMARK_CODE = '000300'
    RISK_FREE_RATE = 0.03

    try:
        all_selections = build_pbroe6_2_portfolios(DATA_FILE, STRATEGY_CONFIGS)
        if not all_selections: return

        returns_df = pd.read_csv(RETURNS_FILE)
        returns_df['Stkcd'] = returns_df['Stkcd'].astype(str).str.zfill(6)
        returns_df['Trdmnt'] = pd.to_datetime(returns_df['Trdmnt']).dt.strftime('%Y-%m')
        returns_df['Mretwd'] = pd.to_numeric(returns_df['Mretwd'], errors='coerce').fillna(0)

        all_benchmarks_df = pd.read_csv(BENCHMARK_FILE)
        benchmark_df = all_benchmarks_df[all_benchmarks_df['Indexcd'].astype(str).str.zfill(6) == BENCHMARK_CODE].copy()
        benchmark_df['date'] = pd.to_datetime(benchmark_df['Month'], format='%Y-%m')
        benchmark_df.rename(columns={'Idxrtn': 'benchmark_return'}, inplace=True)
        benchmark_df = benchmark_df[['date', 'benchmark_return']]

        # 【更新】调用并行版本的回测函数
        all_returns_df = run_multi_strategy_backtest_parallel(all_selections, returns_df, BACKTEST_START_DATE, BACKTEST_END_DATE)

        calculate_and_save_performance_multi(
            all_returns_df, benchmark_df, all_selections, RISK_FREE_RATE,
            RETURNS_OUTPUT_FILE, PERFORMANCE_OUTPUT_FILE
        )
        print("\n--- PBROE 6.2 (基于多年平均ROE) 策略回测完成！ ---")

    except Exception as e:
        import traceback
        print(f"\n执行过程中出现严重错误: {e}")
        traceback.print_exc()

if __name__ == "__main__":
    main()

--- 步骤 1: 构建 PBROE 6.2 系列投资组合 ---
数据文件 'pbroe6.2_avg_roe_residuals.csv' 加载成功。
  -> 正在为策略 'pbroe6.2_avg_2y' 构建持仓...
     已为 231 个调仓日构建投资组合。
  -> 正在为策略 'pbroe6.2_avg_3y' 构建持仓...
     已为 231 个调仓日构建投资组合。
  -> 正在为策略 'pbroe6.2_avg_5y' 构建持仓...
     已为 231 个调仓日构建投资组合。

构建投资组合完成。所有策略共涉及 231 个唯一的调仓日期。

--- 步骤 2: 执行多策略向量化回测 (并行加速) ---
使用 192 个CPU核心进行并行回测。
回测完成，已为 3 个策略生成 180 条月度收益记录。

--- 步骤 3: 计算并保存各策略绩效指标 ---
月度收益数据已保存至: E:\PBROE\ch6\pbroe6.2_returns.csv
格式化的绩效评估报告已保存至: E:\PBROE\ch6\pbroe6.2_performance.csv

--- 各策略绩效对比简报 ---
                  年化收益率   年化波动率  夏普比率     最大回撤    年化换手率    累计收益率 年化超额收益率  \
策略名称                                                                       
pbroe6.2_avg_2y  16.01%  27.02%  0.48  -36.53%  704.76%  827.46%  14.62%   
pbroe6.2_avg_3y  17.32%  27.12%  0.53  -36.17%  709.62%  997.78%  15.93%   
pbroe6.2_avg_5y  17.14%  26.95%  0.52  -35.82%  712.51%  972.84%  15.75%   
基准 (沪深300)        1.39%       -     -        -        -        -       -   

                 信息比

In [14]:
# 残差单独
import pandas as pd
import numpy as np
from pathlib import Path
import os
from joblib import Parallel, delayed

def build_pbroe6_2_pure_residual_portfolios(
    data_file: Path,
    strategy_configs: dict,
    cs_quantile: float = 0.1
) -> dict:
    """
    【已修改】仅基于多年平均ROE的残差（横截面信号）构建投资组合。

    Args:
        data_file (Path): 包含所有预计算指标的数据文件。
        strategy_configs (dict): 策略配置，映射策略名到残差列。
        cs_quantile (float): 横截面残差的筛选分位数阈值。

    Returns:
        dict: 一个嵌套字典，结构为 {'策略名': {调仓日期: {股票代码集合}}}。
    """
    print(f"--- 步骤 1: 构建 PBROE 6.2 (纯残差) 系列投资组合 ---")
    try:
        df = pd.read_csv(data_file)
        df['trdmnt'] = pd.to_datetime(df['trdmnt'])
        # 创建用于选股的调入日期
        df['调入日期'] = df['trdmnt'] + pd.DateOffset(months=1)
        df['stkcd'] = df['stkcd'].astype(str).str.zfill(6)
        print(f"数据文件 '{data_file.name}' 加载成功。")
    except FileNotFoundError:
        print(f"错误: 找不到数据文件 {data_file}。")
        return {}

    all_selections = {}
    for strategy_name, configs in strategy_configs.items():
        resid_col = configs['residual']
        print(f"  -> 正在为策略 '{strategy_name}' 构建持仓...")

        # 【修改】现在只依赖残差列，不再需要时序分位数
        df_strategy = df[['调入日期', 'stkcd', resid_col]].dropna().copy()

        selections_dict = {}
        for date, group in df_strategy.groupby('调入日期'):
            # 【修改】只进行横截面筛选
            resid_threshold = group[resid_col].quantile(cs_quantile)
            final_selection = set(group[group[resid_col] <= resid_threshold]['stkcd'])

            if final_selection:
                selections_dict[date] = final_selection

        all_selections[strategy_name] = selections_dict
        print(f"     已为 {len(selections_dict)} 个调仓日构建投资组合。")

    # 打印所有策略涉及的唯一调仓日期总数
    total_unique_dates = set()
    for selections in all_selections.values():
        total_unique_dates.update(selections.keys())
    print(f"\n构建投资组合完成。所有策略共涉及 {len(total_unique_dates)} 个唯一的调仓日期。")

    return all_selections


def _backtest_single_strategy_job(strategy_name: str, selections: dict, backtest_months: pd.DatetimeIndex, returns_df: pd.DataFrame) -> pd.Series:
    """
    用于并行计算的辅助函数，回测单个策略。
    """
    print(f"  -> 开始回测策略: {strategy_name}")
    portfolio_map = pd.Series(index=backtest_months, dtype='object')
    rebalance_dates = sorted(selections.keys())
    for month in backtest_months:
        applicable_date = next((d for d in reversed(rebalance_dates) if d <= month), None)
        if applicable_date:
            portfolio_map[month] = selections[applicable_date]

    monthly_returns = []
    for month, stocks in portfolio_map.items():
        if not stocks:
            monthly_returns.append(0.0)
            continue

        month_str = month.strftime('%Y-%m')
        current_returns = returns_df[
            (returns_df['Trdmnt'] == month_str) &
            (returns_df['Stkcd'].isin(stocks))
        ]
        avg_return = current_returns['Mretwd'].mean() if not current_returns.empty else 0.0
        monthly_returns.append(avg_return)

    print(f"  <- 完成回测策略: {strategy_name}")
    return pd.Series(monthly_returns, index=backtest_months, name=f"return_{strategy_name}")


def run_multi_strategy_backtest_parallel(all_selections: dict, returns_df: pd.DataFrame, start_date: str, end_date: str) -> pd.DataFrame:
    """
    对多个策略并行执行向量化回测。
    """
    print("\n--- 步骤 2: 执行多策略向量化回测 (并行加速) ---")
    backtest_months = pd.to_datetime(pd.date_range(start_date, end_date, freq='MS'))

    num_cores = os.cpu_count() or 1
    print(f"使用 {num_cores} 个CPU核心进行并行回测。")

    results = Parallel(n_jobs=-1)(
        delayed(_backtest_single_strategy_job)(
            strategy_name,
            selections,
            backtest_months,
            returns_df
        ) for strategy_name, selections in all_selections.items()
    )

    all_returns_df = pd.concat(results, axis=1)
    print(f"回测完成，已为 {len(all_selections)} 个策略生成 {len(all_returns_df)} 条月度收益记录。")
    return all_returns_df


def calculate_and_save_performance_multi(
    all_returns_df: pd.DataFrame,
    benchmark_df: pd.DataFrame,
    all_selections: dict,
    risk_free_rate: float,
    returns_output_file: Path,
    performance_output_file: Path
):
    """
    计算并保存多个策略的详细绩效指标，并格式化输出。
    """
    print("\n--- 步骤 3: 计算并保存各策略绩效指标 ---")

    results = all_returns_df.join(benchmark_df.set_index('date'), how='left').fillna(0)
    all_metrics = []

    for strategy_name, selections in all_selections.items():
        return_col = f'return_{strategy_name}'
        cum_col = f'cum_{strategy_name}'
        results[cum_col] = (1 + results[return_col]).cumprod()

        total_months = len(results)
        final_return = results[cum_col].iloc[-1]
        annualized_return = final_return ** (12 / total_months) - 1
        annualized_volatility = results[return_col].std() * np.sqrt(12)
        sharpe_ratio = (annualized_return - risk_free_rate) / annualized_volatility if annualized_volatility != 0 else 0

        rolling_max = results[cum_col].expanding().max()
        drawdown = (results[cum_col] - rolling_max) / rolling_max
        max_drawdown = drawdown.min()

        annualized_benchmark_return = (1 + results['benchmark_return']).prod() ** (12 / total_months) - 1
        excess_return = results[return_col] - results['benchmark_return']
        annualized_excess_return = annualized_return - annualized_benchmark_return
        tracking_error = excess_return.std() * np.sqrt(12)
        information_ratio = annualized_excess_return / tracking_error if tracking_error != 0 else 0

        turnover_list = []
        rebalance_dates = sorted(selections.keys())
        for i in range(1, len(rebalance_dates)):
            prev_portfolio = selections.get(rebalance_dates[i-1], set())
            curr_portfolio = selections.get(rebalance_dates[i], set())
            if not prev_portfolio: continue
            stocks_sold = len(prev_portfolio - curr_portfolio)
            period_turnover = stocks_sold / len(prev_portfolio)
            turnover_list.append(period_turnover)
        annual_turnover = np.mean(turnover_list) * 12 if turnover_list else 0.0

        metrics = {
            '策略名称': strategy_name,
            '年化收益率': annualized_return, '年化波动率': annualized_volatility, '夏普比率': sharpe_ratio,
            '最大回撤': max_drawdown, '年化换手率': annual_turnover, '累计收益率': final_return - 1,
            '年化超额收益率': annualized_excess_return, '信息比率': information_ratio, '跟踪误差': tracking_error,
        }
        all_metrics.append(metrics)

    performance_df = pd.DataFrame(all_metrics).set_index('策略名称')
    performance_df.loc['基准 (沪深300)', '年化收益率'] = annualized_benchmark_return

    results.to_csv(returns_output_file, encoding='utf-8-sig', float_format='%.6f')
    print(f"月度收益数据已保存至: {returns_output_file}")

    formatted_performance_df = performance_df.copy()
    percent_cols = ['年化收益率', '年化波动率', '最大回撤', '年化换手率', '累计收益率', '年化超额收益率', '跟踪误差']
    for col in percent_cols:
        formatted_performance_df[col] = formatted_performance_df[col].apply(lambda x: f"{x:.2%}" if pd.notna(x) else '-')

    float_cols = ['夏普比率', '信息比率']
    for col in float_cols:
        formatted_performance_df[col] = formatted_performance_df[col].apply(lambda x: f"{x:.2f}" if pd.notna(x) else '-')

    formatted_performance_df.to_csv(performance_output_file, encoding='utf-8-sig')
    print(f"格式化的绩效评估报告已保存至: {performance_output_file}")

    print("\n--- 各策略绩效对比简报 ---")
    print(formatted_performance_df)


def main():
    """主执行函数"""
    # --- 配置区域 ---
    DATA_PATH = Path("E:/PBROE/data")
    CH6_PATH = Path("E:/PBROE/ch6")

    DATA_FILE = CH6_PATH / 'pbroe6.2_avg_roe_residuals.csv'
    RETURNS_FILE = DATA_PATH / 'TRDNEW_Mnth.csv'
    BENCHMARK_FILE = DATA_PATH / 'benchmark_indices.csv'

    # 【修改】更新输出文件名以反映新策略
    RETURNS_OUTPUT_FILE = CH6_PATH / 'pbroe6.2_pure_residual_returns.csv'
    PERFORMANCE_OUTPUT_FILE = CH6_PATH / 'pbroe6.2_pure_residual_performance.csv'

    # 【修改】策略配置现在只关心残差列
    STRATEGY_CONFIGS = {
        'pbroe6.2_pure_resid_2y': {
            'residual': 'residual_zscore_avg_2y'
        },
        'pbroe6.2_pure_resid_3y': {
            'residual': 'residual_zscore_avg_3y'
        },
        'pbroe6.2_pure_resid_5y': {
            'residual': 'residual_zscore_avg_5y'
        }
    }

    BACKTEST_START_DATE = '2010-05-01'
    BACKTEST_END_DATE = '2025-04-30'
    BENCHMARK_CODE = '000300'
    RISK_FREE_RATE = 0.03

    try:
        # 【修改】调用新的投资组合构建函数
        all_selections = build_pbroe6_2_pure_residual_portfolios(DATA_FILE, STRATEGY_CONFIGS)
        if not all_selections: return

        returns_df = pd.read_csv(RETURNS_FILE)
        returns_df['Stkcd'] = returns_df['Stkcd'].astype(str).str.zfill(6)
        returns_df['Trdmnt'] = pd.to_datetime(returns_df['Trdmnt']).dt.strftime('%Y-%m')
        returns_df['Mretwd'] = pd.to_numeric(returns_df['Mretwd'], errors='coerce').fillna(0)

        all_benchmarks_df = pd.read_csv(BENCHMARK_FILE)
        benchmark_df = all_benchmarks_df[all_benchmarks_df['Indexcd'].astype(str).str.zfill(6) == BENCHMARK_CODE].copy()
        benchmark_df['date'] = pd.to_datetime(benchmark_df['Month'], format='%Y-%m')
        benchmark_df.rename(columns={'Idxrtn': 'benchmark_return'}, inplace=True)
        benchmark_df = benchmark_df[['date', 'benchmark_return']]

        all_returns_df = run_multi_strategy_backtest_parallel(all_selections, returns_df, BACKTEST_START_DATE, BACKTEST_END_DATE)

        calculate_and_save_performance_multi(
            all_returns_df, benchmark_df, all_selections, RISK_FREE_RATE,
            RETURNS_OUTPUT_FILE, PERFORMANCE_OUTPUT_FILE
        )
        print("\n--- PBROE 6.2 (纯残差) 策略回测完成！ ---")

    except Exception as e:
        import traceback
        print(f"\n执行过程中出现严重错误: {e}")
        traceback.print_exc()

if __name__ == "__main__":
    main()

--- 步骤 1: 构建 PBROE 6.2 (纯残差) 系列投资组合 ---
数据文件 'pbroe6.2_avg_roe_residuals.csv' 加载成功。
  -> 正在为策略 'pbroe6.2_pure_resid_2y' 构建持仓...
     已为 240 个调仓日构建投资组合。
  -> 正在为策略 'pbroe6.2_pure_resid_3y' 构建持仓...
     已为 240 个调仓日构建投资组合。
  -> 正在为策略 'pbroe6.2_pure_resid_5y' 构建持仓...
     已为 240 个调仓日构建投资组合。

构建投资组合完成。所有策略共涉及 240 个唯一的调仓日期。

--- 步骤 2: 执行多策略向量化回测 (并行加速) ---
使用 192 个CPU核心进行并行回测。
回测完成，已为 3 个策略生成 180 条月度收益记录。

--- 步骤 3: 计算并保存各策略绩效指标 ---
月度收益数据已保存至: E:\PBROE\ch6\pbroe6.2_pure_residual_returns.csv
格式化的绩效评估报告已保存至: E:\PBROE\ch6\pbroe6.2_pure_residual_performance.csv

--- 各策略绩效对比简报 ---
                         年化收益率   年化波动率  夏普比率     最大回撤    年化换手率    累计收益率  \
策略名称                                                                      
pbroe6.2_pure_resid_2y  14.72%  25.66%  0.46  -34.53%  210.72%  684.77%   
pbroe6.2_pure_resid_3y  14.63%  25.58%  0.45  -34.60%  210.63%  675.38%   
pbroe6.2_pure_resid_5y  14.78%  25.59%  0.46  -34.31%  211.73%  690.31%   
基准 (沪深300)               1.39%       -     -   

In [18]:
import pandas as pd
import subprocess
from pathlib import Path
import sys

def prepare_data_for_backtest(source_file: Path, output_file: Path):
    """
    Loads the strategy-specific data, creates the rebalancing date column,
    and saves it to a temporary file ready for the backtester.

    Args:
        source_file (Path): The original data file with 'trdmnt'.
        output_file (Path): The path for the temporary, processed file.
    """
    print(f"--- 步骤 1: 预处理数据文件 '{source_file.name}' ---")
    try:
        df = pd.read_csv(source_file)
        df['trdmnt'] = pd.to_datetime(df['trdmnt'])

        # 关键步骤：根据策略逻辑，创建 '调入日期' 列
        # The generic backtest script expects this column name.
        df['调入日期'] = df['trdmnt'] + pd.DateOffset(months=1)

        df['stkcd'] = df['stkcd'].astype(str).str.zfill(6)

        # 保存处理后的数据到临时文件
        df.to_csv(output_file, index=False, encoding='utf-8-sig')
        print(f"数据已处理并保存到临时文件: '{output_file}'")
    except FileNotFoundError:
        print(f"错误: 找不到源数据文件 {source_file}。程序终止。")
        sys.exit(1) # Exit the script if data is not found
    except Exception as e:
        print(f"预处理数据时发生错误: {e}")
        sys.exit(1)


def main():
    """主执行函数"""
    # --- 1. 配置区域 ---
    # 请确保这里的路径是正确的

    # 通用回测脚本的路径
    BACKTEST_SCRIPT_PATH = Path("E:/PBROE/backtest/backtest.py")

    # 数据文件和输出目录的根路径
    DATA_ROOT_PATH = Path("E:/PBROE/data")
    CH6_PATH = Path("E:/PBROE/ch6")

    # 输入文件
    SOURCE_DATA_FILE = CH6_PATH / 'pbroe6.2_avg_roe_residuals.csv'
    RETURNS_FILE = DATA_ROOT_PATH / 'TRDNEW_Mnth.csv'
    BENCHMARK_FILE = DATA_ROOT_PATH / 'benchmark_indices.csv'

    # 输出目录
    OUTPUT_DIR = CH6_PATH / 'pbroe6.2_pure_residual_results'

    # 临时文件，用于存放预处理后的数据
    TEMP_HOLDINGS_FILE = CH6_PATH / 'temp_pbroe6.2_holdings_for_backtest.csv'

    # 策略配置
    # 这些残差列名将被传递给 backtest.py 的 --volatility-cols 参数
    # 通用脚本会将它们当作因子进行测试
    RESIDUAL_COLUMNS_TO_TEST = [
        'residual_zscore_avg_2y',
        'residual_zscore_avg_3y',
        'residual_zscore_avg_5y'
    ]

    # 回测参数
    BACKTEST_START_DATE = '2010-05-01'
    BACKTEST_END_DATE = '2025-04-30'
    BENCHMARK_CODE = '000300'
    RISK_FREE_RATE = '0.03'
    # 横截面分位数，这里设置为0.1，对应策略要求
    QUANTILE = '0.1'

    # --- 2. 预处理数据 ---
    prepare_data_for_backtest(SOURCE_DATA_FILE, TEMP_HOLDINGS_FILE)

    # --- 3. 构建并执行命令行指令 ---
    print("\n--- 步骤 2: 构建并执行对通用回测脚本的调用 ---")

    # 检查回测脚本是否存在
    if not BACKTEST_SCRIPT_PATH.exists():
        print(f"错误: 找不到回测脚本 '{BACKTEST_SCRIPT_PATH}'。请检查路径。")
        return

    # 使用列表构建命令，更安全、更清晰
    command = [
        "python",
        str(BACKTEST_SCRIPT_PATH),
        "--holdings-file", str(TEMP_HOLDINGS_FILE),
        "--returns-file", str(RETURNS_FILE),
        "--benchmark-file", str(BENCHMARK_FILE),
        "--output-dir", str(OUTPUT_DIR),
        "--volatility-cols", *RESIDUAL_COLUMNS_TO_TEST, # 使用 * 解包列表
        "--start-date", BACKTEST_START_DATE,
        "--end-date", BACKTEST_END_DATE,
        "--benchmark-code", BENCHMARK_CODE,
        "--risk-free-rate", RISK_FREE_RATE,
        "--quantile", QUANTILE
    ]

    print("将要执行以下命令:")
    # 为了可读性，将命令格式化后打印出来
    print(" ".join(f'"{c}"' if " " in c else c for c in command))

    try:
        # 执行命令
        # `check=True` 会在子进程返回非零退出码时抛出异常
        # `text=True` 使输出以文本形式捕获
        subprocess.run(command, check=True, text=True, encoding='utf-8')
        print("\n--- PBROE 6.2 (纯残差) 策略回测完成！ ---")
    except FileNotFoundError:
        print("\n错误: 'python' 命令未找到。请确保 Python 已被添加到系统 PATH 中。")
    except subprocess.CalledProcessError as e:
        print(f"\n错误: 回测脚本执行失败，返回码 {e.returncode}。")
        print("请检查回测脚本的输出日志以获取详细信息。")
    finally:
        # --- 4. 清理临时文件 ---
        print("\n--- 步骤 3: 清理临时文件 ---")
        try:
            TEMP_HOLDINGS_FILE.unlink()
            print(f"临时文件 '{TEMP_HOLDINGS_FILE}' 已被成功删除。")
        except OSError as e:
            print(f"删除临时文件时出错: {e}")


if __name__ == "__main__":
    main()


--- 步骤 1: 预处理数据文件 'pbroe6.2_avg_roe_residuals.csv' ---
数据已处理并保存到临时文件: 'E:\PBROE\ch6\temp_pbroe6.2_holdings_for_backtest.csv'

--- 步骤 2: 构建并执行对通用回测脚本的调用 ---
将要执行以下命令:
python E:\PBROE\backtest\backtest.py --holdings-file E:\PBROE\ch6\temp_pbroe6.2_holdings_for_backtest.csv --returns-file E:\PBROE\data\TRDNEW_Mnth.csv --benchmark-file E:\PBROE\data\benchmark_indices.csv --output-dir E:\PBROE\ch6\pbroe6.2_pure_residual_results --volatility-cols residual_zscore_avg_2y residual_zscore_avg_3y residual_zscore_avg_5y --start-date 2010-05-01 --end-date 2025-04-30 --benchmark-code 000300 --risk-free-rate 0.03 --quantile 0.1

--- PBROE 6.2 (纯残差) 策略回测完成！ ---

--- 步骤 3: 清理临时文件 ---
临时文件 'E:\PBROE\ch6\temp_pbroe6.2_holdings_for_backtest.csv' 已被成功删除。
