# Alpha Research — 真实 A 股数据因子回测

**数据流：**
```
DataRepository (rawdata/repository/)
    └─► RepositoryDataLoader.load()
            └─► MarketData（T×N 宽表，与 MockData 完全同构）
                    └─► AlphaOps 因子构造 → VectorEngine 回测
```

**前提条件：** 已运行 `data_download.ipynb` 下载了股票数据到 `rawdata/repository/`

## 0. 环境准备

In [None]:
import os
print('当前路径:', os.getcwd())

# 如果路径不对，取消注释并修改以下行：
# os.chdir(r'D:\2026_claude\StockFrame')

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 100

import logging
logging.basicConfig(
    level=logging.WARNING,
    format='%(asctime)s [%(name)s] %(levelname)s: %(message)s'
)

## 1. 导入框架

In [None]:
from data_loader import RepositoryDataLoader, MarketData
from quant_alpha_engine.ops import AlphaOps as op
from quant_alpha_engine.backtest import VectorEngine
from quant_alpha_engine.backtest.performance import Performance

print('✅ 导入成功')
print('   - RepositoryDataLoader : 真实数据加载器')
print('   - AlphaOps             : 因子算子库')
print('   - VectorEngine         : 矩阵式回测引擎')

## 2. 配置参数

In [None]:
# ── 数据配置 ──────────────────────────────────────────────────
REPO_DIR   = './rawdata/repository'   # DataRepository 根目录
SYMBOLS    = None                     # None = 自动加载仓库中所有股票
                                      # 或指定列表：['600519', '000001', '000858']
START_DATE = '2022-01-01'             # 回测起始日期
END_DATE   = '2026-02-22'             # 回测截止日期（None = 今天）
ADJ_TYPE   = 'hfq'                    # 'hfq'(后复权) | 'qfq'(前复权) | 'raw'(不复权)

# ── 回测公共配置 ──────────────────────────────────────────────
REBALANCE_FREQ = 5        # 调仓频率（交易日），5 = 每周
TOP_N          = 10       # 持仓股数
WEIGHT_METHOD  = 'equal'  # 'equal' | 'factor_weighted'
COST_RATE      = 0.0015   # 单边交易成本（0.15%）

# ── 预处理参数 ────────────────────────────────────────────────
# delay=1（推荐）：T-1日因子 → T日建仓 → 赚T日收益，严格无未来函数
DELAY    = 1
DECAY    = 0
INDUSTRY = None   # None = 不做行业中性化；load() 后可改为 data.industry

print(f'数据配置: {START_DATE} ~ {END_DATE}, adj={ADJ_TYPE}')
print(f'回测配置: rebalance_freq={REBALANCE_FREQ}, top_n={TOP_N}, '
      f'weight={WEIGHT_METHOD}, cost={COST_RATE}')
print(f'预处理:   delay={DELAY}, decay={DECAY}, industry={INDUSTRY}')

## 3. 加载真实市场数据

**RepositoryDataLoader** 从 DataRepository 读取已下载的 CSV 数据，
输出与 MockData 完全兼容的 `MarketData` 容器（T×N 宽表矩阵）。

In [None]:
loader = RepositoryDataLoader(
    repo_dir     = REPO_DIR,
    limit_up_pct = 0.099,   # 涨跌停判断阈值（科创板/创业板可改为 0.199）
)

data = loader.load(
    symbols          = SYMBOLS,
    start_date       = START_DATE,
    end_date         = END_DATE,
    adj_type         = ADJ_TYPE,
    fill_suspended   = True,    # True = 停牌日用前值填充价格
    min_trading_days = 20,      # 最少有效交易日，不足则剔除
)

data.print_summary()

In [None]:
# 提取常用字段（与 MockData 使用方式完全一致）
close    = data.close          # (T×N) 复权收盘价
open_    = data.open           # (T×N) 复权开盘价
high     = data.high           # (T×N) 复权最高价
low      = data.low            # (T×N) 复权最低价
volume   = data.volume         # (T×N) 成交量（不复权）
hfq_fac  = data.hfq_factor     # (T×N) 后复权因子（始终保留）
industry = data.industry       # Series: stock_code → industry_name
is_susp  = data.is_suspended   # (T×N) bool，True=停牌
is_limit = data.is_limit       # (T×N) bool，True=涨跌停

print(f'收盘价矩阵: {close.shape}  (交易日 × 股票)')
print(f'日期范围:   {close.index[0].date()} ~ {close.index[-1].date()}')
print(f'股票列表:   {list(close.columns)}')
print(f'停牌比例:   {is_susp.values.mean()*100:.2f}%')
print(f'涨跌停比例: {is_limit.values.mean()*100:.2f}%')
print(f'\n行业分布:')
print(industry.value_counts())

In [None]:
# 查看收盘价数据（尾部5行）
close.tail()

## 4. 构建 Alpha 因子

所有算子均基于 (T×N) DataFrame 向量化计算，可自由嵌套。

In [None]:
# ─── 因子 1：反转动量 ───────────────────────────────────────────
# 逻辑：近期跌幅越大，均值回归预期越强
# 公式：Rank(-Ts_Delta(close, 5))
factor_reversal = op.Rank(-op.Ts_Delta(close, 5))
print('✅ 因子1 (反转动量):  Rank(-Ts_Delta(close, 5))')

# ─── 因子 2：量价背离 ───────────────────────────────────────────
# 逻辑：缩量上涨（价涨量缩）是强势信号，量价负相关排名靠前
# 公式：Rank(-Ts_Corr(volume, close, 10))
factor_volprice = op.Rank(-op.Ts_Corr(volume, close, window=10))
print('✅ 因子2 (量价背离):  Rank(-Ts_Corr(volume, close, 10))')

# ─── 因子 3：动量因子 ──────────────────────────────────────────
# 逻辑：过去 20 日涨幅排名，动量延续
# 公式：Rank(Ts_Delta(close, 20))
factor_momentum = op.Rank(op.Ts_Delta(close, 20))
print('✅ 因子3 (价格动量):  Rank(Ts_Delta(close, 20))')

# ─── 因子 4：波动率因子（低波动异象）────────────────────────────
# 逻辑：低波动股票长期表现更好（低波动异象）
# 公式：Rank(-Ts_Std(close/close.shift(1) - 1, 20))
returns     = close / close.shift(1) - 1
factor_lowvol = op.Rank(-op.Ts_Std(returns, 20))
print('✅ 因子4 (低波动率):  Rank(-Ts_Std(ret, 20))')

## 5. 单因子回测

In [None]:
# ─── 回测因子 1：反转动量 ───
print('=' * 55)
print('  回测：反转动量因子')
print('=' * 55)

engine1 = VectorEngine(
    factor         = factor_reversal,
    close          = close,
    is_suspended   = is_susp,
    is_limit       = is_limit,
    rebalance_freq = REBALANCE_FREQ,
    top_n          = TOP_N,
    weight_method  = WEIGHT_METHOD,
    cost_rate      = COST_RATE,
    delay          = DELAY,
    decay          = DECAY,
    industry       = INDUSTRY,
)
result1 = engine1.run()
result1.print_summary()

In [None]:
# ─── 回测因子 2：量价背离 ───
print('=' * 55)
print('  回测：量价背离因子')
print('=' * 55)

engine2 = VectorEngine(
    factor         = factor_volprice,
    close          = close,
    is_suspended   = is_susp,
    is_limit       = is_limit,
    rebalance_freq = REBALANCE_FREQ,
    top_n          = TOP_N,
    weight_method  = WEIGHT_METHOD,
    cost_rate      = COST_RATE,
    delay          = DELAY,
    decay          = DECAY,
    industry       = INDUSTRY,
)
result2 = engine2.run()
result2.print_summary()
result2.plot()

In [None]:
# ─── 回测因子 3：价格动量 ───
print('=' * 55)
print('  回测：价格动量因子')
print('=' * 55)

engine3 = VectorEngine(
    factor         = factor_momentum,
    close          = close,
    is_suspended   = is_susp,
    is_limit       = is_limit,
    rebalance_freq = REBALANCE_FREQ,
    top_n          = TOP_N,
    weight_method  = WEIGHT_METHOD,
    cost_rate      = COST_RATE,
    delay          = DELAY,
    decay          = DECAY,
    industry       = INDUSTRY,
)
result3 = engine3.run()
result3.print_summary()

In [None]:
# ─── 回测因子 4：低波动率 ───
print('=' * 55)
print('  回测：低波动率因子')
print('=' * 55)

engine4 = VectorEngine(
    factor         = factor_lowvol,
    close          = close,
    is_suspended   = is_susp,
    is_limit       = is_limit,
    rebalance_freq = REBALANCE_FREQ,
    top_n          = TOP_N,
    weight_method  = WEIGHT_METHOD,
    cost_rate      = COST_RATE,
    delay          = DELAY,
    decay          = DECAY,
    industry       = INDUSTRY,
)
result4 = engine4.run()
result4.print_summary()

## 6. 因子绩效横向对比

In [None]:
# ─── 汇总对比表 ───
comparison = pd.DataFrame({
    '反转动量': result1.metrics,
    '量价背离': result2.metrics,
    '价格动量': result3.metrics,
    '低波动率': result4.metrics,
}).T

key_cols = ['年化收益率', '年化波动率', 'Sharpe_Ratio', '最大回撤',
            'IC_Mean', 'ICIR', 'IC_胜率', '日均换手率', 'Fitness']

# 兼容：只取实际存在的列
key_cols = [c for c in key_cols if c in comparison.columns]
display_df = comparison[key_cols].copy()

pct_cols = [c for c in ['年化收益率', '年化波动率', '最大回撤', 'IC_胜率', '日均换手率'] if c in display_df.columns]
for col in pct_cols:
    display_df[col] = display_df[col].apply(lambda v: f'{v*100:.2f}%' if pd.notna(v) else 'N/A')

num_cols = [c for c in ['Sharpe_Ratio', 'IC_Mean', 'ICIR', 'Fitness'] if c in display_df.columns]
for col in num_cols:
    display_df[col] = display_df[col].apply(lambda v: f'{v:.4f}' if pd.notna(v) else 'N/A')

print('\n=== 四因子绩效对比 ===')
print(display_df.to_string())

In [None]:
# ─── 净值曲线对比图 ───
fig, ax = plt.subplots(figsize=(14, 6), facecolor='#1A1A2E')
ax.set_facecolor('#16213E')
for spine in ax.spines.values():
    spine.set_color('#0F3460')
ax.tick_params(colors='#ECF0F1')

navs = {
    '反转动量': (result1.nav, '#FF4B4B'),
    '量价背离': (result2.nav, '#2ECC71'),
    '价格动量': (result3.nav, '#3498DB'),
    '低波动率': (result4.nav, '#F39C12'),
}

for label, (nav, color) in navs.items():
    ax.plot(nav.index, nav.values, label=label, color=color, linewidth=1.8)

ax.axhline(1.0, color='white', linewidth=0.6, alpha=0.4, linestyle='--')
ax.set_title('四因子净值曲线对比（真实 A 股数据）', color='#ECF0F1', fontsize=13, pad=10)
ax.set_xlabel('日期', color='#ECF0F1', fontsize=10)
ax.set_ylabel('净值', color='#ECF0F1', fontsize=10)
ax.legend(fontsize=9, facecolor='#1A1A2E', edgecolor='#0F3460', labelcolor='#ECF0F1')
ax.grid(True, color='#2F3640', alpha=0.5, linewidth=0.5)
plt.tight_layout()
plt.show()

## 7. 多因子合成

In [None]:
# ─── 等权合成 Alpha ───────────────────────────────────────────
# 先对每个原始因子做 Rank 标准化至 [0,1]，再等权合成
alpha_combo = (
    0.3 * op.Rank(factor_reversal) +
    0.3 * op.Rank(factor_volprice) +
    0.2 * op.Rank(factor_momentum) +
    0.2 * op.Rank(factor_lowvol)
)

print('=' * 55)
print('  回测：多因子合成 Alpha')
print('=' * 55)
print('  0.3×Rank(反转) + 0.3×Rank(量价) + 0.2×Rank(动量) + 0.2×Rank(低波动)')

engine_combo = VectorEngine(
    factor         = alpha_combo,
    close          = close,
    is_suspended   = is_susp,
    is_limit       = is_limit,
    rebalance_freq = REBALANCE_FREQ,
    top_n          = TOP_N,
    weight_method  = WEIGHT_METHOD,
    cost_rate      = COST_RATE,
    delay          = DELAY,
    decay          = DECAY,
    industry       = INDUSTRY,
)
result_combo = engine_combo.run()
result_combo.print_summary()
result_combo.plot()

## 8. 加载市值 / 财务数据（可选）

市值和财务数据可用于构造基本面因子，或用于市值加权。

In [None]:
# ─── 市值数据（用于市值因子 / 小市值效应）───
mv_data = loader.load_market_value(
    symbols    = data.symbols,
    start_date = START_DATE,
    end_date   = END_DATE,
)

if mv_data is not None:
    print('市值数据字段:', list(mv_data.keys()))
    if 'circulating_market_cap' in mv_data:
        circ_mv = mv_data['circulating_market_cap']
        print(f'流通市值矩阵: {circ_mv.shape}')
        print(circ_mv.tail(3))

        # 示例：小市值因子（流通市值越小排名越靠前）
        factor_smallcap = op.Rank(-circ_mv)
        print('\n✅ 小市值因子构建成功: Rank(-circulating_market_cap)')
else:
    print('⚠️  仓库中暂无市值数据，请先运行 data_download.ipynb 下载 market_value 数据')

In [None]:
# ─── 财务数据（季报粒度，适合构造基本面因子）───
fin_data = loader.load_financial(
    symbols    = data.symbols,
    start_date = START_DATE,
    end_date   = END_DATE,
)

if fin_data is not None:
    print('财务数据字段:', list(fin_data.keys())[:10], '...')
    # 示例：ROE 因子（高ROE排名靠前）
    if 'roe' in fin_data:
        roe_wide = fin_data['roe']    # (T_季报 × N)
        print(f'ROE 宽表: {roe_wide.shape}')
        print(roe_wide.tail(3))
else:
    print('⚠️  仓库中暂无财务数据，请先运行 data_download.ipynb 下载 financial 数据')

## 9. 自定义因子研究区域

在此 cell 编写你的因子逻辑，然后调用 `VectorEngine` 进行回测。

In [None]:
# ═══════════════════════════════════════════════════════════════
#  在此构建你的 Alpha 因子
# ═══════════════════════════════════════════════════════════════

# 示例：综合技术因子
#   逻辑：线性衰减加权的动量信号，行业中性化后 ZScore 标准化
my_factor = op.ZScore(
    op.Decay_Linear(
        op.Rank(op.Ts_Delta(close, 10)),
        d=5
    )
)

print('因子构建完成，维度:', my_factor.shape)
print('因子预览（后5行）:')
my_factor.tail()

In [None]:
# ─── 运行自定义因子回测 ───
my_engine = VectorEngine(
    factor         = my_factor,
    close          = close,
    is_suspended   = is_susp,
    is_limit       = is_limit,
    rebalance_freq = REBALANCE_FREQ,
    top_n          = TOP_N,
    weight_method  = WEIGHT_METHOD,
    cost_rate      = COST_RATE,
    delay          = DELAY,
    decay          = DECAY,
    industry       = INDUSTRY,
)
my_result = my_engine.run()
my_result.print_summary()
my_result.plot()

---

## 算子速查表

```python
from quant_alpha_engine.ops import AlphaOps as op

# 时序类
op.Ts_Sum(df, window)        # 滑动求和
op.Ts_Mean(df, window)       # 滑动均值（移动平均）
op.Ts_Max(df, window)        # 滑动最大值
op.Ts_Min(df, window)        # 滑动最小值
op.Ts_Std(df, window)        # 滑动标准差
op.Ts_Delta(df, period)      # df - df.shift(period)
op.Ts_Delay(df, period)      # df.shift(period) 数据滞后
op.Ts_Rank(df, window)       # 窗口内时序百分比排名 [0,1]
op.Ts_Corr(df1, df2, window) # 滚动相关系数 [-1,1]

# 截面类（每日横截面操作）
op.Rank(df)                  # 截面百分比排名 [0,1]
op.ZScore(df)                # 截面 Z-Score 标准化
op.Scale(df, a=1)            # 截面绝对值之和缩放至 a

# 特殊类
op.Decay_Linear(df, d)       # 线性加权衰减移动平均
op.Neutralize(df, industry)  # 行业中性化（OLS残差法）
```

## 复权方式说明

| 方式 | 公式 | 适用场景 |
|------|------|----------|
| `hfq` （后复权）| `price × hfq_factor` | **推荐**，跨期价格可比，因子计算无偏 |
| `qfq` （前复权）| `price × hfq_factor / hfq_factor.iloc[-1]` | 当前价位直观，但历史截面会随时间漂移 |
| `raw` （不复权）| 原始价格 | 仅用于与行情软件对比核验 |