In [1]:
import pandas as pd
import numpy as np

In [2]:
# 读数据
raw = pd.read_csv('A_share_indicators.csv', encoding='utf-8-sig')
meta = raw.iloc[:4].copy()
data = raw.iloc[4:].copy()

In [3]:
# 处理日期
data.rename(columns={data.columns[0]: 'date'}, inplace=True)
data['date'] = pd.to_datetime(data['date'])
data = data.set_index('date')

In [4]:
# 处理列名
col_map = {
    # macro
    '中国:CPI:当月同比': 'cpi_yoy_off',
    '中国:PPI:全部工业品:当月同比': 'ppi_yoy_off',
    '中国:M2': 'm2_lvl',
    '中国:M2:同比': 'm2_yoy',
    '中国:社会融资规模存量': 'social_financing_lvl',
    '中国:社会融资规模存量:同比': 'social_financing_yoy',
    '中国:制造业PMI': 'pmi_manufacturing',
    '中国:消费者信心指数': 'consumer_confidence',
    '中国:利润总额:制造业:累计值': 'industrial_profit_cum',
    '中国:社会消费品零售总额:累计值': 'retail_sales_cum',
    '中国:固定资产投资完成额:累计值': 'fix_invest_cum',
    '中国:固定资产投资完成额:累计同比': 'fix_invest_cum_yoy_off',
    '中国:固定资产投资完成额:基础设施建设:累计值': 'infra_invest_cum',
    '中国:固定资产投资完成额:基础设施建设:累计同比': 'infra_invest_cum_yoy_off',
    '中国:房地产开发投资完成额:累计值': 'real_estate_invest_cum',
    '中国:房地产开发投资完成额:累计同比': 'real_estate_invest_yoy_off',

    # macro - financial
    '中国:银行间质押式回购加权利率:7天': 'repo_rate_7d',
    '中国:贷款市场报价利率(LPR):1年': 'lpr_1y',
    '中国:贷款市场报价利率(LPR):5年': 'lpr_5y',
    '美国:国债收益率:10年': 'us_bond_10y',
    '中国:国债收益率:10年': 'cn_bond_10y',
    '沪股通:当日资金净流入(人民币)': 'sh_connect_flow',
    '深股通:当日资金净流入(人民币)': 'sz_connect_flow',
    '美元指数': 'usd_index',
    '标准普尔500波动率指数(VIX)': 'vix',

    # meso
    '中国:高炉开工率(247家)': 'blast_furnace_rate',
    '中国:钢材综合价格指数': 'steel_price_index',
    '中国:平均价:铜(1#):有色市场': 'copper_price',
    '中国:白酒批发价格总指数:旬定基价格指数': 'liquor_price_index',
    '中信行业指数:半导体': 'semiconductor_index',
    '市净率:申万行业指数:银行': 'bank_sector_pe',
    '中国:不良贷款比例:商业银行': 'bank_npl_ratio',
    '中国:证券公司:营业收入:代理买卖证券业务净收入': 'securities_revenue_lvl',
    '期货结算价(连续):WTI原油': 'wti_oil',

    # micro
    '美国:标准普尔500指数': 'sp500',
    '沪深300指数': 'hs300_index',
    '沪深300指数:换手率': 'hs300_turnover',
    '市盈率:沪深300指数': 'hs300_pe',
    '市净率:沪深300指数': 'hs300_pb',
    '中国:融资融券余额': 'margin_balance',
    '深市:融资融券余额': 'sz_margin_balance',
    '沪市:融资融券余额': 'sh_margin_balance'
}

data = data.rename(columns=col_map)

data = data.apply(pd.to_numeric, errors='coerce')

In [5]:
# 交易日日历
trade_dates = data['hs300_index'].dropna().index
panel = pd.DataFrame(index=trade_dates)
panel.index.name = 'date'

In [6]:
# 处理频率
freq_cn = meta.iloc[0]
zh2en = {'日':'D', '周':'W', '月':'M', '季':'Q', '年':'A'}
freq_map = { col_map.get(cn, cn): zh2en.get(freq_cn[cn], 'UNK')
             for cn in col_map if cn in freq_cn }

In [7]:
# 前向填充
def ffill_with_stale(dst, ser, name):
    mask = ser.notna()
    dst[name] = ser.ffill()
    group = mask.cumsum()
    dst[f'{name}_stale'] = dst.groupby(group).cumcount()

for col, freq in freq_map.items():
    s = data[col].reindex(panel.index)
    if freq == 'D':
        panel[col] = s.ffill()
        panel[f'{col}_stale'] = 0
    else:
        ffill_with_stale(panel, s, col)

panel = panel.dropna(axis=1, how='all')

In [8]:
def safe_log(s):
    if isinstance(s, (pd.Series, pd.Index)):
        return np.log(s.where(s > 0))
    else:
        return np.log(s) if s > 0 else np.nan

In [9]:
# 派生特征：宏观特征

# PPI-CPI 剪刀差
panel['ppi_cpi_gap'] = panel['ppi_yoy_off'] - panel['cpi_yoy_off']

# 利率 & 流动性
panel['cn_10y_repo_spread'] = panel['cn_bond_10y'] - panel['repo_rate_7d']
panel['us_cn_spread']       = panel['us_bond_10y'] - panel['cn_bond_10y']
panel['vix_ma5_ratio']      = panel['vix'] / panel['vix'].rolling(5).mean()
panel['daily_ret'] = panel['usd_index'].pct_change()
panel['usd_index_ret5'] = panel['daily_ret'].rolling(5).sum()


# 实体经济景气程度
cum_cols = ['fix_invest_cum', 'real_estate_invest_cum', 'infra_invest_cum']
for col in cum_cols:
    panel[f'{col}_inc']      = panel[col] - panel[col].shift(1)
    panel[f'{col}_inc_yoy']  = panel[f'{col}_inc'] / panel[f'{col}_inc'].shift(252) - 1

# PMI动量
panel['macro_momentum'] = (panel['pmi_manufacturing'] - 50) / 10

In [10]:
# 派生特征：中观（行业）特征

# 高炉利用率
panel['blast_furnace_rate_mom'] = panel['blast_furnace_rate'].diff()

# 大宗商品 20 日 log-收益
for col in ['wti_oil', 'steel_price_index', 'copper_price', 'semiconductor_index']:
    panel[f'{col}_ret20'] = safe_log(panel[col]).diff(20)

# 行业估值、券商营收 YoY
panel['securities_revenue_yoy'] = safe_log(panel['securities_revenue_lvl']).diff(252)

In [11]:
# 派生特征：微观（交易）特征

# 价格动量 & 波动
panel['hs300_ret1'] = panel['hs300_index'].pct_change()
panel['hs300_vol20'] = panel['hs300_ret1'].rolling(20).std() * np.sqrt(252)

# 技术指标
def rsi(series, period=14):
    delta = series.diff()
    gain  = delta.clip(lower=0).rolling(period).mean()
    loss  = (-delta.clip(upper=0)).rolling(period).mean()
    rs = gain / loss
    return 100 - 100 / (1 + rs)

panel['rsi_14'] = rsi(panel['hs300_ret1'])

# MACD
ema12 = panel['hs300_index'].ewm(span=12, adjust=False).mean()
ema26 = panel['hs300_index'].ewm(span=26, adjust=False).mean()
panel['macd']        = ema12 - ema26
panel['macd_signal'] = panel['macd'].ewm(span=9, adjust=False).mean()
panel['macd_diff']   = panel['macd'] - panel['macd_signal']

# Bollinger-band position
mid  = panel['hs300_index'].rolling(20).mean()
std  = panel['hs300_index'].rolling(20).std()
upper = mid + 2*std
lower = (mid - 2*std).clip(lower=1e-6)
panel['bb_position'] = (panel['hs300_index'] - lower) / (upper - lower)

# 量能、情绪、估值
panel['turnover_ma10_ratio'] = panel['hs300_turnover'] / panel['hs300_turnover'].rolling(10).mean()
panel['margin_change_rate']  = panel['margin_balance'].pct_change()
panel['pe_pb_ratio']         = panel['hs300_pe'] / panel['hs300_pb']
panel['pe_percentile']       = panel['hs300_pe'].rolling(252).rank(pct=True)

# Parkinson 波幅
hl_log = safe_log(panel['hs300_index']).rolling(1).apply(
    lambda x: safe_log(x.max()/x.min()) if x.count()==1 else np.nan)
panel['hl_vol20'] = hl_log.rolling(20).std()

# 分布特征
panel['return_skew60']  = panel['hs300_ret1'].rolling(60).skew()
panel['return_kurt60']  = panel['hs300_ret1'].rolling(60).kurt()

In [12]:
panel = panel.loc['2015-01-05':].copy()
panel = panel.replace([np.inf, -np.inf], np.nan)
panel = panel.ffill().bfill()

train_end = '2023-12-31'
val_end   = '2024-12-31'

no_scale_kw = [
    'returns', 'ret', 'pct', 'ratio', 'percentile', 'rsi',
    'change_rate', 'bb_position', 'macd_diff',
    'stale', 'skew', 'kurtosis',
    'gap', 'spread', 'vol', 'hl_vol'
]

In [13]:
no_scale_cols = [c for c in panel.columns
                 if any(k in c for k in no_scale_kw)]
scale_cols    = [c for c in panel.columns
                 if (c not in no_scale_cols)]

In [14]:
# 训练集参数
panel = panel.sort_index()
train_slice = panel.loc[panel.index <= train_end, scale_cols]

mean = train_slice.mean()
std  = train_slice.std(ddof=0).replace(0, 1e-8)

scale_params = pd.DataFrame({'mean': mean, 'std': std})

In [15]:
# Scaling
panel_scaled          = panel.copy()
panel_scaled[scale_cols] = (panel[scale_cols] - mean) / std
panel_scaled[scale_cols] = panel_scaled[scale_cols].clip(-10, 10)

panel.sort_index(ascending=False)

Unnamed: 0_level_0,cpi_yoy_off,cpi_yoy_off_stale,ppi_yoy_off,ppi_yoy_off_stale,m2_lvl,m2_lvl_stale,m2_yoy,m2_yoy_stale,social_financing_lvl,social_financing_lvl_stale,...,macd_signal,macd_diff,bb_position,turnover_ma10_ratio,margin_change_rate,pe_pb_ratio,pe_percentile,hl_vol20,return_skew60,return_kurt60
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-05-23,-0.1000,0,-2.7000,0,3251739.32,0,8.0,0,424.00,0,...,0.000000,0.000000,0.212280,1.206950,0.001204,10.786260,0.373016,0.0,3.863122,23.917334
2025-05-22,-0.1000,1,-2.7000,1,3251739.32,1,8.0,1,424.00,1,...,0.504016,2.016064,0.212280,1.206950,0.001204,10.791367,0.373016,0.0,3.863122,23.917334
2025-05-21,-0.1000,2,-2.7000,2,3251739.32,2,8.0,2,424.00,2,...,1.336504,3.329953,0.212280,1.206950,0.001204,10.798122,0.373016,0.0,3.863122,23.917334
2025-05-20,-0.1000,3,-2.7000,3,3251739.32,3,8.0,3,424.00,3,...,2.037653,2.804594,0.212280,1.206950,0.000895,10.788413,0.373016,0.0,3.863122,23.917334
2025-05-19,-0.1000,4,-2.7000,4,3251739.32,4,8.0,4,424.00,4,...,2.279636,0.967932,0.212280,1.206950,-0.001364,10.793457,0.373016,0.0,3.863122,23.917334
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-01-09,1.3758,52,-4.5603,52,1275332.78,52,11.6,52,127.58,52,...,-26.275054,15.203704,0.639645,1.272653,-0.010260,9.234870,0.607143,0.0,1.767561,8.099336
2015-01-08,1.3758,53,-4.5603,53,1275332.78,53,11.6,53,127.58,53,...,-22.432013,15.372164,0.655754,1.062302,-0.006582,9.272908,0.617063,0.0,1.737286,8.123372
2015-01-07,1.3758,54,-4.5603,54,1275332.78,54,11.6,54,127.58,54,...,-17.364264,20.270997,0.863589,1.159999,-0.002936,9.235620,0.698413,0.0,1.625004,7.411890
2015-01-06,1.3758,55,-4.5603,55,1275332.78,55,11.6,55,127.58,55,...,-11.798546,22.262872,0.818994,1.460606,-0.011159,9.252832,0.694444,0.0,1.659685,7.581101


In [16]:
import json

panel_scaled.to_pickle('processed_data.pkl')
scale_params_6d = scale_params.round(6)      
scale_params_6d.to_json('data_metadata.json', orient='index')