In [1]:
from scipy.stats import pearsonr, entropy
import numpy as np
import pandas as pd
import akshare as ak
import baostock as bs
from utils import get_index_components, get_csindex_components, get_history_k
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows',100)

## 思路
1. PE
2. Pearson相关性 with 上证
3. 线性回归斜率 with 上证

In [2]:
get_csindex_components()

array(['sh.688008', 'sh.688009', 'sh.688012', 'sh.688036', 'sh.688041',
       'sh.688047', 'sh.688072', 'sh.688082', 'sh.688111', 'sh.688126',
       'sh.688169', 'sh.688187', 'sh.688188', 'sh.688223', 'sh.688256',
       'sh.688271', 'sh.688303', 'sh.688349', 'sh.688396', 'sh.688472',
       'sh.688506', 'sh.688599', 'sh.688728', 'sh.688777', 'sh.688981',
       'sz.300014', 'sz.300122', 'sz.300124', 'sz.300142', 'sz.300223',
       'sz.300274', 'sz.300308', 'sz.300316', 'sz.300347', 'sz.300394',
       'sz.300408', 'sz.300433', 'sz.300442', 'sz.300450', 'sz.300502',
       'sz.300628', 'sz.300661', 'sz.300750', 'sz.300751', 'sz.300759',
       'sz.300760', 'sz.300782', 'sz.300832', 'sz.300919', 'sz.301269'],
      dtype=object)

In [3]:
def get_corr(stock_list, stt_date, end_date, market='sh.000001', min_samples=122):
    """
    计算股票收益率分布与市场收益率分布之间的Pearson相关系数
    """
    corr_list = []
    df_stock = get_history_k(stock_list, stt_date, end_date)
    df_stock['pct_chg'] = df_stock['pct_chg'].fillna(method="ffill").fillna(method='bfill')
    df_market = get_history_k([market],stt_date, end_date).rename(columns={"pct_chg":"pct_chg_m"})
    df_market['pct_chg_m'] = df_market['pct_chg_m'].fillna(method="ffill").fillna(method='bfill')
    for s in stock_list:
        df_merge = pd.merge(df_stock[df_stock['code']==s], df_market[['date','pct_chg_m']], on='date', how='left')
        if df_merge.shape[0]<min_samples:
            corr_ = 0
        else:
            v_s = df_merge['pct_chg'].values
            v_m = df_merge['pct_chg_m'].values
            corr_ = pearsonr(v_s, v_m).statistic
        corr_list.append(corr_)
    return corr_list

# get_corr(['sh.688008', 'sh.688009', 'sh.688012'],'2023-01-01','2024-08-31')
get_corr(['sz.300103'], '2024-01-01','2024-11-20')

login success!
logout success!
login success!
logout success!


[0.3435829163919291]

In [4]:
def get_slope(stock_list, stt_date, end_date, market='sh.000001', anomaly_q=0.98, min_samples=122):
    """
    计算股票收益率分布与市场收益率分布之间的线性回归beta & alpha
    """
    slope_list = []
    intercept_list = []
    # 获取市场指数数据
    df_market = get_history_k([market], stt_date, end_date)
    df_market['pct_chg'] = df_market['pct_chg'].fillna(method="ffill").fillna(method='bfill')
    df_stock = get_history_k(stock_list, stt_date, end_date)
    df_stock['pct_chg'] = df_stock['pct_chg'].fillna(method="ffill").fillna(method='bfill')
    
    for s in stock_list:
        # 合并股票和市场数据
        df_merge = pd.merge(
            df_stock[df_stock['code']==s], 
            df_market[['date','pct_chg']].rename(columns={'pct_chg':'pct_chg_m'}), 
            on='date', 
            how='left'
        )
        if df_merge.shape[0]<min_samples:
            slope, intercept = 0, 0
        else:
            thresh = df_merge['pct_chg'].abs().quantile(anomaly_q)
            df_merge_ = df_merge[df_merge['pct_chg'].abs()<=thresh]
            # 计算线性回归斜率
            x = df_merge_['pct_chg_m'].values
            y = df_merge_['pct_chg'].values
            slope, intercept = np.polyfit(x, y, 1)  # polyfit返回[slope, intercept]
        slope_list.append(slope)
        intercept_list.append(intercept)
        
    return slope_list, intercept_list

get_slope(['sh.688008', 'sh.688009', 'sh.688012'],'2024-01-01','2024-10-31')

login success!
logout success!
login success!
logout success!


([1.1682724633299075, 1.0546690400831076, 0.9200799520217955],
 [-0.03659805217486814, 0.06545144000491784, -0.12228545413840136])

In [5]:
def get_kld(stock_list, stt_date, end_date, market='sh.000001', bins=50, min_samples=122):
    """
    计算股票收益率分布与市场收益率分布之间的KL散度
    """
    
    kl_list = []
    # 获取市场指数数据
    df_market = get_history_k([market], stt_date, end_date)
    df_market['pct_chg'] = df_market['pct_chg'].fillna(method="ffill").fillna(method='bfill')
    market_returns = df_market['pct_chg'].values
    
    
    df_stock = get_history_k(stock_list, stt_date, end_date)
    df_stock['pct_chg'] = df_stock['pct_chg'].fillna(method="ffill").fillna(method='bfill')
    for s in stock_list:
        # 获取股票收益率
        stock_returns = df_stock[df_stock['code']==s]['pct_chg'].values
        if len(stock_returns)<min_samples:
            kl_div = 9999
        else:
            range_max = np.max([np.max(stock_returns),np.max(market_returns)])
            range_min = np.min([np.min(stock_returns), np.min(market_returns)])
            
            # 计算市场收益率的概率分布
            market_hist, bin_edges = np.histogram(market_returns, bins=bins, 
                range=(range_min, range_max), density=True)
            market_hist = market_hist + 1e-10  # 避免出现0
            # 使用相同的bin边界计算股票收益率的概率分布
            stock_hist, _ = np.histogram(stock_returns, bins=bin_edges, 
                range=(range_min, range_max) , density=True)
            stock_hist = stock_hist + 1e-10  # 避免出现0

            # 计算KL散度
            kl_div = entropy(stock_hist, market_hist)
        kl_list.append(kl_div)
    
    return kl_list

get_kld(['sh.688008', 'sh.688009', 'sh.688012'],'2024-01-01','2024-10-31')

login success!
logout success!
login success!
logout success!


[4.464426743137549, 2.8101924814932464, 2.8668645462127915]

In [6]:
watch_date = '2024-11-27'
stt_date = '2024-01-01'
end_date = watch_date

# 创业板
# stock_list = get_index_components()
stock_list = get_csindex_components()
# stock_list = get_csindex_components('000300')
# stock_list = ['sz.000001','sz.000002']
df_res = get_history_k(stock_list, watch_date, watch_date)
df_res['corr'] = get_corr(stock_list, stt_date, end_date)
df_res[['beta','alpha']] = np.transpose(get_slope(stock_list, stt_date, end_date))
# df_res['kld'] = get_kld(stock_list, stt_date, end_date)

df_res

login success!
logout success!
login success!


In [17]:
get_slope(['sz.300413'],'2024-07-01', '2024-11-27',anomaly_q=0.9,min_samples=42)

login success!
logout success!
login success!
logout success!


([1.2213381976916489], [0.13984466506896429])

In [9]:
df_res[['code','pe_ttm','corr','beta','alpha']].query("pe_ttm>0 and pe_ttm<30 and beta>1.5").sort_values('beta')


Unnamed: 0,code,pe_ttm,corr,beta,alpha
0,sz.300759,26.110839,0.679881,1.500344,-0.10411
0,sz.000733,19.818876,0.643685,1.513428,-0.267083
0,sh.601888,26.183102,0.737669,1.544647,-0.220867
0,sz.002271,27.739582,0.677215,1.561379,-0.209316
0,sh.603799,16.254035,0.670019,1.58211,-0.144457
0,sz.300014,26.252388,0.71418,1.613781,-0.12392
0,sz.300413,16.987376,0.75465,1.614594,-0.053663
0,sh.603659,28.463791,0.599452,1.684209,-0.200334
0,sz.300442,25.597937,0.637042,1.798784,-0.044853


In [17]:
df_res[df_res['code']=='sh.601336']

Unnamed: 0,date,code,open,high,low,close,preclose,volume,amount,adjustflag,...,pct_chg,pe_ttm,ps_ttm,pcf_ncf_ttm,pb_mrq,is_st,corr,beta,alpha,kld
0,2024-11-20,sh.601336,62.097109,64.417082,61.605375,63.206661,62.046675,21032684,1048032000.0,1,...,1.8695,7.87823,1.340639,24.192895,1.717757,0,0.761284,1.367672,0.081557,2.593346


In [8]:
df_res.sort_values('pe_ttm', ascending=True)

Unnamed: 0,date,code,open,high,low,close,preclose,volume,amount,adjustflag,...,pct_chg,pe_ttm,ps_ttm,pcf_ncf_ttm,pb_mrq,is_st,corr,beta,alpha,kld
0,2024-11-20,sh.688256,456.0,479.65,449.01,475.0,461.0,9330080,4328962000.0,1,...,3.0369,-259.138173,264.783851,-244.801632,38.591205,0,0.481804,1.355326,0.293206,6.694667
0,2024-11-20,sh.688303,31.017118,31.338482,30.385473,31.061445,31.006037,7931781,220894400.0,1,...,0.1787,-133.149165,6.358324,-3.510797,1.434533,0,0.527468,1.121999,-0.254985,3.130423
0,2024-11-20,sh.688599,25.716825,25.80205,25.258737,25.429188,25.812704,17933932,429405400.0,1,...,-1.4858,-132.626731,0.545186,48.466446,1.83178,0,0.612699,1.323233,-0.305746,4.771022
0,2024-11-20,sh.688047,146.91,153.0,144.9,150.44,145.0,5983424,895990600.0,1,...,3.7517,-129.650361,143.919499,-371.146397,18.787104,0,0.604875,1.359248,-0.118652,4.79661
0,2024-11-20,sh.688126,23.060881,23.481993,22.870378,23.100987,23.221305,21791512,504009600.0,1,...,-0.5181,-112.5318,19.30491,-100.893684,4.889739,0,0.522164,1.275613,-0.148985,3.269046
0,2024-11-20,sz.300316,341.507849,344.839169,335.797016,341.507849,343.697002,17981305,642765100.0,1,...,-0.6369,11.73586,2.473046,-165.743142,2.7573,0,0.665524,1.350124,-0.275805,5.08969
0,2024-11-20,sz.300274,804.57886,812.927901,792.199247,797.861241,816.478643,35458120,2954184000.0,1,...,-2.2802,17.559532,2.274508,26.27238,5.045724,0,0.592795,1.023036,-0.023142,2.877254
0,2024-11-20,sh.688169,618.435044,618.435044,610.775527,612.562748,620.477582,2299445,497082900.0,1,...,-1.2756,18.440294,4.000019,265.657703,3.242635,0,0.54024,0.906633,0.075967,3.937928
0,2024-11-20,sh.688036,145.494291,146.605759,140.631617,144.058644,146.960811,12259867,1132649000.0,1,...,-1.9748,19.15258,1.508946,-34.759727,5.245357,0,0.561225,0.988433,0.037747,3.758485
0,2024-11-20,sh.688187,50.506274,51.026421,50.256603,50.818362,50.70393,4308116,209688000.0,1,...,0.2257,19.402864,2.87786,-87.914774,1.700859,0,0.62387,1.190289,0.024604,3.301292


In [33]:
pearsonr

ValueError: x and y must have the same length.

In [11]:
?np.corrcoef

[0;31mSignature:[0m
[0mnp[0m[0;34m.[0m[0mcorrcoef[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mx[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0my[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrowvar[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbias[0m[0;34m=[0m[0;34m<[0m[0mno[0m [0mvalue[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mddof[0m[0;34m=[0m[0;34m<[0m[0mno[0m [0mvalue[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdtype[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Return Pearson product-moment correlation coefficients.

Please refer to the documentation for `cov` for more detail.  The
relationship between the correlation coefficient matrix, `R`, and the
covariance matrix, `C`, is

.. math:: R_{ij} = \frac{ C_{ij} } { \sqrt{ C_{ii} C_{jj} } }

The v