In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import polars as pl

from typing import Literal

from src.factor.calculator import StockFactorCalculator
from src.data import DATAVENDOR

def get_yoy_latest(numerator : str , date : int , lastn : int = 20 , **kwargs):
    '''statement@field@fin_type'''
    data = DATAVENDOR.get_fin_hist(numerator , date , lastn , **kwargs).iloc[:,0]
    return calc_yoy_latest(data)

def get_qoq_latest(numerator : str , date : int , lastn : int = 20 , **kwargs):
    '''statement@field@fin_type'''
    data = DATAVENDOR.get_fin_hist(numerator , date , lastn ,**kwargs).iloc[:,0]
    return calc_qoq_latest(data)

def get_compound_growth(numerator: str , date: int , n_year : int = 5 , **kwargs):
    '''cannot deal with < -100% growth compounding, use simple instead'''
    df = DATAVENDOR.get_fin_hist(numerator , date , 4*n_year + 1 , pivot = False).iloc[:,0].reset_index('end_date',drop=False)
    df = pd.concat([df.groupby('secid').first() , df.groupby('secid').last()], axis=0)
    val = df.columns[-1]
    
    df['qtrs'] = (df['end_date'] // 10000) * 4 + df['end_date'] % 10000 // 300 
    df = df.set_index('end_date',append=True).sort_index()

    # df = (df.groupby('secid')[val].pct_change() + 1) ** (4 / df.groupby('secid')['qtrs'].diff()) - 1
    df = df.groupby('secid')[val].pct_change() * 4 / df.groupby('secid')['qtrs'].diff()
    return df.groupby('secid').last()

def get_reg_growth(numerator: str , date: int , n_year : int = 5 , **kwargs):
    def _std_beta(args) -> pl.Series:
        y = args[0].to_numpy()[::-4][::-1]
        x = np.arange(1, len(y) + 1)
        try:
            v = sm.OLS(y, sm.add_constant(x)).fit().params[1] / y.mean()
            return pl.Series([v], dtype=pl.Float64)
        except Exception as e:
            return pl.Series([np.nan], dtype=pl.Float64)
    
    y_var = DATAVENDOR.get_fin_hist(numerator , date , n_year * 4 + 1 , pivot = False ,**kwargs).iloc[:,0]
    y_name = str(y_var.name)
    df = pl.from_pandas(y_var.to_frame() , include_index=True)
    df = df.with_columns([
        (pl.col(y_name) / pl.col(y_name).mean().over('secid')).alias(y_name),
    ]).with_columns(
        pl.when(pl.col(y_name).is_infinite()).then(0).otherwise(pl.col(y_name)).alias(y_name),
    ).drop_nulls()

    df = df.sort(['secid','end_date']).group_by('secid', maintain_order=True).\
        agg(pl.map_groups(exprs=[y_name], function=_std_beta)).to_pandas().set_index('secid').iloc[:,0]
    return df

def get_yoy_zscore(numerator : str , date : int , n_last : int = 20 , **kwargs):
    df = DATAVENDOR.get_fin_hist(numerator , date , n_last , pivot = False , **kwargs).iloc[:,0]
    grp = df.groupby('secid')
    return (grp.last() - grp.mean()) / grp.std()

def calc_yoy(data : pd.Series):
    full_index = pd.MultiIndex.from_product([data.index.get_level_values('secid').unique() ,
                                             data.index.get_level_values('end_date').unique()])
    df_yoy = data.reindex(full_index)
    df_yoy_base = df_yoy.groupby('secid').shift(4)
    df_yoy = (df_yoy - df_yoy_base) / df_yoy_base.abs()

    df_yoy = df_yoy.reindex(data.index).where(~data.isna() , np.nan).replace([np.inf , -np.inf] , np.nan)
    return df_yoy

def calc_qoq(data : pd.Series):
    full_index = pd.MultiIndex.from_product([data.index.get_level_values('secid').unique() ,
                                             data.index.get_level_values('end_date').unique()])
    df_yoy = data.reindex(full_index)
    df_yoy_base = df_yoy.groupby('secid').shift(1)
    df_yoy = (df_yoy - df_yoy_base) / df_yoy_base.abs()

    df_yoy = df_yoy.reindex(data.index).where(~data.isna() , np.nan).replace([np.inf , -np.inf] , np.nan)
    return df_yoy

def calc_yoy_latest(data : pd.Series):
    return calc_yoy(data).dropna().groupby('secid').last()

def calc_qoq_latest(data : pd.Series):
    return calc_qoq(data).dropna().groupby('secid').last()

def calc_trend(data : pd.Series):
    def _trend(args) -> pl.Series:
        y = args[0].to_numpy()
        x = np.arange(1, len(y) + 1)
        try:
            v = sm.OLS(y, sm.add_constant(x)).fit().params[1] / y.mean()
            return pl.Series([v], dtype=pl.Float64)
        except Exception as e:
            return pl.Series([np.nan], dtype=pl.Float64)
    if not data.name: data = data.rename('data')
    y_name = str(data.name)
    df = pl.from_pandas(data.to_frame() , include_index=True)
    df = df.with_columns(
        pl.when(pl.col(y_name).is_infinite()).then(0).otherwise(pl.col(y_name)).alias(y_name),
    ).drop_nulls()

    df = df.sort(['secid','end_date']).group_by('secid', maintain_order=True).\
        agg(pl.map_groups(exprs=[y_name], function=_trend)).to_pandas().set_index('secid').iloc[:,0]
    return df



src.basic.INSTANCE_RECORD can be accessed to check ['trainer', 'account']
Basic module imported!


In [2]:
class gp_ta_qtr_qoq(StockFactorCalculator):
    init_date = 20070101
    category0 = 'fundamental'
    category1 = 'growth'
    description = '毛利润/总资产环比变化值'
    
    def calc_factor(self, date: int):
        return get_qoq_latest('gp@qtr/ta@qtr' , date)
    
class gp_ta_qtr_yoy(StockFactorCalculator):
    init_date = 20070101
    category0 = 'fundamental'
    category1 = 'growth'
    description = '毛利润/总资产同比变化值'
    
    def calc_factor(self, date: int):
        return get_yoy_latest('gp@qtr/ta@qtr' , date)

class gp_ttm_rank_delta(StockFactorCalculator):
    init_date = 20070101
    category0 = 'fundamental'
    category1 = 'growth'
    description = '毛利润TTM行业内分位数之差'
    
    def calc_factor(self, date: int):
        df = DATAVENDOR.get_fin_hist('gp@ttm' , date , 5)
        df = DATAVENDOR.INFO.add_indus(df , date , 'unknown')

        df['gross_margin'] = df.groupby(['end_date' , 'indus'])['gross_margin'].rank(pct=True)
        df = df.drop(columns = ['indus'])
        return (df - df.groupby('secid').shift(4)).dropna().groupby('secid').last().iloc[:,0]

class gp_ta_qoq_trend(StockFactorCalculator):
    init_date = 20070101
    category0 = 'fundamental'
    category1 = 'growth'
    description = '毛利润/总资产环比变化趋势'
    
    def calc_factor(self, date: int):
        gp_ta_qtr = DATAVENDOR.get_fin_hist('gp@qtr/ta@qtr' , date , 20).iloc[:,0]
        qoq = calc_qoq(gp_ta_qtr)
        return calc_trend(qoq)

class gp_ta_yoy_trend(StockFactorCalculator):
    init_date = 20070101
    category0 = 'fundamental'
    category1 = 'growth'
    description = '毛利润/总资产同比变化趋势'
    
    def calc_factor(self, date: int):
        gp_ta_qtr = DATAVENDOR.get_fin_hist('gp@qtr/ta@qtr' , date , 20).iloc[:,0]
        yoy = calc_yoy(gp_ta_qtr)
        return calc_trend(yoy)

class liab_yoy(StockFactorCalculator):
    init_date = 20070101
    category0 = 'fundamental'
    category1 = 'growth'
    description = '总负债同比变化率'
    
    def calc_factor(self, date: int):
        return get_yoy_latest('liab@qtr' , date)

class npro_czscore(StockFactorCalculator):
    init_date = 20070101
    category0 = 'fundamental'
    category1 = 'growth'
    description = 'npro单季度同比增速的标准化得分'
    
    def calc_factor(self, date: int):
        return get_yoy_zscore('npro@qtr' , date)

class dedt_equ_qtr_qoq(StockFactorCalculator):
    init_date = 20070101
    category0 = 'fundamental'
    category1 = 'growth'
    description = '扣非归母净利润/净资产环比变化值'
    
    def calc_factor(self, date: int):
        return get_qoq_latest('dedt@qtr/equ@qtr' , date)
    
class dedt_equ_qtr_yoy(StockFactorCalculator):
    init_date = 20070101
    category0 = 'fundamental'
    category1 = 'growth'
    description = '扣非归母净利润/净资产同比变化值'
    
    def calc_factor(self, date: int):
        return get_yoy_latest('dedt@qtr/equ@qtr' , date)

class dedt_ttm_yoy(StockFactorCalculator):
    init_date = 20070101
    category0 = 'fundamental'
    category1 = 'growth'
    description = '扣非归母净利润TTM同比变化率'
    
    def calc_factor(self, date: int):
        return get_yoy_latest('dedt@ttm' , date)

class dedt_qtr_yoy(StockFactorCalculator):
    init_date = 20070101
    category0 = 'fundamental'
    category1 = 'growth'
    description = '扣非归母净利润单季度同比变化率'
    
    def calc_factor(self, date: int):
        return get_yoy_latest('dedt@qtr' , date)

class npro_op_zscore(StockFactorCalculator):
    init_date = 20070101
    category0 = 'fundamental'
    category1 = 'growth'
    description = 'npro_op单季度同比增速的标准化得分'
    
    def calc_factor(self, date: int):
        return get_yoy_zscore('npro@qtr/oper_np@qtr' , date)
    

date = 20241203


In [10]:
DATAVENDOR.INFO.get_indus(date)

array([['bank'],
       ['bank'],
       ['bank'],
       ...,
       ['electronic'],
       ['electronic'],
       ['electronic']], dtype=object)

In [1]:
from src.api import FactorAPI
hier = FactorAPI.factor_hierarchy()
hier.factor_df()['file_name'].unique()


src.basic.INSTANCE_RECORD can be accessed to check ['trainer', 'account']
Basic module imported!


array(['behavior\\behavior_anndt', 'behavior\\behavior_ff3',
       'behavior\\behavior_slice', 'behavior\\correlation_beta',
       'behavior\\correlation_vp', 'behavior\\liquidity_cap',
       'behavior\\liquidity_cv', 'behavior\\liquidity_illiquid',
       'behavior\\liquidity_moneyflow', 'behavior\\liquidity_turnover',
       'behavior\\momentum_classic', 'behavior\\momentum_mdr',
       'behavior\\momentum_phigh', 'behavior\\momentum_weivol',
       'behavior\\volatility_exret', 'behavior\\volatility_ret',
       'behavior\\volatility_skew', 'fundamental\\earning_classic',
       'fundamental\\earning_sue', 'fundamental\\earning_tsreg',
       'fundamental\\growth_acc', 'fundamental\\growth_classic',
       'fundamental\\growth_long', 'fundamental\\growth_rankdelta',
       'fundamental\\growth_trend', 'fundamental\\growth_yoyzscore',
       'fundamental\\quality', 'fundamental\\valuation'], dtype=object)

In [2]:
factor_values = hier.test_calc_all_factors(multi_thread = True) # file_name = 'fundamental/growth_yoyzscore' , 

ff_alpha_1m calculated , valid_num is 5360
ff_mom_1m calculated , valid_num is 5360
ff_alpha_2m calculated , valid_num is 5361
ff_mom_2m calculated , valid_num is 5361
anndt_phigh calculated , valid_num is 5348
ff_r2_1m calculated , valid_num is 5360
ff_r2_2m calculated , valid_num is 5361
ff_r2_3m calculated , valid_num is 5366
ff_alpha_3m calculated , valid_num is 5366
ff_mom_3m calculated , valid_num is 5366
ff_reskurt_1m calculated , valid_num is 5358
ff_reskurt_2m calculated , valid_num is 5359
ff_reskurt_3m calculated , valid_num is 5362
mom_aaa calculated , valid_num is 5151
mom_aog calculated , valid_num is 5143
ff_resskew_1m calculated , valid_num is 5358
ff_resskew_2m calculated , valid_num is 5359
ff_resskew_3m calculated , valid_num is 5362
ff_alpha_6m calculated , valid_num is 5404
ff_r2_6m calculated , valid_num is 5404
ff_mom_6m calculated , valid_num is 5404
ff_resvol_1m calculated , valid_num is 5360
ff_resskew_6m calculated , valid_num is 5400
ff_resvol_2m calculated 

In [6]:
factor_values['npro_qoq'].corr(factor_values['net_margin_qoq'])


0.9997635020368824

In [5]:
hier.get_factor('expense_sales_qtr')().calc_factor(20241203)

secid
1              NaN
2         0.069183
3              NaN
4         0.644270
5         0.394658
            ...   
920060    0.049546
920066    0.026504
920088    0.176229
920099    0.077616
920118    0.034347
Name: expense_sales_qtr, Length: 5740, dtype: float64

In [6]:
hier.get_factor('expense_sales_ttm')().calc_factor(20241203)

secid
1              NaN
2         0.069183
3              NaN
4         0.644270
5         0.394658
            ...   
920060    0.049546
920066    0.026504
920088    0.176229
920099    0.077616
920118    0.034347
Name: expense_sales_ttm, Length: 5772, dtype: float64

In [7]:
hier.get_factor('ff_alpha_1m')().calc_factor(20241203)


secid
1         0.001472
2        -0.003134
4        -0.006647
6        -0.001304
7        -0.004757
            ...   
920088   -0.001742
920099    0.000018
920111    0.012260
920118    0.010361
920128   -0.002829
Name: ff_alpha_1m, Length: 5372, dtype: float64

In [7]:
hier.update_jobs(file_name = 'liquidity_illiq')
df = hier.jobs()


array([illiq_12m(from20070101,behavior,liquidity)[0dates],
       illiq_1m(from20070101,behavior,liquidity)[0dates],
       illiq_2m(from20070101,behavior,liquidity)[0dates],
       illiq_3m(from20070101,behavior,liquidity)[0dates],
       illiq_6m(from20070101,behavior,liquidity)[0dates]], dtype=object)

In [5]:
df = hier.jobs()
df['factor'].unique()


array([illiq_12m(from20070101,behavior,liquidity)[0dates],
       illiq_1m(from20070101,behavior,liquidity)[0dates],
       illiq_2m(from20070101,behavior,liquidity)[0dates],
       illiq_3m(from20070101,behavior,liquidity)[0dates],
       illiq_6m(from20070101,behavior,liquidity)[0dates]], dtype=object)

In [6]:
hier.Update()

Factor : illiq_12m at date 20070104 deploy successful
Factor : illiq_1m at date 20070104 deploy successful
Factor : illiq_2m at date 20070104 deploy successful
Factor : illiq_3m at date 20070104 deploy successful
Factor : illiq_6m at date 20070104 deploy successful
Factor : illiq_12m at date 20070105 deploy successful
Factor : illiq_1m at date 20070105 deploy successful
Factor : illiq_2m at date 20070105 deploy successful
Factor : illiq_3m at date 20070105 deploy successful
Factor : illiq_6m at date 20070105 deploy successful
Factor : illiq_12m at date 20070108 deploy successful
Factor : illiq_1m at date 20070108 deploy successful
Factor : illiq_2m at date 20070108 deploy successful
Factor : illiq_3m at date 20070108 deploy successful
Factor : illiq_6m at date 20070108 deploy successful
Factor : illiq_12m at date 20070109 deploy successful
Factor : illiq_1m at date 20070109 deploy successful
Factor : illiq_2m at date 20070109 deploy successful
Factor : illiq_3m at date 20070109 deploy 