In [8]:
import polars as pl
import numpy as np
import warnings
import numpy as np
from scipy.stats import kendalltau, rankdata, spearmanr
from scipy import stats
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from typing import Union, List
from tqdm import tqdm
# import matplotlib.pyplot as plt
# from eval_utils import calculate_statistics
print(f"Polars version: {pl.__version__}")

from eval_utils import calculate_statistics, plot_array

warnings.filterwarnings("ignore")

Polars version: 1.6.0


In [None]:
factor_file_path = "data/linear_compound_factor.parquet"
factor_file_path = "data/result_hour_alpha101.parquet"
# factor_file_path = "data/linear_compound_factor_boris.parquet"
factor_data = pl.read_parquet(factor_file_path)

# only derive single factor
factor_data = factor_data.select([
    col for col in factor_data.columns 
    if 'compound' not in col.lower()
])



factor_data

In [None]:

def smooth_factor_once(input_df: pl.DataFrame, column_name: str, window: int = 20, lag: int = 1, method: str = 'mean', custom_func=None) -> pl.DataFrame:
    """
    对输入的Polars DataFrame的指定列进行平滑处理，使用动态扩展窗口策略。
    
    参数:
    input_df (pl.DataFrame): 输入的Polars DataFrame
    column_name (str): 要处理的列名
    window (int): 最大滑动窗口大小，默认为20
    method (str): 平滑方法，可选 'mean', 'std', 'median', 'min', 'max', 'custom', 'ema', 'zscore', 'skew', 'kurtosis', 'iqr'
    custom_func (callable, optional): 自定义的滚动窗口函数，当 method='custom' 时使用
    
    返回:
    pl.DataFrame: 包含新增平滑列的DataFrame
    """
    
    new_column_name = f"rolling_{column_name}_{method}_{window}"
    window_expr = pl.col("").count().over("").clip(1, window)


    if method == 'custom':
        if custom_func is None:
            raise ValueError("当 method='custom' 时，必须提供 custom_func")
        smooth_expr = pl.col(column_name).rolling_apply(custom_func, window_expr, min_periods=1)
    elif method == 'ema':
        smooth_expr = pl.col(column_name).ewm_mean(span=window, min_periods=1)
    elif method == 'zscore':
        smooth_expr = (pl.col(column_name) - pl.col(column_name).rolling_mean(window_expr, min_periods=1)) / pl.col(column_name).rolling_std(window_expr, min_periods=1)
    elif method in ['skew', 'kurtosis', 'iqr']:
        if method == 'skew':
            func = lambda x: stats.skew(x, nan_policy='omit')
        elif method == 'kurtosis':
            func = lambda x: stats.kurtosis(x, nan_policy='omit')
        else:  # iqr
            func = lambda x: np.percentile(x, 75) - np.percentile(x, 25)
        smooth_expr = pl.col(column_name).rolling_apply(func, window_expr, min_periods=1)
    elif method in ['mean', 'std', 'median', 'min', 'max', 'sum', 'var']:
        # 使用之前的逻辑处理标准方法
        smooth_expr = getattr(pl.col(column_name), f"rolling_{method}")(window, min_periods=1)
    elif method == 'quantile':
            # interpolation 参数决定了在计算分位数时的插值方法，常用的值包括：
            # "linear": 线性插值。
            # "lower": 使用较低的数据点。
            # "higher": 使用较高的数据点。
            # "nearest": 使用最接近的数据点。
            # "midpoint": 使用中点的值。
        smooth_expr = pl.col(column_name).rolling_quantile(0.5, interpolation='linear', window_size = window, min_periods=1)
    elif method == 'self_covariance':
        assert (0), f'todo'
        lagged_col = pl.col(column_name).shift(lag)
        smooth_expr = pl.col(column_name).rolling_cov(lagged_col, window_expr, min_periods=1)
    elif method == 'self_correlation':
        assert (0), f'todo'
        # lagged_col = pl.col(column_name).shift(lag)
        # smooth_expr = pl.col(column_name).rolling_corr(lagged_col, window_expr, min_periods=1)
        def rolling_corr(x):
                if len(x) < 2:
                    return float('nan')
                return np.corrcoef(x[:-lag], x[lag:])[0, 1]
        smooth_expr = pl.col(column_name).rolling_apply(rolling_corr, window_expr, min_periods=1)
    elif method == 'mad':  # Mean Absolute Deviation
        assert (0), f'todo'
        # smooth_expr = pl.col(column_name).rolling_apply(lambda x: np.mean(np.abs(x - np.mean(x))), window, min_periods=1)
        smooth_expr = pl.col(column_name).rolling_map(lambda x: float(np.mean(np.abs(x - np.mean(x)))), window, min_periods=1)
    else:
        raise ValueError(f"未实现的方法: {method}")
    return input_df.with_columns(smooth_expr.alias(new_column_name))

In [None]:

def custom_rolling_rank(series: pl.Series, window: int) -> pl.Series:
    def rank_window(window: pl.Series) -> float:
        ranked = window.rank(method="average", descending=True)
        return ranked[ranked.len() - 1]

    return series.rolling_map(
        function=rank_window,
        window_size=window,
        min_periods=1
    )

def smooth_factor(
    input_df: pl.DataFrame,
    factor_name: str,
    window: int = 20,
    method: Union[str, List[str], None] = None,
) -> pl.DataFrame:
    supported_methods = ["mean", "var", "std", "median", "min", "max", "sum"]

    if method is None:
        method = supported_methods
    elif isinstance(method, str):
        method = [method]

    result_df = input_df.group_by("symbol").map_groups(
        lambda group: group.sort("open_time").with_columns(
            [
                pl.col(factor_name)
                .rolling_mean(window_size=window, min_periods=1)
                .alias(f"rolling_{factor_name}_mean_{window}"),
                pl.col(factor_name)
                .rolling_sum(window_size=window, min_periods=1)
                .alias(f"rolling_{factor_name}_sum_{window}"),
                pl.col(factor_name)
                .rolling_std(window_size=window, min_periods=1)
                .alias(f"rolling_{factor_name}_std_{window}"),
                pl.col(factor_name)
                .rolling_skew(window_size=window)
                .alias(f"rolling_{factor_name}_skew_{window}"),
                pl.col(factor_name)
                .rolling_var(window_size=window, min_periods=1)
                .alias(f"rolling_{factor_name}_var_{window}"),
                pl.col(factor_name)
                .rolling_quantile(window_size=window, min_periods=1, quantile=0.5)
                .alias(f"rolling_{factor_name}_quantile_50_{window}"),
                pl.rolling_corr(
                    pl.col(factor_name),
                    pl.col(factor_name).shift(1),
                    window_size=window,
                    min_periods=1,
                ).alias(f"rolling_{factor_name}_self_corr_lag1_{window}"),
                pl.rolling_corr(
                    pl.col(factor_name),
                    pl.col(factor_name).shift(2),
                    window_size=window,
                    min_periods=1,
                ).alias(f"rolling_{factor_name}_self_corr_lag2_{window}"),
                pl.rolling_cov(
                    pl.col(factor_name),
                    pl.col(factor_name).shift(1),
                    window_size=window,
                    min_periods=1,
                ).alias(f"rolling_{factor_name}_self_cov_lag1_{window}"),
                pl.rolling_cov(
                    pl.col(factor_name),
                    pl.col(factor_name).shift(2),
                    window_size=window,
                    min_periods=1,
                ).alias(f"rolling_{factor_name}_self_cov_lag2_{window}"),
                pl.col(factor_name)
                .ewm_mean(span=window, adjust=True)
                .alias(f"rolling_{factor_name}_ewm_mean_{window}"),
                pl.col(factor_name)
                .ewm_std(span=window, adjust=True)
                .alias(f"rolling_{factor_name}_ewm_std_{window}"),
                pl.col(factor_name)
                .ewm_var(span=window, adjust=True)
                .alias(f"rolling_{factor_name}_ewm_var_{window}"),
                # range: max - min
                (
                    pl.col(factor_name).rolling_max(window_size=window, min_periods=1)
                    - pl.col(factor_name).rolling_min(window_size=window, min_periods=1)
                ).alias(f"rolling_{factor_name}_range_{window}"),
                # z-score
                (
                    (
                        pl.col(factor_name)
                        - pl.col(factor_name).rolling_mean(
                            window_size=window, min_periods=1
                        )
                    )
                    / pl.col(factor_name).rolling_std(window_size=window, min_periods=1)
                ).alias(f"rolling_{factor_name}_zscore_{window}"),
                # 差分
                pl.col(factor_name).diff().alias(f'{factor_name}_diff_1'),
                pl.col(factor_name).diff(2).alias(f'{factor_name}_diff_2'),
                # 百分比变化
                pl.col(factor_name).pct_change().alias(f'{factor_name}_pct_change_1'),
                pl.col(factor_name).pct_change(2).alias(f'{factor_name}_pct_change_2'),
                # custom
                custom_rolling_rank(pl.col(factor_name), window).alias(f'rolling_{factor_name}_rank_{window}'),
            ]
        )
    )

    return result_df

In [None]:
factor_name = [col for col in factor_data.columns if col not in ['open_time', 'close_time', 'symbol'] and 'auto_corr' not in col]
# factor_name = ['open']
factor_name = factor_name[:10]
print (f'factor_name: {factor_name}')
roll_df = factor_data.clone()
for each_factor in tqdm(factor_name, desc="Processing factors"):
# for each_factor in factor_name:
    # roll_df = smooth_factor_once(roll_df, each_factor, window=2, method='ema')
    # roll_df = smooth_factor_once(roll_df, each_factor, window=5, method='sum')
    # roll_df = smooth_factor_once(roll_df, each_factor, window=2, method='mad')
    # roll_df = smooth_factor(roll_df, each_factor, window=20, method = ['var'])
    roll_df = smooth_factor(roll_df, each_factor, window=6)
    roll_df = smooth_factor(roll_df, each_factor, window=20)
    roll_df = smooth_factor(roll_df, each_factor, window=40)
    # roll_df = smooth_factor_once(roll_df, each_factor, window=20, method='mean')
    # break
    # roll_df = smooth_factor(roll_df, each_factor, window=20, method='std')

roll_df

In [9]:
if 1:
    roll_df = pl.read_parquet ('data/rolling_factors.parquet')
roll_df

symbol,open_time,close_time,close,volume,count,return,amihud,return_skew,return_kurtosis,alpha13,alpha15,alpha16,alpha24,alpha25,alpha26,alpha28,alpha30,alpha34,alpha35,alpha36,alpha38,alpha40,alpha44,alpha45,alpha46,alpha47,alpha50,alpha51,alpha54,alpha55,alpha62,alpha64,alpha71,alpha74,alpha81,alpha94,…,quote_volume_market_share_pct_diff_1,quote_volume_market_share_pct_diff_2,quote_volume_market_share_pct_pct_change_1,quote_volume_market_share_pct_pct_change_2,rolling_quote_volume_market_share_pct_rank_6,rolling_quote_volume_market_share_pct_mean_20,rolling_quote_volume_market_share_pct_sum_20,rolling_quote_volume_market_share_pct_std_20,rolling_quote_volume_market_share_pct_skew_20,rolling_quote_volume_market_share_pct_var_20,rolling_quote_volume_market_share_pct_quantile_50_20,rolling_quote_volume_market_share_pct_self_corr_lag1_20,rolling_quote_volume_market_share_pct_self_corr_lag2_20,rolling_quote_volume_market_share_pct_self_cov_lag1_20,rolling_quote_volume_market_share_pct_self_cov_lag2_20,rolling_quote_volume_market_share_pct_ewm_mean_20,rolling_quote_volume_market_share_pct_ewm_std_20,rolling_quote_volume_market_share_pct_ewm_var_20,rolling_quote_volume_market_share_pct_range_20,rolling_quote_volume_market_share_pct_zscore_20,rolling_quote_volume_market_share_pct_rank_20,rolling_quote_volume_market_share_pct_mean_40,rolling_quote_volume_market_share_pct_sum_40,rolling_quote_volume_market_share_pct_std_40,rolling_quote_volume_market_share_pct_skew_40,rolling_quote_volume_market_share_pct_var_40,rolling_quote_volume_market_share_pct_quantile_50_40,rolling_quote_volume_market_share_pct_self_corr_lag1_40,rolling_quote_volume_market_share_pct_self_corr_lag2_40,rolling_quote_volume_market_share_pct_self_cov_lag1_40,rolling_quote_volume_market_share_pct_self_cov_lag2_40,rolling_quote_volume_market_share_pct_ewm_mean_40,rolling_quote_volume_market_share_pct_ewm_std_40,rolling_quote_volume_market_share_pct_ewm_var_40,rolling_quote_volume_market_share_pct_range_40,rolling_quote_volume_market_share_pct_zscore_40,rolling_quote_volume_market_share_pct_rank_40
str,datetime[ms],datetime[ms],f64,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,f64,i64,i64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""WAVESUSDT""",2020-08-12 00:00:00,2020-08-12 23:59:59.999,2.169,2.2259e7,121054,0.0,0.0,,,-0.001773,-0.921434,-0.001755,-2.0804,0.497414,-0.0,-5.1585e-7,0.178982,0.317386,300.0,3.023907,-0.015818,0.533451,0.997737,0.132855,-2.0804,-0.24872,-0.931393,-2.0804,-1.192573,0.974516,0,-1,0.0,0,0,-0.327235,…,,,,,1.0,0.777309,0.777309,,,,0.777309,,,,,0.777309,0.0,0.0,0.0,,1.0,0.777309,0.777309,,,,0.777309,,,,,0.777309,0.0,0.0,0.0,,1.0
"""WAVESUSDT""",2020-08-13 00:00:00,2020-08-13 23:59:59.999,3.3433,6.5613e7,511154,54.140157,0.0,,,-0.001543,-0.821704,-0.001521,-3.2547,0.003368,-0.0,-8.5045e-7,0.067527,0.03922,930.0,0.952982,-0.787594,0.60436,0.822964,-0.139378,-1.0,-0.384068,-0.465718,-1.1743,-0.093413,0.929627,0,-1,0.0,0,0,-0.452243,…,1.48603,,1.911764,,1.0,1.520324,3.040647,1.050782,,1.104143,2.263339,,,,,1.557474,1.050782,1.104143,1.48603,0.707107,1.0,1.520324,3.040647,1.050782,,1.104143,2.263339,,,,,1.538899,1.050782,1.104143,1.48603,0.707107,1.0
"""WAVESUSDT""",2020-08-14 00:00:00,2020-08-14 23:59:59.999,3.505,4.8895e7,469609,4.836539,0.0,,,-0.001599,-0.749431,-0.001586,-3.4164,0.011174,0.666667,-4.9450e-7,0.008498,0.065373,728.0,1.253999,-0.783944,0.688842,0.770302,0.139503,-1.0,-0.515913,-0.357465,-0.1617,-0.325404,0.933671,0,-1,0.0,0,0,-0.262461,…,0.568797,2.054827,0.251309,2.643515,1.0,1.957594,5.872783,1.060985,,1.125688,2.263339,1.0,,0.422625,,2.025522,1.045304,1.092661,2.054827,0.824273,1.0,1.957594,5.872783,1.060985,,1.125688,2.263339,1.0,,0.422625,,1.991707,1.053364,1.109575,2.054827,0.824273,1.0
"""WAVESUSDT""",2020-08-15 00:00:00,2020-08-15 23:59:59.999,3.399,2.6652e7,231202,-3.024251,0.0,1.681406,-1.5,-0.001989,-1.024367,-0.001976,-3.3104,0.945185,0.322749,-5.0204e-7,0.02957,0.958114,260.0,2.318267,-0.227765,0.754355,0.692007,-0.138159,-1.0,-0.600473,-0.357465,0.106,-0.383057,0.9062,0,-1,0.0,0,0,-0.536565,…,-1.46279,-0.893993,-0.516497,-0.394989,3.0,1.810532,7.242128,0.91486,,0.836968,2.263339,-0.388519,-1.0,-0.303964,-1.086875,1.836094,0.899901,0.809822,2.054827,-0.482245,3.0,1.810532,7.242128,0.91486,,0.836968,2.263339,-0.388519,-1.0,-0.303964,-1.086875,1.824258,0.907053,0.822745,2.054827,-0.482245,3.0
"""WAVESUSDT""",2020-08-16 00:00:00,2020-08-16 23:59:59.999,4.0495,3.4530e7,426109,19.137982,0.0,1.914915,3.705905,-0.980136,-1.507557,-0.966695,-3.9609,0.009701,-0.0,-6.6882e-7,0.012597,0.217296,1350.0,1.145454,-0.77939,0.782476,-0.501783,-0.139852,-1.0,-0.627125,-0.530143,-0.6505,-0.296826,0.852072,0,-1,0.0,0,0,-0.565948,…,0.917416,-0.545374,0.669967,-0.192566,2.0,1.905778,9.52889,0.820418,,0.673085,2.263339,-0.400694,-0.589167,-0.222029,-0.462096,1.945107,0.789194,0.622828,2.054827,0.464378,2.0,1.905778,9.52889,0.820418,,0.673085,2.263339,-0.400694,-0.589167,-0.222029,-0.462096,1.926234,0.804609,0.647396,2.054827,0.464378,2.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""ONEUSDT""",2024-07-31 00:00:00,2024-07-31 23:59:59.999,0.01311,3.36746823e8,39837,-3.532009,8.1709e-7,1.084441,8.706674,-0.532271,-1.757822,-0.300959,-0.00115,0.594017,-0.272166,4.0321e-8,0.152624,0.782062,-0.0,2.472668,-0.049905,0.005085,-0.0,0.0,1.0,-0.14208,-0.789129,0.00048,-0.026599,-0.187959,0,-1,4.0,0,0,0.0,…,0.000543,0.001564,0.050689,0.161385,2.0,0.012361,0.247222,0.002358,2.185106,0.000006,0.012084,0.045902,-0.213878,2.5484e-7,-0.000001,0.012223,0.002309,0.000005,0.010859,-0.469501,14.0,0.013053,0.522123,0.002152,1.128543,0.000005,0.01258,0.198469,-0.091639,9.2548e-7,-4.2376e-7,0.013532,0.00422,0.000018,0.010859,-0.835982,33.0
"""ONEUSDT""",2024-08-01 00:00:00,2024-08-01 23:59:59.999,0.01313,5.61881623e8,52867,0.152555,6.7583e-7,1.08568,8.712784,-0.256134,-0.964312,-0.101496,-0.00117,0.491344,-0.272166,-1.5923e-8,0.167483,0.540746,476.0,2.057779,-0.044507,0.011827,-0.0,0.0,1.0,-0.192472,-0.789129,-0.00002,-0.890016,-0.018898,0,-1,4.0,0,0,0.0,…,0.001764,0.002307,0.156737,0.215371,2.0,0.012382,0.247638,0.002362,2.147826,0.000006,0.012084,0.036459,-0.221368,2.0307e-7,-0.000001,0.012299,0.002209,0.000005,0.010859,0.269252,6.0,0.013012,0.520498,0.002137,1.200358,0.000005,0.01258,0.179591,-0.107349,8.2575e-7,-4.9697e-7,0.013507,0.004117,0.000017,0.010859,0.002557,17.0
"""ONEUSDT""",2024-08-02 00:00:00,2024-08-02 23:59:59.999,0.01213,5.24499687e8,54106,-7.616146,7.6960e-7,1.086088,8.722216,-0.211114,-0.41679,-0.096364,-0.00017,0.654401,0.045835,-7.4262e-8,0.187669,0.344094,-0.0,1.767522,-0.01705,0.018759,-0.0,0.0,1.0,-0.067691,-0.591382,0.001,-0.181911,0.318925,0,-1,6.0,0,0,0.0,…,-0.001905,-0.000141,-0.146306,-0.0125,4.0,0.012333,0.246667,0.002378,2.15856,0.000006,0.012053,0.0292,-0.203351,1.6405e-7,-0.000001,0.012186,0.002131,0.000005,0.010859,-0.512951,14.0,0.012896,0.515825,0.002109,1.339237,0.000004,0.01258,0.15749,-0.129195,7.0949e-7,-5.8624e-7,0.01339,0.004049,0.000016,0.010859,-0.845272,33.0
"""ONEUSDT""",2024-08-03 00:00:00,2024-08-03 23:59:59.999,0.01135,4.49787626e8,41948,-6.430338,7.8529e-7,1.086434,8.712854,-0.212449,-0.892663,-0.096065,-0.0,0.635433,0.045835,-1.0399e-7,0.192041,0.814201,-0.0,1.782878,-0.009055,0.022396,-0.0,0.0,1.0,-0.109594,-0.47062,0.00078,-0.265619,0.239846,0,-1,8.0,0,0,0.0,…,0.000686,-0.001219,0.061717,-0.093618,2.0,0.012248,0.244961,0.002365,2.300194,0.000006,0.011878,0.037997,-0.209806,2.1371e-7,-0.000001,0.012149,0.00203,0.000004,0.010859,-0.189799,11.0,0.012881,0.515257,0.002114,1.345889,0.000004,0.01258,0.179199,-0.124929,7.9880e-7,-5.6429e-7,0.013312,0.003964,0.000016,0.010859,-0.511905,29.0


In [10]:
class FactorStatResult:
    def __init__(self, ann_return, sharpe, maxdd, calmar_ratio):
        self.ann_return = ann_return
        self.sharpe = sharpe
        self.maxdd = maxdd
        self.calmar_ratio = calmar_ratio

def factor_stats(n, pnl:pl.Series):
    net_value = pnl.cum_sum() + 1.0
    sharpe = n ** 0.5 * pnl.mean() / pnl.std()
    ann_return = n * pnl.mean()
    maxdd = (-(net_value / net_value.cum_max() - 1)).max()
    if maxdd == 0:
        # print (f'net_value: {net_value} ===')
        # print (f'pnl: ==== {pnl}')
        # # Plot the trend of pnl
        # plt.figure(figsize=(10, 6))
        # plt.plot(pnl.cum_sum().to_numpy())
        # plt.title(f'Cumulative PnL Trend (n={n})')
        # plt.xlabel('Time')
        # plt.ylabel('Cumulative PnL')
        # plt.grid(True)
        # plt.show()
        print("警告：检测到零回撤")
        # 使用一个很小的非零值作为替代
        maxdd = 1e-6 

    calmar_ratio = ann_return / maxdd
    return FactorStatResult(ann_return, sharpe, maxdd, calmar_ratio)


In [11]:
def analyse_single_factor(input_df: pl.DataFrame, factor_name: str):
    FACTOR_NAME = factor_name
    # print ('FACTOR_NAME:', FACTOR_NAME)

    close = input_df[["open_time", "symbol", "close"]].pivot(index="open_time", columns="symbol", values="close").sort("open_time")
    factors = input_df[["open_time", "symbol", FACTOR_NAME]].pivot(index="open_time", columns="symbol", values = FACTOR_NAME).sort("open_time")

    symbol_list = close.columns[1:]
    # print ('factors shape:', factors.shape)
    # factors.fill_nan(0).describe()
    factors.describe()

    # 因子未来收益率：
    ret = close.clone()
    ret[symbol_list] = ret[symbol_list].shift(-1) / ret[symbol_list] - 1

    # 对齐column
    ret = ret[factors.columns] 
    t = factors[["open_time"]] 

    # 对齐open_time
    ret = ret.join(t, how="inner", on=["open_time"]).sort(by=["open_time"]) 
    factors = factors.sort(by=["open_time"])


    factors_np = factors[symbol_list].to_numpy()
    ret_np = ret[symbol_list].to_numpy()
    num_timestamps = factors_np.shape[0]
    num_stocks = factors_np.shape[1]

    # Initialize an array to store the IC values
    ic_values = np.zeros(num_timestamps)
    rank_ic_values = np.zeros(num_timestamps)

    for i in range(num_timestamps):
        # Extract the factor values and return values for the current timestamp
        factor_values = factors_np[i, :]
        returns = ret_np[i, :]

        # Find indices where both factor_values and returns are not NaN
        valid_indices = ~(np.isnan(factor_values) | np.isnan(returns))

        # Filter out NaN values
        factor_values_valid = factor_values[valid_indices]
        returns_valid = returns[valid_indices]

        # Rank the valid arrays
        ranked_factors = rankdata(factor_values_valid)
        ranked_returns = rankdata(returns_valid)

        # Calculate the Pearson correlation coefficient (IC)
        if len(factor_values_valid) > 1:
            if 0:
                ic_values[i] = np.corrcoef(factor_values_valid, returns_valid)[0, 1]
                rank_ic_values[i] = np.corrcoef(ranked_factors, ranked_returns)[0, 1]
            elif 0:
                ic_values[i] = spearmanr(factor_values_valid, returns_valid).correlation
                rank_ic_values[i] = spearmanr(ranked_factors, ranked_returns).correlation
            else:
                tau, _ = kendalltau(factor_values_valid, returns_valid)
                rank_tau, _ = kendalltau(ranked_factors, ranked_returns)
                # print (f'tau: {tau} == factor_values: {factor_values_valid} == returns: {returns_valid}')
                ic_values[i] = tau
                rank_ic_values[i] = rank_tau
        else:
            ic_values[i] = np.nan

    # print_statistics(ic_values, "IC")
    ic_stat: dict = calculate_statistics (ic_values)
    # print ('ic stat dict', ic_stat)
    # plot_array(f"{FACTOR_NAME} IC result", ic_values, cumulative = False)
    # plot_array(f"{FACTOR_NAME} IC result", ic_values, cumulative = True)

    # print_statistics(rank_ic_values, "Rank IC")
    rank_ic_stat: dict = calculate_statistics (rank_ic_values)
    # print ('rank ic stat dict', rank_ic_stat)
    # plot_array(f"{FACTOR_NAME} Rank IC result", rank_ic_values, cumulative = False)
    # plot_array(f"{FACTOR_NAME} Rank IC result", rank_ic_values, cumulative = True)

    
    # calc sharp ratio
    # 求每一行的分位数
    commission = 5/100000 
    percentage = 0.5
    quantiles = np.nanquantile(factors_np, [1 - percentage, percentage], axis=1)

    # 把quantiles[0]和[1]变成factors_np的shape
    f = lambda x: quantiles[x][:, None].repeat(factors_np.shape[1], axis=-1)
    long_quantile, short_quantile = f(0), f(1)

    # print (f'quantiles: {quantiles} .  long_quantile: {long_quantile}, short_quantile: {short_quantile}')

    # return before fee
    long = np.nan_to_num(np.nan_to_num(ret_np * (factors_np > long_quantile), 0.0).sum(axis=1) / (factors_np > long_quantile).sum(axis=1) , 0.0)
    short = np.nan_to_num(np.nan_to_num(ret_np * (factors_np < long_quantile), 0.0).sum(axis=1) / (factors_np < long_quantile).sum(axis=1) , 0.0)
    bench = np.nan_to_num(np.nanmean(ret_np, axis=1), 0.0)

    # return after fee
    long_fee = pl.Series(long - 2 * commission)
    short_fee = pl.Series(short - 2 * commission)
    bench_fee = pl.Series(bench - 2 * commission)

    long_short = pl.Series(long - short - 2 * commission)
    long_bench = pl.Series(long - bench - 2 * commission)
    bench_short = pl.Series(bench - short - 2 * commission)
    bench_long = pl.Series(bench - long - 2 * commission)
    short_long = pl.Series(short - long - 2 * commission)
    short_bench = pl.Series(short - bench - 2 * commission)

    long_cum = long_fee.cum_sum()
    short_cum = short_fee.cum_sum()
    bench_cum = bench_fee.cum_sum()

    long_short_cum = long_short.cum_sum()
    long_bench_cum = long_bench.cum_sum()
    bench_short_cum = bench_short.cum_sum()
    bench_long_cum = bench_long.cum_sum()
    short_long_cum = short_long.cum_sum()
    short_bench_cum = short_bench.cum_sum()

    long_short_stat_result: FactorStatResult = factor_stats(365, long_short)
    short_long_stat_result: FactorStatResult = factor_stats(365, short_long)
    if long_short_stat_result.sharpe > short_long_stat_result.sharpe:
        ret_stat_result = long_short_stat_result
    else:
        ret_stat_result = short_long_stat_result


    return ic_stat, rank_ic_stat, ret_stat_result

In [12]:
all_column = [col for col in roll_df.columns if "rolling" in col]
print (f'all_column num: {len (all_column)}')

# Initialize lists to store results
results = []
factor_names = []

column_names = []
have_column_name = False
# all_column = ['rolling_open_skew_40']
# all_column = all_column[:10]
# Collect results for each factor
for each_factor in tqdm (all_column, desc="Analyzing factors"):
    ic_stat_dict, rank_ic_stat_dict, ret_stat_result  = analyse_single_factor(roll_df, each_factor)
    # print(f"ic_stat_dict: {ic_stat_dict}")
    # print(f"rank_ic_stat_dict: {rank_ic_stat_dict}")
    factor_results = []

    for key, value in ic_stat_dict.items():
        if "Count" in key or "T-Value" in key:
            continue
        factor_results.append(value)
        if not have_column_name:
            column_names.append("IC\n" + key)

    for key, value in rank_ic_stat_dict.items():
        if "Count" in key or "T-Value" in key:
            continue
        factor_results.append(value)
        if not have_column_name:
            column_names.append("Rank IC\n" + key)
    factor_results.append(ret_stat_result.sharpe)
    factor_results.append(ret_stat_result.calmar_ratio)

    have_column_name = True

    results.append(factor_results)
    factor_names.append(each_factor)

# print (f'result df shape: {results.shape}')



all_column num: 2160


Analyzing factors:  19%|█▉        | 417/2160 [04:20<25:46,  1.13it/s]

In [None]:
results
tmp_results = []
std_len = len (results[0])
single_zero_list = [[0] * std_len][0]
print (single_zero_list)

for i in results:
    if len (i) == std_len:
        tmp_results.append (i)
    else:
        tmp_results.append (single_zero_list)
print (len (results), len (tmp_results))
# results = tmp_results

In [None]:
# Convert results to a 2D numpy array
results_array = np.array(tmp_results)
# print("results_array shape:", results_array.shape, results_array)

results_df = pd.DataFrame(results_array, index=factor_names, columns=column_names + ['sharpe', 'calmar'])

In [None]:
def color_scale(data):
    def scale_column(col):
        if col.dtype != 'object':
            vmin, vmax = col.min(), col.max()
            if vmin == vmax:
                return ['background-color: white'] * len(col)
            return [
                f'background-color: rgba(255, 0, 0, {0.6 * (x-vmin)/(vmax-vmin)})' if x > 0 else
                f'background-color: rgba(0, 0, 255, {0.6 * (vmax-x)/(vmax-vmin)})' if x < 0 else
                'background-color: white'
                for x in col
            ]
        return [''] * len(col)

    return pd.DataFrame(data.apply(scale_column), index=data.index, columns=data.columns)

# Apply the color scaling to the original results DataFrame
styled_df = results_df.style.apply(color_scale, axis=None)

# Set precision for floating point numbers
styled_df = styled_df.format("{:.4f}")

# Display the styled DataFrame
# display(styled_df)

# Optional: If you want to save this as an HTML file for later viewing
styled_df.to_html('factor_analysis_results.html')
results_df

In [None]:

# Filter for sharpe > 1 and sort by sharpe in descending order
filtered_and_sorted_df = results_df[results_df['sharpe'] > 1].sort_values(by='sharpe', ascending=False)

# Display the filtered and sorted DataFrame
display(filtered_and_sorted_df)
# print("List of all factor names:")
# for factor_name in filtered_and_sorted_df.index:
#     print(factor_name)



In [None]:
from factor_utils import calculate_and_plot_factor_correlations
# BASIC_FACTOR_COLUMNS = ['open', 'high', 'low', 'close', 'volume', 'quote_volume']

# Convert filtered_and_sorted_df to a Polars DataFrame
filtered_and_sorted_pl = pl.from_pandas(filtered_and_sorted_df.reset_index())

# Get the list of factor names (which are now in the 'index' column)
factor_columns = filtered_and_sorted_pl['index'].to_list()

filted_factor_df = roll_df.select(roll_df.columns[:6] + factor_columns)

# 为了方便计算corr，把nan填充为0
filted_factor_df = filted_factor_df.with_columns(
    [pl.col(col).fill_null(0).fill_nan(0) for col in factor_columns]
)

# check the correlation of factor
# corr_df = calculate_and_plot_factor_correlations(filted_factor_df)
# print("Top factor correlations:")
# print(corr_df.head(20))
filted_factor_df[factor_columns].corr()

In [None]:
from factor_utils import select_low_correlation_factors
print (f'factor_columns: {factor_columns}')
select_low_correlation_factors(filted_factor_df, factor_columns, correlation_threshold=0.5, method='greedy')