In [97]:
import re
import pandas as pd
from gplearn.tests.test_genetic import *
from gplearn._program import _Program
from gplearn.genetic import _all_func_dictionary

In [98]:
df = pd.read_parquet('/home/pennymax/proj/MFM/CrossSection/v1_basic/results/basic_2023-11-22_22:15:18/fct_man_ta_proc.parquet')

In [99]:
def convert_expression_to_gp_program(expression: str, function_set: dict, feature_names: list) -> _Program:
    ## convert expression to list function obj and feature index
    tokens = re.findall(r'[\w.]+|\(|\)|,', expression)
    tokens = [t for t in tokens if t not in ['(', ')', ',']]
    # print(tokens)
    program = []
    for token in tokens:
        if token in function_set:
            program.append(function_set[token])
        elif token in feature_names:
            program.append(feature_names.index(token))
        elif token.isdigit():
            program.append(int(token))
        elif token.replace('.', '', 1).isdigit():
            program.append(float(token))
        else:
            print(f'!! unknown token found! {token}')
            return None
    # print(program)

    ## get arity dict
    arities = {}
    for function in function_set.values():
        arity = function.arity
        arities[arity] = arities.get(arity, [])
        arities[arity].append(function)

    ## construct _Program obj
    params = {
            'function_set': function_set,
            'arities': arities,
            'n_features': len(feature_names),
            'feature_names': feature_names,
            'init_depth': (2, 6),
            ## must inputs
            'init_method': 'half and half',
            'const_range': (-1.0, 1.0),
            'metric': 'mean absolute error',
            'p_point_replace': 0.05,
            'parsimony_coefficient': 0.1,
            'random_state': check_random_state(415),
            }
    gp = _Program(program=program, **params)
    return gp


exp = 'sub(mul(BCKRET1_MOM_5, SPOTVOLUME_TSIs_13_25_13), premIdx_close)'
exp = 'sub(SPOT_COPC_11_14_10, ts_correlation_20(SPOTTAKERBUYQUOVOL_PPOh_12_26_9, TRDCNT_TSI_13_25_13))'

print(f'ori expression: {exp}')
feature_names = df.columns.to_list()
gp = convert_expression_to_gp_program(exp, _all_func_dictionary, feature_names)
print(f'_Program print: {gp}')

ori expression: sub(SPOT_COPC_11_14_10, ts_correlation_20(SPOTTAKERBUYQUOVOL_PPOh_12_26_9, TRDCNT_TSI_13_25_13))
_Program print: sub(SPOT_COPC_11_14_10, ts_correlation_20(SPOTTAKERBUYQUOVOL_PPOh_12_26_9, TRDCNT_TSI_13_25_13))


In [100]:
dffct = pd.read_csv('/home/pennymax/proj/MFM/CrossSection/v1_basic/results/basic_2023-11-22_22:15:18/best_programs.csv')
ori_exps = dffct.Expression.to_list()
gp_exps = []
for exp in ori_exps:
    gp = convert_expression_to_gp_program(exp, _all_func_dictionary, feature_names)
    gp_exps.append(gp.__str__())
dfcomp = pd.DataFrame({
    'ori_exps': ori_exps,
    'gp_exps': gp_exps
})
dfcomp['is_same'] = dfcomp.ori_exps == dfcomp.gp_exps
display(dfcomp)
dfcomp[dfcomp.is_same==True].shape[0]

Unnamed: 0,ori_exps,gp_exps,is_same
0,"min(ts_min_3(TZS_Low), netbuyquo3)","min(ts_min_3(TZS_Low), netbuyquo3)",True
1,"sub(ts_min_10(TZS_spot_Close), premIdx_close)","sub(ts_min_10(TZS_spot_Close), premIdx_close)",True
2,"min(SPOT_PGO_14, ts_min_5(RVGIs_14_4))","min(SPOT_PGO_14, ts_min_5(RVGIs_14_4))",True
3,"add(ts_min_3(TZS_spot_Low), NETBUYQUO1_TSI_13_...","add(ts_min_3(TZS_spot_Low), NETBUYQUO1_TSI_13_...",True
4,"mul(mul(QQE_14_5_4.236_RSIMA, KST_10_15_20_30_...","mul(mul(QQE_14_5_4.236_RSIMA, KST_10_15_20_30_...",True
...,...,...,...
674,ts_min_5(ts_mean_5(SPOT_SMIs_5_20_5)),ts_min_5(ts_mean_5(SPOT_SMIs_5_20_5)),True
675,ts_mean_3(ts_min_5(QQE_14_5_4.236_RSIMA)),ts_mean_3(ts_min_5(QQE_14_5_4.236_RSIMA)),True
676,ts_min_20(ts_max_3(CCI_14_0.015)),ts_min_20(ts_max_3(CCI_14_0.015)),True
677,"min(ts_mean_5(SPOT_SMIs_5_20_5), SPOT_TSI_13_2...","min(ts_mean_5(SPOT_SMIs_5_20_5), SPOT_TSI_13_2...",True


679