# Functionality Display —— gplearn 3d mod

## Load Data and Packages

In [1]:
import warnings
import numpy as np
import pandas as pd
import gplearn.genetic as genetic

np.random.seed(10)
pd.set_option('display.max_columns', None)
pd.set_option('expand_frame_repr', True)
pd.set_option('display.unicode.ambiguous_as_wide', True)
warnings.filterwarnings('ignore')


In [2]:
Y = np.load("./data/Y.npy")
print(Y.shape)

(728, 4984)


In [3]:
X = np.load("./data/X.npy")
print(X.shape)

(728, 6, 4984)


In [4]:
feature_names = ["open", "close", "high", "low","vwap","volume"]

## Set Function Sets

In [5]:
max_samples=0.8
sample_weight = np.ones(X.shape[0])
num_div = int(X.shape[0] * max_samples)
sample_weight[num_div:] = 0

In [6]:
function_set_all = list(genetic._all_func_dictionary.keys())
remove_list = ['tan','sin','cos','neg']
function_set = [item for item in function_set_all if item not in remove_list]

## Train GP Model

In [7]:
# list all fitness metrics
list(genetic._extra_fitness_map.keys())

['rank_ic', 'rank_icir', 'quantile_max', 'quantile_mono']

In [22]:
gp_sample = genetic.SymbolicTransformer(generations=2,
                                        population_size=20,
                                        tournament_size=20,
                                        init_depth=(1,4),
                                        hall_of_fame=10,
                                        n_components=10,
                                        function_set=function_set,
                                        metric="rank_ic",
                                        const_range=None,
                                        p_crossover=0.4,
                                        p_hoist_mutation=0.001,
                                        p_subtree_mutation=0.01,
                                        p_point_mutation=0.01,
                                        p_point_replace=0.4,
                                        parsimony_coefficient="auto",
                                        feature_names=feature_names,
                                        max_samples=max_samples, verbose=1,
                                        random_state=0, n_jobs=-3)

In [16]:
print(X.shape, Y.shape)

(728, 6, 4984) (728, 4984)


In [23]:
gp_sample.fit_3D(X, Y,sample_weight=sample_weight,baseline=0.02,need_parallel=True)

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
正在生成表达式0/1 0.00% | 表达式生成完成
正在生成表达式0/1 0.00% | 表达式生成完成
正在生成表达式0/1 0.00% | 表达式生成完成
正在生成表达式0/1 0.00% | 表达式生成完成
正在生成表达式0/1 0.00% | 表达式生成完成
正在生成表达式0/1 0.00% | 表达式生成完成
正在生成表达式0/1 0.00% | 表达式生成完成
正在生成表达式0/1 0.00% | 表达式生成完成
正在生成表达式0/1 0.00% | 表达式生成完成
正在生成表达式0/1 0.00% | 表达式生成完成
正在生成表达式0/1 0.00% | 表达式生成完成
正在生成表达式0/1 0.00% | 表达式生成完成
正在生成表达式0/1 0.00% | 表达式生成完成
正在生成表达式0/1 0.00% | 表达式生成完成
正在生成表达式0/1 0.00% | 表达式生成完成
正在生成表达式0/1 0.00% | 表达式生成完成
正在生成表达式0/1 0.00% | 表达式生成完成
正在生成表达式0/1 0.00% | 表达式生成完成
正在生成表达式0/1 0.00% | 表达式生成完成
正在生成表达式0/1 0.00% | 表达式生成完成
   0     4.75        0.0091857        6        0.0338718        0.0401776     14.90s
正在生成表达式0/1 0.00% | 表达式生成完成
正在生成表达式0/1 0.00% | 表达式生成完成
正在生成表达式0/1 0.00% | 表达式生成完成
正在生成表达式0/1 0.00% | 表达式生成完成
正在生成表达式0/1 0.00% | 表达式

In [24]:
best_result = gp_sample.show_program_simple(baseline=True)
res = pd.DataFrame(best_result).drop_duplicates(subset="Expression").sort_values(by='Fitness',ascending = False)
res.to_csv("./Demo_Result.csv",index=True,index_label='idx')
res

Unnamed: 0,Expression,Fitness,OOB Fitness
2,"div(ts_std_40(ts_correlation_40(close, close))...",0.033872,0.040178
3,"div(div(ts_std_40(ts_correlation_40(close, clo...",0.033676,0.043974
1,ts_std_5(vwap),0.025999,0.030634
0,"add(ts_min_60(vwap), ts_max_40(volume))",0.025963,0.029541


In [25]:
# best_result = gp_sample.show_program(X, Y,sample_weight=sample_weight,baseline=True)
# res = pd.DataFrame(best_result).drop_duplicates(subset="表达式").sort_values(by='训练集RankIC',ascending = False)
# res.to_csv("./GP_demo2.csv",index=False)
# res

## Visualize Result

In [26]:
target_fac = gp_sample._total_program[2]
target_fac.__str__()

'div(ts_std_40(ts_correlation_40(close, close)), volume)'

In [27]:
print(target_fac.export_graphviz())

digraph program {
node [style=filled]
0 [label="div", fillcolor="#136ed4"] ;
1 [label="ts_std_40", fillcolor="#136ed4"] ;
2 [label="ts_correlation_40", fillcolor="#136ed4"] ;
3 [label="close", fillcolor="#60a6f6"] ;
4 [label="close", fillcolor="#60a6f6"] ;
2 -> 4 ;
2 -> 3 ;
1 -> 2 ;
5 [label="volume", fillcolor="#60a6f6"] ;
0 -> 5 ;
0 -> 1 ;
}
