diff --git a/README.md b/README.md index d56a9e95d0..2d67d77b31 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ Recent released features | Feature | Status | | -- | ------ | +|GCN/GATs_indus model|[📈 Released](https://github.com/microsoft/qlib/pull/1476) on Mar 24, 2023| | Release Qlib v0.9.0 | :octocat: [Released](https://github.com/microsoft/qlib/releases/tag/v0.9.0) on Dec 9, 2022 | | RL Learning Framework | :hammer: :chart_with_upwards_trend: Released on Nov 10, 2022. [#1332](https://github.com/microsoft/qlib/pull/1332), [#1322](https://github.com/microsoft/qlib/pull/1322), [#1316](https://github.com/microsoft/qlib/pull/1316),[#1299](https://github.com/microsoft/qlib/pull/1299),[#1263](https://github.com/microsoft/qlib/pull/1263), [#1244](https://github.com/microsoft/qlib/pull/1244), [#1169](https://github.com/microsoft/qlib/pull/1169), [#1125](https://github.com/microsoft/qlib/pull/1125), [#1076](https://github.com/microsoft/qlib/pull/1076)| | HIST and IGMTF models | :chart_with_upwards_trend: [Released](https://github.com/microsoft/qlib/pull/1040) on Apr 10, 2022 | @@ -355,7 +356,8 @@ Here is a list of models built on `Qlib`. - [ADD based on pytorch (Hongshun Tang, et al.2020)](examples/benchmarks/ADD/) - [IGMTF based on pytorch (Wentao Xu, et al.2021)](examples/benchmarks/IGMTF/) - [HIST based on pytorch (Wentao Xu, et al.2021)](examples/benchmarks/HIST/) - +- [GCN based on pytorch (N. Kipf, et al.2016)](examples/benchmarks/GCN/) + Your PR of new Quant models is highly welcomed. The performance of each model on the `Alpha158` and `Alpha360` dataset can be found [here](examples/benchmarks/README.md). diff --git a/examples/benchmarks/GATs/README.md b/examples/benchmarks/GATs/README.md index f432b6c5ba..0018a3d185 100644 --- a/examples/benchmarks/GATs/README.md +++ b/examples/benchmarks/GATs/README.md @@ -2,4 +2,5 @@ * Graph Attention Networks(GATs) leverage masked self-attentional layers on graph-structured data. The nodes in stacked layers have different weights and they are able to attend over their neighborhoods’ features, without requiring any kind of costly matrix operation (such as inversion) or depending on knowing the graph structure upfront. * This code used in Qlib is implemented with PyTorch by ourselves. -* Paper: Graph Attention Networks https://arxiv.org/pdf/1710.10903.pdf \ No newline at end of file +* Paper: Graph Attention Networks https://arxiv.org/pdf/1710.10903.pdf +`stalbe_ind.csv` contains industry information that each instrument belongs to, collected in 2008. If you want to run `GATs` with stocks connection, please run `workflow_config_gats_indus_{Dataset}.yaml` \ No newline at end of file diff --git a/examples/benchmarks/GATs/workflow_config_gats_indus_Alpha158.yaml b/examples/benchmarks/GATs/workflow_config_gats_indus_Alpha158.yaml new file mode 100644 index 0000000000..47d6c13545 --- /dev/null +++ b/examples/benchmarks/GATs/workflow_config_gats_indus_Alpha158.yaml @@ -0,0 +1,102 @@ +qlib_init: + provider_uri: "~/.qlib/qlib_data/cn_data" + region: cn +market: &market csi300 +benchmark: &benchmark SH000300 +experiment_name: gats_add_ind_ts +data_handler_config: &data_handler_config + start_time: 2008-01-01 + end_time: 2020-08-01 + fit_start_time: 2008-01-01 + fit_end_time: 2014-12-31 + instruments: *market + infer_processors: + - class: FilterCol + kwargs: + fields_group: feature + col_list: ["RESI5", "WVMA5", "RSQR5", "KLEN", "RSQR10", "CORR5", "CORD5", "CORR10", + "ROC60", "RESI10", "VSTD5", "RSQR60", "CORR60", "WVMA60", "STD5", + "RSQR20", "CORD60", "CORD10", "CORR20", "KLOW" + ] + - class: RobustZScoreNorm + kwargs: + fields_group: feature + clip_outlier: true + - class: Fillna + kwargs: + fields_group: feature + learn_processors: + - class: DropnaLabel + - class: CSRankNorm + kwargs: + fields_group: label + label: ["Ref($close, -2) / Ref($close, -1) - 1"] +port_analysis_config: &port_analysis_config + strategy: + class: TopkDropoutStrategy + module_path: qlib.contrib.strategy + kwargs: + signal: + - + - + topk: 50 + n_drop: 5 + backtest: + start_time: 2017-01-01 + end_time: 2020-08-01 + account: 100000000 + benchmark: *benchmark + exchange_kwargs: + limit_threshold: 0.095 + deal_price: close + open_cost: 0.0005 + close_cost: 0.0015 + min_cost: 5 +task: + model: + class: GATs_ADD_IND + module_path: qlib.contrib.model.pytorch_gats_add_ind_ts + kwargs: + d_feat: 20 + hidden_size: 64 + num_layers: 2 + dropout: 0.7 + n_epochs: 200 + lr: 0.00001 + early_stop: 10 + metric: loss + loss: mse + base_model: LSTM + model_path: "benchmarks/LSTM/csi300_lstm_ts.pkl" + GPU: 0 + industrial_data_path: '~/stable_ind.csv' + industry_col: 'industry_citic' + smooth_perplexity: 1 + dataset: + class: TSDatasetH + module_path: qlib.data.dataset + kwargs: + handler: + class: Alpha158 + module_path: qlib.contrib.data.handler + kwargs: *data_handler_config + segments: + train: [2008-01-01, 2014-12-31] + valid: [2015-01-01, 2016-12-31] + test: [2017-01-01, 2020-08-01] + step_len: 20 + record: + - class: SignalRecord + module_path: qlib.workflow.record_temp + kwargs: + model: + dataset: + - class: SigAnaRecord + module_path: qlib.workflow.record_temp + kwargs: + ana_long_short: False + ann_scaler: 252 + - class: PortAnaRecord + module_path: qlib.workflow.record_temp + kwargs: + config: *port_analysis_config diff --git a/examples/benchmarks/GATs/workflow_config_gats_indus_Alpha360.yaml b/examples/benchmarks/GATs/workflow_config_gats_indus_Alpha360.yaml new file mode 100644 index 0000000000..fc289b88c3 --- /dev/null +++ b/examples/benchmarks/GATs/workflow_config_gats_indus_Alpha360.yaml @@ -0,0 +1,92 @@ +qlib_init: + provider_uri: "~/.qlib/qlib_data/cn_data" + region: cn +market: &market csi300 +benchmark: &benchmark SH000300 +data_handler_config: &data_handler_config + start_time: 2008-01-01 + end_time: 2020-08-01 + fit_start_time: 2008-01-01 + fit_end_time: 2014-12-31 + instruments: *market + infer_processors: + - class: RobustZScoreNorm + kwargs: + fields_group: feature + clip_outlier: true + - class: Fillna + kwargs: + fields_group: feature + learn_processors: + - class: DropnaLabel + - class: CSRankNorm + kwargs: + fields_group: label + label: ["Ref($close, -2) / Ref($close, -1) - 1"] +port_analysis_config: &port_analysis_config + strategy: + class: TopkDropoutStrategy + module_path: qlib.contrib.strategy + kwargs: + signal: + - + - + topk: 50 + n_drop: 5 + backtest: + start_time: 2017-01-01 + end_time: 2020-08-01 + account: 100000000 + benchmark: *benchmark + exchange_kwargs: + limit_threshold: 0.095 + deal_price: close + open_cost: 0.0005 + close_cost: 0.0015 + min_cost: 5 +task: + model: + class: GATs_ADD_IND + module_path: qlib.contrib.model.pytorch_gats_add_ind + kwargs: + d_feat: 6 + hidden_size: 64 + num_layers: 2 + dropout: 0.7 + n_epochs: 200 + lr: 0.00001 + early_stop: 20 + metric: loss + loss: mse + base_model: LSTM + model_path: "benchmarks/LSTM/model_lstm_csi300.pkl" + GPU: 0 + industrial_data_path: '~/stable_ind.csv' + industry_col: 'industry_citic' + #smooth_perplexity: 1 + dataset: + class: DatasetH + module_path: qlib.data.dataset + kwargs: + class: Alpha360 + module_path: qlib.contrib.data.handler + kwargs: *data_handler_config + segments: + train: [2008-01-01, 2014-12-31] + valid: [2015-01-01, 2016-12-31] + test: [2017-01-01, 2020-08-01] + record: + - class: SignalRecord + module_path: qlib.workflow.record_temp + kwargs: + model: + dataset: + - class: SigAnaRecord + module_path: qlib.workflow.record_temp + kwargs: + ana_long_short: False + ann_scaler: 252 + - class: PortAnaRecord + module_path: qlib.workflow.record_temp + kwargs: + config: *port_analysis_config diff --git a/examples/benchmarks/GCN/README.md b/examples/benchmarks/GCN/README.md new file mode 100644 index 0000000000..21172962fa --- /dev/null +++ b/examples/benchmarks/GCN/README.md @@ -0,0 +1,4 @@ +# GCN +* The costing matrix is based on `stable_ind.csv`, which contains industry information that each instrument belongs to, collected in 2008. +* This code used in Qlib is implemented with PyTorch by ourselves. +* Paper: Graph Convolutional Networks https://arxiv.org/pdf/1609.02907.pdf \ No newline at end of file diff --git a/examples/benchmarks/GCN/requirements.txt b/examples/benchmarks/GCN/requirements.txt new file mode 100644 index 0000000000..bfdf94156e --- /dev/null +++ b/examples/benchmarks/GCN/requirements.txt @@ -0,0 +1,4 @@ +pandas==1.1.2 +numpy==1.21.0 +scikit_learn==0.23.2 +torch==1.7.0 diff --git a/examples/benchmarks/GCN/workflow_config_gcn_Alpha158.yaml b/examples/benchmarks/GCN/workflow_config_gcn_Alpha158.yaml new file mode 100644 index 0000000000..31d8eff32c --- /dev/null +++ b/examples/benchmarks/GCN/workflow_config_gcn_Alpha158.yaml @@ -0,0 +1,102 @@ +qlib_init: + provider_uri: "~/.qlib/qlib_data/cn_data" + region: cn +market: &market csi300 +benchmark: &benchmark SH000300 +experiment_name: workflow +data_handler_config: &data_handler_config + start_time: 2008-01-01 + end_time: 2020-08-01 + fit_start_time: 2008-01-01 + fit_end_time: 2014-12-31 + instruments: *market + infer_processors: + - class: FilterCol + kwargs: + fields_group: feature + col_list: ["RESI5", "WVMA5", "RSQR5", "KLEN", "RSQR10", "CORR5", "CORD5", "CORR10", + "ROC60", "RESI10", "VSTD5", "RSQR60", "CORR60", "WVMA60", "STD5", + "RSQR20", "CORD60", "CORD10", "CORR20", "KLOW" + ] + - class: RobustZScoreNorm + kwargs: + fields_group: feature + clip_outlier: true + - class: Fillna + kwargs: + fields_group: feature + learn_processors: + - class: DropnaLabel + - class: CSRankNorm + kwargs: + fields_group: label + label: ["Ref($close, -2) / Ref($close, -1) - 1"] +port_analysis_config: &port_analysis_config + strategy: + class: TopkDropoutStrategy + module_path: qlib.contrib.strategy + kwargs: + signal: + - + - + topk: 50 + n_drop: 5 + backtest: + start_time: 2017-01-01 + end_time: 2020-08-01 + account: 100000000 + benchmark: *benchmark + exchange_kwargs: + limit_threshold: 0.095 + deal_price: close + open_cost: 0.0005 + close_cost: 0.0015 + min_cost: 5 +task: + model: + class: GCN + module_path: qlib.contrib.model.pytorch_gcn_ts + kwargs: + d_feat: 20 + hidden_size: 64 + num_layers: 2 + dropout: 0.5 + n_epochs: 200 + lr: 5e-5 + early_stop: 20 + metric: loss + loss: mse + base_model: LSTM + model_path: "benchmarks/LSTM/csi300_lstm_ts.pkl" + GPU: 0 + industrial_data_path: '~/stable_ind.csv' + industry_col: 'industry_citic' + adjacent_coef: 0.01 + dataset: + class: TSDatasetH + module_path: qlib.data.dataset + kwargs: + handler: + class: Alpha158 + module_path: qlib.contrib.data.handler + kwargs: *data_handler_config + segments: + train: [2008-01-01, 2014-12-31] + valid: [2015-01-01, 2016-12-31] + test: [2017-01-01, 2020-08-01] + step_len: 20 + record: + - class: SignalRecord + module_path: qlib.workflow.record_temp + kwargs: + model: + dataset: + - class: SigAnaRecord + module_path: qlib.workflow.record_temp + kwargs: + ana_long_short: False + ann_scaler: 252 + - class: PortAnaRecord + module_path: qlib.workflow.record_temp + kwargs: + config: *port_analysis_config diff --git a/examples/benchmarks/GCN/workflow_config_gcn_Alpha360.yaml b/examples/benchmarks/GCN/workflow_config_gcn_Alpha360.yaml new file mode 100644 index 0000000000..13d009f1a5 --- /dev/null +++ b/examples/benchmarks/GCN/workflow_config_gcn_Alpha360.yaml @@ -0,0 +1,94 @@ +qlib_init: + provider_uri: "~/.qlib/qlib_data/cn_data" + region: cn +market: &market csi300 +benchmark: &benchmark SH000300 +data_handler_config: &data_handler_config + start_time: 2008-01-01 + end_time: 2020-08-01 + fit_start_time: 2008-01-01 + fit_end_time: 2014-12-31 + instruments: *market + infer_processors: + - class: RobustZScoreNorm + kwargs: + fields_group: feature + clip_outlier: true + - class: Fillna + kwargs: + fields_group: feature + learn_processors: + - class: DropnaLabel + - class: CSRankNorm + kwargs: + fields_group: label + label: ["Ref($close, -2) / Ref($close, -1) - 1"] +port_analysis_config: &port_analysis_config + strategy: + class: TopkDropoutStrategy + module_path: qlib.contrib.strategy + kwargs: + signal: + - + - + topk: 50 + n_drop: 5 + backtest: + start_time: 2017-01-01 + end_time: 2020-08-01 + account: 100000000 + benchmark: *benchmark + exchange_kwargs: + limit_threshold: 0.095 + deal_price: close + open_cost: 0.0005 + close_cost: 0.0015 + min_cost: 5 +task: + model: + class: GCN + module_path: qlib.contrib.model.pytorch_gcn + kwargs: + d_feat: 6 + hidden_size: 64 + num_layers: 2 + dropout: 0.5 + n_epochs: 200 + lr: 5e-5 + early_stop: 20 + metric: loss + loss: mse + base_model: LSTM + model_path: "benchmarks/LSTM/model_lstm_csi300.pkl" + GPU: 0 + industrial_data_path: '~/stable_ind.csv' + industry_col: 'industry_citic' + adjacent_coef: 0.01 + + dataset: + class: DatasetH + module_path: qlib.data.dataset + kwargs: + handler: + class: Alpha360 + module_path: qlib.contrib.data.handler + kwargs: *data_handler_config + segments: + train: [2008-01-01, 2014-12-31] + valid: [2015-01-01, 2016-12-31] + test: [2017-01-01, 2020-08-01] + record: + - class: SignalRecord + module_path: qlib.workflow.record_temp + kwargs: + model: + dataset: + - class: SigAnaRecord + module_path: qlib.workflow.record_temp + kwargs: + ana_long_short: False + ann_scaler: 252 + - class: PortAnaRecord + module_path: qlib.workflow.record_temp + kwargs: + config: *port_analysis_config diff --git a/examples/benchmarks/README.md b/examples/benchmarks/README.md index 24d3f59023..1cc9110604 100644 --- a/examples/benchmarks/README.md +++ b/examples/benchmarks/README.md @@ -24,50 +24,54 @@ The numbers shown below demonstrate the performance of the entire `workflow` of ### Alpha158 dataset -| Model Name | Dataset | IC | ICIR | Rank IC | Rank ICIR | Annualized Return | Information Ratio | Max Drawdown | -|------------------------------------------|-------------------------------------|-------------|-------------|-------------|-------------|-------------------|-------------------|--------------| -| TCN(Shaojie Bai, et al.) | Alpha158 | 0.0275±0.00 | 0.2157±0.01 | 0.0411±0.00 | 0.3379±0.01 | 0.0190±0.02 | 0.2887±0.27 | -0.1202±0.03 | -| TabNet(Sercan O. Arik, et al.) | Alpha158 | 0.0204±0.01 | 0.1554±0.07 | 0.0333±0.00 | 0.2552±0.05 | 0.0227±0.04 | 0.3676±0.54 | -0.1089±0.08 | -| Transformer(Ashish Vaswani, et al.) | Alpha158 | 0.0264±0.00 | 0.2053±0.02 | 0.0407±0.00 | 0.3273±0.02 | 0.0273±0.02 | 0.3970±0.26 | -0.1101±0.02 | -| GRU(Kyunghyun Cho, et al.) | Alpha158(with selected 20 features) | 0.0315±0.00 | 0.2450±0.04 | 0.0428±0.00 | 0.3440±0.03 | 0.0344±0.02 | 0.5160±0.25 | -0.1017±0.02 | -| LSTM(Sepp Hochreiter, et al.) | Alpha158(with selected 20 features) | 0.0318±0.00 | 0.2367±0.04 | 0.0435±0.00 | 0.3389±0.03 | 0.0381±0.03 | 0.5561±0.46 | -0.1207±0.04 | -| Localformer(Juyong Jiang, et al.) | Alpha158 | 0.0356±0.00 | 0.2756±0.03 | 0.0468±0.00 | 0.3784±0.03 | 0.0438±0.02 | 0.6600±0.33 | -0.0952±0.02 | -| SFM(Liheng Zhang, et al.) | Alpha158 | 0.0379±0.00 | 0.2959±0.04 | 0.0464±0.00 | 0.3825±0.04 | 0.0465±0.02 | 0.5672±0.29 | -0.1282±0.03 | -| ALSTM (Yao Qin, et al.) | Alpha158(with selected 20 features) | 0.0362±0.01 | 0.2789±0.06 | 0.0463±0.01 | 0.3661±0.05 | 0.0470±0.03 | 0.6992±0.47 | -0.1072±0.03 | -| GATs (Petar Velickovic, et al.) | Alpha158(with selected 20 features) | 0.0349±0.00 | 0.2511±0.01 | 0.0462±0.00 | 0.3564±0.01 | 0.0497±0.01 | 0.7338±0.19 | -0.0777±0.02 | -| TRA(Hengxu Lin, et al.) | Alpha158(with selected 20 features) | 0.0404±0.00 | 0.3197±0.05 | 0.0490±0.00 | 0.4047±0.04 | 0.0649±0.02 | 1.0091±0.30 | -0.0860±0.02 | -| Linear | Alpha158 | 0.0397±0.00 | 0.3000±0.00 | 0.0472±0.00 | 0.3531±0.00 | 0.0692±0.00 | 0.9209±0.00 | -0.1509±0.00 | -| TRA(Hengxu Lin, et al.) | Alpha158 | 0.0440±0.00 | 0.3535±0.05 | 0.0540±0.00 | 0.4451±0.03 | 0.0718±0.02 | 1.0835±0.35 | -0.0760±0.02 | -| CatBoost(Liudmila Prokhorenkova, et al.) | Alpha158 | 0.0481±0.00 | 0.3366±0.00 | 0.0454±0.00 | 0.3311±0.00 | 0.0765±0.00 | 0.8032±0.01 | -0.1092±0.00 | -| XGBoost(Tianqi Chen, et al.) | Alpha158 | 0.0498±0.00 | 0.3779±0.00 | 0.0505±0.00 | 0.4131±0.00 | 0.0780±0.00 | 0.9070±0.00 | -0.1168±0.00 | -| TFT (Bryan Lim, et al.) | Alpha158(with selected 20 features) | 0.0358±0.00 | 0.2160±0.03 | 0.0116±0.01 | 0.0720±0.03 | 0.0847±0.02 | 0.8131±0.19 | -0.1824±0.03 | -| MLP | Alpha158 | 0.0376±0.00 | 0.2846±0.02 | 0.0429±0.00 | 0.3220±0.01 | 0.0895±0.02 | 1.1408±0.23 | -0.1103±0.02 | -| LightGBM(Guolin Ke, et al.) | Alpha158 | 0.0448±0.00 | 0.3660±0.00 | 0.0469±0.00 | 0.3877±0.00 | 0.0901±0.00 | 1.0164±0.00 | -0.1038±0.00 | -| DoubleEnsemble(Chuheng Zhang, et al.) | Alpha158 | 0.0521±0.00 | 0.4223±0.01 | 0.0502±0.00 | 0.4117±0.01 | 0.1158±0.01 | 1.3432±0.11 | -0.0920±0.01 | +| Model Name | Dataset | IC | ICIR | Rank IC | Rank ICIR | Annualized Return | Information Ratio | Max Drawdown | +|------------------------------------------|-------------------------------------|-------------|--------------|---------------|--------------|--------------|--------------|---------------| +| TCN(Shaojie Bai, et al.) | Alpha158 | 0.0275±0.00 | 0.2157±0.01 | 0.0411±0.00 | 0.3379±0.01 | 0.0190±0.02 | 0.2887±0.27 | -0.1202±0.03 | +| TabNet(Sercan O. Arik, et al.) | Alpha158 | 0.0204±0.01 | 0.1554±0.07 | 0.0333±0.00 | 0.2552±0.05 | 0.0227±0.04 | 0.3676±0.54 | -0.1089±0.08 | +| Transformer(Ashish Vaswani, et al.) | Alpha158 | 0.0264±0.00 | 0.2053±0.02 | 0.0407±0.00 | 0.3273±0.02 | 0.0273±0.02 | 0.3970±0.26 | -0.1101±0.02 | +| GCN(N. Kipf, et al.) | Alpha158 | 0.0407±0.00 | 0.3389±0.03 | 0.0536±0.00 | 0.4277±0.02 | 0.0301±0.01 | 0.4956±0.22 | -0.0865±0.01 | +| GRU(Kyunghyun Cho, et al.) | Alpha158(with selected 20 features) | 0.0315±0.00 | 0.2450±0.04 | 0.0428±0.00 | 0.3440±0.03 | 0.0344±0.02 | 0.5160±0.25 | -0.1017±0.02 | +| LSTM(Sepp Hochreiter, et al.) | Alpha158(with selected 20 features) | 0.0318±0.00 | 0.2367±0.04 | 0.0435±0.00 | 0.3389±0.03 | 0.0381±0.03 | 0.5561±0.46 | -0.1207±0.04 | +| Localformer(Juyong Jiang, et al.) | Alpha158 | 0.0356±0.00 | 0.2756±0.03 | 0.0468±0.00 | 0.3784±0.03 | 0.0438±0.02 | 0.6600±0.33 | -0.0952±0.02 | +| SFM(Liheng Zhang, et al.) | Alpha158 | 0.0379±0.00 | 0.2959±0.04 | 0.0464±0.00 | 0.3825±0.04 | 0.0465±0.02 | 0.5672±0.29 | -0.1282±0.03 | +| ALSTM (Yao Qin, et al.) | Alpha158(with selected 20 features) | 0.0362±0.01 | 0.2789±0.06 | 0.0463±0.01 | 0.3661±0.05 | 0.0470±0.03 | 0.6992±0.47 | -0.1072±0.03 | +| GATs (Petar Velickovic, et al.) | Alpha158(with selected 20 features) | 0.0349±0.00 | 0.2511±0.01 | 0.0462±0.00 | 0.3564±0.01 | 0.0497±0.01 | 0.7338±0.19 | -0.0777±0.02 | +| GATs_INDUS | Alpha158 | 0.0438±0.00 | 0.3215±0.01 | 0.0534±0.00 | 0.3957±0.01 | 0.0559±0.01 | 0.8081±0.14 | -0.0819±0.01 | +| TRA(Hengxu Lin, et al.) | Alpha158(with selected 20 features) | 0.0404±0.00 | 0.3197±0.05 | 0.0490±0.00 | 0.4047±0.04 | 0.0649±0.02 | 1.0091±0.30 | -0.0860±0.02 | +| Linear | Alpha158 | 0.0397±0.00 | 0.3000±0.00 | 0.0472±0.00 | 0.3531±0.00 | 0.0692±0.00 | 0.9209±0.00 | -0.1509±0.00 | +| TRA(Hengxu Lin, et al.) | Alpha158 | 0.0440±0.00 | 0.3535±0.05 | 0.0540±0.00 | 0.4451±0.03 | 0.0718±0.02 | 1.0835±0.35 | -0.0760±0.02 | +| CatBoost(Liudmila Prokhorenkova, et al.) | Alpha158 | 0.0481±0.00 | 0.3366±0.00 | 0.0454±0.00 | 0.3311±0.00 | 0.0765±0.00 | 0.8032±0.01 | -0.1092±0.00 | +| XGBoost(Tianqi Chen, et al.) | Alpha158 | 0.0498±0.00 | 0.3779±0.00 | 0.0505±0.00 | 0.4131±0.00 | 0.0780±0.00 | 0.9070±0.00 | -0.1168±0.00 | +| TFT (Bryan Lim, et al.) | Alpha158(with selected 20 features) | 0.0358±0.00 | 0.2160±0.03 | 0.0116±0.01 | 0.0720±0.03 | 0.0847±0.02 | 0.8131±0.19 | -0.1824±0.03 | +| MLP | Alpha158 | 0.0376±0.00 | 0.2846±0.02 | 0.0429±0.00 | 0.3220±0.01 | 0.0895±0.02 | 1.1408±0.23 | -0.1103±0.02 | +| LightGBM(Guolin Ke, et al.) | Alpha158 | 0.0448±0.00 | 0.3660±0.00 | 0.0469±0.00 | 0.3877±0.00 | 0.0901±0.00 | 1.0164±0.00 | -0.1038±0.00 | +| DoubleEnsemble(Chuheng Zhang, et al.) | Alpha158 | 0.0521±0.00 | 0.4223±0.01 | 0.0502±0.00 | 0.4117±0.01 | 0.1158±0.01 | 1.3432±0.11 | -0.0920±0.01 | ### Alpha360 dataset -| Model Name | Dataset | IC | ICIR | Rank IC | Rank ICIR | Annualized Return | Information Ratio | Max Drawdown | -|-------------------------------------------|----------|-------------|-------------|-------------|-------------|-------------------|-------------------|--------------| -| Transformer(Ashish Vaswani, et al.) | Alpha360 | 0.0114±0.00 | 0.0716±0.03 | 0.0327±0.00 | 0.2248±0.02 | -0.0270±0.03 | -0.3378±0.37 | -0.1653±0.05 | -| TabNet(Sercan O. Arik, et al.) | Alpha360 | 0.0099±0.00 | 0.0593±0.00 | 0.0290±0.00 | 0.1887±0.00 | -0.0369±0.00 | -0.3892±0.00 | -0.2145±0.00 | -| MLP | Alpha360 | 0.0273±0.00 | 0.1870±0.02 | 0.0396±0.00 | 0.2910±0.02 | 0.0029±0.02 | 0.0274±0.23 | -0.1385±0.03 | -| Localformer(Juyong Jiang, et al.) | Alpha360 | 0.0404±0.00 | 0.2932±0.04 | 0.0542±0.00 | 0.4110±0.03 | 0.0246±0.02 | 0.3211±0.21 | -0.1095±0.02 | -| CatBoost((Liudmila Prokhorenkova, et al.) | Alpha360 | 0.0378±0.00 | 0.2714±0.00 | 0.0467±0.00 | 0.3659±0.00 | 0.0292±0.00 | 0.3781±0.00 | -0.0862±0.00 | -| XGBoost(Tianqi Chen, et al.) | Alpha360 | 0.0394±0.00 | 0.2909±0.00 | 0.0448±0.00 | 0.3679±0.00 | 0.0344±0.00 | 0.4527±0.02 | -0.1004±0.00 | -| DoubleEnsemble(Chuheng Zhang, et al.) | Alpha360 | 0.0390±0.00 | 0.2946±0.01 | 0.0486±0.00 | 0.3836±0.01 | 0.0462±0.01 | 0.6151±0.18 | -0.0915±0.01 | -| LightGBM(Guolin Ke, et al.) | Alpha360 | 0.0400±0.00 | 0.3037±0.00 | 0.0499±0.00 | 0.4042±0.00 | 0.0558±0.00 | 0.7632±0.00 | -0.0659±0.00 | -| TCN(Shaojie Bai, et al.) | Alpha360 | 0.0441±0.00 | 0.3301±0.02 | 0.0519±0.00 | 0.4130±0.01 | 0.0604±0.02 | 0.8295±0.34 | -0.1018±0.03 | -| ALSTM (Yao Qin, et al.) | Alpha360 | 0.0497±0.00 | 0.3829±0.04 | 0.0599±0.00 | 0.4736±0.03 | 0.0626±0.02 | 0.8651±0.31 | -0.0994±0.03 | -| LSTM(Sepp Hochreiter, et al.) | Alpha360 | 0.0448±0.00 | 0.3474±0.04 | 0.0549±0.00 | 0.4366±0.03 | 0.0647±0.03 | 0.8963±0.39 | -0.0875±0.02 | -| ADD | Alpha360 | 0.0430±0.00 | 0.3188±0.04 | 0.0559±0.00 | 0.4301±0.03 | 0.0667±0.02 | 0.8992±0.34 | -0.0855±0.02 | -| GRU(Kyunghyun Cho, et al.) | Alpha360 | 0.0493±0.00 | 0.3772±0.04 | 0.0584±0.00 | 0.4638±0.03 | 0.0720±0.02 | 0.9730±0.33 | -0.0821±0.02 | -| AdaRNN(Yuntao Du, et al.) | Alpha360 | 0.0464±0.01 | 0.3619±0.08 | 0.0539±0.01 | 0.4287±0.06 | 0.0753±0.03 | 1.0200±0.40 | -0.0936±0.03 | -| GATs (Petar Velickovic, et al.) | Alpha360 | 0.0476±0.00 | 0.3508±0.02 | 0.0598±0.00 | 0.4604±0.01 | 0.0824±0.02 | 1.1079±0.26 | -0.0894±0.03 | -| TCTS(Xueqing Wu, et al.) | Alpha360 | 0.0508±0.00 | 0.3931±0.04 | 0.0599±0.00 | 0.4756±0.03 | 0.0893±0.03 | 1.2256±0.36 | -0.0857±0.02 | -| TRA(Hengxu Lin, et al.) | Alpha360 | 0.0485±0.00 | 0.3787±0.03 | 0.0587±0.00 | 0.4756±0.03 | 0.0920±0.03 | 1.2789±0.42 | -0.0834±0.02 | -| IGMTF(Wentao Xu, et al.) | Alpha360 | 0.0480±0.00 | 0.3589±0.02 | 0.0606±0.00 | 0.4773±0.01 | 0.0946±0.02 | 1.3509±0.25 | -0.0716±0.02 | -| HIST(Wentao Xu, et al.) | Alpha360 | 0.0522±0.00 | 0.3530±0.01 | 0.0667±0.00 | 0.4576±0.01 | 0.0987±0.02 | 1.3726±0.27 | -0.0681±0.01 | +| Model Name | Dataset | IC | ICIR | Rank IC | Rank ICIR | Annualized Return | Information Ratio | Max Drawdown | +|-------------------------------------------|----------|-------------|--------------|-------------|-------------|-------------------|-------------------|--------------| +| Transformer(Ashish Vaswani, et al.) | Alpha360 | 0.0114±0.00 | 0.0716±0.03 | 0.0327±0.00 | 0.2248±0.02 | -0.0270±0.03 | -0.3378±0.37 | -0.1653±0.05 | +| TabNet(Sercan O. Arik, et al.) | Alpha360 | 0.0099±0.00 | 0.0593±0.00 | 0.0290±0.00 | 0.1887±0.00 | -0.0369±0.00 | -0.3892±0.00 | -0.2145±0.00 | +| MLP | Alpha360 | 0.0273±0.00 | 0.1870±0.02 | 0.0396±0.00 | 0.2910±0.02 | 0.0029±0.02 | 0.0274±0.23 | -0.1385±0.03 | +| Localformer(Juyong Jiang, et al.) | Alpha360 | 0.0404±0.00 | 0.2932±0.04 | 0.0542±0.00 | 0.4110±0.03 | 0.0246±0.02 | 0.3211±0.21 | -0.1095±0.02 | +| CatBoost((Liudmila Prokhorenkova, et al.) | Alpha360 | 0.0378±0.00 | 0.2714±0.00 | 0.0467±0.00 | 0.3659±0.00 | 0.0292±0.00 | 0.3781±0.00 | -0.0862±0.00 | +| XGBoost(Tianqi Chen, et al.) | Alpha360 | 0.0394±0.00 | 0.2909±0.00 | 0.0448±0.00 | 0.3679±0.00 | 0.0344±0.00 | 0.4527±0.02 | -0.1004±0.00 | +| GCN(N. Kipf, et al.) | Alpha360 | 0.0407±0.00 | 0.3382±0.02 | 0.0539±0.00 | 0.4291±0.02 | 0.0358±0.01 | 0.5734±0.21 | -0.0867±0.01 | +| DoubleEnsemble(Chuheng Zhang, et al.) | Alpha360 | 0.0390±0.00 | 0.2946±0.01 | 0.0486±0.00 | 0.3836±0.01 | 0.0462±0.01 | 0.6151±0.18 | -0.0915±0.01 | +| LightGBM(Guolin Ke, et al.) | Alpha360 | 0.0400±0.00 | 0.3037±0.00 | 0.0499±0.00 | 0.4042±0.00 | 0.0558±0.00 | 0.7632±0.00 | -0.0659±0.00 | +| TCN(Shaojie Bai, et al.) | Alpha360 | 0.0441±0.00 | 0.3301±0.02 | 0.0519±0.00 | 0.4130±0.01 | 0.0604±0.02 | 0.8295±0.34 | -0.1018±0.03 | +| GATs_INDUS | Alpha360 | 0.0476±0.00 | 0.3605±0.02 | 0.0567±0.00 | 0.4366±0.01 | 0.0615±0.02 | 0.8696±0.25 | -0.0830±0.01 | +| ALSTM (Yao Qin, et al.) | Alpha360 | 0.0497±0.00 | 0.3829±0.04 | 0.0599±0.00 | 0.4736±0.03 | 0.0626±0.02 | 0.8651±0.31 | -0.0994±0.03 | +| LSTM(Sepp Hochreiter, et al.) | Alpha360 | 0.0448±0.00 | 0.3474±0.04 | 0.0549±0.00 | 0.4366±0.03 | 0.0647±0.03 | 0.8963±0.39 | -0.0875±0.02 | +| ADD | Alpha360 | 0.0430±0.00 | 0.3188±0.04 | 0.0559±0.00 | 0.4301±0.03 | 0.0667±0.02 | 0.8992±0.34 | -0.0855±0.02 | +| GRU(Kyunghyun Cho, et al.) | Alpha360 | 0.0493±0.00 | 0.3772±0.04 | 0.0584±0.00 | 0.4638±0.03 | 0.0720±0.02 | 0.9730±0.33 | -0.0821±0.02 | +| AdaRNN(Yuntao Du, et al.) | Alpha360 | 0.0464±0.01 | 0.3619±0.08 | 0.0539±0.01 | 0.4287±0.06 | 0.0753±0.03 | 1.0200±0.40 | -0.0936±0.03 | +| GATs (Petar Velickovic, et al.) | Alpha360 | 0.0476±0.00 | 0.3508±0.02 | 0.0598±0.00 | 0.4604±0.01 | 0.0824±0.02 | 1.1079±0.26 | -0.0894±0.03 | +| TCTS(Xueqing Wu, et al.) | Alpha360 | 0.0508±0.00 | 0.3931±0.04 | 0.0599±0.00 | 0.4756±0.03 | 0.0893±0.03 | 1.2256±0.36 | -0.0857±0.02 | +| TRA(Hengxu Lin, et al.) | Alpha360 | 0.0485±0.00 | 0.3787±0.03 | 0.0587±0.00 | 0.4756±0.03 | 0.0920±0.03 | 1.2789±0.42 | -0.0834±0.02 | +| IGMTF(Wentao Xu, et al.) | Alpha360 | 0.0480±0.00 | 0.3589±0.02 | 0.0606±0.00 | 0.4773±0.01 | 0.0946±0.02 | 1.3509±0.25 | -0.0716±0.02 | +| HIST(Wentao Xu, et al.) | Alpha360 | 0.0522±0.00 | 0.3530±0.01 | 0.0667±0.00 | 0.4576±0.01 | 0.0987±0.02 | 1.3726±0.27 | -0.0681±0.01 | - The selected 20 features are based on the feature importance of a lightgbm-based model. diff --git a/qlib/contrib/model/__init__.py b/qlib/contrib/model/__init__.py index 5d4d5f2e69..963eeb022d 100644 --- a/qlib/contrib/model/__init__.py +++ b/qlib/contrib/model/__init__.py @@ -34,8 +34,10 @@ from .pytorch_sfm import SFM_Model from .pytorch_tcn import TCN from .pytorch_add import ADD + from .pytorch_gats_add_ind import GATs_ADD_IND + from .pytorch_gcn import GCN - pytorch_classes = (ALSTM, GATs, GRU, LSTM, DNNModelPytorch, TabnetModel, SFM_Model, TCN, ADD) + pytorch_classes = (ALSTM, GATs, GRU, LSTM, DNNModelPytorch, TabnetModel, SFM_Model, TCN, ADD, GATs_ADD_IND, GCN) except ModuleNotFoundError: pytorch_classes = () print("ModuleNotFoundError. PyTorch models are skipped (optional: maybe installing pytorch can fix it).") diff --git a/qlib/contrib/model/pytorch_gats_add_ind.py b/qlib/contrib/model/pytorch_gats_add_ind.py new file mode 100644 index 0000000000..d5663e4911 --- /dev/null +++ b/qlib/contrib/model/pytorch_gats_add_ind.py @@ -0,0 +1,463 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + + +from __future__ import division +from __future__ import print_function + +import numpy as np +import pandas as pd +from typing import Text, Union +import copy + +from ...utils import get_or_create_path +from ...log import get_module_logger +import torch +import torch.nn as nn +import torch.optim as optim + +from .pytorch_utils import count_parameters +from ...model.base import Model +from ...data.dataset import DatasetH +from ...data.dataset.handler import DataHandlerLP +from ...contrib.model.pytorch_lstm import LSTMModel +from ...contrib.model.pytorch_gru import GRUModel + + +class GATs_ADD_IND(Model): + """GATs_ADD_IND Model + + Parameters + ---------- + lr : float + learning rate + d_feat : int + input dimensions for each time step + metric : str + the evaluation metric used in early stop + optimizer : str + optimizer name + GPU : int + the GPU ID used for training + """ + + def __init__( + self, + d_feat=6, + hidden_size=64, + num_layers=2, + dropout=0.0, + n_epochs=200, + lr=0.001, + metric="", + early_stop=20, + loss="mse", + base_model="GRU", + model_path=None, + optimizer="adam", + GPU=0, + seed=None, + industrial_data_path="~/industry_data.csv", + industry_col="industry", + smooth_perplexity=1, + **kwargs + ): + # Set logger. + self.logger = get_module_logger("GATs_ADD_IND") + self.logger.info("GATs_ADD_IND pytorch version...") + + # set hyper-parameters. + self.d_feat = d_feat + self.hidden_size = hidden_size + self.num_layers = num_layers + self.dropout = dropout + self.n_epochs = n_epochs + self.lr = lr + self.metric = metric + self.early_stop = early_stop + self.optimizer = optimizer.lower() + self.loss = loss + self.base_model = base_model + self.model_path = model_path + self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu") + self.seed = seed + + self.industrial_data_path = industrial_data_path + self.industrial = pd.read_csv(industrial_data_path, index_col=0) + self.industry_col = industry_col + self.smooth_perplexity = smooth_perplexity + + self.logger.info( + "GATs_ADD_IND parameters setting:" + "\nd_feat : {}" + "\nhidden_size : {}" + "\nnum_layers : {}" + "\ndropout : {}" + "\nn_epochs : {}" + "\nlr : {}" + "\nmetric : {}" + "\nearly_stop : {}" + "\noptimizer : {}" + "\nloss_type : {}" + "\nbase_model : {}" + "\nmodel_path : {}" + "\ndevice : {}" + "\nuse_GPU : {}" + "\nseed : {}" + "\nindustry_col: {}" + "\nsmooth_perplexity: {}".format( + d_feat, + hidden_size, + num_layers, + dropout, + n_epochs, + lr, + metric, + early_stop, + optimizer.lower(), + loss, + base_model, + model_path, + self.device, + self.use_gpu, + seed, + industry_col, + smooth_perplexity, + ) + ) + + if self.seed is not None: + np.random.seed(self.seed) + torch.manual_seed(self.seed) + + self.GAT_model = GATModel_ADD_IND( + d_feat=self.d_feat, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + dropout=self.dropout, + base_model=self.base_model, + smooth_perplexity=smooth_perplexity, + ) + self.logger.info("model:\n{:}".format(self.GAT_model)) + self.logger.info("model size: {:.4f} MB".format(count_parameters(self.GAT_model))) + + if optimizer.lower() == "adam": + self.train_optimizer = optim.Adam(self.GAT_model.parameters(), lr=self.lr) + elif optimizer.lower() == "gd": + self.train_optimizer = optim.SGD(self.GAT_model.parameters(), lr=self.lr) + else: + raise NotImplementedError("optimizer {} is not supported!".format(optimizer)) + + self.fitted = False + self.GAT_model.to(self.device) + + @property + def use_gpu(self): + return self.device != torch.device("cpu") + + def mse(self, pred, label): + loss = (pred - label) ** 2 + return torch.mean(loss) + + def loss_fn(self, pred, label): + mask = ~torch.isnan(label) + + if self.loss == "mse": + return self.mse(pred[mask], label[mask]) + + raise ValueError("unknown loss `%s`" % self.loss) + + def metric_fn(self, pred, label): + mask = torch.isfinite(label) + + if self.metric in ("", "loss"): + return -self.loss_fn(pred[mask], label[mask]) + + raise ValueError("unknown metric `%s`" % self.metric) + + def get_daily_inter(self, df, shuffle=False): + # organize the train data into daily batches + daily_count = df.groupby(level=0).size().values + daily_index = np.roll(np.cumsum(daily_count), 1) + daily_index[0] = 0 + if shuffle: + # shuffle data + daily_shuffle = list(zip(daily_index, daily_count)) + np.random.shuffle(daily_shuffle) + daily_index, daily_count = zip(*daily_shuffle) + return daily_index, daily_count + + def get_industry(self, date, instruments): + table = self.industrial.loc[:, self.industry_col] + + instrument_to_industry = {} + + for id, instrument in enumerate(instruments): + industry = table.get(instrument, -1) + if np.isnan(industry): + industry = -1 + instrument_to_industry[instrument] = industry + return instrument_to_industry + + def adjacent_matrix(self, groups): + # get the adjacent matrix + industries = [] + for i, count in groups: + for j in range(count): + industries.append(i) + industries = np.array(industries) + adjacent_matrix = (industries.reshape(-1, 1) == industries.reshape(1, -1)).astype(np.float32) + return adjacent_matrix + + def get_input_data(self, data_x, data_y, data_x_values, data_y_values, idx, count): + instruments = [] + index = [] + for i in range(idx, idx + count): + index.append(i) + instruments.append(data_x.index[i][1]) + + instrument_to_idx = {instrument: idx for idx, instrument in zip(index, instruments)} + + date = str(data_x.index[idx][0])[:10] + instrument_to_industry = self.get_industry(date, instruments) + group_by_industry = list(instrument_to_industry.items()) + group_by_industry.sort(key=lambda x: x[1]) + # for each industry, get the starting index and the number of instruments + groups = [] + now_industry = None + index = [] + for id, (instrument, industry) in enumerate(group_by_industry): + if industry != now_industry: + now_industry = industry + groups.append((id, 1)) + else: + groups[-1] = (groups[-1][0], groups[-1][1] + 1) + index.append(instrument_to_idx[instrument]) + index = np.array(index) + + label = data_y_values[index] if data_y_values is not None else None + + return data_x_values[index], label, torch.tensor(self.adjacent_matrix(groups)).to(self.device), index + + def train_epoch(self, x_train, y_train): + x_train_values = x_train.values + y_train_values = np.squeeze(y_train.values) + self.GAT_model.train() + + # organize the train data into daily batches + daily_index, daily_count = self.get_daily_inter(x_train, shuffle=True) + + for idx, count in zip(daily_index, daily_count): + feature, label, groups, _ = self.get_input_data( + x_train, y_train, x_train_values, y_train_values, idx, count + ) + + feature = torch.from_numpy(feature).float().to(self.device) + label = torch.from_numpy(label).float().to(self.device) + + pred = self.GAT_model(feature, groups) + loss = self.loss_fn(pred, label) + + self.train_optimizer.zero_grad() + loss.backward() + torch.nn.utils.clip_grad_value_(self.GAT_model.parameters(), 3.0) + self.train_optimizer.step() + + def test_epoch(self, data_x, data_y): + # prepare training data + x_values = data_x.values + y_values = np.squeeze(data_y.values) + + self.GAT_model.eval() + + scores = [] + losses = [] + + # organize the test data into daily batches + daily_index, daily_count = self.get_daily_inter(data_x, shuffle=False) + + for idx, count in zip(daily_index, daily_count): + feature, label, groups, _ = self.get_input_data(data_x, data_y, x_values, y_values, idx, count) + + feature = torch.from_numpy(feature).float().to(self.device) + label = torch.from_numpy(label).float().to(self.device) + + pred = self.GAT_model(feature, groups) + loss = self.loss_fn(pred, label) + losses.append(loss.item()) + + score = self.metric_fn(pred, label) + scores.append(score.item()) + + return np.mean(losses), np.mean(scores) + + def fit( + self, + dataset: DatasetH, + evals_result=dict(), + save_path=None, + ): + df_train, df_valid, df_test = dataset.prepare( + ["train", "valid", "test"], + col_set=["feature", "label"], + data_key=DataHandlerLP.DK_L, + ) + if df_train.empty or df_valid.empty: + raise ValueError("Empty data from dataset, please check your dataset config.") + + x_train, y_train = df_train["feature"], df_train["label"] + x_valid, y_valid = df_valid["feature"], df_valid["label"] + + save_path = get_or_create_path(save_path) + stop_steps = 0 + best_score = -np.inf + best_epoch = 0 + evals_result["train"] = [] + evals_result["valid"] = [] + + # load pretrained base_model + if self.base_model == "LSTM": + pretrained_model = LSTMModel() + elif self.base_model == "GRU": + pretrained_model = GRUModel() + else: + raise ValueError("unknown base model name `%s`" % self.base_model) + + if self.model_path is not None: + self.logger.info("Loading pretrained model...") + pretrained_model.load_state_dict(torch.load(self.model_path, map_location=self.device)) + + model_dict = self.GAT_model.state_dict() + pretrained_dict = { + k: v for k, v in pretrained_model.state_dict().items() if k in model_dict # pylint: disable=E1135 + } + model_dict.update(pretrained_dict) + self.GAT_model.load_state_dict(model_dict) + self.logger.info("Loading pretrained model Done...") + + # train + self.logger.info("training...") + self.fitted = True + + for step in range(self.n_epochs): + self.logger.info("Epoch%d:", step) + self.logger.info("training...") + self.train_epoch(x_train, y_train) + self.logger.info("evaluating...") + train_loss, train_score = self.test_epoch(x_train, y_train) + val_loss, val_score = self.test_epoch(x_valid, y_valid) + self.logger.info("train %.6f, valid %.6f" % (train_score, val_score)) + evals_result["train"].append(train_score) + evals_result["valid"].append(val_score) + + if val_score > best_score: + best_score = val_score + stop_steps = 0 + best_epoch = step + best_param = copy.deepcopy(self.GAT_model.state_dict()) + else: + stop_steps += 1 + if stop_steps >= self.early_stop: + self.logger.info("early stop") + break + + self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch)) + self.GAT_model.load_state_dict(best_param) + torch.save(best_param, save_path) + + if self.use_gpu: + torch.cuda.empty_cache() + + def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): + if not self.fitted: + raise ValueError("model is not fitted yet!") + + x_test = dataset.prepare(segment, col_set="feature") + final_index = x_test.index + self.GAT_model.eval() + x_values = x_test.values + preds = [] + + # organize the data into daily batches + daily_index, daily_count = self.get_daily_inter(x_test, shuffle=False) + + for idx, count in zip(daily_index, daily_count): + feature, label, groups, index = self.get_input_data(x_test, None, x_values, None, idx, count) + + feature = torch.from_numpy(feature).float().to(self.device) + + with torch.no_grad(): + pred = self.GAT_model(feature, groups).detach().cpu().numpy() + + decoder = [0] * count + for i, id in enumerate(index): + decoder[id - idx] = i + pred = pred[decoder] + preds.append(pred) + + return pd.Series(np.concatenate(preds), index=final_index) + + +class GATModel_ADD_IND(nn.Module): + def __init__(self, d_feat=6, hidden_size=64, num_layers=2, dropout=0.0, base_model="GRU", smooth_perplexity=1.0): + super().__init__() + + if base_model == "GRU": + self.rnn = nn.GRU( + input_size=d_feat, + hidden_size=hidden_size, + num_layers=num_layers, + batch_first=True, + dropout=dropout, + ) + elif base_model == "LSTM": + self.rnn = nn.LSTM( + input_size=d_feat, + hidden_size=hidden_size, + num_layers=num_layers, + batch_first=True, + dropout=dropout, + ) + else: + raise ValueError("unknown base model name `%s`" % base_model) + + self.hidden_size = hidden_size + self.d_feat = d_feat + self.transformation = nn.Linear(self.hidden_size, self.hidden_size) + self.a = nn.Parameter(torch.randn(self.hidden_size * 2, 1)) + self.a.requires_grad = True + self.fc = nn.Linear(self.hidden_size, self.hidden_size) + self.fc_out = nn.Linear(hidden_size, 1) + self.leaky_relu = nn.LeakyReLU() + self.softmax = nn.Softmax(dim=1) + self.smooth_perplexity = smooth_perplexity + + def cal_attention(self, x, y, adj): + x = self.transformation(x) + y = self.transformation(y) + + sample_num = x.shape[0] + dim = x.shape[1] + e_x = x.expand(sample_num, sample_num, dim) + e_y = torch.transpose(e_x, 0, 1) + attention_in = torch.cat((e_x, e_y), 2).view(-1, dim * 2) + self.a_t = torch.t(self.a) + attention_out = self.a_t.mm(torch.t(attention_in)).view(sample_num, sample_num) + attention_out = self.leaky_relu(attention_out) + attention_out /= self.smooth_perplexity + attention_out = torch.exp(attention_out) * adj + att_weight = attention_out / torch.sum(attention_out, dim=1, keepdim=True) + return att_weight + + def forward(self, x, adj): + # x: [N, F*T] + x = x.reshape(len(x), self.d_feat, -1) # [N, F, T] + x = x.permute(0, 2, 1) # [N, T, F] + out, _ = self.rnn(x) + hidden = out[:, -1, :] # [N, H] + + att_weight = self.cal_attention(hidden, hidden, adj) + hidden = att_weight.mm(hidden) + hidden + hidden = self.fc(hidden) + hidden = self.leaky_relu(hidden) + return self.fc_out(hidden).squeeze() diff --git a/qlib/contrib/model/pytorch_gats_add_ind_ts.py b/qlib/contrib/model/pytorch_gats_add_ind_ts.py new file mode 100644 index 0000000000..7c3ebce82f --- /dev/null +++ b/qlib/contrib/model/pytorch_gats_add_ind_ts.py @@ -0,0 +1,451 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + + +from __future__ import division +from __future__ import print_function + +import numpy as np +import pandas as pd +import copy + +from ...utils import get_or_create_path +from ...log import get_module_logger +import torch +import torch.nn as nn +import torch.optim as optim +from torch.utils.data import DataLoader +from torch.utils.data import Sampler + +from .pytorch_utils import count_parameters +from ...model.base import Model +from ...data.dataset.handler import DataHandlerLP +from ...contrib.model.pytorch_lstm import LSTMModel +from ...contrib.model.pytorch_gru import GRUModel + + +class DailyBatchSampler(Sampler): + def __init__(self, data_source): + self.data_source = data_source + # calculate number of samples in each batch + self.daily_count = pd.Series(index=self.data_source.get_index()).groupby("datetime").size().values + self.daily_index = np.roll(np.cumsum(self.daily_count), 1) # calculate begin index of each batch + self.daily_index[0] = 0 + + def __iter__(self): + for idx, count in zip(self.daily_index, self.daily_count): + yield np.arange(idx, idx + count) + + def __len__(self): + return len(self.data_source) + + +class GATs_ADD_IND(Model): + """GATs_ Model + + Parameters + ---------- + lr : float + learning rate + d_feat : int + input dimensions for each time step + metric : str + the evaluation metric used in early stop + optimizer : str + optimizer name + GPU : int + the GPU ID used for training + """ + + def __init__( + self, + d_feat=20, + hidden_size=64, + num_layers=2, + dropout=0.0, + n_epochs=200, + lr=0.001, + metric="", + early_stop=20, + loss="mse", + base_model="GRU", + model_path=None, + optimizer="adam", + GPU=0, + n_jobs=10, + seed=None, + industrial_data_path="~/industry_data.csv", + industry_col="industry", + smooth_perplexity=1, + **kwargs + ): + # Set logger. + self.logger = get_module_logger("GATs_ADD_IND") + self.logger.info("GATs_ADD_IND pytorch version...") + + # set hyper-parameters. + self.d_feat = d_feat + self.hidden_size = hidden_size + self.num_layers = num_layers + self.dropout = dropout + self.n_epochs = n_epochs + self.lr = lr + self.metric = metric + self.early_stop = early_stop + self.optimizer = optimizer.lower() + self.loss = loss + self.base_model = base_model + self.model_path = model_path + self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu") + self.n_jobs = n_jobs + self.seed = seed + self.industrial_data_path = industrial_data_path + self.industrial = pd.read_csv(industrial_data_path, index_col=0) + self.industry_col = industry_col + self.industry = self.industrial[self.industry_col] + self.industry = self.industry.fillna(-1) + self.smooth_perplexity = smooth_perplexity + + self.logger.info( + "GATs_ADD_IND parameters setting:" + "\nd_feat : {}" + "\nhidden_size : {}" + "\nnum_layers : {}" + "\ndropout : {}" + "\nn_epochs : {}" + "\nlr : {}" + "\nmetric : {}" + "\nearly_stop : {}" + "\noptimizer : {}" + "\nloss_type : {}" + "\nbase_model : {}" + "\nmodel_path : {}" + "\nvisible_GPU : {}" + "\nuse_GPU : {}" + "\nseed : {}" + "\nindustry_col: {}" + "\nsmooth_perplexity: {}".format( + d_feat, + hidden_size, + num_layers, + dropout, + n_epochs, + lr, + metric, + early_stop, + optimizer.lower(), + loss, + base_model, + model_path, + GPU, + self.use_gpu, + seed, + industry_col, + smooth_perplexity, + ) + ) + + if self.seed is not None: + np.random.seed(self.seed) + torch.manual_seed(self.seed) + + self.GAT_model = GATModel_ADD_IND( + d_feat=self.d_feat, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + dropout=self.dropout, + base_model=self.base_model, + smooth_perplexity=self.smooth_perplexity, + ) + self.logger.info("model:\n{:}".format(self.GAT_model)) + self.logger.info("model size: {:.4f} MB".format(count_parameters(self.GAT_model))) + + if optimizer.lower() == "adam": + self.train_optimizer = optim.Adam(self.GAT_model.parameters(), lr=self.lr) + elif optimizer.lower() == "gd": + self.train_optimizer = optim.SGD(self.GAT_model.parameters(), lr=self.lr) + else: + raise NotImplementedError("optimizer {} is not supported!".format(optimizer)) + + self.fitted = False + self.GAT_model.to(self.device) + + @property + def use_gpu(self): + return self.device != torch.device("cpu") + + def mse(self, pred, label): + loss = (pred - label) ** 2 + return torch.mean(loss) + + def loss_fn(self, pred, label): + mask = ~torch.isnan(label) + + if self.loss == "mse": + return self.mse(pred[mask], label[mask]) + + raise ValueError("unknown loss `%s`" % self.loss) + + def metric_fn(self, pred, label): + mask = torch.isfinite(label) + + if self.metric in ("", "loss"): + return -self.loss_fn(pred[mask], label[mask]) + + raise ValueError("unknown metric `%s`" % self.metric) + + def get_daily_inter(self, df, shuffle=False): + # organize the train data into daily batches + daily_count = df.groupby(level=0).size().values + daily_index = np.roll(np.cumsum(daily_count), 1) + daily_index[0] = 0 + if shuffle: + # shuffle data + daily_shuffle = list(zip(daily_index, daily_count)) + np.random.shuffle(daily_shuffle) + daily_index, daily_count = zip(*daily_shuffle) + return daily_index, daily_count + + def train_epoch(self, data_loader): + self.GAT_model.train() + + for data in data_loader: + data = data.squeeze() + feature = data[:, :, 0:-2].to(self.device) + label = data[:, -1, -2].to(self.device) + ind = data[:, -1, -1] + + # there is no nan in ind + assert torch.isnan(ind).sum() == 0 + + # adjacent_matrix = torch.ones((feature.shape[0],feature.shape[0])).float().to(self.device) + adjacent_matrix = (ind.reshape(-1, 1) == ind.reshape(1, -1)).float().to(self.device) + + pred = self.GAT_model(feature.float(), adjacent_matrix) + loss = self.loss_fn(pred, label) + + self.train_optimizer.zero_grad() + loss.backward() + torch.nn.utils.clip_grad_value_(self.GAT_model.parameters(), 3.0) + self.train_optimizer.step() + + def test_epoch(self, data_loader): + self.GAT_model.eval() + + scores = [] + losses = [] + + for data in data_loader: + data = data.squeeze() + feature = data[:, :, 0:-2].to(self.device) + # feature[torch.isnan(feature)] = 0 + label = data[:, -1, -2].to(self.device) + ind = data[:, -1, -1] + adjacent_matrix = (ind.reshape(-1, 1) == ind.reshape(1, -1)).float().to(self.device) + + pred = self.GAT_model(feature.float(), adjacent_matrix) + loss = self.loss_fn(pred, label) + losses.append(loss.item()) + + score = self.metric_fn(pred, label) + scores.append(score.item()) + + return np.mean(losses), np.mean(scores) + + def fit( + self, + dataset, + evals_result=dict(), + save_path=None, + ): + dl_train = dataset.prepare("train", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) + dl_valid = dataset.prepare("valid", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) + if dl_train.empty or dl_valid.empty: + raise ValueError("Empty data from dataset, please check your dataset config.") + + dl_train.config(fillna_type="ffill+bfill") # process nan brought by dataloader + dl_valid.config(fillna_type="ffill+bfill") # process nan brought by dataloader + + dl_train_index = dl_train.data_index.get_level_values(0) + dl_valid_index = dl_valid.data_index.get_level_values(0) + + dl_train_ind = np.zeros(dl_train.data_arr.shape[0]) + dl_valid_ind = np.zeros(dl_valid.data_arr.shape[0]) + + for col_name in dl_train.idx_df.columns: + col_data = dl_train.idx_df[col_name] + for val in col_data: + if np.isnan(val) is False: + dl_train_ind[val] = self.industry.get(col_name, -1) + for col_name in dl_valid.idx_df.columns: + col_data = dl_valid.idx_df[col_name] + for val in col_data: + if np.isnan(val) is False: + dl_valid_ind[val] = self.industry.get(col_name, -1) + + dl_train.data_arr = np.concatenate([dl_train.data_arr, dl_train_ind[:, None]], axis=1) + dl_valid.data_arr = np.concatenate([dl_valid.data_arr, dl_valid_ind[:, None]], axis=1) + + sampler_train = DailyBatchSampler(dl_train) + sampler_valid = DailyBatchSampler(dl_valid) + + train_loader = DataLoader(dl_train, sampler=sampler_train, num_workers=self.n_jobs, drop_last=True) + valid_loader = DataLoader(dl_valid, sampler=sampler_valid, num_workers=self.n_jobs, drop_last=True) + + save_path = get_or_create_path(save_path) + + stop_steps = 0 + train_loss = 0 + best_score = -np.inf + best_epoch = 0 + evals_result["train"] = [] + evals_result["valid"] = [] + + # load pretrained base_model + if self.base_model == "LSTM": + pretrained_model = LSTMModel(d_feat=self.d_feat, hidden_size=self.hidden_size, num_layers=self.num_layers) + elif self.base_model == "GRU": + pretrained_model = GRUModel(d_feat=self.d_feat, hidden_size=self.hidden_size, num_layers=self.num_layers) + else: + raise ValueError("unknown base model name `%s`" % self.base_model) + + if self.model_path is not None: + self.logger.info("Loading pretrained model...") + pretrained_model.load_state_dict(torch.load(self.model_path, map_location=self.device)) + + model_dict = self.GAT_model.state_dict() + pretrained_dict = { + k: v for k, v in pretrained_model.state_dict().items() if k in model_dict # pylint: disable=E1135 + } + model_dict.update(pretrained_dict) + self.GAT_model.load_state_dict(model_dict) + self.logger.info("Loading pretrained model Done...") + + # train + self.logger.info("training...") + self.fitted = True + + for step in range(self.n_epochs): + self.logger.info("Epoch%d:", step) + self.logger.info("training...") + self.train_epoch(train_loader) + self.logger.info("evaluating...") + train_loss, train_score = self.test_epoch(train_loader) + val_loss, val_score = self.test_epoch(valid_loader) + self.logger.info("train %.6f, valid %.6f" % (train_score, val_score)) + evals_result["train"].append(train_score) + evals_result["valid"].append(val_score) + + if val_score > best_score: + best_score = val_score + stop_steps = 0 + best_epoch = step + best_param = copy.deepcopy(self.GAT_model.state_dict()) + else: + stop_steps += 1 + if stop_steps >= self.early_stop: + self.logger.info("early stop") + break + + self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch)) + self.GAT_model.load_state_dict(best_param) + torch.save(best_param, save_path) + + if self.use_gpu: + torch.cuda.empty_cache() + + def predict(self, dataset): + if not self.fitted: + raise ValueError("model is not fitted yet!") + + dl_test = dataset.prepare("test", col_set=["feature", "label"], data_key=DataHandlerLP.DK_I) + dl_test.config(fillna_type="ffill+bfill") + + dl_test_ind = np.zeros(dl_test.data_arr.shape[0]) + + for col_name in dl_test.idx_df.columns: + col_data = dl_test.idx_df[col_name] + for val in col_data: + if np.isnan(val) is False: + dl_test_ind[val] = self.industry.get(col_name, -1) + + dl_test.data_arr = np.concatenate([dl_test.data_arr, dl_test_ind[:, None]], axis=1) + + sampler_test = DailyBatchSampler(dl_test) + test_loader = DataLoader(dl_test, sampler=sampler_test, num_workers=self.n_jobs) + self.GAT_model.eval() + preds = [] + + for data in test_loader: + data = data.squeeze() + feature = data[:, :, 0:-2].to(self.device) + ind = data[:, -1, -1].to(self.device) + adjacent_matrix = (ind.reshape(-1, 1) == ind.reshape(1, -1)).float().to(self.device) + + with torch.no_grad(): + pred = self.GAT_model(feature.float(), adjacent_matrix).detach().cpu().numpy() + + preds.append(pred) + + return pd.Series(np.concatenate(preds), index=dl_test.get_index()) + + +class GATModel_ADD_IND(nn.Module): + def __init__(self, d_feat=6, hidden_size=64, num_layers=2, dropout=0.0, base_model="GRU", smooth_perplexity=1.0): + super().__init__() + + if base_model == "GRU": + self.rnn = nn.GRU( + input_size=d_feat, + hidden_size=hidden_size, + num_layers=num_layers, + batch_first=True, + dropout=dropout, + ) + elif base_model == "LSTM": + self.rnn = nn.LSTM( + input_size=d_feat, + hidden_size=hidden_size, + num_layers=num_layers, + batch_first=True, + dropout=dropout, + ) + else: + raise ValueError("unknown base model name `%s`" % base_model) + + self.hidden_size = hidden_size + self.d_feat = d_feat + self.transformation = nn.Linear(self.hidden_size, self.hidden_size) + self.a = nn.Parameter(torch.randn(self.hidden_size * 2, 1)) + self.a.requires_grad = True + self.fc = nn.Linear(self.hidden_size, self.hidden_size) + self.fc_out = nn.Linear(hidden_size, 1) + self.leaky_relu = nn.LeakyReLU() + self.softmax = nn.Softmax(dim=1) + self.smooth_perplexity = smooth_perplexity + + def cal_attention(self, x, y, adj): + x = self.transformation(x) + y = self.transformation(y) + + sample_num = x.shape[0] + dim = x.shape[1] + e_x = x.expand(sample_num, sample_num, dim) + e_y = torch.transpose(e_x, 0, 1) + attention_in = torch.cat((e_x, e_y), 2).view(-1, dim * 2) + self.a_t = torch.t(self.a) + attention_out = self.a_t.mm(torch.t(attention_in)).view(sample_num, sample_num) + attention_out = self.leaky_relu(attention_out) + attention_out /= self.smooth_perplexity + attention_out = torch.exp(attention_out) * adj + att_weight = attention_out / torch.sum(attention_out, dim=1, keepdim=True) + return att_weight + + def forward(self, x, adj): + out, _ = self.rnn(x) + hidden = out[:, -1, :] # [N, H] + att_weight = self.cal_attention(hidden, hidden, adj) + hidden = att_weight.mm(hidden) + hidden + hidden = self.fc(hidden) + hidden = self.leaky_relu(hidden) + return self.fc_out(hidden).squeeze() diff --git a/qlib/contrib/model/pytorch_gcn.py b/qlib/contrib/model/pytorch_gcn.py new file mode 100644 index 0000000000..d7d867a36d --- /dev/null +++ b/qlib/contrib/model/pytorch_gcn.py @@ -0,0 +1,480 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + + +from __future__ import division +from __future__ import print_function + +import numpy as np +import pandas as pd +from typing import Text, Union +import copy + +from torch.nn.functional import relu + +from ...utils import get_or_create_path +from ...log import get_module_logger +import torch +import torch.nn as nn +import torch.optim as optim + +from .pytorch_utils import count_parameters +from ...model.base import Model +from ...data.dataset import DatasetH +from ...data.dataset.handler import DataHandlerLP +from ...contrib.model.pytorch_lstm import LSTMModel +from ...contrib.model.pytorch_gru import GRUModel + + +class GCN(Model): + """GCN Model + + Parameters + ---------- + lr : float + learning rate + d_feat : int + input dimensions for each time step + metric : str + the evaluation metric used in early stop + optimizer : str + optimizer name + GPU : int + the GPU ID used for training + """ + + def __init__( + self, + d_feat=6, + hidden_size=64, + num_layers=2, + dropout=0.0, + n_epochs=200, + lr=0.001, + metric="", + early_stop=20, + loss="mse", + base_model="GRU", + model_path=None, + optimizer="adam", + GPU=0, + seed=None, + industrial_data_path="~/industry_data.csv", + industry_col="industry", + smooth_perplexity=1, + adjacent_coef=0.01, + **kwargs + ): + # Set logger. + self.logger = get_module_logger("GCN") + self.logger.info("GCN pytorch version...") + + # set hyper-parameters. + self.d_feat = d_feat + self.hidden_size = hidden_size + self.num_layers = num_layers + self.dropout = dropout + self.n_epochs = n_epochs + self.lr = lr + self.metric = metric + self.early_stop = early_stop + self.optimizer = optimizer.lower() + self.loss = loss + self.base_model = base_model + self.model_path = model_path + self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu") + self.seed = seed + + self.industrial_data_path = industrial_data_path + self.industrial = pd.read_csv(industrial_data_path, index_col=0) + self.industry_col = industry_col + self.adjacent_coef = adjacent_coef + + self.logger.info( + "GCN parameters setting:" + "\nd_feat : {}" + "\nhidden_size : {}" + "\nnum_layers : {}" + "\ndropout : {}" + "\nn_epochs : {}" + "\nlr : {}" + "\nmetric : {}" + "\nearly_stop : {}" + "\noptimizer : {}" + "\nloss_type : {}" + "\nbase_model : {}" + "\nmodel_path : {}" + "\ndevice : {}" + "\nuse_GPU : {}" + "\nseed : {}".format( + d_feat, + hidden_size, + num_layers, + dropout, + n_epochs, + lr, + metric, + early_stop, + optimizer.lower(), + loss, + base_model, + model_path, + self.device, + self.use_gpu, + seed, + ) + ) + + if self.seed is not None: + np.random.seed(self.seed) + torch.manual_seed(self.seed) + + self.GCN_model = GCNModel( + d_feat=self.d_feat, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + dropout=self.dropout, + base_model=self.base_model, + ) + self.logger.info("model:\n{:}".format(self.GCN_model)) + self.logger.info("model size: {:.4f} MB".format(count_parameters(self.GCN_model))) + + if optimizer.lower() == "adam": + self.train_optimizer = optim.Adam(self.GCN_model.parameters(), lr=self.lr) + elif optimizer.lower() == "gd": + self.train_optimizer = optim.SGD(self.GCN_model.parameters(), lr=self.lr) + else: + raise NotImplementedError("optimizer {} is not supported!".format(optimizer)) + + self.fitted = False + self.GCN_model.to(self.device) + + @property + def use_gpu(self): + return self.device != torch.device("cpu") + + def mse(self, pred, label): + loss = (pred - label) ** 2 + return torch.mean(loss) + + def loss_fn(self, pred, label): + mask = ~torch.isnan(label) + + if self.loss == "mse": + return self.mse(pred[mask], label[mask]) + + raise ValueError("unknown loss `%s`" % self.loss) + + def metric_fn(self, pred, label): + mask = torch.isfinite(label) + + if self.metric in ("", "loss"): + return -self.loss_fn(pred[mask], label[mask]) + + raise ValueError("unknown metric `%s`" % self.metric) + + def get_daily_inter(self, df, shuffle=False): + # organize the train data into daily batches + daily_count = df.groupby(level=0).size().values + daily_index = np.roll(np.cumsum(daily_count), 1) + daily_index[0] = 0 + if shuffle: + # shuffle data + daily_shuffle = list(zip(daily_index, daily_count)) + np.random.shuffle(daily_shuffle) + daily_index, daily_count = zip(*daily_shuffle) + return daily_index, daily_count + + def get_industry(self, date, instruments): + table = self.industrial.loc[:, self.industry_col] + + instrument_to_industry = {} + + for id, instrument in enumerate(instruments): + industry = table.get(instrument, -1) + if np.isnan(industry): + industry = -1 + instrument_to_industry[instrument] = industry + return instrument_to_industry + + def adjacent_matrix(self, groups): + # extract the adjacent matrix + industries = [] + for i, (idx, count) in enumerate(groups): + for j in range(count): + industries.append(i) + industries = np.array(industries) + adjacent_matrix = (industries.reshape(-1, 1) == industries.reshape(1, -1)).astype(np.float32) + return adjacent_matrix + + def get_input_data(self, data_x, data_y, data_x_values, data_y_values, idx, count): + instruments = [] + index = [] + for i in range(idx, idx + count): + index.append(i) + instruments.append(data_x.index[i][1]) + + instrument_to_idx = {instrument: idx for idx, instrument in zip(index, instruments)} + + date = str(data_x.index[idx][0])[:10] + instrument_to_industry = self.get_industry(date, instruments) + group_by_industry = list(instrument_to_industry.items()) + group_by_industry.sort(key=lambda x: x[1]) + # for each industry, get the starting index and the number of instruments + groups = [] + now_industry = None + index = [] + for id, (instrument, industry) in enumerate(group_by_industry): + if industry != now_industry: + now_industry = industry + groups.append((id, 1)) + else: + groups[-1] = (groups[-1][0], groups[-1][1] + 1) + index.append(instrument_to_idx[instrument]) + index = np.array(index) + + label = data_y_values[index] if data_y_values is not None else None + + A = torch.tensor(self.adjacent_matrix(groups)) * self.adjacent_coef + + return data_x_values[index], label, (A + torch.eye(A.shape[0])).to(self.device), index + + def train_epoch(self, x_train, y_train): + x_train_values = x_train.values + y_train_values = np.squeeze(y_train.values) + self.GCN_model.train() + + # organize the train data into daily batches + daily_index, daily_count = self.get_daily_inter(x_train, shuffle=True) + + for idx, count in zip(daily_index, daily_count): + feature, label, groups, _ = self.get_input_data( + x_train, y_train, x_train_values, y_train_values, idx, count + ) + + feature = torch.from_numpy(feature).float().to(self.device) + label = torch.from_numpy(label).float().to(self.device) + + pred = self.GCN_model(feature, groups) + loss = self.loss_fn(pred, label) + + self.train_optimizer.zero_grad() + loss.backward() + torch.nn.utils.clip_grad_value_(self.GCN_model.parameters(), 3.0) + self.train_optimizer.step() + + def test_epoch(self, data_x, data_y): + # prepare training data + x_values = data_x.values + y_values = np.squeeze(data_y.values) + + self.GCN_model.eval() + + scores = [] + losses = [] + + # organize the test data into daily batches + daily_index, daily_count = self.get_daily_inter(data_x, shuffle=False) + + for idx, count in zip(daily_index, daily_count): + feature, label, groups, _ = self.get_input_data(data_x, data_y, x_values, y_values, idx, count) + + feature = torch.from_numpy(feature).float().to(self.device) + label = torch.from_numpy(label).float().to(self.device) + + pred = self.GCN_model(feature, groups) + loss = self.loss_fn(pred, label) + losses.append(loss.item()) + + score = self.metric_fn(pred, label) + scores.append(score.item()) + + return np.mean(losses), np.mean(scores) + + def fit( + self, + dataset: DatasetH, + evals_result=dict(), + save_path=None, + ): + df_train, df_valid, df_test = dataset.prepare( + ["train", "valid", "test"], + col_set=["feature", "label"], + data_key=DataHandlerLP.DK_L, + ) + if df_train.empty or df_valid.empty: + raise ValueError("Empty data from dataset, please check your dataset config.") + + x_train, y_train = df_train["feature"], df_train["label"] + x_valid, y_valid = df_valid["feature"], df_valid["label"] + + save_path = get_or_create_path(save_path) + stop_steps = 0 + best_score = -np.inf + best_epoch = 0 + evals_result["train"] = [] + evals_result["valid"] = [] + + # load pretrained base_model + if self.base_model == "LSTM": + pretrained_model = LSTMModel() + elif self.base_model == "GRU": + pretrained_model = GRUModel() + else: + raise ValueError("unknown base model name `%s`" % self.base_model) + + if self.model_path is not None: + self.logger.info("Loading pretrained model...") + pretrained_model.load_state_dict(torch.load(self.model_path, map_location=self.device)) + + model_dict = self.GCN_model.state_dict() + pretrained_dict = { + k: v for k, v in pretrained_model.state_dict().items() if k in model_dict # pylint: disable=E1135 + } + model_dict.update(pretrained_dict) + self.GCN_model.load_state_dict(model_dict) + self.logger.info("Loading pretrained model Done...") + + # train + self.logger.info("training...") + self.fitted = True + + for step in range(self.n_epochs): + self.logger.info("Epoch%d:", step) + self.logger.info("training...") + self.train_epoch(x_train, y_train) + self.logger.info("evaluating...") + train_loss, train_score = self.test_epoch(x_train, y_train) + val_loss, val_score = self.test_epoch(x_valid, y_valid) + self.logger.info("train %.6f, valid %.6f" % (train_score, val_score)) + evals_result["train"].append(train_score) + evals_result["valid"].append(val_score) + + if val_score > best_score: + best_score = val_score + stop_steps = 0 + best_epoch = step + best_param = copy.deepcopy(self.GCN_model.state_dict()) + else: + stop_steps += 1 + if stop_steps >= self.early_stop: + self.logger.info("early stop") + break + + self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch)) + self.GCN_model.load_state_dict(best_param) + torch.save(best_param, save_path) + + if self.use_gpu: + torch.cuda.empty_cache() + + def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): + if not self.fitted: + raise ValueError("model is not fitted yet!") + + x_test = dataset.prepare(segment, col_set="feature") + final_index = x_test.index + self.GCN_model.eval() + x_values = x_test.values + preds = [] + + # organize the data into daily batches + daily_index, daily_count = self.get_daily_inter(x_test, shuffle=False) + + for idx, count in zip(daily_index, daily_count): + feature, label, groups, index = self.get_input_data(x_test, None, x_values, None, idx, count) + + feature = torch.from_numpy(feature).float().to(self.device) + + with torch.no_grad(): + pred = self.GCN_model(feature, groups).detach().cpu().numpy() + + decoder = [0] * count + for i, id in enumerate(index): + decoder[id - idx] = i + pred = pred[decoder] + preds.append(pred) + + return pd.Series(np.concatenate(preds), index=final_index) + + +class GCNLayer(nn.Module): + def __init__(self, in_dim, out_dim, activation=relu): + super(GCNLayer, self).__init__() + self.linear = nn.Linear(in_dim, out_dim) + self.activation = activation + + def forward(self, x, adj): + # x: [N, in_dim] + x = torch.matmul(adj, x) + # x: [N, ind_im] + x = self.linear(x) + # x: [N, out_dim] + if self.activation is not None: + x = self.activation(x) + return x + + +class GCNModel(nn.Module): + def __init__( + self, + d_feat=6, + hidden_size=64, + num_layers=2, + dropout=0.0, + base_model="GRU", + base_model_trainable=True, + gcn_num_layers=2, + adj_coef=0.01, + ): + super().__init__() + + if base_model == "GRU": + self.rnn = nn.GRU( + input_size=d_feat, + hidden_size=hidden_size, + num_layers=num_layers, + batch_first=True, + dropout=dropout, + ) + elif base_model == "LSTM": + self.rnn = nn.LSTM( + input_size=d_feat, + hidden_size=hidden_size, + num_layers=num_layers, + batch_first=True, + dropout=dropout, + ) + else: + raise ValueError("unknown base model name `%s`" % base_model) + + if not base_model_trainable: + for param in self.rnn.parameters(): + param.requires_grad = False + + self.hidden_size = hidden_size + self.d_feat = d_feat + self.fc_out = nn.Linear(hidden_size, 1) + self.adj_coef = adj_coef + + self.GCN = torch.nn.ModuleList( + [GCNLayer(in_dim=hidden_size, out_dim=hidden_size, activation=relu) for _ in range(gcn_num_layers)] + ) + + def forward(self, x, adj): + # x: [N, F*T] + x = x.reshape(x.shape[0], self.d_feat, -1) # [N, F, T] + x = x.permute(0, 2, 1) # [N, T, F] + out, _ = self.rnn(x) + hidden = out[:, -1, :] + + adj = adj / torch.sqrt(adj.sum(axis=1, keepdim=True)) / torch.sqrt(adj.sum(axis=0, keepdim=True)) + for gcn in self.GCN: + hidden = gcn(hidden, adj) + # hidden = self.GCN_3(hidden, adj) + + return self.fc_out(hidden).squeeze() + + +if __name__ == "__main__": + pass diff --git a/qlib/contrib/model/pytorch_gcn_ts.py b/qlib/contrib/model/pytorch_gcn_ts.py new file mode 100644 index 0000000000..9ac5052d19 --- /dev/null +++ b/qlib/contrib/model/pytorch_gcn_ts.py @@ -0,0 +1,464 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + + +from __future__ import division +from __future__ import print_function + +import numpy as np +import pandas as pd +import copy + +from torch.nn.functional import relu + +from ...model import Model +from ...utils import get_or_create_path +from ...log import get_module_logger +import torch +import torch.nn as nn +import torch.optim as optim +from torch.utils.data import DataLoader +from torch.utils.data import Sampler + +from .pytorch_utils import count_parameters +from ...data.dataset.handler import DataHandlerLP +from ...contrib.model.pytorch_lstm import LSTMModel +from ...contrib.model.pytorch_gru import GRUModel + + +class DailyBatchSampler(Sampler): + def __init__(self, data_source): + self.data_source = data_source + # calculate number of samples in each batch + self.daily_count = pd.Series(index=self.data_source.get_index()).groupby("datetime").size().values + self.daily_index = np.roll(np.cumsum(self.daily_count), 1) # calculate begin index of each batch + self.daily_index[0] = 0 + + def __iter__(self): + for idx, count in zip(self.daily_index, self.daily_count): + yield np.arange(idx, idx + count) + + def __len__(self): + return len(self.data_source) + + +class GCN(Model): + """GCN Model + + Parameters + ---------- + lr : float + learning rate + d_feat : int + input dimensions for each time step + metric : str + the evaluation metric used in early stop + optimizer : str + optimizer name + GPU : int + the GPU ID used for training + """ + + def __init__( + self, + d_feat=6, + hidden_size=64, + num_layers=2, + dropout=0.0, + n_epochs=200, + lr=0.001, + metric="", + early_stop=20, + loss="mse", + base_model="GRU", + model_path=None, + optimizer="adam", + GPU=0, + n_jobs=10, + seed=None, + industrial_data_path="~/industry_data.csv", + industry_col="industry_citic", + adjacent_coef=0.01, + **kwargs + ): + # Set logger. + self.logger = get_module_logger("GCN") + self.logger.info("GCN pytorch version...") + + # set hyper-parameters. + self.d_feat = d_feat + self.hidden_size = hidden_size + self.num_layers = num_layers + self.dropout = dropout + self.n_epochs = n_epochs + self.lr = lr + self.metric = metric + self.early_stop = early_stop + self.optimizer = optimizer.lower() + self.loss = loss + self.base_model = base_model + self.model_path = model_path + self.n_jobs = n_jobs + self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu") + self.seed = seed + + self.industrial_data_path = industrial_data_path + self.industrial = pd.read_csv(industrial_data_path, index_col=0) + self.industry_col = industry_col + self.adjacent_coef = adjacent_coef + + self.industry = self.industrial[self.industry_col] + self.industry = self.industry.fillna(-1) + + self.logger.info( + "GCN parameters setting:" + "\nd_feat : {}" + "\nhidden_size : {}" + "\nnum_layers : {}" + "\ndropout : {}" + "\nn_epochs : {}" + "\nlr : {}" + "\nmetric : {}" + "\nearly_stop : {}" + "\noptimizer : {}" + "\nloss_type : {}" + "\nbase_model : {}" + "\nmodel_path : {}" + "\ndevice : {}" + "\nuse_GPU : {}" + "\nseed : {}" + "\nindustry_col: {}".format( + d_feat, + hidden_size, + num_layers, + dropout, + n_epochs, + lr, + metric, + early_stop, + optimizer.lower(), + loss, + base_model, + model_path, + self.device, + self.use_gpu, + seed, + industry_col, + ) + ) + + if self.seed is not None: + np.random.seed(self.seed) + torch.manual_seed(self.seed) + + self.GCN_model = GCNModel( + d_feat=self.d_feat, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + dropout=self.dropout, + base_model=self.base_model, + ) + self.logger.info("model:\n{:}".format(self.GCN_model)) + self.logger.info("model size: {:.4f} MB".format(count_parameters(self.GCN_model))) + + if optimizer.lower() == "adam": + self.train_optimizer = optim.Adam(self.GCN_model.parameters(), lr=self.lr) + elif optimizer.lower() == "gd": + self.train_optimizer = optim.SGD(self.GCN_model.parameters(), lr=self.lr) + else: + raise NotImplementedError("optimizer {} is not supported!".format(optimizer)) + + self.fitted = False + self.GCN_model.to(self.device) + + @property + def use_gpu(self): + return self.device != torch.device("cpu") + + def mse(self, pred, label): + loss = (pred - label) ** 2 + return torch.mean(loss) + + def loss_fn(self, pred, label): + mask = ~torch.isnan(label) + + if self.loss == "mse": + return self.mse(pred[mask], label[mask]) + + raise ValueError("unknown loss `%s`" % self.loss) + + def metric_fn(self, pred, label): + mask = torch.isfinite(label) + + if self.metric in ("", "loss"): + return -self.loss_fn(pred[mask], label[mask]) + + raise ValueError("unknown metric `%s`" % self.metric) + + def get_daily_inter(self, df, shuffle=False): + # organize the train data into daily batches + daily_count = df.groupby(level=0).size().values + daily_index = np.roll(np.cumsum(daily_count), 1) + daily_index[0] = 0 + if shuffle: + # shuffle data + daily_shuffle = list(zip(daily_index, daily_count)) + np.random.shuffle(daily_shuffle) + daily_index, daily_count = zip(*daily_shuffle) + return daily_index, daily_count + + def train_epoch(self, data_loader): + self.GCN_model.train() + + for data in data_loader: + data = data.squeeze() + feature = data[:, :, 0:-2].to(self.device) + label = data[:, -1, -2].to(self.device) + ind = data[:, -1, -1] + + adjacent_matrix = (ind.reshape(-1, 1) == ind.reshape(1, -1)).float().to(self.device) + + adjacent_matrix = adjacent_matrix * self.adjacent_coef + torch.eye(adjacent_matrix.shape[0]).to(self.device) + + pred = self.GCN_model(feature.float(), adjacent_matrix) + loss = self.loss_fn(pred, label) + + self.train_optimizer.zero_grad() + loss.backward() + torch.nn.utils.clip_grad_value_(self.GCN_model.parameters(), 3.0) + self.train_optimizer.step() + + def test_epoch(self, data_loader): + self.GCN_model.eval() + + scores = [] + losses = [] + + for data in data_loader: + data = data.squeeze() + feature = data[:, :, 0:-2].to(self.device) + label = data[:, -1, -2].to(self.device) + ind = data[:, -1, -1] + adjacent_matrix = (ind.reshape(-1, 1) == ind.reshape(1, -1)).float().to(self.device) + adjacent_matrix = adjacent_matrix * self.adjacent_coef + torch.eye(adjacent_matrix.shape[0]).to(self.device) + + pred = self.GCN_model(feature.float(), adjacent_matrix) + loss = self.loss_fn(pred, label) + losses.append(loss.item()) + + score = self.metric_fn(pred, label) + scores.append(score.item()) + + return np.mean(losses), np.mean(scores) + + def fit( + self, + dataset, + evals_result=dict(), + save_path=None, + ): + dl_train = dataset.prepare("train", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) + dl_valid = dataset.prepare("valid", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) + if dl_train.empty or dl_valid.empty: + raise ValueError("Empty data from dataset, please check your dataset config.") + + dl_train.config(fillna_type="ffill+bfill") # process nan brought by dataloader + dl_valid.config(fillna_type="ffill+bfill") # process nan brought by dataloader + + dl_train_index = dl_train.data_index.get_level_values(0) + dl_valid_index = dl_valid.data_index.get_level_values(0) + + dl_train_ind = np.zeros(dl_train.data_arr.shape[0]) + dl_valid_ind = np.zeros(dl_valid.data_arr.shape[0]) + + for col_name in dl_train.idx_df.columns: + col_data = dl_train.idx_df[col_name] + for val in col_data: + if np.isnan(val) is False: + dl_train_ind[val] = self.industry.get(col_name, -1) + for col_name in dl_valid.idx_df.columns: + col_data = dl_valid.idx_df[col_name] + for val in col_data: + if np.isnan(val) is False: + dl_valid_ind[val] = self.industry.get(col_name, -1) + + dl_train.data_arr = np.concatenate([dl_train.data_arr, dl_train_ind[:, None]], axis=1) + dl_valid.data_arr = np.concatenate([dl_valid.data_arr, dl_valid_ind[:, None]], axis=1) + + sampler_train = DailyBatchSampler(dl_train) + sampler_valid = DailyBatchSampler(dl_valid) + + train_loader = DataLoader(dl_train, sampler=sampler_train, num_workers=self.n_jobs, drop_last=True) + valid_loader = DataLoader(dl_valid, sampler=sampler_valid, num_workers=self.n_jobs, drop_last=True) + + save_path = get_or_create_path(save_path) + + stop_steps = 0 + train_loss = 0 + best_score = -np.inf + best_epoch = 0 + evals_result["train"] = [] + evals_result["valid"] = [] + + # load pretrained base_model + if self.base_model == "LSTM": + pretrained_model = LSTMModel(d_feat=self.d_feat, hidden_size=self.hidden_size, num_layers=self.num_layers) + elif self.base_model == "GRU": + pretrained_model = GRUModel(d_feat=self.d_feat, hidden_size=self.hidden_size, num_layers=self.num_layers) + else: + raise ValueError("unknown base model name `%s`" % self.base_model) + + if self.model_path is not None: + self.logger.info("Loading pretrained model...") + pretrained_model.load_state_dict(torch.load(self.model_path, map_location=self.device)) + + model_dict = self.GCN_model.state_dict() + pretrained_dict = { + k: v for k, v in pretrained_model.state_dict().items() if k in model_dict # pylint: disable=E1135 + } + model_dict.update(pretrained_dict) + self.GCN_model.load_state_dict(model_dict) + self.logger.info("Loading pretrained model Done...") + + # train + self.logger.info("training...") + self.fitted = True + + for step in range(self.n_epochs): + self.logger.info("Epoch%d:", step) + self.logger.info("training...") + self.train_epoch(train_loader) + self.logger.info("evaluating...") + train_loss, train_score = self.test_epoch(train_loader) + val_loss, val_score = self.test_epoch(valid_loader) + self.logger.info("train %.6f, valid %.6f" % (train_score, val_score)) + evals_result["train"].append(train_score) + evals_result["valid"].append(val_score) + + if val_score > best_score: + best_score = val_score + stop_steps = 0 + best_epoch = step + best_param = copy.deepcopy(self.GCN_model.state_dict()) + else: + stop_steps += 1 + if stop_steps >= self.early_stop: + self.logger.info("early stop") + break + + self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch)) + self.GCN_model.load_state_dict(best_param) + torch.save(best_param, save_path) + + if self.use_gpu: + torch.cuda.empty_cache() + + def predict(self, dataset): + if not self.fitted: + raise ValueError("model is not fitted yet!") + + dl_test = dataset.prepare("test", col_set=["feature", "label"], data_key=DataHandlerLP.DK_I) + dl_test.config(fillna_type="ffill+bfill") + + dl_test_ind = np.zeros(dl_test.data_arr.shape[0]) + + for col_name in dl_test.idx_df.columns: + col_data = dl_test.idx_df[col_name] + for val in col_data: + if np.isnan(val) is False: + dl_test_ind[val] = self.industry.get(col_name, -1) + + dl_test.data_arr = np.concatenate([dl_test.data_arr, dl_test_ind[:, None]], axis=1) + + sampler_test = DailyBatchSampler(dl_test) + test_loader = DataLoader(dl_test, sampler=sampler_test, num_workers=self.n_jobs) + self.GCN_model.eval() + preds = [] + + for data in test_loader: + data = data.squeeze() + feature = data[:, :, 0:-2].to(self.device) + ind = data[:, -1, -1].to(self.device) + adjacent_matrix = (ind.reshape(-1, 1) == ind.reshape(1, -1)).float().to(self.device) + adjacent_matrix = adjacent_matrix * self.adjacent_coef + torch.eye(adjacent_matrix.shape[0]).to(self.device) + + with torch.no_grad(): + pred = self.GCN_model(feature.float(), adjacent_matrix).detach().cpu().numpy() + + preds.append(pred) + + return pd.Series(np.concatenate(preds), index=dl_test.get_index()) + + +class GCNLayer(nn.Module): + def __init__(self, in_dim, out_dim, activation=relu): + super(GCNLayer, self).__init__() + self.linear = nn.Linear(in_dim, out_dim) + self.activation = activation + + def forward(self, x, adj): + # x: [N, in_dim] + x = torch.matmul(adj, x) + # x: [N, ind_im] + x = self.linear(x) + # x: [N, out_dim] + if self.activation is not None: + x = self.activation(x) + return x + + +class GCNModel(nn.Module): + def __init__( + self, + d_feat=6, + hidden_size=64, + num_layers=2, + dropout=0.0, + base_model="GRU", + base_model_trainable=True, + gcn_num_layers=2, + adj_coef=0.01, + ): + super().__init__() + + if base_model == "GRU": + self.rnn = nn.GRU( + input_size=d_feat, + hidden_size=hidden_size, + num_layers=num_layers, + batch_first=True, + dropout=dropout, + ) + elif base_model == "LSTM": + self.rnn = nn.LSTM( + input_size=d_feat, + hidden_size=hidden_size, + num_layers=num_layers, + batch_first=True, + dropout=dropout, + ) + else: + raise ValueError("unknown base model name `%s`" % base_model) + + if not base_model_trainable: + for param in self.rnn.parameters(): + param.requires_grad = False + + self.hidden_size = hidden_size + self.d_feat = d_feat + self.fc_out = nn.Linear(hidden_size, 1) + self.adj_coef = adj_coef + + self.GCN = torch.nn.ModuleList( + [GCNLayer(in_dim=hidden_size, out_dim=hidden_size, activation=relu) for _ in range(gcn_num_layers)] + ) + + def forward(self, x, adj): + out, _ = self.rnn(x) + hidden = out[:, -1, :] + + adj = adj / torch.sqrt(adj.sum(axis=1, keepdim=True)) / torch.sqrt(adj.sum(axis=0, keepdim=True)) + for gcn in self.GCN: + hidden = gcn(hidden, adj) + + return self.fc_out(hidden).squeeze()