In [1]:
import sys
from pathlib import Path
import qlib
import pandas as pd
from qlib.config import REG_CN
from qlib.contrib.model.gbdt import LGBModel
from qlib.contrib.data.handler import Alpha360
from qlib.utils import init_instance_by_config
from qlib.contrib.strategy import TopkDropoutStrategy
from qlib.contrib.report import analysis_model, analysis_position
# from qlib.contrib.evaluate import (
#     backtest as normal_backtest,
#     risk_analysis,
# )
from qlib.utils import exists_qlib_data, init_instance_by_config, flatten_dict
from qlib.workflow import R
from qlib.workflow.record_temp import SignalRecord, PortAnaRecord
from qlib.utils import flatten_dict
from qlib.data.dataset.loader import QlibDataLoader
from qlib.contrib.data.handler import Alpha158   #Alpha158内置指标体系
from qlib.data.dataset.loader import QlibDataLoader
import qlib
from qlib.contrib.data.handler import Alpha158   #Alpha158内置指标体系


provider_uri = "./qlib_data/cn_data"  # 原始行情数据存放目录
qlib.init(provider_uri=provider_uri, region=REG_CN)  # 初始化
market = "csi100"
benchmark = "SH000300"

 #数据处理器参数配置
data_handler_config = {
    "start_time": "2008-01-01",
    "end_time": "2020-08-01",
    "fit_start_time": "2008-01-01",
    "fit_end_time": "2014-12-31",
    # "start_time": "2020-01-01",
    # "end_time": "2020-02-21",
    # "fit_start_time": "2020-01-01",  # 模型跑数据的开始时间
    # "fit_end_time": "2020-01-31",
    "instruments": market,
    'freq' : "day",
    'infer_processors': [
                                    {'class': 'RobustZScoreNorm',
                                     'kwargs': {'fields_group': 'feature', 'clip_outlier': True}},
                                    {'class': 'Fillna', 'kwargs': {'fields_group': 'feature'}}],
    
    'learn_processors': [{'class': 'DropnaLabel'},
                                                     
                                                     # 对预测的目标进行截面排序处理
                                    {'class': 'CSRankNorm', 'kwargs': {'fields_group': 'label'}}],
                                
                                # 预测的目标
                                'label': ['Ref($close, -1) / $close - 1']  # 下一日的收益率.
}
    

# 任务参数配置
task = {
    "model": {  # 模型参数配置
        # 模型类
        "class": "TransGANModel",
        # 模型类所在模块
        "module_path": "qlib.contrib.model.transgan111",
        "kwargs": {  # 模型超参数配置
            "hidden_size": 4,
            "num_layers" : 2,
            "nhead" : 5,
            "learning_rate" : 0.001,
            "n_epochs" : 2,
            "batch_size": 128,
            "optimizer_betas" :(0.9,0.999),
            "evaluation_epoch_num" :5 ,
            
        }, 
    },
    "dataset": {  # 　因子库数据集参数配置
        # 数据集类，是Dataset with Data(H)andler的缩写，即带数据处理器的数据集
        "class": "TSDatasetH",
        # 数据集类所在模块
        "module_path": "qlib.data.dataset",
        "kwargs": {  # 数据集参数配置
            "handler": {  # 数据集使用的数据处理器配置
                #"class": "Alpha158",  # 数据处理器类，继承自DataHandlerLP
                "module_path": "qlib.contrib.data.handler",  # 数据处理器类所在模块
                "class": "Alpha158",
                "kwargs": data_handler_config,  # 数据处理器参数配置
            },
            "segments": {  # 数据集划分标准
                "train": ("2008-01-01", "2014-12-31"), # 此时段的数据为训练集
                "valid": ("2015-01-01", "2016-12-31"), # 此时段的数据为验证集
                "test": ("2017-01-01", "2020-08-01"),  # 此时段的数据为测试集
                # "train": ("2020-01-01", "2020-01-31"),  # 此时段的数据为训练集
                # "valid": ("2020-01-31", "2020-02-20"),  # 此时段的数据为验证集
                # "test": ("2020-02-20", "2020-02-21"),  # 此时段的数据为测试集
            },
        },
    },

}

# 实例化模型对象
model = init_instance_by_config(task["model"])

# 实例化因子库数据集，从基础行情数据计算出的包含所有特征（因子）和标签值的数据集。
dataset = init_instance_by_config(task["dataset"])  # DatasetH


# start exp to train model
with R.start(experiment_name="train_model"):
    R.log_params(**flatten_dict(task))
    model.fit(dataset)
    R.save_objects(trained_model=model)
    rid = R.get_recorder().id

###################################
# prediction, backtest & analysis
###################################
port_analysis_config = {
    "executor": {
        "class": "SimulatorExecutor",
        "module_path": "qlib.backtest.executor",
        "kwargs": {
            "time_per_step": "day",
            "generate_portfolio_metrics": True,
        },
    },
    "strategy": {
        "class": "TopkDropoutStrategy",
        "module_path": "qlib.contrib.strategy",
        "kwargs": {
            "model": model,
            "dataset": dataset,
            "topk": 50,
            "n_drop": 5,
        },
    },
    "backtest": {
        "start_time": "2017-01-01",
        "end_time": "2020-08-01",
        "account": 100000000,
        "benchmark": benchmark,
        "exchange_kwargs": {
            "freq": "day",
            "limit_threshold": 0.095,
            "deal_price": "close",
            "open_cost": 0.0005,
            "close_cost": 0.0015,
            "min_cost": 5,
        },
    },
}

# backtest and analysis
with R.start(experiment_name="backtest_analysis"):
    recorder = R.get_recorder(recorder_id=rid, experiment_name="train_model")
    model = recorder.load_object("trained_model")

    # prediction
    recorder = R.get_recorder()
    ba_rid = recorder.id
    sr = SignalRecord(model, dataset, recorder)
    sr.generate()

    # backtest & analysis
    par = PortAnaRecord(recorder, port_analysis_config, "day")
    par.generate()


# 从实验记录器加载保存在pkl文件中的预测结果数据
pred_df = recorder.load_object("pred.pkl")

# 从实验记录器加载保存在pkl文件中的标签数据
label_df = recorder.load_object("label.pkl")
label_df.columns = ['label']

# 构造预测值和标签值并列的df
pred_label = pd.concat([pred_df, label_df], axis=1, sort=True).reindex(label_df.index)

print(pred_label)

# import numpy as np
# # 计算预测值和标签值的正负号
# pred_sign = np.sign(pred_label["score"])
# label_sign = np.sign(pred_label["label"])

# # 统计正负号相同的数量
# same_sign_count = np.sum(pred_sign == label_sign)
# print(same_sign_count)

# # 计算出两列数据正负号相同的比例
# corr_pct = (pred_label['score'] * pred_label['label'] > 0).mean()
# print(corr_pct)

ModuleNotFoundError. CatBoostModel are skipped. (optional: maybe installing CatBoostModel can fix it.)
ModuleNotFoundError. XGBModel is skipped(optional: maybe installing xgboost can fix it).


  from .autonotebook import tqdm as notebook_tqdm
[31253:MainThread](2023-05-19 22:24:49,054) INFO - qlib.Initialization - [config.py:413] - default_conf: client.
[31253:MainThread](2023-05-19 22:24:49,060) INFO - qlib.workflow - [expm.py:31] - experiment manager uri is at file:/home/shared/qlib-main/mlruns
[31253:MainThread](2023-05-19 22:24:49,061) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[31253:MainThread](2023-05-19 22:24:49,063) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/home/shared/qlib-main/qlib_data/cn_data')}
[31253:MainThread](2023-05-19 22:24:49,306) INFO - qlib.TransGANModel - [transgan111.py:137] - Naive TransGAN:
batch_size : 128
device : cuda:3
[31253:MainThread](2023-05-19 22:25:10,198) INFO - qlib.timer - [log.py:117] - Time cost: 18.577s | Loading data Done
[31253:MainThread](2023-05-19 22:25:23,286) INFO - qlib.timer - [log.py:117] - Time cost: 12.209s | Robus

模型训练参数不一样


[31253:MainThread](2023-05-19 22:27:44,974) INFO - qlib.TransGANModel - [transgan111.py:378] - train -1.036893, valid -0.552810
[31253:MainThread](2023-05-19 22:27:44,985) INFO - qlib.TransGANModel - [transgan111.py:359] - Epoch1:
[31253:MainThread](2023-05-19 22:27:44,987) INFO - qlib.TransGANModel - [transgan111.py:360] - Training...


test acc on clean examples (%): 50.314
test acc on FGM adversarial examples (%): 48.909
test acc on PGD adversarial examples (%): 50.314


[31253:MainThread](2023-05-19 22:29:21,332) INFO - qlib.TransGANModel - [transgan111.py:375] - evaluating...


模型训练参数不一样


[31253:MainThread](2023-05-19 22:30:05,483) INFO - qlib.TransGANModel - [transgan111.py:378] - train -1.037501, valid -0.537657
[31253:MainThread](2023-05-19 22:30:05,491) INFO - qlib.TransGANModel - [transgan111.py:393] - best score: -0.537657 @ 1


test acc on clean examples (%): 50.527
test acc on FGM adversarial examples (%): 48.696
test acc on PGD adversarial examples (%): 50.527


[31253:MainThread](2023-05-19 22:30:05,722) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | waiting `async_log` Done
[31253:MainThread](2023-05-19 22:30:05,725) INFO - qlib.workflow - [expm.py:316] - <mlflow.tracking.client.MlflowClient object at 0x7f3f05ebd190>
[31253:MainThread](2023-05-19 22:30:05,730) INFO - qlib.workflow - [exp.py:260] - Experiment 2 starts running ...
[31253:MainThread](2023-05-19 22:30:05,752) INFO - qlib.workflow - [recorder.py:339] - Recorder 82b3b3e73b584abbbb91868de1123710 starts running under Experiment 2 ...
Not a git repository
To compare two paths outside a working tree:
usage: git diff [--no-index] <path> <path>
[31253:MainThread](2023-05-19 22:30:05,857) INFO - qlib.workflow - [recorder.py:372] - Fail to log the uncommitted code of $CWD when run `git diff`
fatal: 不是一个 git 仓库（或者直至挂载点 / 的任何父目录）
停止在文件系统边界（未设置 GIT_DISCOVERY_ACROSS_FILESYSTEM）。
[31253:MainThread](2023-05-19 22:30:05,968) INFO - qlib.workflow - [recorder.py:372] - Fail to log the unco

'The following are prediction results of the TransGANModel model.'
                          score
datetime   instrument          
2017-01-03 SH600000   -0.022733
           SH600010    0.004297
           SH600015   -0.028968
           SH600016   -0.040432
           SH600018   -0.020757


[31253:MainThread](2023-05-19 22:30:16,829) INFO - qlib.backtest caller - [__init__.py:94] - Create new exchange
backtest loop: 100%|██████████| 871/871 [00:16<00:00, 52.26it/s]
[31253:MainThread](2023-05-19 22:30:55,306) INFO - qlib.workflow - [record_temp.py:499] - Portfolio analysis record 'port_analysis_1day.pkl' has been saved as the artifact of the Experiment 2
[31253:MainThread](2023-05-19 22:30:55,318) INFO - qlib.workflow - [record_temp.py:524] - Indicator analysis record 'indicator_analysis_1day.pkl' has been saved as the artifact of the Experiment 2
[31253:MainThread](2023-05-19 22:30:55,375) INFO - qlib.timer - [log.py:117] - Time cost: 0.041s | waiting `async_log` Done


'The following are analysis results of benchmark return(1day).'
                       risk
mean               0.000477
std                0.012295
annualized_return  0.113561
information_ratio  0.598699
max_drawdown      -0.370479
'The following are analysis results of the excess return without cost(1day).'
                       risk
mean              -0.000161
std                0.003875
annualized_return -0.038318
information_ratio -0.641018
max_drawdown      -0.218130
'The following are analysis results of the excess return with cost(1day).'
                       risk
mean              -0.000351
std                0.003876
annualized_return -0.083545
information_ratio -1.397230
max_drawdown      -0.310917
'The following are analysis results of indicators(1day).'
     value
ffr    1.0
pa     0.0
pos    0.0
                          score     label
datetime   instrument                    
2017-01-03 SH600000   -0.022733  0.001834
           SH600010    0.004297  0.000000
         