In [1]:
import sys
from pathlib import Path
import qlib
import pandas as pd
from qlib.config import REG_CN
from qlib.contrib.model.gbdt import LGBModel
from qlib.contrib.data.handler import Alpha360
from qlib.utils import init_instance_by_config
from qlib.contrib.strategy import TopkDropoutStrategy
from qlib.contrib.report import analysis_model, analysis_position
# from qlib.contrib.evaluate import (
#     backtest as normal_backtest,
#     risk_analysis,
# )
from qlib.utils import exists_qlib_data, init_instance_by_config, flatten_dict
from qlib.workflow import R
from qlib.workflow.record_temp import SignalRecord, PortAnaRecord
from qlib.utils import flatten_dict
from qlib.data.dataset.loader import QlibDataLoader
from qlib.contrib.data.handler import Alpha158   #Alpha158内置指标体系
from qlib.data.dataset.loader import QlibDataLoader
import qlib
from qlib.contrib.data.handler import Alpha158   #Alpha158内置指标体系
provider_uri = "./qlib_data/cn_data"  # 原始行情数据存放目录
qlib.init(provider_uri=provider_uri, region=REG_CN)  # 初始化
market = "csi100"
benchmark = "SH000300"

 #数据处理器参数配置
data_handler_config = {
    "start_time": "2020-01-01",
    "end_time": "2020-02-21",
    "fit_start_time": "2020-01-01",  # 模型跑数据的开始时间
    "fit_end_time": "2020-01-31",
    "instruments": market,
    "infer_processors" : [
                                    {'class': 'FilterCol',##过滤器
                                     'kwargs': {'fields_group': 'feature', 'col_list': ["KMID","KLOW","OPEN0"]
                                     }},
                                    {'class': 'RobustZScoreNorm', # RobustZScoreNorm和Fillna，用于进行标准化和填充缺失值。
                                     'kwargs': {'fields_group': 'feature', 'clip_outlier': True}},
                                    {'class': 'Fillna', 'kwargs': {'fields_group': 'feature'}}],
    "learn_processors": [{'class': 'DropnaLabel'}, #DropnaLabel删除标注中含有缺失值的样本。

                                    # 对预测的目标进行截面排序处理  DropnaLabel 和 CSRankNorm 预处理器，用于对标签进行处理。
                                    {'class': 'CSRankNorm', 'kwargs': {'fields_group': 'label'}}],

                                    # 预测的目标
                                    'label': ["Ref($close, -2) / Ref($close, -1) - 1"] ,#下一日收益率, Ref($close, -1)表示下一日收盘价
                                    #'label': ['Ref($close, -1)'],
}
    


# 任务参数配置
task = {
    "model": {  # 模型参数配置
        # 模型类
        "class": "TransGANModel",
        # 模型类所在模块
        "module_path": "qlib.contrib.model.transgan",
        "kwargs": {  # 模型超参数配置
            "hidden_size": 4,
            "num_layers" : 2,
            "nhead" : 2,
            "learning_rate" : 0.001,
            "num_epochs" : 3,
            "batch_size": 256,
            "optimizer_betas" :(0.9,0.999),
            "evaluation_epoch_num" :5 ,
            
        }, 
    },
    "dataset": {  # 　因子库数据集参数配置
        # 数据集类，是Dataset with Data(H)andler的缩写，即带数据处理器的数据集
        "class": "TSDatasetH",
        # 数据集类所在模块
        "module_path": "qlib.data.dataset",
        "kwargs": {  # 数据集参数配置
            "handler": {  # 数据集使用的数据处理器配置
                #"class": "Alpha158",  # 数据处理器类，继承自DataHandlerLP
                "module_path": "qlib.contrib.data.handler",  # 数据处理器类所在模块
                "class": "Alpha158",
                "kwargs": data_handler_config,  # 数据处理器参数配置
            },
             "segments": {  # 数据集划分标准
                "train": ("2020-01-01", "2020-01-31"),  # 此时段的数据为训练集
                "valid": ("2020-01-31", "2020-02-20"),  # 此时段的数据为验证集
                "test": ("2020-02-20", "2020-02-21"),  # 此时段的数据为测试集
            },
        },
    },

}

# 实例化模型对象
model = init_instance_by_config(task["model"])

# 实例化因子库数据集，从基础行情数据计算出的包含所有特征（因子）和标签值的数据集。
dataset = init_instance_by_config(task["dataset"])  # DatasetH


ModuleNotFoundError. CatBoostModel are skipped. (optional: maybe installing CatBoostModel can fix it.)
ModuleNotFoundError. XGBModel is skipped(optional: maybe installing xgboost can fix it).


  from .autonotebook import tqdm as notebook_tqdm
[36462:MainThread](2023-04-11 10:09:08,440) INFO - qlib.Initialization - [config.py:413] - default_conf: client.
[36462:MainThread](2023-04-11 10:09:08,445) INFO - qlib.workflow - [expm.py:31] - experiment manager uri is at file:/home/shared/qlib-main/mlruns
[36462:MainThread](2023-04-11 10:09:08,446) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[36462:MainThread](2023-04-11 10:09:08,448) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/home/shared/qlib-main/qlib_data/cn_data')}
[36462:MainThread](2023-04-11 10:09:08,487) INFO - qlib.TransGANModel - [transgan.py:87] - Naive TransGAN:
batch_size : 256
device : cuda:3
[36462:MainThread](2023-04-11 10:09:16,824) INFO - qlib.timer - [log.py:117] - Time cost: 5.942s | Loading data Done
[36462:MainThread](2023-04-11 10:09:16,835) INFO - qlib.timer - [log.py:117] - Time cost: 0.002s | FilterCol D

In [2]:
# start exp to train model
with R.start(experiment_name="train_model"):
    R.log_params(**flatten_dict(task))
    model.fit(dataset)
    R.save_objects(trained_model=model)
    rid = R.get_recorder().id

[36462:MainThread](2023-04-11 10:09:16,914) INFO - qlib.workflow - [expm.py:316] - <mlflow.tracking.client.MlflowClient object at 0x7f2d04046670>
[36462:MainThread](2023-04-11 10:09:16,921) INFO - qlib.workflow - [exp.py:260] - Experiment 1 starts running ...
[36462:MainThread](2023-04-11 10:09:17,216) INFO - qlib.workflow - [recorder.py:339] - Recorder aeaccffd8da14f62bb43409c7ede57b0 starts running under Experiment 1 ...
Not a git repository
To compare two paths outside a working tree:
usage: git diff [--no-index] <path> <path>
[36462:MainThread](2023-04-11 10:09:17,302) INFO - qlib.workflow - [recorder.py:372] - Fail to log the uncommitted code of $CWD when run `git diff`
fatal: 不是一个 git 仓库（或者直至挂载点 / 的任何父目录）
停止在文件系统边界（未设置 GIT_DISCOVERY_ACROSS_FILESYSTEM）。
[36462:MainThread](2023-04-11 10:09:17,386) INFO - qlib.workflow - [recorder.py:372] - Fail to log the uncommitted code of $CWD when run `git status`
Not a git repository
To compare two paths outside a working tree:
usage: git diff

Generator and discriminator are initialized


[36462:MainThread](2023-04-11 10:09:26,081) INFO - qlib.TransGANModel - [transgan.py:348] - Epoch [2/3]
[36462:MainThread](2023-04-11 10:09:26,084) INFO - qlib.TransGANModel - [transgan.py:349] - Training...


Generator Loss: 0.8520, Discriminator Loss: 1.2923
KMID | Real:0.4562 / Generated:-0.5304
KLOW | Real:-0.8988 / Generated:0.0860
OPEN0 | Real:-0.4531 / Generated:-0.8034
Generator and discriminator are initialized


[36462:MainThread](2023-04-11 10:09:27,066) INFO - qlib.TransGANModel - [transgan.py:348] - Epoch [3/3]
[36462:MainThread](2023-04-11 10:09:27,068) INFO - qlib.TransGANModel - [transgan.py:349] - Training...


Generator Loss: 0.7529, Discriminator Loss: 1.3872
KMID | Real:-2.7210 / Generated:0.5889
KLOW | Real:0.9312 / Generated:1.0020
OPEN0 | Real:2.7878 / Generated:0.4040
Generator and discriminator are initialized
Generator Loss: 0.7463, Discriminator Loss: 1.3932
KMID | Real:-2.0270 / Generated:0.8597
KLOW | Real:0.1017 / Generated:0.7554
OPEN0 | Real:2.0625 / Generated:0.1562


[36462:MainThread](2023-04-11 10:09:28,403) INFO - qlib.TransGANModel - [transgan.py:366] - training...
[36462:MainThread](2023-04-11 10:09:28,405) INFO - qlib.TransGANModel - [transgan.py:370] - Epoch0:
[36462:MainThread](2023-04-11 10:09:28,407) INFO - qlib.TransGANModel - [transgan.py:371] - training...
[36462:MainThread](2023-04-11 10:09:29,549) INFO - qlib.TransGANModel - [transgan.py:374] - evaluating...
[36462:MainThread](2023-04-11 10:09:31,503) INFO - qlib.TransGANModel - [transgan.py:378] - train -1.038126, valid -1.614328
[36462:MainThread](2023-04-11 10:09:31,511) INFO - qlib.TransGANModel - [transgan.py:370] - Epoch1:
[36462:MainThread](2023-04-11 10:09:31,512) INFO - qlib.TransGANModel - [transgan.py:371] - training...
[36462:MainThread](2023-04-11 10:09:32,419) INFO - qlib.TransGANModel - [transgan.py:374] - evaluating...
[36462:MainThread](2023-04-11 10:09:34,101) INFO - qlib.TransGANModel - [transgan.py:378] - train -1.316004, valid -1.409114
[36462:MainThread](2023-04

In [3]:
###################################
# prediction, backtest & analysis
###################################
port_analysis_config = {
    "executor": {
        "class": "SimulatorExecutor",
        "module_path": "qlib.backtest.executor",
        "kwargs": {
            "time_per_step": "day",
            "generate_portfolio_metrics": True,
        },
    },
    "strategy": {
        "class": "TopkDropoutStrategy",
        "module_path": "qlib.contrib.strategy",
        "kwargs": {
            "model": model,
            "dataset": dataset,
            "topk": 50,
            "n_drop": 5,
        },
    },
    "backtest": {
        "start_time": "2017-01-01",
        "end_time": "2020-08-01",
        "account": 100000000,
        "benchmark": benchmark,
        "exchange_kwargs": {
            "freq": "day",
            "limit_threshold": 0.095,
            "deal_price": "close",
            "open_cost": 0.0005,
            "close_cost": 0.0015,
            "min_cost": 5,
        },
    },
}

# backtest and analysis
with R.start(experiment_name="backtest_analysis"):
    recorder = R.get_recorder(recorder_id=rid, experiment_name="train_model")
    model = recorder.load_object("trained_model")

    # prediction
    recorder = R.get_recorder()
    ba_rid = recorder.id
    sr = SignalRecord(model, dataset, recorder)
    sr.generate()

    # backtest & analysis
    par = PortAnaRecord(recorder, port_analysis_config, "day")
    par.generate()

[36462:MainThread](2023-04-11 10:09:36,967) INFO - qlib.workflow - [expm.py:316] - <mlflow.tracking.client.MlflowClient object at 0x7f2d040460a0>
[36462:MainThread](2023-04-11 10:09:36,975) INFO - qlib.workflow - [exp.py:260] - Experiment 2 starts running ...
[36462:MainThread](2023-04-11 10:09:37,177) INFO - qlib.workflow - [recorder.py:339] - Recorder bf9c0be204d847b280931da68e6bc805 starts running under Experiment 2 ...
Not a git repository
To compare two paths outside a working tree:
usage: git diff [--no-index] <path> <path>
[36462:MainThread](2023-04-11 10:09:37,295) INFO - qlib.workflow - [recorder.py:372] - Fail to log the uncommitted code of $CWD when run `git diff`
fatal: 不是一个 git 仓库（或者直至挂载点 / 的任何父目录）
停止在文件系统边界（未设置 GIT_DISCOVERY_ACROSS_FILESYSTEM）。
[36462:MainThread](2023-04-11 10:09:37,415) INFO - qlib.workflow - [recorder.py:372] - Fail to log the uncommitted code of $CWD when run `git status`
Not a git repository
To compare two paths outside a working tree:
usage: git diff

'The following are prediction results of the TransGANModel model.'
                          score
datetime   instrument          
2020-02-20 SH600000    0.176197
           SH600009   -0.873383
           SH600010    0.268904
           SH600011   -0.563702
           SH600015   -0.351168


backtest loop: 100%|██████████| 871/871 [00:01<00:00, 462.72it/s]
[36462:MainThread](2023-04-11 10:09:51,782) INFO - qlib.workflow - [record_temp.py:499] - Portfolio analysis record 'port_analysis_1day.pkl' has been saved as the artifact of the Experiment 2
[36462:MainThread](2023-04-11 10:09:51,795) INFO - qlib.workflow - [record_temp.py:524] - Indicator analysis record 'indicator_analysis_1day.pkl' has been saved as the artifact of the Experiment 2
[36462:MainThread](2023-04-11 10:09:51,836) INFO - qlib.timer - [log.py:117] - Time cost: 0.018s | waiting `async_log` Done


'The following are analysis results of benchmark return(1day).'
                       risk
mean               0.000477
std                0.012295
annualized_return  0.113561
information_ratio  0.598699
max_drawdown      -0.370479
'The following are analysis results of the excess return without cost(1day).'
                       risk
mean              -0.000335
std                0.011010
annualized_return -0.079657
information_ratio -0.468960
max_drawdown      -0.413520
'The following are analysis results of the excess return with cost(1day).'
                       risk
mean              -0.000335
std                0.011010
annualized_return -0.079845
information_ratio -0.470063
max_drawdown      -0.414206
'The following are analysis results of indicators(1day).'
     value
ffr    1.0
pa     0.0
pos    0.0


In [4]:

# 从实验记录器加载保存在pkl文件中的预测结果数据
pred_df = recorder.load_object("pred.pkl")

# 从实验记录器加载保存在pkl文件中的标签数据
label_df = recorder.load_object("label.pkl")
label_df.columns = ['label']

# 构造预测值和标签值并列的df
pred_label = pd.concat([pred_df, label_df], axis=1, sort=True).reindex(label_df.index)

print(pred_label)

                          score     label
datetime   instrument                    
2020-02-20 SH600000    0.176197 -0.012389
           SH600009   -0.873383 -0.038657
           SH600010    0.268904 -0.008197
           SH600011   -0.563702 -0.014000
           SH600015   -0.351168 -0.005563
...                         ...       ...
2020-02-21 SZ002594   -0.209338  0.070242
           SZ002736    0.184976 -0.002357
           SZ300015    0.699010 -0.007494
           SZ300059   -0.334402  0.069132
           SZ300498    0.193667  0.006787

[200 rows x 2 columns]


In [5]:
import numpy as np
# 计算预测值和标签值的正负号
pred_sign = np.sign(pred_label["score"])
label_sign = np.sign(pred_label["label"])

# 统计正负号相同的数量
same_sign_count = np.sum(pred_sign == label_sign)
same_sign_count

88

In [6]:
# 计算出两列数据正负号相同的比例
corr_pct = (pred_label['score'] * pred_label['label'] > 0).mean()
corr_pct

0.44