In [1]:
import qlib
import pandas as pd
from qlib.constant import REG_CN
from qlib.utils import init_instance_by_config
from qlib.workflow import R
from qlib.workflow.record_temp import SignalRecord, PortAnaRecord
from qlib.utils import flatten_dict
import numpy as np
import alphalens
from typing import Tuple

In [2]:
# qlib init
provider_uri = "~/.qlib/qlib_data/cn_data"  # target_dir
qlib.init(provider_uri=provider_uri, region=REG_CN)

[70971:MainThread](2023-02-28 21:41:31,180) INFO - qlib.Initialization - [config.py:416] - default_conf: client.
[70971:MainThread](2023-02-28 21:41:31,183) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[70971:MainThread](2023-02-28 21:41:31,184) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/Users/jersonliao/.qlib/qlib_data/cn_data')}


### define init dataset function

In [3]:
def init_dataset(market, label: Tuple[list, list], segments):
    data_handler_config = {
        "start_time": segments["train"][0],
        "end_time": segments["test"][0],
        "fit_start_time": segments["train"][0],
        "fit_end_time": segments["train"][1],
        "instruments": market,
        "label": label,
        "feature": {
            'kbar': {},  # whether to use some hard-code kbar features
            "price": {
                "windows": [0],
                "feature": ["OPEN", "HIGH", "LOW", "VWAP"],
            },
            'volume': {  # whether to use raw volume features
                'windows': [0, 1, 2],  # use volume at n days ago
            },
            'rolling': {  # whether to use rolling operator based features
                'windows': [5, 10, 20, 30, 60, 90, 120, 150],  # rolling windows size
            }
        }
    }
    dataset_config = {
        "class": "DatasetH",
        "module_path": "qlib.data.dataset",
        "kwargs": {
            "handler": {
                "class": "Alpha158",
                "module_path": "qlib.contrib.data.handler",
                "kwargs": data_handler_config,
            },
            "segments": segments,
        }
    }
    dataset = init_instance_by_config(dataset_config)
    return dataset

### define init and train model function

In [4]:
def init_model_and_train(dataset,label_column, experiment_name: str, model_type: str):
    lgb_model = {
        "class": "LGBModel",
        "module_path": "qlib.contrib.model.gbdt",
        "kwargs": {
            "loss": "mse",
            "colsample_bytree": 0.8879,
            "learning_rate": 0.0421,
            "subsample": 0.8789,
            "lambda_l1": 205.6999,
            "lambda_l2": 580.9768,
            "max_depth": 8,
            "num_leaves": 210,
            "num_threads": 20,
            "label_column":label_column,
        },
    }
    densemble_model = {
        "class": "DEnsembleModel",
        "module_path": "qlib.contrib.model.double_ensemble",
        "kwargs": {
            "base_model": "gbm",
            "loss": "mse",
            "num_models": 3,
            "enable_sr": True,
            "enable_fs": True,
            "alpha1": 1,
            "alpha2": 1,
            "bins_sr": 10,
            "bins_fs": 5,
            "decay": 0.5,
            "sample_ratios": [0.8, 0.7, 0.6, 0.5, 0.4],
            "sub_weights": [1, 1, 1],
            "epochs": 28,
            "colsample_bytree": 0.8879,
            "learning_rate": 0.2,
            "subsample": 0.8789,
            "lambda_l1": 205.6999,
            "lambda_l2": 580.9768,
            "max_depth": 8,
            "num_leaves": 210,
            "num_threads": 20,
            "verbosity": -1
        }
    }
    # model initiaiton
    model_config = lgb_model if model_type=="lgb" else densemble_model
    model = init_instance_by_config(model_config)

    # start exp to train model
    with R.start(experiment_name=experiment_name):
        R.log_params(**flatten_dict({"model":model_config}))
        model.fit(dataset)
        R.save_objects(trained_model=model)
        rid = R.get_recorder().id
    return model, rid


### define prediction, backtest & analysis function

In [5]:
def port_analysis(experiment_name, benchmark, dataset, model, topk, n_drop, hold_thresh, start_date, end_date):
    port_analysis_config = {
        "executor": {
            "class": "SimulatorExecutor",
            "module_path": "qlib.backtest.executor",
            "kwargs": {
                "time_per_step": "day",
                "generate_portfolio_metrics": True,
            },
        },
        "strategy": {
            "class": "TopkDropoutStrategy",
            "module_path": "qlib.contrib.strategy.signal_strategy",
            "kwargs": {
                "model": model,
                "dataset": dataset,
                "topk": topk,
                "n_drop": n_drop,
                "hold_thresh": hold_thresh,
            },
        },
        "backtest": {
            "start_time": start_date,
            "end_time": end_date,
            "account": 100000000,
            "benchmark": benchmark,
            "exchange_kwargs": {
                "freq": "day",
                "limit_threshold": 0.095,
                "deal_price": "close",
                "open_cost": 0.0005,
                "close_cost": 0.0015,
                "min_cost": 5,
            },
        },
    }

    # backtest and analysis
    with R.start(experiment_name=experiment_name):
        # prediction
        recorder = R.get_recorder()
        sr = SignalRecord(model, dataset, recorder)
        sr.generate()
        # backtest & analysis
        par = PortAnaRecord(recorder, port_analysis_config, "day")
        par.generate()
    return recorder.id

### prepare dataset

In [6]:
segments = {
    "train": ("2008-01-01", "2018-12-31"),
    "valid": ("2019-01-01", "2020-12-31"),
    "test": ("2021-01-01", "2022-12-31"),
}
label = ([
    f"Ref($close, -2)/Ref($close, -1) - 1", f"Ref($close, -10)/Ref($close, -1) - 1",
    f"Ref($close, -30)/Ref($close, -1) - 1", f"Ref($close, -60)/Ref($close, -1) - 1"
], ["LABEL2", "LABEL10", "LABE30", "LABEL60"])
dataset = init_dataset("csi300", label, segments)

[70971:MainThread](2023-02-28 21:42:09,756) INFO - qlib.timer - [log.py:128] - Time cost: 28.717s | Loading data Done
[70971:MainThread](2023-02-28 21:42:10,322) INFO - qlib.timer - [log.py:128] - Time cost: 0.230s | DropnaLabel Done
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[cols] = df[cols].groupby("datetime", group_keys=False).apply(self.zscore_func)
[70971:MainThread](2023-02-28 21:42:12,535) INFO - qlib.timer - [log.py:128] - Time cost: 2.213s | CSZScoreNorm Done
[70971:MainThread](2023-02-28 21:42:12,548) INFO - qlib.timer - [log.py:128] - Time cost: 2.791s | fit & process data Done
[70971:MainThread](2023-02-28 21:42:12,548) INFO - qlib.timer - [log.py:128] - Time cost: 31.509s | Init data Done


In [7]:
ana_recid_map = {}

for ref_days, hold_days in [(2, 1), (10, 30)]:
    name = f"ref{ref_days}_h{hold_days}"
    model, train_recid = init_model_and_train(dataset, f"LABEL{ref_days}", "train", "lgb")
    ana_recid = port_analysis("backtest_analysis", "SH000300", dataset, model, 50, 5, hold_days, "2021-01-01",
                              "2022-12-31")
    ana_recid_map[name] = ana_recid


[70971:MainThread](2023-02-28 21:45:54,200) INFO - qlib.workflow - [exp.py:258] - Experiment 5 starts running ...
[70971:MainThread](2023-02-28 21:45:54,257) INFO - qlib.workflow - [recorder.py:341] - Recorder 4f119f0d68834cd4b1cfe594455ff72b starts running under Experiment 5 ...


Training until validation scores don't improve for 50 rounds
[20]	train's l2: 0.990748	valid's l2: 0.995745
[40]	train's l2: 0.987355	valid's l2: 0.995734
[60]	train's l2: 0.984981	valid's l2: 0.995834


[70971:MainThread](2023-02-28 21:46:06,568) INFO - qlib.timer - [log.py:128] - Time cost: 0.053s | waiting `async_log` Done
[70971:MainThread](2023-02-28 21:46:06,571) INFO - qlib.workflow - [exp.py:258] - Experiment 1 starts running ...
[70971:MainThread](2023-02-28 21:46:06,576) INFO - qlib.workflow - [recorder.py:341] - Recorder 6b8664ed4c4747a0b7f5fbb3a42e21f2 starts running under Experiment 1 ...


Early stopping, best iteration is:
[26]	train's l2: 0.989592	valid's l2: 0.995679


[70971:MainThread](2023-02-28 21:46:06,857) INFO - qlib.workflow - [record_temp.py:196] - Signal record 'pred.pkl' has been saved as the artifact of the Experiment 1
[70971:MainThread](2023-02-28 21:46:06,869) INFO - qlib.backtest caller - [__init__.py:93] - Create new exchange


'The following are prediction results of the LGBModel model.'
Empty DataFrame
Columns: [score]
Index: []




backtest loop:   0%|          | 0/485 [00:00<?, ?it/s]

  return np.nanmean(self.data)
[70971:MainThread](2023-02-28 21:46:15,686) INFO - qlib.workflow - [record_temp.py:505] - Portfolio analysis record 'port_analysis_1day.pkl' has been saved as the artifact of the Experiment 1
[70971:MainThread](2023-02-28 21:46:15,692) INFO - qlib.workflow - [record_temp.py:530] - Indicator analysis record 'indicator_analysis_1day.pkl' has been saved as the artifact of the Experiment 1
[70971:MainThread](2023-02-28 21:46:15,699) INFO - qlib.timer - [log.py:128] - Time cost: 0.005s | waiting `async_log` Done
[70971:MainThread](2023-02-28 21:46:15,702) INFO - qlib.workflow - [exp.py:258] - Experiment 5 starts running ...
[70971:MainThread](2023-02-28 21:46:15,709) INFO - qlib.workflow - [recorder.py:341] - Recorder 5c65cf54236e4c35b23fe806183e7366 starts running under Experiment 5 ...


'The following are analysis results of benchmark return(1day).'
                       risk
mean              -0.000537
std                0.012287
annualized_return -0.127806
information_ratio -0.674256
max_drawdown      -0.473380
'The following are analysis results of the excess return without cost(1day).'
                       risk
mean               0.000537
std                0.012287
annualized_return  0.127806
information_ratio  0.674256
max_drawdown      -0.175821
'The following are analysis results of the excess return with cost(1day).'
                       risk
mean               0.000537
std                0.012287
annualized_return  0.127806
information_ratio  0.674256
max_drawdown      -0.175821
'The following are analysis results of indicators(1day).'
     value
ffr    NaN
pa     NaN
pos    NaN
Training until validation scores don't improve for 50 rounds
[20]	train's l2: 0.982366	valid's l2: 0.993613
[40]	train's l2: 0.974201	valid's l2: 0.992974
[60]	train's l2: 0.968

[70971:MainThread](2023-02-28 21:46:30,185) INFO - qlib.timer - [log.py:128] - Time cost: 0.077s | waiting `async_log` Done
[70971:MainThread](2023-02-28 21:46:30,188) INFO - qlib.workflow - [exp.py:258] - Experiment 1 starts running ...
[70971:MainThread](2023-02-28 21:46:30,195) INFO - qlib.workflow - [recorder.py:341] - Recorder 78f7551136db4d95b4ae02ae9c144277 starts running under Experiment 1 ...
[70971:MainThread](2023-02-28 21:46:30,254) INFO - qlib.workflow - [record_temp.py:196] - Signal record 'pred.pkl' has been saved as the artifact of the Experiment 1
[70971:MainThread](2023-02-28 21:46:30,261) INFO - qlib.backtest caller - [__init__.py:93] - Create new exchange


Early stopping, best iteration is:
[45]	train's l2: 0.972675	valid's l2: 0.99296
'The following are prediction results of the LGBModel model.'
Empty DataFrame
Columns: [score]
Index: []




backtest loop:   0%|          | 0/485 [00:00<?, ?it/s]

  return np.nanmean(self.data)
[70971:MainThread](2023-02-28 21:46:39,166) INFO - qlib.workflow - [record_temp.py:505] - Portfolio analysis record 'port_analysis_1day.pkl' has been saved as the artifact of the Experiment 1
[70971:MainThread](2023-02-28 21:46:39,170) INFO - qlib.workflow - [record_temp.py:530] - Indicator analysis record 'indicator_analysis_1day.pkl' has been saved as the artifact of the Experiment 1
[70971:MainThread](2023-02-28 21:46:39,177) INFO - qlib.timer - [log.py:128] - Time cost: 0.005s | waiting `async_log` Done


'The following are analysis results of benchmark return(1day).'
                       risk
mean              -0.000537
std                0.012287
annualized_return -0.127806
information_ratio -0.674256
max_drawdown      -0.473380
'The following are analysis results of the excess return without cost(1day).'
                       risk
mean               0.000537
std                0.012287
annualized_return  0.127806
information_ratio  0.674256
max_drawdown      -0.175821
'The following are analysis results of the excess return with cost(1day).'
                       risk
mean               0.000537
std                0.012287
annualized_return  0.127806
information_ratio  0.674256
max_drawdown      -0.175821
'The following are analysis results of indicators(1day).'
     value
ffr    NaN
pa     NaN
pos    NaN


# analyze graphs

In [8]:
from qlib.contrib.report import analysis_model, analysis_position
from qlib.data import D

report_df_map = {}
for name,ana_recid in ana_recid_map.items():
    recorder = R.get_recorder(recorder_id=ana_recid, experiment_name="backtest_analysis")
    report_df = recorder.load_object("portfolio_analysis/report_normal_1day.pkl")
    report_df_map[name]=report_df

In [11]:
report_df_map.keys()

dict_keys(['ref2_h1', 'ref10_h30'])

## analysis position

### report

In [None]:
analysis_position.report_graph(report_df_map)

In [1]:
import qlib
from qlib.constant import REG_CN
from qlib.utils import init_instance_by_config, flatten_dict
from qlib.workflow import R
from qlib.workflow.record_temp import SignalRecord, PortAnaRecord, SigAnaRecord
from qlib.tests.data import GetData
from qlib.tests.config import CSI300_BENCH, CSI300_GBDT_TASK
from qlib.contrib.report import analysis_model, analysis_position

provider_uri = "~/.qlib/qlib_data/cn_data"  # target_dir
qlib.init(provider_uri=provider_uri, region=REG_CN)

rid_list = ["efc65143bb6b48c5906b2da02ff93ac2","ea48476370b944af9b4e274644033252"]

report_df_map = {}
for idx, rid in enumerate(rid_list):
    recorder = R.get_recorder(recorder_id=rid, experiment_name="experiment")
    report_df = recorder.load_object("portfolio_analysis/report_normal_1day.pkl")
    report_df_map[f"{idx}"] = report_df


[86219:MainThread](2023-02-28 22:54:20,861) INFO - qlib.Initialization - [config.py:416] - default_conf: client.
[86219:MainThread](2023-02-28 22:54:20,863) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[86219:MainThread](2023-02-28 22:54:20,863) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/Users/jersonliao/.qlib/qlib_data/cn_data')}


In [None]:
analysis_position.report_graph(report_df_map)