In [None]:
#  Copyright (c) Microsoft Corporation.
#  Licensed under the MIT License.
import qlib
import os
import random
import pandas as pd
import numpy as np
from multiprocessing import Pool
from qlib.config import REG_CN, HIGH_FREQ_CONFIG
from qlib.contrib.model.gbdt import LGBModel
from qlib.contrib.data.handler import Alpha158
from qlib.contrib.strategy.strategy import TopkDropoutStrategy
from qlib.contrib.evaluate import (
    backtest as normal_backtest,
    risk_analysis,
)
from qlib.utils import exists_qlib_data, init_instance_by_config
from qlib.workflow import R
from qlib.data import D
from qlib.data.filter import NameDFilter
from qlib.workflow.record_temp import SignalRecord, PortAnaRecord
from qlib.data.dataset.handler import DataHandlerLP
from qlib.utils import flatten_dict
import lightgbm as lgb

# Qlib configuration

In [None]:
QLIB_INIT_CONFIG = {**HIGH_FREQ_CONFIG}
qlib.init(**QLIB_INIT_CONFIG)
instruments = D.instruments(market='all')
random.seed(710)
instruments = D.list_instruments(instruments=instruments, freq = '1min', as_list=True)
# Randomly select instruments to boost the training efficiency
instruments = random.sample(instruments, 150)

# train model configuration


In [None]:
MARKET = 'ALL'
BENCHMARK = "SH000300"

start_time = "2020-09-15 00:00:00"
end_time = "2021-01-18 16:00:00"
train_end_time = "2020-11-15 16:00:00"
valid_start_time = "2020-11-16 00:00:00"
valid_end_time = "2020-11-30 16:00:00"
test_start_time = "2020-12-01 00:00:00"

data_handler_config = {
    "start_time": start_time,
    "end_time": end_time,
    "fit_start_time": start_time,
    "fit_end_time": train_end_time,
    "freq": "1min",
    "instruments": instruments,
    "learn_processors":[
        {"class": "DropnaLabel"}
    ],
    "infer_processors": [         
        {"class": "RobustZScoreNorm",
        "kwargs": {
            "fields_group": "feature",
            "clip_outlier": True,
        }},
        {"class": "Fillna",
         "kwargs": {
             "fields_group": "feature",
         }},],
    "label": ["Ref($close, -1) / $close - 1"],
}


task = {
    "model": {
        "class": "HF_LGBModel",
        "module_path": "highfreq_gdbt_model.py",
        "kwargs": {
            "objective": 'binary', 
            "metric": ['binary_logloss','auc'],
            "verbosity": -1,
            "learning_rate": 0.01,
            "max_depth": 8,
            "num_leaves": 150, 
            "lambda_l1": 1.5,
            "lambda_l2": 1,
            "num_threads": 20
        },
    },
    "dataset": {
        "class": "DatasetH",
        "module_path": "qlib.data.dataset",
        "kwargs": {
            "handler": {
                "class": "Alpha158",
                "module_path": "qlib.contrib.data.handler",
                "kwargs": data_handler_config,
            },
            "segments": {
                "train": (start_time, train_end_time),
                "valid": (train_end_time, valid_end_time),
                "test": (
                    test_start_time,
                    end_time,
                ),
            },
        },
    },
}

provider_uri = QLIB_INIT_CONFIG.get("provider_uri")
if not exists_qlib_data(provider_uri):
    print(f"Qlib data is not found in {provider_uri}")
    GetData().qlib_data(target_dir=provider_uri, interval="1min", region=REG_CN)

dataset = init_instance_by_config(task["dataset"])
model = init_instance_by_config(task["model"])

# train model and back test


In [None]:
# start exp to train model with signal test
with R.start(experiment_name="train_model"):
    R.log_params(**flatten_dict(task))
    model.fit(dataset)
    model.hf_signal_test(dataset, 0.1)