Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update high freq demo #358

Merged
merged 6 commits into from
Apr 14, 2021
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
65 changes: 65 additions & 0 deletions examples/highfreq/workflow_config_High_Freq_Tree_Alpha158.yaml
@@ -0,0 +1,65 @@
qlib_init:
provider_uri: "~/.qlib/qlib_data/yahoo_cn_1min"
region: cn
market: &market ['SH605222', 'SZ002796', 'SZ002246', 'SZ000713', 'SZ002820', 'SH601328', 'SZ000668', 'SH603359', 'SZ002144', 'SH600195', 'SH603685', 'SH603386', 'SZ002586', 'SZ000573', 'SZ000605', 'SZ002842', 'SH600068', 'SZ300547', 'SZ000926', 'SZ002036', 'SZ002161', 'SH600715', 'SZ300427', 'SZ002573', 'SZ300142', 'SH605116', 'SZ002951', 'SH600276', 'SZ002437', 'SH603355', 'SZ002893', 'SH600584']
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Waiting for @zhupr to update the instruments list

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated: python get_data.py qlib_data --name qlib_data --target_dir ~/.qlib/qlib_data/cn_data_1min --interval 1min --region cn

start_time: &start_time "2020-09-15 00:00:00"
end_time: &end_time "2021-01-18 16:00:00"
train_end_time: &train_end_time "2020-11-15 16:00:00"
valid_start_time: &valid_start_time "2020-11-16 00:00:00"
valid_end_time: &valid_end_time "2020-11-30 16:00:00"
test_start_time: &test_start_time "2020-12-01 00:00:00"
data_handler_config: &data_handler_config
start_time: *start_time
end_time: *end_time
fit_start_time: *start_time
fit_end_time: *train_end_time
instruments: *market
freq: '1min'
infer_processors:
- class: 'RobustZScoreNorm'
kwargs:
fields_group: 'feature'
clip_outlier: false
- class: "Fillna"
kwargs:
fields_group: 'feature'
learn_processors:
- class: 'DropnaLabel'
- class: 'CSRankNorm'
kwargs:
fields_group: 'label'
label: ["Ref($close, -2) / Ref($close, -1) - 1"]

task:
model:
class: "HFLGBModel"
module_path: "qlib.contrib.model.highfreq_gdbt_model"
kwargs:
objective: 'binary'
metric: ['binary_logloss','auc']
verbosity: -1
learning_rate: 0.01
max_depth: 8
num_leaves: 150
lambda_l1: 1.5
lambda_l2: 1
num_threads: 20
dataset:
class: "DatasetH"
module_path: "qlib.data.dataset"
kwargs:
handler:
class: "Alpha158"
module_path: "qlib.contrib.data.handler"
kwargs: *data_handler_config
segments:
train: [*start_time, *train_end_time]
valid: [*train_end_time, *valid_end_time]
test: [*test_start_time, *end_time]
record:
- class: "SignalRecord"
module_path: "qlib.workflow.record_temp"
kwargs: {}
- class: "HFSignalRecord"
module_path: "qlib.workflow.record_temp"
kwargs: {}
53 changes: 53 additions & 0 deletions qlib/contrib/eva/alpha.py
Expand Up @@ -8,6 +8,59 @@
from typing import Tuple


def calc_long_short_prec(
pred: pd.Series, label: pd.Series, date_col="datetime", quantile: float = 0.2, dropna=False, is_alpha=False
) -> Tuple[pd.Series, pd.Series]:
"""
calculate the precision for long and short operation


:param pred/label: index is **pd.MultiIndex**, index name is **[datetime, instruments]**; columns names is **[score]**.

.. code-block:: python
score
datetime instrument
2020-12-01 09:30:00 SH600068 0.553634
SH600195 0.550017
SH600276 0.540321
SH600584 0.517297
SH600715 0.544674
label :
label
date_col :
date_col

Returns
-------
(pd.Series, pd.Series)
long precision and short precision in time level
"""
if is_alpha:
label = label - label.mean(level=date_col)
if int(1 / quantile) >= len(label.index.get_level_values(1).unique()):
you-n-g marked this conversation as resolved.
Show resolved Hide resolved
raise ValueError("Need more instruments to calculate precision")

df = pd.DataFrame({"pred": pred, "label": label})
if dropna:
df.dropna(inplace=True)

group = df.groupby(level=date_col)

N = lambda x: int(len(x) * quantile)
# find the top/low quantile of prediction and treat them as long and short target
long = group.apply(lambda x: x.nlargest(N(x), columns="pred").label).reset_index(level=0, drop=True)
you-n-g marked this conversation as resolved.
Show resolved Hide resolved
short = group.apply(lambda x: x.nsmallest(N(x), columns="pred").label).reset_index(level=0, drop=True)

groupll = long.groupby(date_col)
you-n-g marked this conversation as resolved.
Show resolved Hide resolved
l_dom = groupll.apply(lambda x: x > 0)
l_c = groupll.count()

groups = short.groupby(date_col)
s_dom = groups.apply(lambda x: x < 0)
s_c = groups.count()
return (l_dom.groupby(date_col).sum() / l_c), (s_dom.groupby(date_col).sum() / s_c)


def calc_ic(pred: pd.Series, label: pd.Series, date_col="datetime", dropna=False) -> Tuple[pd.Series, pd.Series]:
"""calc_ic.

Expand Down
157 changes: 157 additions & 0 deletions qlib/contrib/model/highfreq_gdbt_model.py
@@ -0,0 +1,157 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

import numpy as np
import pandas as pd
import lightgbm as lgb

from qlib.model.base import ModelFT
from qlib.data.dataset import DatasetH
from qlib.data.dataset.handler import DataHandlerLP
import warnings


class HFLGBModel(ModelFT):
"""LightGBM Model for high frequency prediction"""

def __init__(self, loss="mse", **kwargs):
if loss not in {"mse", "binary"}:
raise NotImplementedError
self.params = {"objective": loss, "verbosity": -1}
self.params.update(kwargs)
self.model = None

def _cal_signal_metrics(self, y_test, l_cut, r_cut):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why don't you use calc_prec here?

"""
Calcaute the signal metrics by daily level
"""
up_pre, down_pre = [], []
up_alpha_ll, down_alpha_ll = [], []
for date in y_test.index.get_level_values(0).unique():
df_res = y_test.loc[date].sort_values("pred")
if int(l_cut * len(df_res)) < 10:
warnings.warn("Warning: threhold is too low or instruments number is not enough")
continue
top = df_res.iloc[: int(l_cut * len(df_res))]
bottom = df_res.iloc[int(r_cut * len(df_res)) :]

down_precision = len(top[top[top.columns[0]] < 0]) / (len(top))
up_precision = len(bottom[bottom[top.columns[0]] > 0]) / (len(bottom))

down_alpha = top[top.columns[0]].mean()
up_alpha = bottom[bottom.columns[0]].mean()

up_pre.append(up_precision)
down_pre.append(down_precision)
up_alpha_ll.append(up_alpha)
down_alpha_ll.append(down_alpha)

return (
np.array(up_pre).mean(),
np.array(down_pre).mean(),
np.array(up_alpha_ll).mean(),
np.array(down_alpha_ll).mean(),
)

def hf_signal_test(self, dataset: DatasetH, threhold=0.2):
"""
Test the sigal in high frequency test set
"""
if self.model == None:
raise ValueError("Model hasn't been trained yet")
df_test = dataset.prepare("test", col_set=["feature", "label"], data_key=DataHandlerLP.DK_I)
df_test.dropna(inplace=True)
x_test, y_test = df_test["feature"], df_test["label"]
# Convert label into alpha
y_test[y_test.columns[0]] = y_test[y_test.columns[0]] - y_test[y_test.columns[0]].mean(level=0)

res = pd.Series(self.model.predict(x_test.values), index=x_test.index)
y_test["pred"] = res

up_p, down_p, up_a, down_a = self._cal_signal_metrics(y_test, threhold, 1 - threhold)
print("===============================")
print("High frequency signal test")
print("===============================")
print("Test set precision: ")
print("Positive precision: {}, Negative precision: {}".format(up_p, down_p))
print("Test Alpha Average in test set: ")
print("Positive average alpha: {}, Negative average alpha: {}".format(up_a, down_a))

def _prepare_data(self, dataset: DatasetH):
df_train, df_valid = dataset.prepare(
["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L
)

x_train, y_train = df_train["feature"], df_train["label"]
x_valid, y_valid = df_train["feature"], df_valid["label"]
if y_train.values.ndim == 2 and y_train.values.shape[1] == 1:
l_name = df_train["label"].columns[0]
# Convert label into alpha
df_train["label"][l_name] = df_train["label"][l_name] - df_train["label"][l_name].mean(level=0)
df_valid["label"][l_name] = df_valid["label"][l_name] - df_valid["label"][l_name].mean(level=0)
mapping_fn = lambda x: 0 if x < 0 else 1
df_train["label_c"] = df_train["label"][l_name].apply(mapping_fn)
df_valid["label_c"] = df_valid["label"][l_name].apply(mapping_fn)
x_train, y_train = df_train["feature"], df_train["label_c"].values
x_valid, y_valid = df_valid["feature"], df_valid["label_c"].values
else:
raise ValueError("LightGBM doesn't support multi-label training")

dtrain = lgb.Dataset(x_train.values, label=y_train)
dvalid = lgb.Dataset(x_valid.values, label=y_valid)
return dtrain, dvalid

def fit(
self,
dataset: DatasetH,
num_boost_round=1000,
early_stopping_rounds=50,
verbose_eval=20,
evals_result=dict(),
**kwargs
):
dtrain, dvalid = self._prepare_data(dataset)
self.model = lgb.train(
self.params,
dtrain,
num_boost_round=num_boost_round,
valid_sets=[dtrain, dvalid],
valid_names=["train", "valid"],
early_stopping_rounds=early_stopping_rounds,
verbose_eval=verbose_eval,
evals_result=evals_result,
**kwargs
)
evals_result["train"] = list(evals_result["train"].values())[0]
evals_result["valid"] = list(evals_result["valid"].values())[0]

def predict(self, dataset):
if self.model is None:
raise ValueError("model is not fitted yet!")
x_test = dataset.prepare("test", col_set="feature", data_key=DataHandlerLP.DK_I)
return pd.Series(self.model.predict(x_test.values), index=x_test.index)

def finetune(self, dataset: DatasetH, num_boost_round=10, verbose_eval=20):
"""
finetune model

Parameters
----------
dataset : DatasetH
dataset for finetuning
num_boost_round : int
number of round to finetune model
verbose_eval : int
verbose level
"""
# Based on existing model and finetune by train more rounds
dtrain, _ = self._prepare_data(dataset)
self.model = lgb.train(
self.params,
dtrain,
num_boost_round=num_boost_round,
init_model=self.model,
valid_sets=[dtrain],
valid_names=["train"],
verbose_eval=verbose_eval,
)
56 changes: 55 additions & 1 deletion qlib/workflow/record_temp.py
Expand Up @@ -13,7 +13,7 @@
from ..utils import init_instance_by_config, get_module_by_module_path
from ..log import get_module_logger
from ..utils import flatten_dict
from ..contrib.eva.alpha import calc_ic, calc_long_short_return
from ..contrib.eva.alpha import calc_ic, calc_long_short_return, calc_long_short_prec
from ..contrib.strategy.strategy import BaseStrategy

logger = get_module_logger("workflow", "INFO")
Expand Down Expand Up @@ -156,6 +156,60 @@ def load(self, name="pred.pkl"):
return super().load(name)


class HFSignalRecord(SignalRecord):
"""
This is the Signal Analysis Record class that generates the analysis results such as IC and IR. This class inherits the ``RecordTemp`` class.
"""

artifact_path = "hg_sig_analysis"

def __init__(self, recorder, **kwargs):
super().__init__(recorder=recorder)

def generate(self):
pred = self.load("pred.pkl")
raw_label = self.load("label.pkl")
long_pre, short_pre = calc_long_short_prec(pred.iloc[:, 0], raw_label.iloc[:, 0], is_alpha=True)
ic, ric = calc_ic(pred.iloc[:, 0], raw_label.iloc[:, 0])
metrics = {
"IC": ic.mean(),
"ICIR": ic.mean() / ic.std(),
"Rank IC": ric.mean(),
"Rank ICIR": ric.mean() / ric.std(),
"Long precision": long_pre.mean(),
"Short precision": short_pre.mean(),
}
objects = {"ic.pkl": ic, "ric.pkl": ric}
objects.update({"long_pre.pkl": long_pre, "short_pre.pkl": short_pre})
long_short_r, long_avg_r = calc_long_short_return(pred.iloc[:, 0], raw_label.iloc[:, 0])
metrics.update(
{
"Long-Short Average Return": long_short_r.mean(),
"Long-Short Average Sharpe": long_short_r.mean() / long_short_r.std(),
}
)
objects.update(
{
"long_short_r.pkl": long_short_r,
"long_avg_r.pkl": long_avg_r,
}
)
self.recorder.log_metrics(**metrics)
self.recorder.save_objects(**objects, artifact_path=self.get_path())
pprint(metrics)

def list(self):
paths = [
self.get_path("ic.pkl"),
self.get_path("ric.pkl"),
self.get_path("long_pre.pkl"),
self.get_path("short_pre.pkl"),
self.get_path("long_short_r.pkl"),
self.get_path("long_avg_r.pkl"),
]
return paths


class SigAnaRecord(SignalRecord):
"""
This is the Signal Analysis Record class that generates the analysis results such as IC and IR. This class inherits the ``RecordTemp`` class.
Expand Down