Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add future trading date collector #405

Merged
merged 1 commit into from Apr 28, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions qlib/data/data.py
Expand Up @@ -522,6 +522,9 @@ def load_calendar(self, freq, future):
# if future calendar not exists, return current calendar
if not os.path.exists(fname):
get_module_logger("data").warning(f"{freq}_future.txt not exists, return current calendar!")
get_module_logger("data").warning(
"You can get future calendar by referring to the following document: https://github.com/microsoft/qlib/blob/main/scripts/data_collector/contrib/README.md"
)
fname = self._uri_cal.format(freq)
else:
fname = self._uri_cal.format(freq)
Expand Down
24 changes: 24 additions & 0 deletions scripts/data_collector/contrib/README.md
@@ -0,0 +1,24 @@
# Get future trading days

> `D.calendar(future=True)` will be used

## Requirements

```bash
pip install -r requirements.txt
```

## Collector Data

```bash
# parse instruments, using in qlib/instruments.
python future_trading_date_collector.py --qlib_dir ~/.qlib/qlib_data/cn_data --freq day
```

## Parameters

- qlib_dir: qlib data directory
- freq: value from [`day`, `1min`], default `day`



87 changes: 87 additions & 0 deletions scripts/data_collector/contrib/future_trading_date_collector.py
@@ -0,0 +1,87 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

import sys
from typing import List
from pathlib import Path

import fire
import numpy as np
import pandas as pd
from loguru import logger

# get data from baostock
import baostock as bs

CUR_DIR = Path(__file__).resolve().parent
sys.path.append(str(CUR_DIR.parent.parent))


from data_collector.utils import generate_minutes_calendar_from_daily


def read_calendar_from_qlib(qlib_dir: Path) -> pd.DataFrame:
Copy link
Collaborator

@you-n-g you-n-g Apr 27, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will it be better to use backed storage if it is merged?

calendar_path = qlib_dir.joinpath("calendars").joinpath("day.txt")
if not calendar_path.exists():
return pd.DataFrame()
return pd.read_csv(calendar_path, header=None)


def write_calendar_to_qlib(qlib_dir: Path, date_list: List[str], freq: str = "day"):
calendar_path = str(qlib_dir.joinpath("calendars").joinpath(f"{freq}_future.txt"))

np.savetxt(calendar_path, date_list, fmt="%s", encoding="utf-8")
logger.info(f"write future calendars success: {calendar_path}")


def generate_qlib_calendar(date_list: List[str], freq: str) -> List[str]:
print(freq)
if freq == "day":
return date_list
elif freq == "1min":
date_list = generate_minutes_calendar_from_daily(date_list, freq=freq).tolist()
return list(map(lambda x: pd.Timestamp(x).strftime("%Y-%m-%d %H:%M:%S"), date_list))
else:
raise ValueError(f"Unsupported freq: {freq}")


def future_calendar_collector(qlib_dir: [str, Path], freq: str = "day"):
"""get future calendar

Parameters
----------
qlib_dir: str or Path
qlib data directory
freq: str
value from ["day", "1min"], by default day
"""
qlib_dir = Path(qlib_dir).expanduser().resolve()
if not qlib_dir.exists():
raise FileNotFoundError(str(qlib_dir))

lg = bs.login()
if lg.error_code != "0":
logger.error(f"login error: {lg.error_msg}")
return
# read daily calendar
daily_calendar = read_calendar_from_qlib(qlib_dir)
end_year = pd.Timestamp.now().year
if daily_calendar.empty:
start_year = pd.Timestamp.now().year
else:
start_year = pd.Timestamp(daily_calendar.iloc[-1, 0]).year
rs = bs.query_trade_dates(start_date=pd.Timestamp(f"{start_year}-01-01"), end_date=f"{end_year}-12-31")
data_list = []
while (rs.error_code == "0") & rs.next():
_row_data = rs.get_row_data()
if int(_row_data[1]) == 1:
data_list.append(_row_data[0])
data_list = sorted(data_list)
date_list = generate_qlib_calendar(data_list, freq=freq)
write_calendar_to_qlib(qlib_dir, date_list, freq=freq)
bs.logout()
logger.info(f"get trading dates success: {start_year}-01-01 to {end_year}-12-31")


if __name__ == "__main__":
fire.Fire(future_calendar_collector)
5 changes: 5 additions & 0 deletions scripts/data_collector/contrib/requirements.txt
@@ -0,0 +1,5 @@
baostock
fire
numpy
pandas
loguru
37 changes: 37 additions & 0 deletions scripts/data_collector/utils.py
Expand Up @@ -10,7 +10,9 @@
import requests
import functools
from pathlib import Path
from typing import Iterable, Tuple

import numpy as np
import pandas as pd
from lxml import etree
from loguru import logger
Expand Down Expand Up @@ -418,5 +420,40 @@ def get_trading_date_by_shift(trading_list: list, trading_date: pd.Timestamp, sh
return res


def generate_minutes_calendar_from_daily(
calendars: Iterable,
freq: str = "1min",
am_range: Tuple[str, str] = ("09:30:00", "11:29:00"),
pm_range: Tuple[str, str] = ("13:00:00", "14:59:00"),
) -> pd.Index:
"""generate minutes calendar

Parameters
----------
calendars: Iterable
daily calendar
freq: str
by default 1min
am_range: Tuple[str, str]
AM Time Range, by default China-Stock: ("09:30:00", "11:29:00")
pm_range: Tuple[str, str]
PM Time Range, by default China-Stock: ("13:00:00", "14:59:00")

"""
daily_format: str = "%Y-%m-%d"
res = []
for _day in calendars:
for _range in [am_range, pm_range]:
res.append(
pd.date_range(
f"{pd.Timestamp(_day).strftime(daily_format)} {_range[0]}",
f"{pd.Timestamp(_day).strftime(daily_format)} {_range[1]}",
freq=freq,
)
)

return pd.Index(sorted(set(np.hstack(res))))


if __name__ == "__main__":
assert len(get_hs_stock_symbols()) >= MINIMUM_SYMBOLS_NUM
25 changes: 9 additions & 16 deletions scripts/data_collector/yahoo/collector.py
Expand Up @@ -24,7 +24,12 @@
CUR_DIR = Path(__file__).resolve().parent
sys.path.append(str(CUR_DIR.parent.parent))
from data_collector.base import BaseCollector, BaseNormalize, BaseRun
from data_collector.utils import get_calendar_list, get_hs_stock_symbols, get_us_stock_symbols
from data_collector.utils import (
get_calendar_list,
get_hs_stock_symbols,
get_us_stock_symbols,
generate_minutes_calendar_from_daily,
)

INDEX_BENCH_URL = "http://push2his.eastmoney.com/api/qt/stock/kline/get?secid=1.{index_code}&fields1=f1%2Cf2%2Cf3%2Cf4%2Cf5&fields2=f51%2Cf52%2Cf53%2Cf54%2Cf55%2Cf56%2Cf57%2Cf58&klt=101&fqt=0&beg={begin}&end={end}"

Expand Down Expand Up @@ -418,21 +423,9 @@ def calendar_list_1d(self):
return calendar_list_1d

def generate_1min_from_daily(self, calendars: Iterable) -> pd.Index:
res = []
daily_format = self.DAILY_FORMAT
am_range = self.AM_RANGE
pm_range = self.PM_RANGE
for _day in calendars:
for _range in [am_range, pm_range]:
res.append(
pd.date_range(
f"{_day.strftime(daily_format)} {_range[0]}",
f"{_day.strftime(daily_format)} {_range[1]}",
freq="1min",
)
)

return pd.Index(sorted(set(np.hstack(res))))
return generate_minutes_calendar_from_daily(
calendars, freq="1min", am_range=self.AM_RANGE, pm_range=self.PM_RANGE
)

def adjusted_price(self, df: pd.DataFrame) -> pd.DataFrame:
# TODO: using daily data factor
Expand Down