diff --git a/CHANGES.rst b/CHANGES.rst index a340b19e14..505c9f8e4a 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -150,3 +150,11 @@ Version 0.4.6 - Some bugs are fixed - The default config in `Version 0.4.5` is not friendly to daily frequency data. - Backtest error in TopkWeightStrategy when `WithInteract=True`. + + +Version 0.5.0 +-------------------- +- First opensource version + - Refine the docs, code + - Add baselines + - public data crawler diff --git a/README.md b/README.md index 4f63303753..bdfc276b34 100644 --- a/README.md +++ b/README.md @@ -12,13 +12,14 @@ With Qlib, you can easily try your ideas to create better Quant investment strat For more details, please refer to our paper ["Qlib: An AI-oriented Quantitative Investment Platform"](https://arxiv.org/abs/2009.11189). - - [Framework of Qlib](#framework-of-qlib) - [Quick Start](#quick-start) - [Installation](#installation) - [Data Preparation](#data-preparation) - [Auto Quant Research Workflow](#auto-quant-research-workflow) - [Building Customized Quant Research Workflow by Code](#building-customized-quant-research-workflow-by-code) +- [Quant Model Zoo](#quant-model-zoo) +- [Quant Dataset Zoo](#quant-dataset-zoo) - [More About Qlib](#more-about-qlib) - [Offline Mode and Online Mode](#offline-mode-and-online-mode) - [Performance of Qlib Data Server](#performance-of-qlib-data-server) @@ -124,16 +125,17 @@ Qlib provides a tool named `Estimator` to run the whole workflow automatically ( ```bash risk - excess_return_without_cost mean 0.000605 - std 0.005481 - annualized_return 0.152373 - information_ratio 1.751319 - max_drawdown -0.059055 - excess_return_with_cost mean 0.000410 - std 0.005478 - annualized_return 0.103265 - information_ratio 1.187411 - max_drawdown -0.075024 + excess_return_without_cost mean 0.000675 + std 0.005456 + annualized_return 0.170077 + information_ratio 1.963824 + max_drawdown -0.063646 + excess_return_with_cost mean 0.000479 + std 0.005453 + annualized_return 0.120776 + information_ratio 1.395116 + max_drawdown -0.071216 + ``` @@ -171,6 +173,20 @@ Qlib provides a tool named `Estimator` to run the whole workflow automatically ( The automatic workflow may not suite the research workflow of all Quant researchers. To support a flexible Quant research workflow, Qlib also provides a modularized interface to allow researchers to build their own workflow by code. [Here](examples/train_backtest_analyze.ipynb) is a demo for customized Quant research workflow by code +# Quant-model-zoo + +Here is a list of models build on `Qlib`. +- [GBDT based on lightgbm](qlib/contrib/model/gbdt.py) +- [MLP based on pytroch](qlib/contrib/model/pytorch_nn.py) + +Your PR of new Quant models is highly welcomed. + +# Quant-dataset-zoo +Dataset plays a very important role in Quant. Here is a list of the datasets build on `Qlib`. +- [Alpha360](./qlib/contrib/estimator/handler.py) +- [QLibDataHandlerClose](./qlib/contrib/estimator/handler.py) + +Your PR to build new Quant dataset is highly welcomed. # More About Qlib The detailed documents are organized in [docs](docs/). diff --git a/docs/_static/img/analysis/analysis_model_IC.png b/docs/_static/img/analysis/analysis_model_IC.png index 41cc318c96..0064fb8901 100644 Binary files a/docs/_static/img/analysis/analysis_model_IC.png and b/docs/_static/img/analysis/analysis_model_IC.png differ diff --git a/docs/_static/img/analysis/analysis_model_NDQ.png b/docs/_static/img/analysis/analysis_model_NDQ.png index 6ba0153027..c1824368ba 100644 Binary files a/docs/_static/img/analysis/analysis_model_NDQ.png and b/docs/_static/img/analysis/analysis_model_NDQ.png differ diff --git a/docs/_static/img/analysis/analysis_model_auto_correlation.png b/docs/_static/img/analysis/analysis_model_auto_correlation.png index 16eca717d7..3f213a79b9 100644 Binary files a/docs/_static/img/analysis/analysis_model_auto_correlation.png and b/docs/_static/img/analysis/analysis_model_auto_correlation.png differ diff --git a/docs/_static/img/analysis/analysis_model_cumulative_return.png b/docs/_static/img/analysis/analysis_model_cumulative_return.png index 75760a73bb..bcccf138a5 100644 Binary files a/docs/_static/img/analysis/analysis_model_cumulative_return.png and b/docs/_static/img/analysis/analysis_model_cumulative_return.png differ diff --git a/docs/_static/img/analysis/analysis_model_long_short.png b/docs/_static/img/analysis/analysis_model_long_short.png index 871e5324b1..2fcb08c4e1 100644 Binary files a/docs/_static/img/analysis/analysis_model_long_short.png and b/docs/_static/img/analysis/analysis_model_long_short.png differ diff --git a/docs/_static/img/analysis/analysis_model_monthly_IC.png b/docs/_static/img/analysis/analysis_model_monthly_IC.png index bc8d13361b..0056c6c9c4 100644 Binary files a/docs/_static/img/analysis/analysis_model_monthly_IC.png and b/docs/_static/img/analysis/analysis_model_monthly_IC.png differ diff --git a/docs/_static/img/analysis/report.png b/docs/_static/img/analysis/report.png index 28fefb1771..dfd227f5a6 100644 Binary files a/docs/_static/img/analysis/report.png and b/docs/_static/img/analysis/report.png differ diff --git a/docs/_static/img/analysis/risk_analysis_annualized_return.png b/docs/_static/img/analysis/risk_analysis_annualized_return.png index f15f315b39..1979ca19b3 100644 Binary files a/docs/_static/img/analysis/risk_analysis_annualized_return.png and b/docs/_static/img/analysis/risk_analysis_annualized_return.png differ diff --git a/docs/_static/img/analysis/risk_analysis_bar.png b/docs/_static/img/analysis/risk_analysis_bar.png index 6597317fd4..1cce1f340e 100644 Binary files a/docs/_static/img/analysis/risk_analysis_bar.png and b/docs/_static/img/analysis/risk_analysis_bar.png differ diff --git a/docs/_static/img/analysis/risk_analysis_information_ratio.png b/docs/_static/img/analysis/risk_analysis_information_ratio.png index 3bef1069d0..edc64b17d5 100644 Binary files a/docs/_static/img/analysis/risk_analysis_information_ratio.png and b/docs/_static/img/analysis/risk_analysis_information_ratio.png differ diff --git a/docs/_static/img/analysis/risk_analysis_max_drawdown.png b/docs/_static/img/analysis/risk_analysis_max_drawdown.png index c2e8b0818e..a688102225 100644 Binary files a/docs/_static/img/analysis/risk_analysis_max_drawdown.png and b/docs/_static/img/analysis/risk_analysis_max_drawdown.png differ diff --git a/docs/_static/img/analysis/risk_analysis_std.png b/docs/_static/img/analysis/risk_analysis_std.png index 49e7e287c7..73d782e206 100644 Binary files a/docs/_static/img/analysis/risk_analysis_std.png and b/docs/_static/img/analysis/risk_analysis_std.png differ diff --git a/docs/_static/img/analysis/score_ic.png b/docs/_static/img/analysis/score_ic.png index 4419987246..6e1d37d2a6 100644 Binary files a/docs/_static/img/analysis/score_ic.png and b/docs/_static/img/analysis/score_ic.png differ diff --git a/docs/component/data.rst b/docs/component/data.rst index 60fe6da446..507d32af6e 100644 --- a/docs/component/data.rst +++ b/docs/component/data.rst @@ -65,7 +65,7 @@ After conversion, users can find their Qlib format data in the directory `~/.qli .. note:: - The arguments of `--include_fields` should correspond with the column names of CSV files. The columns names of dataset provided by ``Qlib`` includes open,close,high,low,volume,factor. + The arguments of `--include_fields` should correspond with the column names of CSV files. The columns names of dataset provided by ``Qlib`` should include open, close, high, low, volume and factor at least. - `open` The opening price @@ -80,6 +80,7 @@ After conversion, users can find their Qlib format data in the directory `~/.qli - `factor` The Restoration factor + In the convention of `Qlib` data processing, `open, close, high, low, volume, money and factor` will be set to NaN if the stock is suspended. China-Stock Mode & US-Stock Mode -------------------------------- diff --git a/qlib/config.py b/qlib/config.py index 687945c545..c599ced79e 100644 --- a/qlib/config.py +++ b/qlib/config.py @@ -120,7 +120,7 @@ _default_region_config = { REG_CN: { "trade_unit": 100, - "limit_threshold": 0.1, + "limit_threshold": 0.099, "deal_price": "vwap", }, REG_US: { diff --git a/qlib/contrib/backtest/exchange.py b/qlib/contrib/backtest/exchange.py index 68a5067185..ae64dec507 100644 --- a/qlib/contrib/backtest/exchange.py +++ b/qlib/contrib/backtest/exchange.py @@ -149,7 +149,7 @@ def set_quote(self, codes, start_date, end_date): self.quote = quote_df.to_dict("index") def _update_limit(self, buy_limit, sell_limit): - self.quote["limit"] = ~self.quote["$change"].between(-sell_limit, buy_limit) + self.quote["limit"] = ~self.quote["$change"].between(-sell_limit, buy_limit, inclusive=False) def check_stock_limit(self, stock_id, trade_date): """Parameter diff --git a/scripts/data_collector/utils.py b/scripts/data_collector/utils.py index e3f949fd16..d70aaa8b4c 100644 --- a/scripts/data_collector/utils.py +++ b/scripts/data_collector/utils.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + import re import requests diff --git a/scripts/data_collector/yahoo/README.md b/scripts/data_collector/yahoo/README.md index 958d748543..4f1f4c650d 100644 --- a/scripts/data_collector/yahoo/README.md +++ b/scripts/data_collector/yahoo/README.md @@ -1,5 +1,15 @@ # Collect Data From Yahoo Finance +> *Please pay **ATTENTION** that the data is collected from [Yahoo Finance](https://finance.yahoo.com/lookup) and the data might not be perfect. We recommend users to prepare their own data if they have high-quality dataset. For more information, users can refer to the [related document](https://qlib.readthedocs.io/en/latest/component/data.html#converting-csv-format-into-qlib-format)* + + +> **Examples of abnormal data** + +- [SH000661](https://finance.yahoo.com/quote/000661.SZ/history?period1=1558310400&period2=1590796800&interval=1d&filter=history&frequency=1d) +- [SZ300144](https://finance.yahoo.com/quote/300144.SZ/history?period1=1557446400&period2=1589932800&interval=1d&filter=history&frequency=1d) + +We have considered **STOCK PRICE ADJUSTMENT**, but some price series seem still very abnormal. + ## Requirements ```bash @@ -35,4 +45,4 @@ python collector.py manual_adj_data --normalize_dir ~/.qlib/stock_data/normalize ```bash python collector.py dump_data --normalize_dir ~/.qlib/stock_data/normalize_dir --qlib_dir ~/.qlib/stock_data/qlib_data -``` \ No newline at end of file +``` diff --git a/scripts/data_collector/yahoo/collector.py b/scripts/data_collector/yahoo/collector.py index b652311a6b..96ea8d6327 100644 --- a/scripts/data_collector/yahoo/collector.py +++ b/scripts/data_collector/yahoo/collector.py @@ -2,6 +2,7 @@ # Licensed under the MIT License. import sys +import time from pathlib import Path from concurrent.futures import ThreadPoolExecutor, as_completed @@ -19,17 +20,20 @@ from data_collector.utils import get_hs_calendar_list as get_calendar_list, get_hs_stock_symbols CSI300_BENCH_URL = "http://push2his.eastmoney.com/api/qt/stock/kline/get?secid=1.000300&fields1=f1%2Cf2%2Cf3%2Cf4%2Cf5&fields2=f51%2Cf52%2Cf53%2Cf54%2Cf55%2Cf56%2Cf57%2Cf58&klt=101&fqt=0&beg=19900101&end=20220101" +MIN_NUMBERS_TRADING = 252 / 4 class YahooCollector: - def __init__(self, save_dir: [str, Path], max_workers=4, asynchronous=True, max_collector_count=3): + def __init__(self, save_dir: [str, Path], max_workers=4, asynchronous=False, max_collector_count=5, delay=0): self.save_dir = Path(save_dir).expanduser().resolve() self.save_dir.mkdir(parents=True, exist_ok=True) + self._delay = delay self._stock_list = None self.max_workers = max_workers self._asynchronous = asynchronous self._max_collector_count = max_collector_count + self._mini_symbol_map = {} @property def stock_list(self): @@ -37,6 +41,9 @@ def stock_list(self): self._stock_list = get_hs_stock_symbols() return self._stock_list + def _sleep(self): + time.sleep(self._delay) + def save_stock(self, symbol, df: pd.DataFrame): """save stock data to file @@ -56,6 +63,15 @@ def save_stock(self, symbol, df: pd.DataFrame): df["symbol"] = symbol df.to_csv(stock_path, index=False) + def _temp_save_small_data(self, symbol, df): + if len(df) <= MIN_NUMBERS_TRADING: + logger.warning(f"the number of trading days of {symbol} is less than {MIN_NUMBERS_TRADING}!") + _temp = self._mini_symbol_map.setdefault(symbol, []) + _temp.append(df.copy()) + else: + if symbol in self._mini_symbol_map: + self._mini_symbol_map.pop(symbol) + def _collector(self, stock_list): error_symbol = [] @@ -63,12 +79,14 @@ def _collector(self, stock_list): futures = {} p_bar = tqdm(total=len(stock_list)) for symbols in [stock_list[i : i + self.max_workers] for i in range(0, len(stock_list), self.max_workers)]: + self._sleep() resp = Ticker(symbols, asynchronous=self._asynchronous, max_workers=self.max_workers).history( period="max" ) if isinstance(resp, dict): for symbol, df in resp.items(): if isinstance(df, pd.DataFrame): + self._temp_save_small_data(self, df) futures[ worker.submit( self.save_stock, symbol, df.reset_index().rename(columns={"index": "date"}) @@ -78,6 +96,7 @@ def _collector(self, stock_list): error_symbol.append(symbol) else: for symbol, df in resp.reset_index().groupby("symbol"): + self._temp_save_small_data(self, df) futures[worker.submit(self.save_stock, symbol, df)] = symbol p_bar.update(self.max_workers) p_bar.close() @@ -93,6 +112,7 @@ def _collector(self, stock_list): print(error_symbol) logger.info(f"error symbol nums: {len(error_symbol)}") logger.info(f"current get symbol nums: {len(stock_list)}") + error_symbol.extend(self._mini_symbol_map.keys()) return error_symbol def collector_data(self): @@ -107,7 +127,14 @@ def collector_data(self): logger.info(f"getting data: {i+1}") stock_list = self._collector(stock_list) logger.info(f"{i+1} finish.") + for _symbol, _df_list in self._mini_symbol_map.items(): + self.save_stock(_symbol, max(_df_list, key=len)) + logger.warning(f"less than {MIN_NUMBERS_TRADING} stock list: {list(self._mini_symbol_map.keys())}") + + self.download_csi300_data() + + def download_csi300_data(self): # TODO: from MSN logger.info(f"get bench data: csi300(SH000300)......") df = pd.DataFrame(map(lambda x: x.split(","), requests.get(CSI300_BENCH_URL).json()["data"]["klines"])) @@ -164,6 +191,7 @@ def _normalize(file_path: Path): df = pd.read_csv(file_path) df.set_index("date", inplace=True) df.index = pd.to_datetime(df.index) + df = df[~df.index.duplicated(keep="first")] # using China stock market data calendar df = df.reindex(pd.Index(get_calendar_list())) @@ -232,7 +260,7 @@ def dump_data(self): include_fields="close,open,high,low,volume,change,factor" ) - def download_data(self): + def download_data(self, asynchronous=False, max_collector_count=5, delay=0): """download data from Internet Examples @@ -240,7 +268,20 @@ def download_data(self): $ python collector.py download_data --source_dir ~/.qlib/stock_data/source """ - YahooCollector(self.source_dir, max_workers=self.max_workers).collector_data() + YahooCollector( + self.source_dir, + max_workers=self.max_workers, + asynchronous=asynchronous, + max_collector_count=max_collector_count, + delay=delay, + ).collector_data() + + def download_csi300_data(self): + YahooCollector(self.source_dir).download_csi300_data() + + def download_bench_data(self): + """download bench stock data(SH000300) + """ def collector_data(self): """download -> normalize -> dump data diff --git a/scripts/get_data.py b/scripts/get_data.py index d279536313..d20a251ed2 100644 --- a/scripts/get_data.py +++ b/scripts/get_data.py @@ -53,7 +53,7 @@ def _unzip(file_path: Path, target_dir: Path): for _file in tqdm(zp.namelist()): zp.extract(_file, str(target_dir.resolve())) - def qlib_data_cn(self, target_dir="~/.qlib/qlib_data/cn_data", version="v1"): + def qlib_data_cn(self, target_dir="~/.qlib/qlib_data/cn_data", version="latest"): """download cn qlib data from remote Parameters @@ -61,7 +61,7 @@ def qlib_data_cn(self, target_dir="~/.qlib/qlib_data/cn_data", version="v1"): target_dir: str data save directory version: str - data version, value from [v0, v1], by default v1 + data version, value from [v0, v1, ..., latest], by default latest Examples --------- diff --git a/tests/dataset_tests/README.md b/tests/dataset_tests/README.md new file mode 100644 index 0000000000..0a0905ed4b --- /dev/null +++ b/tests/dataset_tests/README.md @@ -0,0 +1,2 @@ +# About dataset tests +Tests in this folder are for testing the prepared dataset from Yahoo diff --git a/tests/dataset_tests/test_dataset.py b/tests/dataset_tests/test_dataset.py new file mode 100644 index 0000000000..5393b84dfe --- /dev/null +++ b/tests/dataset_tests/test_dataset.py @@ -0,0 +1,42 @@ + +import qlib +from qlib.data import D +from qlib.config import REG_CN +import unittest +import numpy as np + + +class TestDataset(unittest.TestCase): + + def setUp(self): + provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir + qlib.init(provider_uri=provider_uri, region=REG_CN) + + def testCSI300(self): + close_p = D.features(D.instruments('csi300'), ['$close']) + size = close_p.groupby('datetime').size() + cnt = close_p.groupby('datetime').count() + size_desc = size.describe(percentiles=np.arange(0.1, 0.9, 0.1)) + cnt_desc = cnt.describe(percentiles=np.arange(0.1, 0.9, 0.1)) + + print(size_desc) + print(cnt_desc) + + self.assertLessEqual(size_desc.loc["max"][0], 305, "Excessive number of CSI300 constituent stocks") + self.assertLessEqual(size_desc.loc["80%"][0], 290, "Insufficient number of CSI300 constituent stocks") + + self.assertLessEqual(cnt_desc.loc["max"][0], 305, "Excessive number of CSI300 constituent stocks") + self.assertEqual(cnt_desc.loc["80%"][0], 300, "Insufficient number of CSI300 constituent stocks") + + def testClose(self): + close_p = D.features(D.instruments('csi300'), ['Ref($close, 1)/$close - 1']) + close_desc = close_p.describe(percentiles=np.arange(0.1, 0.9, 0.1)) + print(close_desc) + self.assertLessEqual(abs(close_desc.loc["80%"][0]), 0.1, "Close value is abnormal") + self.assertLessEqual(abs(close_desc.loc["max"][0]), 0.2, "Close value is abnormal") + self.assertGreaterEqual(close_desc.loc["min"][0], -0.2, "Close value is abnormal") + + +if __name__ == '__main__': + unittest.main() +