microsoft · you-n-g · Sep 27, 2020 · Sep 25, 2020 · Sep 26, 2020 · Sep 26, 2020
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -150,3 +150,11 @@ Version 0.4.6
 - Some bugs are fixed
     - The default config in `Version 0.4.5` is not friendly to daily frequency data.
     - Backtest error in TopkWeightStrategy when `WithInteract=True`.
+
+
+Version 0.5.0
+--------------------
+- First opensource version
+    - Refine the docs, code
+    - Add baselines
+    - public data crawler
diff --git a/README.md b/README.md
@@ -12,13 +12,14 @@ With Qlib, you can easily try your ideas to create better Quant investment strat
 
 For more details, please refer to our paper ["Qlib: An AI-oriented Quantitative Investment Platform"](https://arxiv.org/abs/2009.11189).
 
-
 - [Framework of Qlib](#framework-of-qlib)
 - [Quick Start](#quick-start)
   - [Installation](#installation)
   - [Data Preparation](#data-preparation)
   - [Auto Quant Research Workflow](#auto-quant-research-workflow)
   - [Building Customized Quant Research Workflow by Code](#building-customized-quant-research-workflow-by-code)
+- [Quant Model Zoo](#quant-model-zoo)
+- [Quant Dataset Zoo](#quant-dataset-zoo)
 - [More About Qlib](#more-about-qlib)
 - [Offline Mode and Online Mode](#offline-mode-and-online-mode)
   - [Performance of Qlib Data Server](#performance-of-qlib-data-server)
@@ -124,16 +125,17 @@ Qlib provides a tool named `Estimator` to run the whole workflow automatically (
     ```bash
 
                                                       risk
-    excess_return_without_cost mean               0.000605
-                               std                0.005481
-                               annualized_return  0.152373
-                               information_ratio  1.751319
-                               max_drawdown      -0.059055
-    excess_return_with_cost    mean               0.000410
-                               std                0.005478
-                               annualized_return  0.103265
-                               information_ratio  1.187411
-                               max_drawdown      -0.075024
+    excess_return_without_cost mean               0.000675
+                               std                0.005456
+                               annualized_return  0.170077
+                               information_ratio  1.963824
+                               max_drawdown      -0.063646
+    excess_return_with_cost    mean               0.000479
+                               std                0.005453
+                               annualized_return  0.120776
+                               information_ratio  1.395116
+                               max_drawdown      -0.071216
+
 
 
     ```
@@ -171,6 +173,20 @@ Qlib provides a tool named `Estimator` to run the whole workflow automatically (
 The automatic workflow may not suite the research workflow of all Quant researchers. To support a flexible Quant research workflow, Qlib also provides a modularized interface to allow researchers to build their own workflow by code. [Here](examples/train_backtest_analyze.ipynb) is a demo for customized Quant research workflow by code
 
 
+# Quant-model-zoo
+
+Here is a list of models build on `Qlib`.
+- [GBDT based on lightgbm](qlib/contrib/model/gbdt.py)
+- [MLP based on pytroch](qlib/contrib/model/pytorch_nn.py)
+
+Your PR of new Quant models is highly welcomed.
+
+# Quant-dataset-zoo
+Dataset plays a very important role in Quant. Here is a list of the datasets build on `Qlib`.
+- [Alpha360](./qlib/contrib/estimator/handler.py)
+- [QLibDataHandlerClose](./qlib/contrib/estimator/handler.py)
+
+Your PR to build new Quant dataset is highly welcomed.
 
 # More About Qlib
 The detailed documents are organized in [docs](docs/).

diff --git a/docs/_static/img/analysis/analysis_model_IC.png b/docs/_static/img/analysis/analysis_model_IC.png
diff --git a/docs/_static/img/analysis/analysis_model_NDQ.png b/docs/_static/img/analysis/analysis_model_NDQ.png
diff --git a/docs/_static/img/analysis/analysis_model_auto_correlation.png b/docs/_static/img/analysis/analysis_model_auto_correlation.png
diff --git a/docs/_static/img/analysis/analysis_model_cumulative_return.png b/docs/_static/img/analysis/analysis_model_cumulative_return.png
diff --git a/docs/_static/img/analysis/analysis_model_long_short.png b/docs/_static/img/analysis/analysis_model_long_short.png
diff --git a/docs/_static/img/analysis/analysis_model_monthly_IC.png b/docs/_static/img/analysis/analysis_model_monthly_IC.png
diff --git a/docs/_static/img/analysis/report.png b/docs/_static/img/analysis/report.png
diff --git a/docs/_static/img/analysis/risk_analysis_annualized_return.png b/docs/_static/img/analysis/risk_analysis_annualized_return.png
diff --git a/docs/_static/img/analysis/risk_analysis_bar.png b/docs/_static/img/analysis/risk_analysis_bar.png
diff --git a/docs/_static/img/analysis/risk_analysis_information_ratio.png b/docs/_static/img/analysis/risk_analysis_information_ratio.png
diff --git a/docs/_static/img/analysis/risk_analysis_max_drawdown.png b/docs/_static/img/analysis/risk_analysis_max_drawdown.png
diff --git a/docs/_static/img/analysis/risk_analysis_std.png b/docs/_static/img/analysis/risk_analysis_std.png
diff --git a/docs/_static/img/analysis/score_ic.png b/docs/_static/img/analysis/score_ic.png
diff --git a/docs/component/data.rst b/docs/component/data.rst
@@ -65,7 +65,7 @@ After conversion, users can find their Qlib format data in the directory `~/.qli
 
 .. note::
 
-    The arguments of `--include_fields` should correspond with the column names of CSV files. The columns names of dataset provided by ``Qlib`` includes open,close,high,low,volume,factor.
+    The arguments of `--include_fields` should correspond with the column names of CSV files. The columns names of dataset provided by ``Qlib`` should include open, close, high, low, volume and factor at least.
 
     - `open`
         The opening price
@@ -80,6 +80,7 @@ After conversion, users can find their Qlib format data in the directory `~/.qli
     - `factor`
         The Restoration factor
 
+    In the convention of `Qlib` data processing, `open, close, high, low, volume, money and factor` will be set to NaN if the stock is suspended. 
 
 China-Stock Mode & US-Stock Mode
 --------------------------------

diff --git a/qlib/config.py b/qlib/config.py
@@ -120,7 +120,7 @@
 _default_region_config = {
     REG_CN: {
         "trade_unit": 100,
-        "limit_threshold": 0.1,
+        "limit_threshold": 0.099,
         "deal_price": "vwap",
     },
     REG_US: {

diff --git a/qlib/contrib/backtest/exchange.py b/qlib/contrib/backtest/exchange.py
@@ -149,7 +149,7 @@ def set_quote(self, codes, start_date, end_date):
         self.quote = quote_df.to_dict("index")
 
     def _update_limit(self, buy_limit, sell_limit):
-        self.quote["limit"] = ~self.quote["$change"].between(-sell_limit, buy_limit)
+        self.quote["limit"] = ~self.quote["$change"].between(-sell_limit, buy_limit, inclusive=False)
 
     def check_stock_limit(self, stock_id, trade_date):
         """Parameter

diff --git a/scripts/data_collector/utils.py b/scripts/data_collector/utils.py
@@ -1,3 +1,6 @@
+#  Copyright (c) Microsoft Corporation.
+#  Licensed under the MIT License.
+
 import re
 import requests
 

diff --git a/scripts/data_collector/yahoo/README.md b/scripts/data_collector/yahoo/README.md
@@ -1,5 +1,15 @@
 # Collect Data From Yahoo Finance
 
+> *Please pay **ATTENTION** that the data is collected from [Yahoo Finance](https://finance.yahoo.com/lookup) and the data might not be perfect. We recommend users to prepare their own data if they have high-quality dataset. For more information, users can refer to the [related document](https://qlib.readthedocs.io/en/latest/component/data.html#converting-csv-format-into-qlib-format)*
+
+
+>  **Examples of abnormal data**
+
+- [SH000661](https://finance.yahoo.com/quote/000661.SZ/history?period1=1558310400&period2=1590796800&interval=1d&filter=history&frequency=1d)
+- [SZ300144](https://finance.yahoo.com/quote/300144.SZ/history?period1=1557446400&period2=1589932800&interval=1d&filter=history&frequency=1d)
+
+We have considered **STOCK PRICE ADJUSTMENT**, but some price series seem still very abnormal.
+
 ## Requirements
 
 ```bash
@@ -35,4 +45,4 @@ python collector.py manual_adj_data --normalize_dir ~/.qlib/stock_data/normalize
 
 ```bash
 python collector.py dump_data --normalize_dir ~/.qlib/stock_data/normalize_dir --qlib_dir ~/.qlib/stock_data/qlib_data
-```
+```
diff --git a/scripts/data_collector/yahoo/collector.py b/scripts/data_collector/yahoo/collector.py
@@ -2,6 +2,7 @@
 # Licensed under the MIT License.
 
 import sys
+import time
 from pathlib import Path
 from concurrent.futures import ThreadPoolExecutor, as_completed
 
@@ -19,24 +20,30 @@
 from data_collector.utils import get_hs_calendar_list as get_calendar_list, get_hs_stock_symbols
 
 CSI300_BENCH_URL = "http://push2his.eastmoney.com/api/qt/stock/kline/get?secid=1.000300&fields1=f1%2Cf2%2Cf3%2Cf4%2Cf5&fields2=f51%2Cf52%2Cf53%2Cf54%2Cf55%2Cf56%2Cf57%2Cf58&klt=101&fqt=0&beg=19900101&end=20220101"
+MIN_NUMBERS_TRADING = 252 / 4
 
 
 class YahooCollector:
-    def __init__(self, save_dir: [str, Path], max_workers=4, asynchronous=True, max_collector_count=3):
+    def __init__(self, save_dir: [str, Path], max_workers=4, asynchronous=False, max_collector_count=5, delay=0):
 
         self.save_dir = Path(save_dir).expanduser().resolve()
         self.save_dir.mkdir(parents=True, exist_ok=True)
+        self._delay = delay
         self._stock_list = None
         self.max_workers = max_workers
         self._asynchronous = asynchronous
         self._max_collector_count = max_collector_count
+        self._mini_symbol_map = {}
 
     @property
     def stock_list(self):
         if self._stock_list is None:
             self._stock_list = get_hs_stock_symbols()
         return self._stock_list
 
+    def _sleep(self):
+        time.sleep(self._delay)
+
     def save_stock(self, symbol, df: pd.DataFrame):
         """save stock data to file
 
@@ -56,19 +63,30 @@ def save_stock(self, symbol, df: pd.DataFrame):
         df["symbol"] = symbol
         df.to_csv(stock_path, index=False)
 
+    def _temp_save_small_data(self, symbol, df):
+        if len(df) <= MIN_NUMBERS_TRADING:
+            logger.warning(f"the number of trading days of {symbol} is less than {MIN_NUMBERS_TRADING}!")
+            _temp = self._mini_symbol_map.setdefault(symbol, [])
+            _temp.append(df.copy())
+        else:
+            if symbol in self._mini_symbol_map:
+                self._mini_symbol_map.pop(symbol)
+
     def _collector(self, stock_list):
 
         error_symbol = []
         with ThreadPoolExecutor(max_workers=self.max_workers) as worker:
             futures = {}
             p_bar = tqdm(total=len(stock_list))
             for symbols in [stock_list[i : i + self.max_workers] for i in range(0, len(stock_list), self.max_workers)]:
+                self._sleep()
                 resp = Ticker(symbols, asynchronous=self._asynchronous, max_workers=self.max_workers).history(
                     period="max"
                 )
                 if isinstance(resp, dict):
                     for symbol, df in resp.items():
                         if isinstance(df, pd.DataFrame):
+                            self._temp_save_small_data(self, df)
                             futures[
                                 worker.submit(
                                     self.save_stock, symbol, df.reset_index().rename(columns={"index": "date"})
@@ -78,6 +96,7 @@ def _collector(self, stock_list):
                             error_symbol.append(symbol)
                 else:
                     for symbol, df in resp.reset_index().groupby("symbol"):
+                        self._temp_save_small_data(self, df)
                         futures[worker.submit(self.save_stock, symbol, df)] = symbol
                 p_bar.update(self.max_workers)
             p_bar.close()
@@ -93,6 +112,7 @@ def _collector(self, stock_list):
         print(error_symbol)
         logger.info(f"error symbol nums: {len(error_symbol)}")
         logger.info(f"current get symbol nums: {len(stock_list)}")
+        error_symbol.extend(self._mini_symbol_map.keys())
         return error_symbol
 
     def collector_data(self):
@@ -107,7 +127,14 @@ def collector_data(self):
             logger.info(f"getting data: {i+1}")
             stock_list = self._collector(stock_list)
             logger.info(f"{i+1} finish.")
+        for _symbol, _df_list in self._mini_symbol_map.items():
+            self.save_stock(_symbol, max(_df_list, key=len))
 
+        logger.warning(f"less than {MIN_NUMBERS_TRADING} stock list: {list(self._mini_symbol_map.keys())}")
+
+        self.download_csi300_data()
+
+    def download_csi300_data(self):
         # TODO: from MSN
         logger.info(f"get bench data: csi300(SH000300)......")
         df = pd.DataFrame(map(lambda x: x.split(","), requests.get(CSI300_BENCH_URL).json()["data"]["klines"]))
@@ -164,6 +191,7 @@ def _normalize(file_path: Path):
             df = pd.read_csv(file_path)
             df.set_index("date", inplace=True)
             df.index = pd.to_datetime(df.index)
+            df = df[~df.index.duplicated(keep="first")]
 
             # using China stock market data calendar
             df = df.reindex(pd.Index(get_calendar_list()))
@@ -232,15 +260,28 @@ def dump_data(self):
             include_fields="close,open,high,low,volume,change,factor"
         )
 
-    def download_data(self):
+    def download_data(self, asynchronous=False, max_collector_count=5, delay=0):
         """download data from Internet
 
         Examples
         ---------
             $ python collector.py download_data --source_dir ~/.qlib/stock_data/source
 
         """
-        YahooCollector(self.source_dir, max_workers=self.max_workers).collector_data()
+        YahooCollector(
+            self.source_dir,
+            max_workers=self.max_workers,
+            asynchronous=asynchronous,
+            max_collector_count=max_collector_count,
+            delay=delay,
+        ).collector_data()
+
+    def download_csi300_data(self):
+        YahooCollector(self.source_dir).download_csi300_data()
+
+    def download_bench_data(self):
+        """download bench stock data(SH000300)
+        """
 
     def collector_data(self):
         """download -> normalize -> dump data

diff --git a/scripts/get_data.py b/scripts/get_data.py
@@ -53,15 +53,15 @@ def _unzip(file_path: Path, target_dir: Path):
             for _file in tqdm(zp.namelist()):
                 zp.extract(_file, str(target_dir.resolve()))
 
-    def qlib_data_cn(self, target_dir="~/.qlib/qlib_data/cn_data", version="v1"):
+    def qlib_data_cn(self, target_dir="~/.qlib/qlib_data/cn_data", version="latest"):
         """download cn qlib data from remote
 
         Parameters
         ----------
         target_dir: str
             data save directory
         version: str
-            data version, value from [v0, v1], by default v1
+            data version, value from [v0, v1, ..., latest], by default latest
 
         Examples
         ---------

diff --git a/tests/dataset_tests/README.md b/tests/dataset_tests/README.md
@@ -0,0 +1,2 @@
+# About dataset tests
+Tests in this folder are for testing the prepared dataset from Yahoo
diff --git a/tests/dataset_tests/test_dataset.py b/tests/dataset_tests/test_dataset.py
@@ -0,0 +1,42 @@
+
+import qlib
+from qlib.data import D
+from qlib.config import REG_CN
+import unittest
+import numpy as np
+
+
+class TestDataset(unittest.TestCase):
+
+    def setUp(self):
+        provider_uri = "~/.qlib/qlib_data/cn_data"  # target_dir
+        qlib.init(provider_uri=provider_uri, region=REG_CN)
+
+    def testCSI300(self):
+        close_p = D.features(D.instruments('csi300'), ['$close'])
+        size = close_p.groupby('datetime').size()
+        cnt = close_p.groupby('datetime').count()
+        size_desc = size.describe(percentiles=np.arange(0.1, 0.9, 0.1))
+        cnt_desc = cnt.describe(percentiles=np.arange(0.1, 0.9, 0.1))
+
+        print(size_desc)
+        print(cnt_desc)
+
+        self.assertLessEqual(size_desc.loc["max"][0], 305, "Excessive number of CSI300 constituent stocks")
+        self.assertLessEqual(size_desc.loc["80%"][0], 290, "Insufficient number of CSI300 constituent stocks")
+
+        self.assertLessEqual(cnt_desc.loc["max"][0], 305, "Excessive number of CSI300 constituent stocks")
+        self.assertEqual(cnt_desc.loc["80%"][0], 300, "Insufficient number of CSI300 constituent stocks")
+
+    def testClose(self):
+        close_p = D.features(D.instruments('csi300'), ['Ref($close, 1)/$close - 1'])
+        close_desc = close_p.describe(percentiles=np.arange(0.1, 0.9, 0.1))
+        print(close_desc)
+        self.assertLessEqual(abs(close_desc.loc["80%"][0]), 0.1, "Close value is abnormal")
+        self.assertLessEqual(abs(close_desc.loc["max"][0]), 0.2, "Close value is abnormal")
+        self.assertGreaterEqual(close_desc.loc["min"][0], -0.2, "Close value is abnormal")
+
+
+if __name__ == '__main__':
+    unittest.main()
+
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# About dataset tests
		Tests in this folder are for testing the prepared dataset from Yahoo