From 35e0fdd1c036ff03d029f3a49dfc0ccfa1bab593 Mon Sep 17 00:00:00 2001 From: Hao Zhao Date: Sat, 1 Jun 2024 08:07:34 +0800 Subject: [PATCH] fix the bug that the HS_SYMBOLS_URL is 404 (#1758) * fix the bug that the HS_SYMBOLS_URL is 404 * fix bug * format with black * fix pylint error * change error code * fix ci error * fix ci error * optimize code * optimize code * add comments --------- Co-authored-by: Linlang --- .github/workflows/test_qlib_from_pip.yml | 3 ++ scripts/data_collector/utils.py | 49 ++++++++++++++++++------ setup.py | 3 ++ 3 files changed, 43 insertions(+), 12 deletions(-) diff --git a/.github/workflows/test_qlib_from_pip.yml b/.github/workflows/test_qlib_from_pip.yml index 4cc842b223..4b9fa7c34d 100644 --- a/.github/workflows/test_qlib_from_pip.yml +++ b/.github/workflows/test_qlib_from_pip.yml @@ -45,6 +45,9 @@ jobs: - name: Qlib installation test run: | + # 2024-05-30 scs has released a new version: 3.2.4.post2, + # This will cause the CI to fail, so we have limited the version of scs for now. + python -m pip install "scs<=3.2.4" python -m pip install pyqlib - name: Install Lightgbm for MacOS diff --git a/scripts/data_collector/utils.py b/scripts/data_collector/utils.py index 596eae60ef..feec170bb1 100644 --- a/scripts/data_collector/utils.py +++ b/scripts/data_collector/utils.py @@ -15,7 +15,6 @@ import numpy as np import pandas as pd -from lxml import etree from loguru import logger from yahooquery import Ticker from tqdm import tqdm @@ -190,17 +189,43 @@ def get_hs_stock_symbols() -> list: global _HS_SYMBOLS # pylint: disable=W0603 def _get_symbol(): - _res = set() - for _k, _v in (("ha", "ss"), ("sa", "sz"), ("gem", "sz")): - resp = requests.get(HS_SYMBOLS_URL.format(s_type=_k), timeout=None) - _res |= set( - map( - lambda x: "{}.{}".format(re.findall(r"\d+", x)[0], _v), # pylint: disable=W0640 - etree.HTML(resp.text).xpath("//div[@class='result']/ul//li/a/text()"), # pylint: disable=I1101 - ) - ) - time.sleep(3) - return _res + """ + Get the stock pool from a web page and process it into the format required by yahooquery. + Format of data retrieved from the web page: 600519, 000001 + The data format required by yahooquery: 600519.ss, 000001.sz + + Returns + ------- + set: Returns the set of symbol codes. + + Examples: + ------- + {600000.ss, 600001.ss, 600002.ss, 600003.ss, ...} + """ + url = "http://99.push2.eastmoney.com/api/qt/clist/get?pn=1&pz=10000&po=1&np=1&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f12" + try: + resp = requests.get(url, timeout=None) + resp.raise_for_status() + except requests.exceptions.HTTPError as e: + raise requests.exceptions.HTTPError(f"Request to {url} failed with status code {resp.status_code}") from e + + try: + _symbols = [_v["f12"] for _v in resp.json()["data"]["diff"]] + except Exception as e: + logger.warning("An error occurred while extracting data from the response.") + raise + + if len(_symbols) < 3900: + raise ValueError("The complete list of stocks is not available.") + + # Add suffix after the stock code to conform to yahooquery standard, otherwise the data will not be fetched. + _symbols = [ + _symbol + ".ss" if _symbol.startswith("6") else _symbol + ".sz" if _symbol.startswith(("0", "3")) else None + for _symbol in _symbols + ] + _symbols = [_symbol for _symbol in _symbols if _symbol is not None] + + return set(_symbols) if _HS_SYMBOLS is None: symbols = set() diff --git a/setup.py b/setup.py index adafefd614..1feabd30c1 100644 --- a/setup.py +++ b/setup.py @@ -166,6 +166,9 @@ def get_version(rel_path: str) -> str: "lxml", "baostock", "yahooquery", + # 2024-05-30 scs has released a new version: 3.2.4.post2, + # this version, causes qlib installation to fail, so we've limited the scs version a bit for now. + "scs<=3.2.4", "beautifulsoup4", # In version 0.4.11 of tianshou, the code: # logits, hidden = self.actor(batch.obs, state=state, info=batch.info)