Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

change get_data url #1558

Merged
merged 38 commits into from Jun 25, 2023
Merged
Show file tree
Hide file tree
Changes from 33 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
21 changes: 15 additions & 6 deletions .github/workflows/test_qlib_from_source.yml
Expand Up @@ -20,18 +20,28 @@ jobs:

steps:
- name: Test qlib from source
uses: actions/checkout@v2
uses: actions/checkout@v3

# Since version 3.7 of python for MacOS is installed in CI, version 3.7.17, this version causes "_bz not found error".
# So we make the version number of python 3.7 for MacOS more specific.
# refs: https://github.com/actions/setup-python/issues/682
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
if: (matrix.os == 'macos-latest' && matrix.python-version == '3.7') || (matrix.os == 'macos-11' && matrix.python-version == '3.7')
uses: actions/setup-python@v4
with:
python-version: "3.7.16"

- name: Set up Python ${{ matrix.python-version }}
if: (matrix.os != 'macos-latest' || matrix.python-version != '3.7') && (matrix.os != 'macos-11' || matrix.python-version != '3.7')
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}

- name: Update pip to the latest version
# pip release version 23.1 on Apr.15 2023, CI failed to run, Please refer to #1495 ofr detailed logs.
# The pip version has been temporarily fixed to 23.0.1
# The pip version has been temporarily fixed to 23.0
run: |
python -m pip install pip==23.0.1
python -m pip install pip==23.0

- name: Installing pytorch for macos
if: ${{ matrix.os == 'macos-11' || matrix.os == 'macos-latest' }}
Expand Down Expand Up @@ -129,8 +139,7 @@ jobs:
- name: Test data downloads
run: |
python scripts/get_data.py qlib_data --name qlib_data_simple --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn
azcopy copy https://qlibpublic.blob.core.windows.net/data/rl /tmp/qlibpublic/data --recursive
mv /tmp/qlibpublic/data tests/.data
python get_data.py download_data --file_name rl_data --target_dir tests/.data/rl

- name: Install Lightgbm for MacOS
if: ${{ matrix.os == 'macos-11' || matrix.os == 'macos-latest' }}
Expand Down
18 changes: 14 additions & 4 deletions .github/workflows/test_qlib_from_source_slow.yml
Expand Up @@ -20,18 +20,28 @@ jobs:

steps:
- name: Test qlib from source slow
uses: actions/checkout@v2
uses: actions/checkout@v3

# Since version 3.7 of python for MacOS is installed in CI, version 3.7.17, this version causes "_bz not found error".
# So we make the version number of python 3.7 for MacOS more specific.
# refs: https://github.com/actions/setup-python/issues/682
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
if: (matrix.os == 'macos-latest' && matrix.python-version == '3.7') || (matrix.os == 'macos-11' && matrix.python-version == '3.7')
uses: actions/setup-python@v4
with:
python-version: "3.7.16"

- name: Set up Python ${{ matrix.python-version }}
if: (matrix.os != 'macos-latest' || matrix.python-version != '3.7') && (matrix.os != 'macos-11' || matrix.python-version != '3.7')
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}

- name: Set up Python tools
# pip release version 23.1 on Apr.15 2023, CI failed to run, Please refer to #1495 ofr detailed logs.
# The pip version has been temporarily fixed to 23.0.1
# The pip version has been temporarily fixed to 23.0
run: |
python -m pip install pip==23.0.1
python -m pip install pip==23.0
pip install --upgrade cython numpy
pip install -e .[dev]

Expand Down
2 changes: 1 addition & 1 deletion docs/component/data.rst
Expand Up @@ -119,7 +119,7 @@ Here are some example:
for daily data:
.. code-block:: bash

python scripts/get_data.py csv_data_cn --target_dir ~/.qlib/csv_data/cn_data
python scripts/get_data.py download_data --file_name csv_data_cn.zip --target_dir ~/.qlib/csv_data/cn_data

for 1min data:
.. code-block:: bash
Expand Down
93 changes: 55 additions & 38 deletions qlib/tests/data.py
@@ -1,6 +1,7 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

import os
import re
import sys
import qlib
Expand All @@ -11,13 +12,15 @@
from tqdm import tqdm
from pathlib import Path
from loguru import logger
from cryptography.fernet import Fernet
from qlib.utils import exists_qlib_data


class GetData:
DATASET_VERSION = "v2"
REMOTE_URL = "https://qlibpublic.blob.core.windows.net/data/default/stock_data"
QLIB_DATA_NAME = "{dataset_name}_{region}_{interval}_{qlib_version}.zip"
# "?" is not included in the token.
TOKEN = "gAAAAABkmDhojHc0VSCDdNK1MqmRzNLeDFXe5hy8obHpa6SDQh4de6nW5gtzuD-fa6O_WZb0yyqYOL7ndOfJX_751W3xN5YB4-n-P22jK-t6ucoZqhT70KPD0Lf0_P328QPJVZ1gDnjIdjhi2YLOcP4BFTHLNYO0mvzszR8TKm9iT5AKRvuysWnpi8bbYwGU9zAcJK3x9EPL43hOGtxliFHcPNGMBoJW4g_ercdhi0-Qgv5_JLsV-29_MV-_AhuaYvJuN2dEywBy"
KEY = "EYcA8cgorA8X9OhyMwVfuFxn_1W3jGk6jCbs3L2oPoA="

def __init__(self, delete_zip_file=False):
"""
Expand All @@ -34,19 +37,44 @@ def normalize_dataset_version(self, dataset_version: str = None):
dataset_version = self.DATASET_VERSION
return dataset_version

def merge_remote_url(self, file_name: str, dataset_version: str = None):
return f"{self.REMOTE_URL}/{self.normalize_dataset_version(dataset_version)}/{file_name}"
def merge_remote_url(self, file_name: str):
fernet = Fernet(self.KEY)
token = fernet.decrypt(self.TOKEN).decode()
SunsetWolf marked this conversation as resolved.
Show resolved Hide resolved
return f"{self.REMOTE_URL}/{file_name}?{token}"

def _download_data(
self, file_name: str, target_dir: [Path, str], delete_old: bool = True, dataset_version: str = None
):
def download_data(self, file_name: str, target_dir: [Path, str], delete_old: bool = True):
"""
Download the specified file to the target folder.

Parameters
----------
target_dir: str
data save directory
file_name: str
dataset name, needs to endwith .zip, value from [rl_data.zip, csv_data_cn.zip, ...]
may contain folder names, for example: v2/qlib_data_simple_cn_1d_latest.zip
delete_old: bool
delete an existing directory, by default True

Examples
---------
# get rl data
python get_data.py download_data --file_name rl_data.zip --target_dir ~/.qlib/qlib_data/rl_data
When this command is run, the data will be downloaded from this link: https://qlibpublic.blob.core.windows.net/data/default/stock_data/rl_data.zip?{token}

# get cn csv data
python get_data.py download_data --file_name csv_data_cn.zip --target_dir ~/.qlib/csv_data/cn_data
When this command is run, the data will be downloaded from this link: https://qlibpublic.blob.core.windows.net/data/default/stock_data/csv_data_cn.zip?{token}
-------

"""
target_dir = Path(target_dir).expanduser()
target_dir.mkdir(exist_ok=True, parents=True)
# saved file name
_target_file_name = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + "_" + file_name
_target_file_name = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + "_" + os.path.basename(file_name)
target_path = target_dir.joinpath(_target_file_name)

url = self.merge_remote_url(file_name, dataset_version)
url = self.merge_remote_url(file_name)
resp = requests.get(url, stream=True, timeout=60)
resp.raise_for_status()
if resp.status_code != 200:
Expand All @@ -56,7 +84,7 @@ def _download_data(
logger.warning(
f"The data for the example is collected from Yahoo Finance. Please be aware that the quality of the data might not be perfect. (You can refer to the original data source: https://finance.yahoo.com/lookup.)"
)
logger.info(f"{file_name} downloading......")
logger.info(f"{os.path.basename(file_name)} downloading......")
with tqdm(total=int(resp.headers.get("Content-Length", 0))) as p_bar:
with target_path.open("wb") as fp:
for chunk in resp.iter_content(chunk_size=chunk_size):
Expand All @@ -67,8 +95,8 @@ def _download_data(
if self.delete_zip_file:
target_path.unlink()

def check_dataset(self, file_name: str, dataset_version: str = None):
url = self.merge_remote_url(file_name, dataset_version)
def check_dataset(self, file_name: str):
url = self.merge_remote_url(file_name)
resp = requests.get(url, stream=True, timeout=60)
status = True
if resp.status_code == 404:
Expand Down Expand Up @@ -140,9 +168,11 @@ def qlib_data(
---------
# get 1d data
python get_data.py qlib_data --name qlib_data --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn
When this command is run, the data will be downloaded from this link: https://qlibpublic.blob.core.windows.net/data/default/stock_data/v2/qlib_data_cn_1d_latest.zip?{token}

# get 1min data
python get_data.py qlib_data --name qlib_data --target_dir ~/.qlib/qlib_data/cn_data_1min --interval 1min --region cn
When this command is run, the data will be downloaded from this link: https://qlibpublic.blob.core.windows.net/data/default/stock_data/v2/qlib_data_cn_1min_latest.zip?{token}
-------

"""
Expand All @@ -155,29 +185,16 @@ def qlib_data(

qlib_version = ".".join(re.findall(r"(\d+)\.+", qlib.__version__))

def _get_file_name(v):
return self.QLIB_DATA_NAME.format(
dataset_name=name, region=region.lower(), interval=interval.lower(), qlib_version=v
)

file_name = _get_file_name(qlib_version)
if not self.check_dataset(file_name, version):
file_name = _get_file_name("latest")
self._download_data(file_name.lower(), target_dir, delete_old, dataset_version=version)

def csv_data_cn(self, target_dir="~/.qlib/csv_data/cn_data"):
SunsetWolf marked this conversation as resolved.
Show resolved Hide resolved
"""download cn csv data from remote

Parameters
----------
target_dir: str
data save directory

Examples
---------
python get_data.py csv_data_cn --target_dir ~/.qlib/csv_data/cn_data
-------

"""
file_name = "csv_data_cn.zip"
self._download_data(file_name, target_dir)
def _get_file_name_with_version(qlib_version, dataset_version):
if dataset_version is None:
file_name_with_version = f"v2/{name}_{region.lower()}_{interval.lower()}_{qlib_version}.zip"
else:
file_name_with_version = (
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

file_name_with_version = (
f"{ 'v2' if dataset_version is None dataset_version }/{name}{region.lower()}{interval.lower()}_{qlib_version}.zip"
)

f"{dataset_version}/{name}_{region.lower()}_{interval.lower()}_{qlib_version}.zip"
)
return file_name_with_version

file_name = _get_file_name_with_version(qlib_version, dataset_version=version)
if not self.check_dataset(file_name):
file_name = _get_file_name_with_version("latest", dataset_version=version)
self.download_data(file_name.lower(), target_dir, delete_old)
1 change: 1 addition & 0 deletions setup.py
Expand Up @@ -80,6 +80,7 @@ def get_version(rel_path: str) -> str:
"gym",
# Installing the latest version of protobuf for python versions below 3.8 will cause unit tests to fail.
"protobuf<=3.20.1;python_version<='3.8'",
"cryptography",
]

# Numpy include
Expand Down
2 changes: 1 addition & 1 deletion tests/test_dump_data.py
Expand Up @@ -35,7 +35,7 @@ class TestDumpData(unittest.TestCase):

@classmethod
def setUpClass(cls) -> None:
GetData().csv_data_cn(SOURCE_DIR)
GetData().download_data(file_name="csv_data_cn.zip", target_dir=SOURCE_DIR)
TestDumpData.DUMP_DATA = DumpDataAll(csv_path=SOURCE_DIR, qlib_dir=QLIB_DIR, include_fields=cls.FIELDS)
TestDumpData.STOCK_NAMES = list(map(lambda x: x.name[:-4].upper(), SOURCE_DIR.glob("*.csv")))
provider_uri = str(QLIB_DIR.resolve())
Expand Down
2 changes: 1 addition & 1 deletion tests/test_get_data.py
Expand Up @@ -42,7 +42,7 @@ def test_0_qlib_data(self):
self.assertFalse(df.dropna().empty, "get qlib data failed")

def test_1_csv_data(self):
GetData().csv_data_cn(SOURCE_DIR)
GetData().download_data(file_name="csv_data_cn.zip", target_dir=SOURCE_DIR)
stock_name = set(map(lambda x: x.name[:-4].upper(), SOURCE_DIR.glob("*.csv")))
self.assertEqual(len(stock_name), 85, "get csv data failed")

Expand Down