Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

change get_data url #1558

Merged
merged 38 commits into from
Jun 25, 2023
Merged
Show file tree
Hide file tree
Changes from 32 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 15 additions & 6 deletions .github/workflows/test_qlib_from_source.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,28 @@ jobs:

steps:
- name: Test qlib from source
uses: actions/checkout@v2
uses: actions/checkout@v3

# Since version 3.7 of python for MacOS is installed in CI, version 3.7.17, this version causes "_bz not found error".
# So we make the version number of python 3.7 for MacOS more specific.
# refs: https://github.com/actions/setup-python/issues/682
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
if: (matrix.os == 'macos-latest' && matrix.python-version == '3.7') || (matrix.os == 'macos-11' && matrix.python-version == '3.7')
uses: actions/setup-python@v4
with:
python-version: "3.7.16"

- name: Set up Python ${{ matrix.python-version }}
if: (matrix.os != 'macos-latest' || matrix.python-version != '3.7') && (matrix.os != 'macos-11' || matrix.python-version != '3.7')
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}

- name: Update pip to the latest version
# pip release version 23.1 on Apr.15 2023, CI failed to run, Please refer to #1495 ofr detailed logs.
# The pip version has been temporarily fixed to 23.0.1
# The pip version has been temporarily fixed to 23.0
run: |
python -m pip install pip==23.0.1
python -m pip install pip==23.0

- name: Installing pytorch for macos
if: ${{ matrix.os == 'macos-11' || matrix.os == 'macos-latest' }}
Expand Down Expand Up @@ -129,8 +139,7 @@ jobs:
- name: Test data downloads
run: |
python scripts/get_data.py qlib_data --name qlib_data_simple --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn
azcopy copy https://qlibpublic.blob.core.windows.net/data/rl /tmp/qlibpublic/data --recursive
mv /tmp/qlibpublic/data tests/.data
python get_data.py download_data --file_name rl_data --target_dir tests/.data/rl

- name: Install Lightgbm for MacOS
if: ${{ matrix.os == 'macos-11' || matrix.os == 'macos-latest' }}
Expand Down
18 changes: 14 additions & 4 deletions .github/workflows/test_qlib_from_source_slow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,28 @@ jobs:

steps:
- name: Test qlib from source slow
uses: actions/checkout@v2
uses: actions/checkout@v3

# Since version 3.7 of python for MacOS is installed in CI, version 3.7.17, this version causes "_bz not found error".
# So we make the version number of python 3.7 for MacOS more specific.
# refs: https://github.com/actions/setup-python/issues/682
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
if: (matrix.os == 'macos-latest' && matrix.python-version == '3.7') || (matrix.os == 'macos-11' && matrix.python-version == '3.7')
uses: actions/setup-python@v4
with:
python-version: "3.7.16"

- name: Set up Python ${{ matrix.python-version }}
if: (matrix.os != 'macos-latest' || matrix.python-version != '3.7') && (matrix.os != 'macos-11' || matrix.python-version != '3.7')
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}

- name: Set up Python tools
# pip release version 23.1 on Apr.15 2023, CI failed to run, Please refer to #1495 ofr detailed logs.
# The pip version has been temporarily fixed to 23.0.1
# The pip version has been temporarily fixed to 23.0
run: |
python -m pip install pip==23.0.1
python -m pip install pip==23.0
pip install --upgrade cython numpy
pip install -e .[dev]

Expand Down
76 changes: 46 additions & 30 deletions qlib/tests/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,15 @@
from tqdm import tqdm
from pathlib import Path
from loguru import logger
from cryptography.fernet import Fernet
from qlib.utils import exists_qlib_data


class GetData:
DATASET_VERSION = "v2"
REMOTE_URL = "https://qlibpublic.blob.core.windows.net/data/default/stock_data"
TOKEN = "gAAAAABkl9cedDk0lDdzxZgl70vAhIu3obr3wPqZiIXibQSYA5yCTr8FN1gZ_8XSthRAYucwEkq76ahFg10F_NFrCSroeGNczB1kouajJiEvlGlO389pZRXV4GDmDe3pbETXzEipSbXNGyw3oYF3t2TIulxLkTio7xI6-980EhIdy56oU_cuHqhSnaOlXfJwM0kGnTHNiTeWbLDlq1GWtbXuY5ZACvkVDBcmkO36CMv5qIgQ_iLmk3ZsFq96CYyJCbsDCgCKIX7R"
KEY = "EYcA8cgorA8X9OhyMwVfuFxn_1W3jGk6jCbs3L2oPoA="
QLIB_DATA_NAME = "{dataset_name}_{region}_{interval}_{qlib_version}.zip"
SunsetWolf marked this conversation as resolved.
Show resolved Hide resolved

def __init__(self, delete_zip_file=False):
Expand All @@ -34,19 +37,47 @@ def normalize_dataset_version(self, dataset_version: str = None):
dataset_version = self.DATASET_VERSION
return dataset_version

def merge_remote_url(self, file_name: str, dataset_version: str = None):
return f"{self.REMOTE_URL}/{self.normalize_dataset_version(dataset_version)}/{file_name}"
def merge_remote_url(self, file_name: str):
fernet = Fernet(self.KEY)
token = fernet.decrypt(self.TOKEN).decode()
SunsetWolf marked this conversation as resolved.
Show resolved Hide resolved
return f"{self.REMOTE_URL}/{file_name}{token}"

def _download_data(
self, file_name: str, target_dir: [Path, str], delete_old: bool = True, dataset_version: str = None
def download_data(
self, file_name: str, target_dir: [Path, str], delete_old: bool = True
):
"""
Download the specified file to the target folder.

Parameters
----------
target_dir: str
data save directory
file_name: str
dataset name, value from [rl_data, csv_data_cn, ...]
SunsetWolf marked this conversation as resolved.
Show resolved Hide resolved
delete_old: bool
delete an existing directory, by default True

Examples
---------
# get rl data
python get_data.py download_data --file_name rl_data --target_dir ~/.qlib/qlib_data/rl_data
SunsetWolf marked this conversation as resolved.
Show resolved Hide resolved

# get cn csv data
python get_data.py download_data --file_name csv_data_cn --target_dir ~/.qlib/csv_data/cn_data
-------

"""
target_dir = Path(target_dir).expanduser()
target_dir.mkdir(exist_ok=True, parents=True)
# saved file name
_target_file_name = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + "_" + file_name
if "/" in file_name:
SunsetWolf marked this conversation as resolved.
Show resolved Hide resolved
_target_file_name = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + "_" + file_name.split("/")[1]
SunsetWolf marked this conversation as resolved.
Show resolved Hide resolved
else:
file_name += ".zip"
_target_file_name = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + "_" + file_name
target_path = target_dir.joinpath(_target_file_name)

url = self.merge_remote_url(file_name, dataset_version)
url = self.merge_remote_url(file_name)
resp = requests.get(url, stream=True, timeout=60)
resp.raise_for_status()
if resp.status_code != 200:
Expand All @@ -68,7 +99,7 @@ def _download_data(
target_path.unlink()

def check_dataset(self, file_name: str, dataset_version: str = None):
url = self.merge_remote_url(file_name, dataset_version)
url = self.merge_remote_url(file_name)
resp = requests.get(url, stream=True, timeout=60)
status = True
if resp.status_code == 404:
Expand Down Expand Up @@ -155,29 +186,14 @@ def qlib_data(

qlib_version = ".".join(re.findall(r"(\d+)\.+", qlib.__version__))

def _get_file_name(v):
return self.QLIB_DATA_NAME.format(
def _get_file_name_with_version(v):
SunsetWolf marked this conversation as resolved.
Show resolved Hide resolved
file_name = self.QLIB_DATA_NAME.format(
dataset_name=name, region=region.lower(), interval=interval.lower(), qlib_version=v
)
file_name_with_version = f"{self.normalize_dataset_version(version)}/{file_name}"
return file_name_with_version

file_name = _get_file_name(qlib_version)
if not self.check_dataset(file_name, version):
file_name = _get_file_name("latest")
self._download_data(file_name.lower(), target_dir, delete_old, dataset_version=version)

def csv_data_cn(self, target_dir="~/.qlib/csv_data/cn_data"):
SunsetWolf marked this conversation as resolved.
Show resolved Hide resolved
"""download cn csv data from remote

Parameters
----------
target_dir: str
data save directory

Examples
---------
python get_data.py csv_data_cn --target_dir ~/.qlib/csv_data/cn_data
-------

"""
file_name = "csv_data_cn.zip"
self._download_data(file_name, target_dir)
file_name = _get_file_name_with_version(qlib_version)
if not self.check_dataset(file_name):
file_name = _get_file_name_with_version("latest")
self.download_data(file_name.lower(), target_dir, delete_old)
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ def get_version(rel_path: str) -> str:
"gym",
# Installing the latest version of protobuf for python versions below 3.8 will cause unit tests to fail.
"protobuf<=3.20.1;python_version<='3.8'",
"cryptography",
]

# Numpy include
Expand Down