Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dataset refactor #807

Merged
merged 34 commits into from Mar 22, 2024
Merged
Show file tree
Hide file tree
Changes from 33 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
e34c638
update
wangxingjun778 Feb 22, 2024
91f3481
ok Merge branch 'master' of github.com:modelscope/modelscope into dev…
wangxingjun778 Mar 6, 2024
2f0d702
add main entry in ms_dataset
wangxingjun778 Mar 7, 2024
3c77cca
add main entry in ms_dataset
wangxingjun778 Mar 7, 2024
64e0aa4
update func get_data_patterns import
wangxingjun778 Mar 7, 2024
c10fda5
modify return_config_only
wangxingjun778 Mar 7, 2024
f57a2c9
update
wangxingjun778 Mar 7, 2024
76475c5
update
wangxingjun778 Mar 7, 2024
f90d214
update
wangxingjun778 Mar 7, 2024
32e21da
modify return_config_only to dataset_info_only
wangxingjun778 Mar 7, 2024
732984f
udpate version for test
wangxingjun778 Mar 7, 2024
b978553
del get_logger(__name__)
wangxingjun778 Mar 7, 2024
a130f0b
fix py script loading
wangxingjun778 Mar 10, 2024
b99985b
fix loading py and without py
wangxingjun778 Mar 11, 2024
d7038a3
add subset support
wangxingjun778 Mar 11, 2024
ecacc30
add hf_datasets_util; refine list_repo_tree_ms; fix private datasets …
wangxingjun778 Mar 12, 2024
948bed6
update version to rc5
wangxingjun778 Mar 12, 2024
7cb839b
fix and support preview for dataset_info_only mode
wangxingjun778 Mar 14, 2024
fc13737
fix urlencode
wangxingjun778 Mar 15, 2024
53b6382
update to rc7
wangxingjun778 Mar 15, 2024
b5f75ca
1. loading of dataset_infos.json is deprecated; 2. add some ut
wangxingjun778 Mar 18, 2024
1f8cda0
update version
wangxingjun778 Mar 18, 2024
014770a
add escapechar for read_csv and to_csv
wangxingjun778 Mar 18, 2024
c1f7dad
add params: Source=SDK
wangxingjun778 Mar 18, 2024
5b97201
add create_dataset func
wangxingjun778 Mar 18, 2024
ecddc5a
overwrite _get_paths_info
wangxingjun778 Mar 18, 2024
a4f4584
ok Merge branch 'master' of github.com:modelscope/modelscope into dev…
wangxingjun778 Mar 18, 2024
8830a07
update & version
wangxingjun778 Mar 18, 2024
fb3a025
update
wangxingjun778 Mar 18, 2024
02e48b1
update list_repo_tree name
wangxingjun778 Mar 18, 2024
1e9dc7b
add get_module_with_script, fix download imports
wangxingjun778 Mar 20, 2024
6bee42f
fix py script loading issue in dataset_module_factory
wangxingjun778 Mar 21, 2024
4e0216d
fix create dataset
wangxingjun778 Mar 22, 2024
09e1f50
update log info in api
wangxingjun778 Mar 22, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
120 changes: 111 additions & 9 deletions modelscope/hub/api.py
Expand Up @@ -15,7 +15,9 @@
from http.cookiejar import CookieJar
from os.path import expanduser
from typing import Dict, List, Optional, Tuple, Union
from urllib.parse import urlencode

import json
import pandas as pd
import requests
from requests import Session
Expand All @@ -31,7 +33,8 @@
MODELSCOPE_CLOUD_ENVIRONMENT,
MODELSCOPE_CLOUD_USERNAME,
MODELSCOPE_REQUEST_ID, ONE_YEAR_SECONDS,
REQUESTS_API_HTTP_METHOD, Licenses,
REQUESTS_API_HTTP_METHOD,
DatasetVisibility, Licenses,
ModelVisibility)
from modelscope.hub.errors import (InvalidParameter, NotExistError,
NotLoginException, NoValidRevisionError,
Expand Down Expand Up @@ -647,6 +650,44 @@ def get_model_files(self,
files.append(file)
return files

def create_dataset(self,
dataset_name: str,
namespace: str,
chinese_name: Optional[str] = '',
license: Optional[str] = Licenses.APACHE_V2,
visibility: Optional[int] = DatasetVisibility.PUBLIC,
description: Optional[str] = '') -> str:

if dataset_name is None or namespace is None:
raise InvalidParameter('dataset_name and namespace is required!')
wangxingjun778 marked this conversation as resolved.
Show resolved Hide resolved

cookies = ModelScopeConfig.get_cookies()
if cookies is None:
raise ValueError('Token does not exist, please login first.')

path = f'{self.endpoint}/api/v1/datasets'
files = {
'Name': (None, dataset_name),
'ChineseName': (None, chinese_name),
'Owner': (None, namespace),
'License': (None, license),
'Visibility': (None, visibility),
'Description': (None, description)
}

r = self.session.post(
path,
files=files,
cookies=cookies,
headers=self.builder_headers(self.headers),
)

handle_http_post_error(r, path, files)
raise_on_error(r.json())
dataset_repo_url = f'{self.endpoint}/datasets/{namespace}/{dataset_name}'
logger.info(f'Create dataset success: {dataset_repo_url}')
return dataset_repo_url

def list_datasets(self):
path = f'{self.endpoint}/api/v1/datasets'
params = {}
Expand All @@ -667,6 +708,47 @@ def get_dataset_id_and_type(self, dataset_name: str, namespace: str):
dataset_type = resp['Data']['Type']
return dataset_id, dataset_type

def get_dataset_infos(self,
dataset_hub_id: str,
revision: str,
files_metadata: bool = False,
timeout: float = 100,
recursive: str = 'True'):
"""
Get dataset infos.
"""
datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_hub_id}/repo/tree'
params = {'Revision': revision, 'Root': None, 'Recursive': recursive}
cookies = ModelScopeConfig.get_cookies()
if files_metadata:
params['blobs'] = True
r = self.session.get(datahub_url, params=params, cookies=cookies, timeout=timeout)
resp = r.json()
datahub_raise_on_error(datahub_url, resp, r)

return resp

def list_repo_tree(self,
dataset_name: str,
namespace: str,
revision: str,
root_path: str,
recursive: bool = True):

dataset_hub_id, dataset_type = self.get_dataset_id_and_type(
dataset_name=dataset_name, namespace=namespace)

recursive = 'True' if recursive else 'False'
datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_hub_id}/repo/tree'
params = {'Revision': revision, 'Root': root_path, 'Recursive': recursive}
cookies = ModelScopeConfig.get_cookies()

r = self.session.get(datahub_url, params=params, cookies=cookies)
resp = r.json()
datahub_raise_on_error(datahub_url, resp, r)

return resp

def get_dataset_meta_file_list(self, dataset_name: str, namespace: str, dataset_id: str, revision: str):
""" Get the meta file-list of the dataset. """
datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={revision}'
Expand Down Expand Up @@ -735,7 +817,6 @@ def fetch_meta_files_from_url(url, out_path, chunk_size=1024, mode=DownloadMode.
Fetch the meta-data files from the url, e.g. csv/jsonl files.
"""
import hashlib
import json
from tqdm import tqdm
out_path = os.path.join(out_path, hashlib.md5(url.encode(encoding='UTF-8')).hexdigest())
if mode == DownloadMode.FORCE_REDOWNLOAD and os.path.exists(out_path):
Expand Down Expand Up @@ -774,7 +855,7 @@ def get_chunk(resp):
else:
with_header = False
chunk_df = pd.DataFrame(chunk)
chunk_df.to_csv(f, index=False, header=with_header)
chunk_df.to_csv(f, index=False, header=with_header, escapechar='\\')
iter_num += 1
else:
# csv or others
Expand All @@ -789,11 +870,28 @@ def get_dataset_file_url(
file_name: str,
dataset_name: str,
namespace: str,
revision: Optional[str] = DEFAULT_DATASET_REVISION):
if file_name and os.path.splitext(file_name)[-1] in META_FILES_FORMAT:
file_name = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
f'Revision={revision}&FilePath={file_name}'
return file_name
revision: Optional[str] = DEFAULT_DATASET_REVISION,
extension_filter: Optional[bool] = True):

if not file_name or not dataset_name or not namespace:
raise ValueError('Args (file_name, dataset_name, namespace) cannot be empty!')

# Note: make sure the FilePath is the last parameter in the url
params: dict = {'Source': 'SDK', 'Revision': revision, 'FilePath': file_name}
params: str = urlencode(params)
file_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?{params}'

return file_url

# if extension_filter:
# if os.path.splitext(file_name)[-1] in META_FILES_FORMAT:
# file_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?'\
# f'Revision={revision}&FilePath={file_name}'
# else:
# file_url = file_name
# return file_url
# else:
# return file_url

def get_dataset_access_config(
self,
Expand Down Expand Up @@ -931,7 +1029,7 @@ def datahub_remote_call(self, url):
datahub_raise_on_error(url, resp, r)
return resp['Data']

def dataset_download_statistics(self, dataset_name: str, namespace: str, use_streaming: bool) -> None:
def dataset_download_statistics(self, dataset_name: str, namespace: str, use_streaming: bool = False) -> None:
is_ci_test = os.getenv('CI_TEST') == 'True'
if dataset_name and namespace and not is_ci_test and not use_streaming:
try:
Expand Down Expand Up @@ -964,6 +1062,10 @@ def builder_headers(self, headers):
return {MODELSCOPE_REQUEST_ID: str(uuid.uuid4().hex),
**headers}

def get_file_base_path(self, namespace: str, dataset_name: str) -> str:
return f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?'
# return f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?Revision={revision}&FilePath='


class ModelScopeConfig:
path_credential = expanduser(DEFAULT_CREDENTIALS_PATH)
Expand Down
6 changes: 6 additions & 0 deletions modelscope/hub/constants.py
Expand Up @@ -47,3 +47,9 @@ class ModelVisibility(object):
PRIVATE = 1
INTERNAL = 3
PUBLIC = 5


class DatasetVisibility(object):
PRIVATE = 1
INTERNAL = 3
PUBLIC = 5
4 changes: 4 additions & 0 deletions modelscope/msdatasets/meta/data_meta_manager.py
Expand Up @@ -92,6 +92,10 @@ def fetch_meta_files(self) -> None:
data_meta_config.meta_cache_dir = meta_cache_dir
data_meta_config.dataset_scripts = dataset_scripts
data_meta_config.dataset_formation = dataset_formation
if '.py' in dataset_scripts:
tmp_py_scripts = dataset_scripts['.py']
if len(tmp_py_scripts) > 0:
data_meta_config.dataset_py_script = tmp_py_scripts[0]

# Set dataset_context_config
self.dataset_context_config.data_meta_config = data_meta_config
Expand Down
77 changes: 62 additions & 15 deletions modelscope/msdatasets/ms_dataset.py
Expand Up @@ -13,7 +13,6 @@
from modelscope.hub.repository import DatasetRepository
from modelscope.msdatasets.context.dataset_context_config import \
DatasetContextConfig
from modelscope.msdatasets.data_loader.data_loader import VirgoDownloader
from modelscope.msdatasets.data_loader.data_loader_manager import (
LocalDataLoaderManager, LocalDataLoaderType, RemoteDataLoaderManager,
RemoteDataLoaderType)
Expand All @@ -22,14 +21,16 @@
from modelscope.msdatasets.dataset_cls.custom_datasets.builder import \
build_custom_dataset
from modelscope.msdatasets.utils.delete_utils import DatasetDeleteManager
from modelscope.msdatasets.utils.hf_datasets_util import \
load_dataset as hf_load_dataset_wrapper
from modelscope.msdatasets.utils.upload_utils import DatasetUploadManager
from modelscope.preprocessors import build_preprocessor
from modelscope.utils.config import Config, ConfigDict
from modelscope.utils.config_ds import MS_DATASETS_CACHE
from modelscope.utils.constant import (DEFAULT_DATASET_NAMESPACE,
DEFAULT_DATASET_REVISION, ConfigFields,
DownloadMode, Hubs, ModeKeys, Tasks,
UploadMode, VirgoDatasetConfig)
DatasetFormations, DownloadMode, Hubs,
ModeKeys, Tasks, UploadMode)
from modelscope.utils.import_utils import is_tf_available, is_torch_available
from modelscope.utils.logger import get_logger

Expand Down Expand Up @@ -167,6 +168,7 @@ def load(
stream_batch_size: Optional[int] = 1,
custom_cfg: Optional[Config] = Config(),
token: Optional[str] = None,
dataset_info_only: Optional[bool] = False,
**config_kwargs,
) -> Union[dict, 'MsDataset', NativeIterableDataset]:
"""Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset.
Expand Down Expand Up @@ -196,6 +198,7 @@ def load(
custom_cfg (str, Optional): Model configuration, this can be used for custom datasets.
see https://modelscope.cn/docs/Configuration%E8%AF%A6%E8%A7%A3
token (str, Optional): SDK token of ModelScope.
dataset_info_only (bool, Optional): If set to True, only return the dataset config and info (dict).
**config_kwargs (additional keyword arguments): Keyword arguments to be passed

Returns:
Expand Down Expand Up @@ -279,19 +282,51 @@ def load(
return dataset_inst
# Load from the modelscope hub
elif hub == Hubs.modelscope:
remote_dataloader_manager = RemoteDataLoaderManager(
dataset_context_config)
dataset_inst = remote_dataloader_manager.load_dataset(
RemoteDataLoaderType.MS_DATA_LOADER)
dataset_inst = MsDataset.to_ms_dataset(dataset_inst, target=target)
if isinstance(dataset_inst, MsDataset):
dataset_inst._dataset_context_config = remote_dataloader_manager.dataset_context_config
if custom_cfg:
dataset_inst.to_custom_dataset(
custom_cfg=custom_cfg, **config_kwargs)
dataset_inst.is_custom = True
return dataset_inst

# Get dataset type from ModelScope Hub; dataset_type->4: General Dataset
from modelscope.hub.api import HubApi
_api = HubApi()
dataset_id_on_hub, dataset_type = _api.get_dataset_id_and_type(
dataset_name=dataset_name, namespace=namespace)

logger.info(f'dataset_type: {dataset_type}')

# Load from the ModelScope Hub for type=4 (general)
if str(dataset_type) == str(DatasetFormations.general.value):
return hf_load_dataset_wrapper(
path=namespace + '/' + dataset_name,
name=subset_name,
data_dir=data_dir,
data_files=data_files,
split=split,
cache_dir=cache_dir,
features=None,
download_config=None,
download_mode=download_mode.value,
revision=version,
token=token,
streaming=use_streaming,
dataset_info_only=dataset_info_only,
**config_kwargs)
else:

remote_dataloader_manager = RemoteDataLoaderManager(
dataset_context_config)
dataset_inst = remote_dataloader_manager.load_dataset(
RemoteDataLoaderType.MS_DATA_LOADER)
dataset_inst = MsDataset.to_ms_dataset(
dataset_inst, target=target)
if isinstance(dataset_inst, MsDataset):
dataset_inst._dataset_context_config = remote_dataloader_manager.dataset_context_config
if custom_cfg:
dataset_inst.to_custom_dataset(
custom_cfg=custom_cfg, **config_kwargs)
dataset_inst.is_custom = True
return dataset_inst

elif hub == Hubs.virgo:
from modelscope.msdatasets.data_loader.data_loader import VirgoDownloader
from modelscope.utils.constant import VirgoDatasetConfig
# Rewrite the namespace, version and cache_dir for virgo dataset.
if namespace == DEFAULT_DATASET_NAMESPACE:
dataset_context_config.namespace = VirgoDatasetConfig.default_virgo_namespace
Expand Down Expand Up @@ -323,6 +358,10 @@ def upload(
chunksize: Optional[int] = 1,
filter_hidden_files: Optional[bool] = True,
upload_mode: Optional[UploadMode] = UploadMode.OVERWRITE) -> None:
r"""
@deprecated
This method is deprecated and may be removed in future releases, please use git command line instead.
"""
"""Upload dataset file or directory to the ModelScope Hub. Please log in to the ModelScope Hub first.

Args:
Expand All @@ -346,6 +385,10 @@ def upload(
None

"""
warnings.warn(
'upload is deprecated, please use git command line to upload the dataset.',
DeprecationWarning)

if not object_name:
raise ValueError('object_name cannot be empty!')

Expand Down Expand Up @@ -393,6 +436,10 @@ def clone_meta(dataset_work_dir: str,
None
"""

warnings.warn(
'upload is deprecated, please use git command line to upload the dataset.',
DeprecationWarning)

_repo = DatasetRepository(
repo_work_dir=dataset_work_dir,
dataset_id=dataset_id,
Expand Down
5 changes: 4 additions & 1 deletion modelscope/msdatasets/utils/dataset_utils.py
Expand Up @@ -212,7 +212,10 @@ def get_dataset_files(subset_split_into: dict,

csv_delimiter = context_config.config_kwargs.get('delimiter', ',')
csv_df = pd.read_csv(
meta_csv_file_path, iterator=False, delimiter=csv_delimiter)
meta_csv_file_path,
iterator=False,
delimiter=csv_delimiter,
escapechar='\\')
target_col = csv_df.columns[csv_df.columns.str.contains(
':FILE')].to_list()
if len(target_col) == 0:
Expand Down