Skip to content

Commit

Permalink
Merge commit '1a66f069c432e43865775affbb23f77ee5c52f53' into release/…
Browse files Browse the repository at this point in the history
…1.13

* commit '1a66f069c432e43865775affbb23f77ee5c52f53':
  Dataset refactor (#807)
  fix download file with spical name as 'Image+Title.png' (#805)
  Fix error "modelscope attributeerror: 'dict' object has no attribute 'task_name' " (#800)
  move doc to classroom (#802)
  add ViViT-demo (#796)
  chore: Formal LICENSE content (#799)
  upload marigold monocular depth estimation core files (#703)
  • Loading branch information
tastelikefeet committed Mar 22, 2024
2 parents 73a8502 + 1a66f06 commit 9247ff4
Show file tree
Hide file tree
Showing 17 changed files with 1,881 additions and 1,203 deletions.
3 changes: 1 addition & 2 deletions LICENSE
@@ -1,4 +1,3 @@
Copyright 2022-2023 Alibaba ModelScope. All rights reserved.

Apache License
Version 2.0, January 2004
Expand Down Expand Up @@ -188,7 +187,7 @@ Copyright 2022-2023 Alibaba ModelScope. All rights reserved.
same "printed page" as the copyright notice for easier
identification within third-party archives.

Copyright 2020-2022 Alibaba ModelScope.
Copyright [yyyy] [name of copyright owner]

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down
289 changes: 0 additions & 289 deletions examples/pytorch/DiT_ImageNet_Demo.ipynb

This file was deleted.

3 changes: 3 additions & 0 deletions examples/pytorch/FILE_TRANSFER.md
@@ -0,0 +1,3 @@
# NOTE

`DiT_ImageNet_Demo.ipynb`, `SiT_ImageNet_Demo.ipynb`, `ViViT-demo.ipynb`, `UViT_ImageNet_demo.ipynb` are moved to the [modelscope-classroom repo](https://github.com/modelscope/modelscope-classroom)
316 changes: 0 additions & 316 deletions examples/pytorch/SiT_ImageNet_Demo.ipynb

This file was deleted.

569 changes: 0 additions & 569 deletions examples/pytorch/UViT_ImageNet_demo.ipynb

This file was deleted.

120 changes: 111 additions & 9 deletions modelscope/hub/api.py
Expand Up @@ -15,7 +15,9 @@
from http.cookiejar import CookieJar
from os.path import expanduser
from typing import Dict, List, Optional, Tuple, Union
from urllib.parse import urlencode

import json
import pandas as pd
import requests
from requests import Session
Expand All @@ -31,7 +33,8 @@
MODELSCOPE_CLOUD_ENVIRONMENT,
MODELSCOPE_CLOUD_USERNAME,
MODELSCOPE_REQUEST_ID, ONE_YEAR_SECONDS,
REQUESTS_API_HTTP_METHOD, Licenses,
REQUESTS_API_HTTP_METHOD,
DatasetVisibility, Licenses,
ModelVisibility)
from modelscope.hub.errors import (InvalidParameter, NotExistError,
NotLoginException, NoValidRevisionError,
Expand Down Expand Up @@ -647,6 +650,44 @@ def get_model_files(self,
files.append(file)
return files

def create_dataset(self,
dataset_name: str,
namespace: str,
chinese_name: Optional[str] = '',
license: Optional[str] = Licenses.APACHE_V2,
visibility: Optional[int] = DatasetVisibility.PUBLIC,
description: Optional[str] = '') -> str:

if dataset_name is None or namespace is None:
raise InvalidParameter('dataset_name and namespace are required!')

cookies = ModelScopeConfig.get_cookies()
if cookies is None:
raise ValueError('Token does not exist, please login first.')

path = f'{self.endpoint}/api/v1/datasets'
files = {
'Name': (None, dataset_name),
'ChineseName': (None, chinese_name),
'Owner': (None, namespace),
'License': (None, license),
'Visibility': (None, visibility),
'Description': (None, description)
}

r = self.session.post(
path,
files=files,
cookies=cookies,
headers=self.builder_headers(self.headers),
)

handle_http_post_error(r, path, files)
raise_on_error(r.json())
dataset_repo_url = f'{self.endpoint}/datasets/{namespace}/{dataset_name}'
logger.info(f'Create dataset success: {dataset_repo_url}')
return dataset_repo_url

def list_datasets(self):
path = f'{self.endpoint}/api/v1/datasets'
params = {}
Expand All @@ -667,6 +708,47 @@ def get_dataset_id_and_type(self, dataset_name: str, namespace: str):
dataset_type = resp['Data']['Type']
return dataset_id, dataset_type

def get_dataset_infos(self,
dataset_hub_id: str,
revision: str,
files_metadata: bool = False,
timeout: float = 100,
recursive: str = 'True'):
"""
Get dataset infos.
"""
datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_hub_id}/repo/tree'
params = {'Revision': revision, 'Root': None, 'Recursive': recursive}
cookies = ModelScopeConfig.get_cookies()
if files_metadata:
params['blobs'] = True
r = self.session.get(datahub_url, params=params, cookies=cookies, timeout=timeout)
resp = r.json()
datahub_raise_on_error(datahub_url, resp, r)

return resp

def list_repo_tree(self,
dataset_name: str,
namespace: str,
revision: str,
root_path: str,
recursive: bool = True):

dataset_hub_id, dataset_type = self.get_dataset_id_and_type(
dataset_name=dataset_name, namespace=namespace)

recursive = 'True' if recursive else 'False'
datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_hub_id}/repo/tree'
params = {'Revision': revision, 'Root': root_path, 'Recursive': recursive}
cookies = ModelScopeConfig.get_cookies()

r = self.session.get(datahub_url, params=params, cookies=cookies)
resp = r.json()
datahub_raise_on_error(datahub_url, resp, r)

return resp

def get_dataset_meta_file_list(self, dataset_name: str, namespace: str, dataset_id: str, revision: str):
""" Get the meta file-list of the dataset. """
datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={revision}'
Expand Down Expand Up @@ -735,7 +817,6 @@ def fetch_meta_files_from_url(url, out_path, chunk_size=1024, mode=DownloadMode.
Fetch the meta-data files from the url, e.g. csv/jsonl files.
"""
import hashlib
import json
from tqdm import tqdm
out_path = os.path.join(out_path, hashlib.md5(url.encode(encoding='UTF-8')).hexdigest())
if mode == DownloadMode.FORCE_REDOWNLOAD and os.path.exists(out_path):
Expand Down Expand Up @@ -774,7 +855,7 @@ def get_chunk(resp):
else:
with_header = False
chunk_df = pd.DataFrame(chunk)
chunk_df.to_csv(f, index=False, header=with_header)
chunk_df.to_csv(f, index=False, header=with_header, escapechar='\\')
iter_num += 1
else:
# csv or others
Expand All @@ -789,11 +870,28 @@ def get_dataset_file_url(
file_name: str,
dataset_name: str,
namespace: str,
revision: Optional[str] = DEFAULT_DATASET_REVISION):
if file_name and os.path.splitext(file_name)[-1] in META_FILES_FORMAT:
file_name = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
f'Revision={revision}&FilePath={file_name}'
return file_name
revision: Optional[str] = DEFAULT_DATASET_REVISION,
extension_filter: Optional[bool] = True):

if not file_name or not dataset_name or not namespace:
raise ValueError('Args (file_name, dataset_name, namespace) cannot be empty!')

# Note: make sure the FilePath is the last parameter in the url
params: dict = {'Source': 'SDK', 'Revision': revision, 'FilePath': file_name}
params: str = urlencode(params)
file_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?{params}'

return file_url

# if extension_filter:
# if os.path.splitext(file_name)[-1] in META_FILES_FORMAT:
# file_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?'\
# f'Revision={revision}&FilePath={file_name}'
# else:
# file_url = file_name
# return file_url
# else:
# return file_url

def get_dataset_access_config(
self,
Expand Down Expand Up @@ -931,7 +1029,7 @@ def datahub_remote_call(self, url):
datahub_raise_on_error(url, resp, r)
return resp['Data']

def dataset_download_statistics(self, dataset_name: str, namespace: str, use_streaming: bool) -> None:
def dataset_download_statistics(self, dataset_name: str, namespace: str, use_streaming: bool = False) -> None:
is_ci_test = os.getenv('CI_TEST') == 'True'
if dataset_name and namespace and not is_ci_test and not use_streaming:
try:
Expand Down Expand Up @@ -964,6 +1062,10 @@ def builder_headers(self, headers):
return {MODELSCOPE_REQUEST_ID: str(uuid.uuid4().hex),
**headers}

def get_file_base_path(self, namespace: str, dataset_name: str) -> str:
return f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?'
# return f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?Revision={revision}&FilePath='


class ModelScopeConfig:
path_credential = expanduser(DEFAULT_CREDENTIALS_PATH)
Expand Down
6 changes: 6 additions & 0 deletions modelscope/hub/constants.py
Expand Up @@ -47,3 +47,9 @@ class ModelVisibility(object):
PRIVATE = 1
INTERNAL = 3
PUBLIC = 5


class DatasetVisibility(object):
PRIVATE = 1
INTERNAL = 3
PUBLIC = 5
4 changes: 3 additions & 1 deletion modelscope/hub/file_download.py
Expand Up @@ -3,7 +3,7 @@
import copy
import os
import tempfile
import threading
import urllib
import uuid
from concurrent.futures import ThreadPoolExecutor
from functools import partial
Expand Down Expand Up @@ -179,6 +179,8 @@ def get_file_download_url(model_id: str, file_path: str, revision: str):
Returns:
str: The file url.
"""
file_path = urllib.parse.quote_plus(file_path)
revision = urllib.parse.quote_plus(revision)
download_url_template = '{endpoint}/api/v1/models/{model_id}/repo?Revision={revision}&FilePath={file_path}'
return download_url_template.format(
endpoint=get_endpoint(),
Expand Down
4 changes: 4 additions & 0 deletions modelscope/msdatasets/meta/data_meta_manager.py
Expand Up @@ -92,6 +92,10 @@ def fetch_meta_files(self) -> None:
data_meta_config.meta_cache_dir = meta_cache_dir
data_meta_config.dataset_scripts = dataset_scripts
data_meta_config.dataset_formation = dataset_formation
if '.py' in dataset_scripts:
tmp_py_scripts = dataset_scripts['.py']
if len(tmp_py_scripts) > 0:
data_meta_config.dataset_py_script = tmp_py_scripts[0]

# Set dataset_context_config
self.dataset_context_config.data_meta_config = data_meta_config
Expand Down
77 changes: 62 additions & 15 deletions modelscope/msdatasets/ms_dataset.py
Expand Up @@ -13,7 +13,6 @@
from modelscope.hub.repository import DatasetRepository
from modelscope.msdatasets.context.dataset_context_config import \
DatasetContextConfig
from modelscope.msdatasets.data_loader.data_loader import VirgoDownloader
from modelscope.msdatasets.data_loader.data_loader_manager import (
LocalDataLoaderManager, LocalDataLoaderType, RemoteDataLoaderManager,
RemoteDataLoaderType)
Expand All @@ -22,14 +21,16 @@
from modelscope.msdatasets.dataset_cls.custom_datasets.builder import \
build_custom_dataset
from modelscope.msdatasets.utils.delete_utils import DatasetDeleteManager
from modelscope.msdatasets.utils.hf_datasets_util import \
load_dataset as hf_load_dataset_wrapper
from modelscope.msdatasets.utils.upload_utils import DatasetUploadManager
from modelscope.preprocessors import build_preprocessor
from modelscope.utils.config import Config, ConfigDict
from modelscope.utils.config_ds import MS_DATASETS_CACHE
from modelscope.utils.constant import (DEFAULT_DATASET_NAMESPACE,
DEFAULT_DATASET_REVISION, ConfigFields,
DownloadMode, Hubs, ModeKeys, Tasks,
UploadMode, VirgoDatasetConfig)
DatasetFormations, DownloadMode, Hubs,
ModeKeys, Tasks, UploadMode)
from modelscope.utils.import_utils import is_tf_available, is_torch_available
from modelscope.utils.logger import get_logger

Expand Down Expand Up @@ -167,6 +168,7 @@ def load(
stream_batch_size: Optional[int] = 1,
custom_cfg: Optional[Config] = Config(),
token: Optional[str] = None,
dataset_info_only: Optional[bool] = False,
**config_kwargs,
) -> Union[dict, 'MsDataset', NativeIterableDataset]:
"""Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset.
Expand Down Expand Up @@ -196,6 +198,7 @@ def load(
custom_cfg (str, Optional): Model configuration, this can be used for custom datasets.
see https://modelscope.cn/docs/Configuration%E8%AF%A6%E8%A7%A3
token (str, Optional): SDK token of ModelScope.
dataset_info_only (bool, Optional): If set to True, only return the dataset config and info (dict).
**config_kwargs (additional keyword arguments): Keyword arguments to be passed
Returns:
Expand Down Expand Up @@ -279,19 +282,51 @@ def load(
return dataset_inst
# Load from the modelscope hub
elif hub == Hubs.modelscope:
remote_dataloader_manager = RemoteDataLoaderManager(
dataset_context_config)
dataset_inst = remote_dataloader_manager.load_dataset(
RemoteDataLoaderType.MS_DATA_LOADER)
dataset_inst = MsDataset.to_ms_dataset(dataset_inst, target=target)
if isinstance(dataset_inst, MsDataset):
dataset_inst._dataset_context_config = remote_dataloader_manager.dataset_context_config
if custom_cfg:
dataset_inst.to_custom_dataset(
custom_cfg=custom_cfg, **config_kwargs)
dataset_inst.is_custom = True
return dataset_inst

# Get dataset type from ModelScope Hub; dataset_type->4: General Dataset
from modelscope.hub.api import HubApi
_api = HubApi()
dataset_id_on_hub, dataset_type = _api.get_dataset_id_and_type(
dataset_name=dataset_name, namespace=namespace)

logger.info(f'dataset_type: {dataset_type}')

# Load from the ModelScope Hub for type=4 (general)
if str(dataset_type) == str(DatasetFormations.general.value):
return hf_load_dataset_wrapper(
path=namespace + '/' + dataset_name,
name=subset_name,
data_dir=data_dir,
data_files=data_files,
split=split,
cache_dir=cache_dir,
features=None,
download_config=None,
download_mode=download_mode.value,
revision=version,
token=token,
streaming=use_streaming,
dataset_info_only=dataset_info_only,
**config_kwargs)
else:

remote_dataloader_manager = RemoteDataLoaderManager(
dataset_context_config)
dataset_inst = remote_dataloader_manager.load_dataset(
RemoteDataLoaderType.MS_DATA_LOADER)
dataset_inst = MsDataset.to_ms_dataset(
dataset_inst, target=target)
if isinstance(dataset_inst, MsDataset):
dataset_inst._dataset_context_config = remote_dataloader_manager.dataset_context_config
if custom_cfg:
dataset_inst.to_custom_dataset(
custom_cfg=custom_cfg, **config_kwargs)
dataset_inst.is_custom = True
return dataset_inst

elif hub == Hubs.virgo:
from modelscope.msdatasets.data_loader.data_loader import VirgoDownloader
from modelscope.utils.constant import VirgoDatasetConfig
# Rewrite the namespace, version and cache_dir for virgo dataset.
if namespace == DEFAULT_DATASET_NAMESPACE:
dataset_context_config.namespace = VirgoDatasetConfig.default_virgo_namespace
Expand Down Expand Up @@ -323,6 +358,10 @@ def upload(
chunksize: Optional[int] = 1,
filter_hidden_files: Optional[bool] = True,
upload_mode: Optional[UploadMode] = UploadMode.OVERWRITE) -> None:
r"""
@deprecated
This method is deprecated and may be removed in future releases, please use git command line instead.
"""
"""Upload dataset file or directory to the ModelScope Hub. Please log in to the ModelScope Hub first.
Args:
Expand All @@ -346,6 +385,10 @@ def upload(
None
"""
warnings.warn(
'upload is deprecated, please use git command line to upload the dataset.',
DeprecationWarning)

if not object_name:
raise ValueError('object_name cannot be empty!')

Expand Down Expand Up @@ -393,6 +436,10 @@ def clone_meta(dataset_work_dir: str,
None
"""

warnings.warn(
'upload is deprecated, please use git command line to upload the dataset.',
DeprecationWarning)

_repo = DatasetRepository(
repo_work_dir=dataset_work_dir,
dataset_id=dataset_id,
Expand Down
5 changes: 4 additions & 1 deletion modelscope/msdatasets/utils/dataset_utils.py
Expand Up @@ -212,7 +212,10 @@ def get_dataset_files(subset_split_into: dict,

csv_delimiter = context_config.config_kwargs.get('delimiter', ',')
csv_df = pd.read_csv(
meta_csv_file_path, iterator=False, delimiter=csv_delimiter)
meta_csv_file_path,
iterator=False,
delimiter=csv_delimiter,
escapechar='\\')
target_col = csv_df.columns[csv_df.columns.str.contains(
':FILE')].to_list()
if len(target_col) == 0:
Expand Down

0 comments on commit 9247ff4

Please sign in to comment.