diff --git a/docs/reference/api.rst b/docs/reference/api.rst index 57f61f18b1..5e6e50b0ba 100644 --- a/docs/reference/api.rst +++ b/docs/reference/api.rst @@ -53,6 +53,34 @@ Cache .. autoclass:: qlib.data.cache.DiskDatasetCache :members: + +Storage +------------- +.. autoclass:: qlib.data.storage.storage.BaseStorage + :members: + +.. autoclass:: qlib.data.storage.storage.CalendarStorage + :members: + +.. autoclass:: qlib.data.storage.storage.InstrumentStorage + :members: + +.. autoclass:: qlib.data.storage.storage.FeatureStorage + :members: + +.. autoclass:: qlib.data.storage.file_storage.FileStorageMixin + :members: + +.. autoclass:: qlib.data.storage.file_storage.FileCalendarStorage + :members: + +.. autoclass:: qlib.data.storage.file_storage.FileInstrumentStorage + :members: + +.. autoclass:: qlib.data.storage.file_storage.FileFeatureStorage + :members: + + Dataset --------------- diff --git a/qlib/data/data.py b/qlib/data/data.py index 3a74a20277..eb7fbe0ead 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -45,12 +45,12 @@ def backend_obj(self, **kwargs): # set default storage kwargs backend_kwargs = backend.setdefault("kwargs", {}) - # default uri map - if "uri" not in backend_kwargs: + # default provider_uri map + if "provider_uri" not in backend_kwargs: # if the user has no uri configured, use: uri = uri_map[freq] freq = kwargs.get("freq", "day") - uri_map = backend_kwargs.setdefault("uri_map", {freq: C.get_data_path()}) - backend_kwargs["uri"] = uri_map[freq] + provider_uri_map = backend_kwargs.setdefault("provider_uri_map", {freq: C.get_data_path()}) + backend_kwargs["provider_uri"] = provider_uri_map[freq] backend.setdefault("kwargs", {}).update(**kwargs) return init_instance_by_config(backend) @@ -556,17 +556,21 @@ def load_calendar(self, freq, future): list of timestamps """ - backend_obj = self.backend_obj(freq=freq, future=future) - if future and not backend_obj.check_exists(): - get_module_logger("data").warning( - f"load calendar error: freq={freq}, future={future}; return current calendar!" - ) - get_module_logger("data").warning( - "You can get future calendar by referring to the following document: https://github.com/microsoft/qlib/blob/main/scripts/data_collector/contrib/README.md" - ) - backend_obj = self.backend_obj(freq=freq, future=False) + try: + backend_obj = self.backend_obj(freq=freq, future=future).data + except ValueError: + if future: + get_module_logger("data").warning( + f"load calendar error: freq={freq}, future={future}; return current calendar!" + ) + get_module_logger("data").warning( + "You can get future calendar by referring to the following document: https://github.com/microsoft/qlib/blob/main/scripts/data_collector/contrib/README.md" + ) + backend_obj = self.backend_obj(freq=freq, future=False).data + else: + raise - return [pd.Timestamp(x) for x in backend_obj.data] + return [pd.Timestamp(x) for x in backend_obj] def calendar(self, start_time=None, end_time=None, freq="day", future=False): _calendar, _calendar_index = self._get_calendar(freq, future) @@ -659,14 +663,7 @@ def feature(self, instrument, field, start_index, end_index, freq): # validate field = str(field).lower()[1:] instrument = code_to_fname(instrument) - try: - data = self.backend_obj(instrument=instrument, field=field, freq=freq)[start_index : end_index + 1] - except Exception as e: - get_module_logger("data").warning( - f"WARN: data not found for {instrument}.{field}\n\tFeature exception info: {str(e)}" - ) - data = pd.Series(dtype=np.float32) - return data + return self.backend_obj(instrument=instrument, field=field, freq=freq)[start_index : end_index + 1] class LocalExpressionProvider(ExpressionProvider): diff --git a/qlib/data/storage/file_storage.py b/qlib/data/storage/file_storage.py index 90e4178ffe..a2b145c4df 100644 --- a/qlib/data/storage/file_storage.py +++ b/qlib/data/storage/file_storage.py @@ -14,19 +14,35 @@ logger = get_module_logger("file_storage") -class FileStorage: - def check_exists(self): - return self.uri.exists() +class FileStorageMixin: + @property + def uri(self) -> Path: + _provider_uri = self.kwargs.get("provider_uri", None) + if _provider_uri is None: + raise ValueError( + f"The `provider_uri` parameter is not found in {self.__class__.__name__}, " + f'please specify `provider_uri` in the "provider\'s backend"' + ) + return Path(_provider_uri).expanduser().joinpath(f"{self.storage_name}s", self.file_name) + + def check(self): + """check self.uri + + Raises + ------- + ValueError + """ + if not self.uri.exists(): + raise ValueError(f"{self.storage_name} not exists: {self.uri}") -class FileCalendarStorage(FileStorage, CalendarStorage): - def __init__(self, freq: str, future: bool, uri: str, **kwargs): - super(FileCalendarStorage, self).__init__(freq, future, uri, **kwargs) - _file_name = f"{freq}_future.txt" if future else f"{freq}.txt" - self.uri = Path(self.uri).expanduser().joinpath("calendars", _file_name.lower()) +class FileCalendarStorage(FileStorageMixin, CalendarStorage): + def __init__(self, freq: str, future: bool, **kwargs): + super(FileCalendarStorage, self).__init__(freq, future, **kwargs) + self.file_name = f"{freq}_future.txt" if future else f"{freq}.txt".lower() - def _read_calendar(self, skip_rows: int = 0, n_rows: int = None) -> Iterable[CalVT]: - if not self.check_exists(): + def _read_calendar(self, skip_rows: int = 0, n_rows: int = None) -> List[CalVT]: + if not self.uri.exists(): self._write_calendar(values=[]) with self.uri.open("rb") as fp: return [ @@ -39,7 +55,8 @@ def _write_calendar(self, values: Iterable[CalVT], mode: str = "wb"): np.savetxt(fp, values, fmt="%s", encoding="utf-8") @property - def data(self) -> Iterable[CalVT]: + def data(self) -> List[CalVT]: + self.check() return self._read_calendar() def extend(self, values: Iterable[CalVT]) -> None: @@ -49,6 +66,7 @@ def clear(self) -> None: self._write_calendar(values=[]) def index(self, value: CalVT) -> int: + self.check() calendar = self._read_calendar() return int(np.argwhere(calendar == value)[0]) @@ -58,6 +76,7 @@ def insert(self, index: int, value: CalVT): self._write_calendar(values=calendar) def remove(self, value: CalVT) -> None: + self.check() index = self.index(value) calendar = self._read_calendar() calendar = np.delete(calendar, index) @@ -69,24 +88,29 @@ def __setitem__(self, i: Union[int, slice], values: Union[CalVT, Iterable[CalVT] self._write_calendar(values=calendar) def __delitem__(self, i: Union[int, slice]) -> None: + self.check() calendar = self._read_calendar() calendar = np.delete(calendar, i) self._write_calendar(values=calendar) - def __getitem__(self, i: Union[int, slice]) -> Union[CalVT, Iterable[CalVT]]: + def __getitem__(self, i: Union[int, slice]) -> Union[CalVT, List[CalVT]]: + self.check() return self._read_calendar()[i] + def __len__(self) -> int: + return len(self.data) + -class FileInstrumentStorage(FileStorage, InstrumentStorage): +class FileInstrumentStorage(FileStorageMixin, InstrumentStorage): INSTRUMENT_SEP = "\t" INSTRUMENT_START_FIELD = "start_datetime" INSTRUMENT_END_FIELD = "end_datetime" SYMBOL_FIELD_NAME = "instrument" - def __init__(self, market: str, uri: str, **kwargs): - super(FileInstrumentStorage, self).__init__(market, uri, **kwargs) - self.uri = Path(self.uri).expanduser().joinpath("instruments", f"{market.lower()}.txt") + def __init__(self, market: str, **kwargs): + super(FileInstrumentStorage, self).__init__(market, **kwargs) + self.file_name = f"{market.lower()}.txt" def _read_instrument(self) -> Dict[InstKT, InstVT]: if not self.uri.exists(): @@ -128,6 +152,7 @@ def clear(self) -> None: @property def data(self) -> Dict[InstKT, InstVT]: + self.check() return self._read_instrument() def __setitem__(self, k: InstKT, v: InstVT) -> None: @@ -136,11 +161,13 @@ def __setitem__(self, k: InstKT, v: InstVT) -> None: self._write_instrument(inst) def __delitem__(self, k: InstKT) -> None: + self.check() inst = self._read_instrument() del inst[k] self._write_instrument(inst) def __getitem__(self, k: InstKT) -> InstVT: + self.check() return self._read_instrument()[k] def update(self, *args, **kwargs) -> None: @@ -164,13 +191,14 @@ def update(self, *args, **kwargs) -> None: self._write_instrument(inst) + def __len__(self) -> int: + return len(self.data) -class FileFeatureStorage(FileStorage, FeatureStorage): - def __init__(self, instrument: str, field: str, freq: str, uri: str, **kwargs): - super(FileFeatureStorage, self).__init__(instrument, field, freq, uri, **kwargs) - self.uri = ( - Path(self.uri).expanduser().joinpath("features", instrument.lower(), f"{field.lower()}.{freq.lower()}.bin") - ) + +class FileFeatureStorage(FileStorageMixin, FeatureStorage): + def __init__(self, instrument: str, field: str, freq: str, **kwargs): + super(FileFeatureStorage, self).__init__(instrument, field, freq, **kwargs) + self.file_name = f"{instrument.lower()}/{field.lower()}.{freq.lower()}.bin" def clear(self): with self.uri.open("wb") as _: @@ -214,35 +242,44 @@ def write(self, data_array: Union[List, np.ndarray], index: int = None) -> None: @property def start_index(self) -> Union[int, None]: - if len(self) == 0: + if not self.uri.exists(): return None - with open(self.uri, "rb") as fp: + with self.uri.open("rb") as fp: index = int(np.frombuffer(fp.read(4), dtype=" Union[int, None]: + if not self.uri.exists(): + return None + # The next data appending index point will be `end_index + 1` + return self.start_index + len(self) - 1 + def __getitem__(self, i: Union[int, slice]) -> Union[Tuple[int, float], pd.Series]: if not self.uri.exists(): if isinstance(i, int): return None, None elif isinstance(i, slice): - return pd.Series() + return pd.Series(dtype=np.float32) else: raise TypeError(f"type(i) = {type(i)}") - with open(self.uri, "rb") as fp: - + storage_start_index = self.start_index + storage_end_index = self.end_index + with self.uri.open("rb") as fp: if isinstance(i, int): - if self.start_index > i: - raise IndexError(f"{i}: start index is {self.start_index}") - fp.seek(4 * (i - self.start_index) + 4) + + if storage_start_index > i: + raise IndexError(f"{i}: start index is {storage_start_index}") + fp.seek(4 * (i - storage_start_index) + 4) return i, struct.unpack("f", fp.read(4))[0] elif isinstance(i, slice): - start_index = self.start_index if i.start is None else i.start - end_index = self.end_index if i.stop is None else i.stop - 1 - si = max(self.start_index, start_index) + start_index = storage_start_index if i.start is None else i.start + end_index = storage_end_index if i.stop is None else i.stop - 1 + si = max(start_index, storage_start_index) if si > end_index: - return pd.Series() - fp.seek(4 * (si - self.start_index) + 4) + return pd.Series(dtype=np.float32) + fp.seek(4 * (si - storage_start_index) + 4) # read n bytes count = end_index - si + 1 data = np.frombuffer(fp.read(4 * count), dtype=" Union[Tuple[int, float], pd.Serie raise TypeError(f"type(i) = {type(i)}") def __len__(self) -> int: - return self.uri.stat().st_size // 4 - 1 if self.check_exists() else 0 + self.check() + return self.uri.stat().st_size // 4 - 1 diff --git a/qlib/data/storage/storage.py b/qlib/data/storage/storage.py index dcf6da9ed1..8426ebe66f 100644 --- a/qlib/data/storage/storage.py +++ b/qlib/data/storage/storage.py @@ -25,24 +25,28 @@ class UserCalendarStorage(CalendarStorage): @property def data(self) -> Iterable[CalVT]: - '''get all data''' - raise NotImplementedError("Subclass of CalendarStorage must implement `data` method") + '''get all data - def check_exists(self) -> bool: - '''check if storage(uri) exists, if not exists: return False''' - raise NotImplementedError("Subclass of BaseStorage must implement `check_exists` method") + Raises + ------ + ValueError + If the data(storage) does not exist, raise ValueError + ''' + raise NotImplementedError("Subclass of CalendarStorage must implement `data` method") class UserInstrumentStorage(InstrumentStorage): @property def data(self) -> Dict[InstKT, InstVT]: - '''get all data''' - raise NotImplementedError("Subclass of InstrumentStorage must implement `data` method") + '''get all data - def check_exists(self) -> bool: - '''check if storage(uri) exists, if not exists: return False''' - raise NotImplementedError("Subclass of BaseStorage must implement `check_exists` method") + Raises + ------ + ValueError + If the data(storage) does not exist, raise ValueError + ''' + raise NotImplementedError("Subclass of InstrumentStorage must implement `data` method") class UserFeatureStorage(FeatureStorage): @@ -53,103 +57,64 @@ def __getitem__(self, s: slice) -> pd.Series: Returns ------- pd.Series(values, index=pd.RangeIndex(start, len(values)) + + Notes + ------- + if data(storage) does not exist: + if isinstance(i, int): + return (None, None) + if isinstance(i, slice): + # return empty pd.Series + return pd.Series(dtype=np.float32) ''' raise NotImplementedError( "Subclass of FeatureStorage must implement `__getitem__(s: slice)` method" ) - def check_exists(self) -> bool: - '''check if storage(uri) exists, if not exists: return False''' - raise NotImplementedError("Subclass of BaseStorage must implement `check_exists` method") """ -class StorageMeta(type): - """unified management of raise when storage is not exists""" - - def __new__(cls, name, bases, dict): - class_obj = type.__new__(cls, name, bases, dict) - - # The calls to __iter__ and __getitem__ do not pass through __getattribute__. - # In order to throw an exception before calling __getitem__, use the metaclass - _getitem_func = getattr(class_obj, "__getitem__") - - def _getitem(obj, item): - getattr(obj, "_check")() - try: - res = _getitem_func(obj, item) - except Exception as e: - raise ValueError(f"{obj.raise_info}\n\tStorage exception info: {str(e)}") - return res - - setattr(class_obj, "__getitem__", _getitem) - return class_obj - - -class BaseStorage(metaclass=StorageMeta): +class BaseStorage: @property def storage_name(self) -> str: return re.findall("[A-Z][^A-Z]*", self.__class__.__name__)[-2].lower() - @property - def raise_info(self): - parameters_info = [ - f"{_k}={_v}" - for _k, _v in self.__dict__.items() - if not isinstance(_v, (dict,)) or (hasattr(_v, "__len__") and len(_v) < 3) - ] - return f"{self.storage_name.lower()} not exists, storage parameters: {parameters_info}" - - def check_exists(self) -> bool: - """check if storage(uri) exists, if not exists: return False""" - raise NotImplementedError("Subclass of BaseStorage must implement `check_exists` method") - - def clear(self) -> None: - """clear storage""" - raise NotImplementedError("Subclass of BaseStorage must implement `clear` method") - - def __len__(self) -> 0: - return len(self.data) if self.check_exists() else 0 - - def __getitem__(self, item: Union[slice, Union[int, InstKT]]): - raise NotImplementedError( - "Subclass of BaseStorage must implement `__getitem__(i: Union[int, InstKT])`/`__getitem__(s: slice)` method" - ) - - def _check(self): - if not self.check_exists(): - raise ValueError(self.raise_info) - - def __getattribute__(self, item): - if item == "data": - self._check() - try: - res = super(BaseStorage, self).__getattribute__(item) - except Exception as e: - raise ValueError(f"{self.raise_info}\n\tStorage exception info: {str(e)}") - return res - class CalendarStorage(BaseStorage): """ The behavior of CalendarStorage's methods and List's methods of the same name remain consistent """ - def __init__(self, freq: str, future: bool, uri: str, **kwargs): + def __init__(self, freq: str, future: bool, **kwargs): self.freq = freq self.future = future - self.uri = uri + self.kwargs = kwargs @property def data(self) -> Iterable[CalVT]: - """get all data""" + """get all data + + Raises + ------ + ValueError + If the data(storage) does not exist, raise ValueError + """ raise NotImplementedError("Subclass of CalendarStorage must implement `data` method") + def clear(self) -> None: + raise NotImplementedError("Subclass of CalendarStorage must implement `clear` method") + def extend(self, iterable: Iterable[CalVT]) -> None: raise NotImplementedError("Subclass of CalendarStorage must implement `extend` method") def index(self, value: CalVT) -> int: + """ + Raises + ------ + ValueError + If the data(storage) does not exist, raise ValueError + """ raise NotImplementedError("Subclass of CalendarStorage must implement `index` method") def insert(self, index: int, value: CalVT) -> None: @@ -184,6 +149,12 @@ def __delitem__(self, i: slice) -> None: ... def __delitem__(self, i) -> None: + """ + Raises + ------ + ValueError + If the data(storage) does not exist, raise ValueError + """ raise NotImplementedError( "Subclass of CalendarStorage must implement `__delitem__(i: int)`/`__delitem__(s: slice)` method" ) @@ -199,26 +170,60 @@ def __getitem__(self, i: int) -> CalVT: ... def __getitem__(self, i) -> CalVT: + """ + + Raises + ------ + ValueError + If the data(storage) does not exist, raise ValueError + + """ raise NotImplementedError( "Subclass of CalendarStorage must implement `__getitem__(i: int)`/`__getitem__(s: slice)` method" ) + def __len__(self) -> int: + """ + + Raises + ------ + ValueError + If the data(storage) does not exist, raise ValueError + + """ + raise NotImplementedError("Subclass of CalendarStorage must implement `__len__` method") + class InstrumentStorage(BaseStorage): - def __init__(self, market: str, uri: str, **kwargs): + def __init__(self, market: str, **kwargs): self.market = market - self.uri = uri + self.kwargs = kwargs @property def data(self) -> Dict[InstKT, InstVT]: - """get all data""" + """get all data + + Raises + ------ + ValueError + If the data(storage) does not exist, raise ValueError + """ raise NotImplementedError("Subclass of InstrumentStorage must implement `data` method") + def clear(self) -> None: + raise NotImplementedError("Subclass of InstrumentStorage must implement `clear` method") + def update(self, *args, **kwargs) -> None: """D.update([E, ]**F) -> None. Update D from mapping/iterable E and F. - If E present and has a .keys() method, does: for k in E: D[k] = E[k] - If E present and lacks .keys() method, does: for (k, v) in E: D[k] = v - In either case, this is followed by: for k, v in F.items(): D[k] = v + + Notes + ------ + If E present and has a .keys() method, does: for k in E: D[k] = E[k] + + If E present and lacks .keys() method, does: for (k, v) in E: D[k] = v + + In either case, this is followed by: for k, v in F.items(): D[k] = v + """ raise NotImplementedError("Subclass of InstrumentStorage must implement `update` method") @@ -227,53 +232,96 @@ def __setitem__(self, k: InstKT, v: InstVT) -> None: raise NotImplementedError("Subclass of InstrumentStorage must implement `__setitem__` method") def __delitem__(self, k: InstKT) -> None: - """Delete self[key].""" + """Delete self[key]. + + Raises + ------ + ValueError + If the data(storage) does not exist, raise ValueError + """ raise NotImplementedError("Subclass of InstrumentStorage must implement `__delitem__` method") def __getitem__(self, k: InstKT) -> InstVT: """x.__getitem__(k) <==> x[k]""" raise NotImplementedError("Subclass of InstrumentStorage must implement `__getitem__` method") + def __len__(self) -> int: + """ + + Raises + ------ + ValueError + If the data(storage) does not exist, raise ValueError + + """ + raise NotImplementedError("Subclass of InstrumentStorage must implement `__len__` method") + class FeatureStorage(BaseStorage): - def __init__(self, instrument: str, field: str, freq: str, uri: str, **kwargs): + def __init__(self, instrument: str, field: str, freq: str, **kwargs): self.instrument = instrument self.field = field self.freq = freq - self.uri = uri + self.kwargs = kwargs @property def data(self) -> pd.Series: - """get all data""" + """get all data + + Notes + ------ + if data(storage) does not exist, return empty pd.Series: `return pd.Series(dtype=np.float32)` + """ raise NotImplementedError("Subclass of FeatureStorage must implement `data` method") @property def start_index(self) -> Union[int, None]: """get FeatureStorage start index - If len(self) == 0; return None + + Notes + ----- + If the data(storage) does not exist, return None """ - raise NotImplementedError("Subclass of FeatureStorage must implement `data` method") + raise NotImplementedError("Subclass of FeatureStorage must implement `start_index` method") @property def end_index(self) -> Union[int, None]: - if len(self) == 0: - return None - return None if len(self) == 0 else self.start_index + len(self) - 1 + """get FeatureStorage end index + + Notes + ----- + The right index of the data range (both sides are closed) + + The next data appending point will be `end_index + 1` + + If the data(storage) does not exist, return None + """ + raise NotImplementedError("Subclass of FeatureStorage must implement `end_index` method") + + def clear(self) -> None: + raise NotImplementedError("Subclass of FeatureStorage must implement `clear` method") def write(self, data_array: Union[List, np.ndarray, Tuple], index: int = None): """Write data_array to FeatureStorage starting from index. - If index is None, append data_array to feature. - If len(data_array) == 0; return - If (index - self.end_index) >= 1, self[end_index+1: index] will be filled with np.nan + Notes + ------ + If index is None, append data_array to feature. - Examples: + If len(data_array) == 0; return + + If (index - self.end_index) >= 1, self[end_index+1: index] will be filled with np.nan + + Examples + --------- + .. code-block:: feature: 3 4 4 5 5 6 + >>> self.write([6, 7], index=6) feature: @@ -311,56 +359,70 @@ def write(self, data_array: Union[List, np.ndarray, Tuple], index: int = None): def rebase(self, start_index: int = None, end_index: int = None): """Rebase the start_index and end_index of the FeatureStorage. - Examples: + start_index and end_index are closed intervals: [start_index, end_index] - feature: - 3 4 - 4 5 - 5 6 + Examples + --------- - >>> self.rebase(start_index=4) + .. code-block:: - feature: - 4 5 - 5 6 + feature: + 3 4 + 4 5 + 5 6 - >>> self.rebase(start_index=3) - feature: - 3 np.nan - 4 5 - 5 6 + >>> self.rebase(start_index=4) - >>> self.write([3], index=3) + feature: + 4 5 + 5 6 - feature: - 3 3 - 4 5 - 5 6 + >>> self.rebase(start_index=3) - >>> self.rebase(end_index=4) + feature: + 3 np.nan + 4 5 + 5 6 - feature: - 3 3 - 4 5 + >>> self.write([3], index=3) - >>> self.write([6, 7, 8], index=4) + feature: + 3 3 + 4 5 + 5 6 - feature: - 3 3 - 4 6 - 5 7 - 6 8 + >>> self.rebase(end_index=4) - >>> self.rebase(start_index=4, end_index=5) + feature: + 3 3 + 4 5 - feature: - 4 6 - 5 7 + >>> self.write([6, 7, 8], index=4) + + feature: + 3 3 + 4 6 + 5 7 + 6 8 + + >>> self.rebase(start_index=4, end_index=5) + + feature: + 4 6 + 5 7 """ - if start_index is None and end_index is None: - logger.warning("both start_index and end_index are None, rebase is ignored") + storage_si = self.start_index + storage_ei = self.end_index + if storage_si is None or storage_ei is None: + raise ValueError("storage.start_index or storage.end_index is None, storage may not exist") + + start_index = storage_si if start_index is None else start_index + end_index = storage_ei if end_index is None else end_index + + if start_index is None or end_index is None: + logger.warning("both start_index and end_index are None, or storage does not exist; rebase is ignored") return if start_index < 0 or end_index < 0: @@ -373,17 +435,15 @@ def rebase(self, start_index: int = None, end_index: int = None): ) return - start_index = self.start_index if start_index is None else end_index - end_index = self.end_index if end_index is None else end_index - if start_index <= self.start_index: - self.write([np.nan] * (self.start_index - start_index), start_index) + if start_index <= storage_si: + self.write([np.nan] * (storage_si - start_index), start_index) else: self.rewrite(self[start_index:].values, start_index) if end_index >= self.end_index: self.write([np.nan] * (end_index - self.end_index)) else: - self.rewrite(self[: end_index + 1].values, self.start_index) + self.rewrite(self[: end_index + 1].values, start_index) def rewrite(self, data: Union[List, np.ndarray, Tuple], index: int): """overwrite all data in FeatureStorage with data @@ -414,7 +474,28 @@ def __getitem__(self, i: int) -> Tuple[int, float]: ... def __getitem__(self, i) -> Union[Tuple[int, float], pd.Series]: - """x.__getitem__(y) <==> x[y]""" + """x.__getitem__(y) <==> x[y] + + Notes + ------- + if data(storage) does not exist: + if isinstance(i, int): + return (None, None) + if isinstance(i, slice): + # return empty pd.Series + return pd.Series(dtype=np.float32) + """ raise NotImplementedError( "Subclass of FeatureStorage must implement `__getitem__(i: int)`/`__getitem__(s: slice)` method" ) + + def __len__(self) -> int: + """ + + Raises + ------ + ValueError + If the data(storage) does not exist, raise ValueError + + """ + raise NotImplementedError("Subclass of FeatureStorage must implement `__len__` method") diff --git a/qlib/utils/__init__.py b/qlib/utils/__init__.py index 77857182d9..686f0fc00f 100644 --- a/qlib/utils/__init__.py +++ b/qlib/utils/__init__.py @@ -665,7 +665,10 @@ def exists_qlib_data(qlib_dir): return False # check calendar bin for _calendar in calendars_dir.iterdir(): - if not list(features_dir.rglob(f"*.{_calendar.name.split('.')[0]}.bin")): + + if ("_future" not in _calendar.name) and ( + not list(features_dir.rglob(f"*.{_calendar.name.split('.')[0]}.bin")) + ): return False # check instruments diff --git a/scripts/dump_bin.py b/scripts/dump_bin.py index 0b063fddac..b3a18cc902 100644 --- a/scripts/dump_bin.py +++ b/scripts/dump_bin.py @@ -120,7 +120,7 @@ def _get_date( else: df = file_or_df if df.empty or self.date_field_name not in df.columns.tolist(): - _calendars = pd.Series() + _calendars = pd.Series(dtype=np.float32) else: _calendars = df[self.date_field_name] diff --git a/tests/storage_tests/test_storage.py b/tests/storage_tests/test_storage.py index e7bac658cb..aad8d11e48 100644 --- a/tests/storage_tests/test_storage.py +++ b/tests/storage_tests/test_storage.py @@ -24,7 +24,7 @@ class TestStorage(TestAutoData): def test_calendar_storage(self): - calendar = CalendarStorage(freq="day", future=False, uri=self.provider_uri) + calendar = CalendarStorage(freq="day", future=False, provider_uri=self.provider_uri) assert isinstance(calendar[:], Iterable), f"{calendar.__class__.__name__}.__getitem__(s: slice) is not Iterable" assert isinstance(calendar.data, Iterable), f"{calendar.__class__.__name__}.data is not Iterable" @@ -32,6 +32,16 @@ def test_calendar_storage(self): print(f"calendar[0]: {calendar[0]}") print(f"calendar[-1]: {calendar[-1]}") + calendar = CalendarStorage(freq="1min", future=False, provider_uri="not_found") + with pytest.raises(ValueError): + print(calendar.data) + + with pytest.raises(ValueError): + print(calendar[:]) + + with pytest.raises(ValueError): + print(calendar[0]) + def test_instrument_storage(self): """ The meaning of instrument, such as CSI500: @@ -66,7 +76,7 @@ def test_instrument_storage(self): """ - instrument = InstrumentStorage(market="csi300", uri=self.provider_uri) + instrument = InstrumentStorage(market="csi300", provider_uri=self.provider_uri) for inst, spans in instrument.data.items(): assert isinstance(inst, str) and isinstance( @@ -79,6 +89,13 @@ def test_instrument_storage(self): print(f"instrument['SH600000']: {instrument['SH600000']}") + instrument = InstrumentStorage(market="csi300", provider_uri="not_found") + with pytest.raises(ValueError): + print(instrument.data) + + with pytest.raises(ValueError): + print(instrument["sSH600000"]) + def test_feature_storage(self): """ Calendar: @@ -133,9 +150,9 @@ def test_feature_storage(self): """ - feature = FeatureStorage(instrument="SH600004", field="close", freq="day", uri=self.provider_uri) + feature = FeatureStorage(instrument="SH600004", field="close", freq="day", provider_uri=self.provider_uri) - with pytest.raises(ValueError): + with pytest.raises(IndexError): print(feature[0]) assert isinstance( feature[815][1], (float, np.float32) @@ -144,3 +161,11 @@ def test_feature_storage(self): print(f"feature[815: 818]: \n{feature[815: 818]}") print(f"feature[:].tail(): \n{feature[:].tail()}") + + feature = FeatureStorage(instrument="SH600004", field="close", freq="day", provider_uri="not_fount") + + assert feature[0] == (None, None), "FeatureStorage does not exist, feature[i] should return `(None, None)`" + assert feature[:].empty, "FeatureStorage does not exist, feature[:] should return `pd.Series(dtype=np.float32)`" + assert ( + feature.data.empty + ), "FeatureStorage does not exist, feature.data should return `pd.Series(dtype=np.float32)`"