Merge branch 'main' into main

mckinsey · May 20, 2024 · 77e1ea5 · 77e1ea5
2 parents d063a5a + 9fef3c0
commit 77e1ea5
Show file tree

Hide file tree

Showing 6 changed files with 384 additions and 138 deletions.
diff --git a/vizro-core/changelog.d/20240514_213231_antony.milne_dynamic_data_load_with_args.md b/vizro-core/changelog.d/20240514_213231_antony.milne_dynamic_data_load_with_args.md
@@ -0,0 +1,48 @@
+<!--
+A new scriv changelog fragment.
+
+Uncomment the section that is right (remove the HTML comment wrapper).
+-->
+
+<!--
+### Highlights ✨
+
+- A bullet item for the Highlights ✨ category with a link to the relevant PR at the end of your entry, e.g. Enable feature XXX ([#1](https://github.com/mckinsey/vizro/pull/1))
+
+-->
+<!--
+### Removed
+
+- A bullet item for the Removed category with a link to the relevant PR at the end of your entry, e.g. Enable feature XXX ([#1](https://github.com/mckinsey/vizro/pull/1))
+
+-->
+<!--
+### Added
+
+- A bullet item for the Added category with a link to the relevant PR at the end of your entry, e.g. Enable feature XXX ([#1](https://github.com/mckinsey/vizro/pull/1))
+
+-->
+<!--
+### Changed
+
+- A bullet item for the Changed category with a link to the relevant PR at the end of your entry, e.g. Enable feature XXX ([#1](https://github.com/mckinsey/vizro/pull/1))
+
+-->
+<!--
+### Deprecated
+
+- A bullet item for the Deprecated category with a link to the relevant PR at the end of your entry, e.g. Enable feature XXX ([#1](https://github.com/mckinsey/vizro/pull/1))
+
+-->
+<!--
+### Fixed
+
+- A bullet item for the Fixed category with a link to the relevant PR at the end of your entry, e.g. Enable feature XXX ([#1](https://github.com/mckinsey/vizro/pull/1))
+
+-->
+<!--
+### Security
+
+- A bullet item for the Security category with a link to the relevant PR at the end of your entry, e.g. Enable feature XXX ([#1](https://github.com/mckinsey/vizro/pull/1))
+
+-->
diff --git a/vizro-core/docs/pages/explanation/documentation-style-guide.md b/vizro-core/docs/pages/explanation/documentation-style-guide.md
@@ -13,6 +13,7 @@ What follows is a set of lightweight guidelines rather than rules. There are alw
 The names of our products are **Vizro** and **Vizro-AI**.
 
 We refer to other products using their preferred capitalization. For example:
+
 * Dash and Pydantic are always capitalized, except where given as Python package names `dash` and `pydantic`.
 * pandas DataFrame has a lowercase "p" and camelcase "DataFrame".
 

diff --git a/vizro-core/docs/pages/user-guides/dashboard.md b/vizro-core/docs/pages/user-guides/dashboard.md
@@ -5,9 +5,9 @@ pydantic models, Python dictionaries, YAML, or JSON.
 To create a dashboard:
 
 1. Choose one of the possible configuration syntaxes
-2. Create your `pages`, see our guide on [Pages](pages.md)
-3. (optional) Choose a `theme`, see our guide on [Themes](themes.md)
-4. (optional) Customize your `navigation`, see our guide on [Navigation](navigation.md)
+2. Create your `pages`, see our [guide on Pages](pages.md)
+3. (optional) Choose a `theme`, see our [guide on Themes](themes.md)
+4. (optional) Customize your `navigation`, see our [guide on Navigation](navigation.md)
 5. (optional) Set a `title` for your dashboard
 6. Add your `dashboard` to the `build` call of Vizro
 
@@ -190,7 +190,7 @@ If supplied, the `title` of the [`Dashboard`][vizro.models.Dashboard] displays a
 
 ## Add a dashboard logo
 
-Vizro will automatically incorporate the dashboard [logo](assets.md/#logo-image) in the top-left corner of each page if an image named `logo.<extension>` is present within the assets folder.
+Vizro will [automatically incorporate the dashboard logo](assets.md/#logo-image) in the top-left corner of each page if an image named `logo.<extension>` is present within the assets folder.
 
 ![Dashboard with logo](../../assets/user_guides/dashboard/dashboard_with_logo.png)
 

diff --git a/vizro-core/docs/pages/user-guides/layouts.md b/vizro-core/docs/pages/user-guides/layouts.md
@@ -7,7 +7,7 @@ The [`Page`][vizro.models.Page] model accepts the `layout` argument, where you c
 
 ## Use the default layout
 The `layout` argument of the [`Page`][vizro.models.Page] model is optional. If no layout is specified, all charts/components
-will automatically be [**vertically stacked**](layouts.md#stacking-components) down the page in one column.
+will automatically be [**vertically stacked**](layouts.md#stack-components) down the page in one column.
 If that is your desired layout, you can create your charts/components without providing a [`Layout`][vizro.models.Layout].
 
 !!! example "Default Layout"

diff --git a/vizro-core/src/vizro/managers/_data_manager.py b/vizro-core/src/vizro/managers/_data_manager.py
@@ -2,9 +2,12 @@
 
 from __future__ import annotations
 
+import functools
 import logging
+import os
 import warnings
-from typing import Any, Callable, Dict, Optional, Protocol, Union
+from functools import partial
+from typing import Callable, Dict, Optional, Union
 
 import pandas as pd
 import wrapt
@@ -17,60 +20,22 @@
 # Really DataSourceName should be NewType and not just aliases but then for a user's code to type check
 # correctly they would need to cast all strings to these types.
 DataSourceName = str
-pd_DataFrameCallable = Callable[[], pd.DataFrame]
-
-
-# memoize currently requires wrapped to be a bound method rather than just a function. A bound method has a __func__
-# attribute that gives the underlying function. In future memoize could be expanded to handle functions just by
-# applying the same logic but without looking inside the __func__ attribute.
-# The below is the best way to type-hint a bound method, based on:
-# https://stackoverflow.com/questions/62658540/how-to-combine-a-custom-protocol-with-the-callable-protocol
-# https://stackoverflow.com/questions/64225522/python-typing-how-to-type-hint-a-variable-as-a-bound-method
-class BoundMethod(Protocol):
-    __self__: object
-    __func__: Callable[..., Any]
-
-    def __call__(self, *args, **kwargs): ...
-
-
-# wrapt.decorator is the cleanest way to decorate a bound method when instance properties (here instance.timeout)
-# are required as arguments. The enabled argument bypasses the entire decorator when data_manager._cache_has_app
-# is False. This prevents Flask-Caching from trying to access a Flask app when one doesn't yet exist.
-@wrapt.decorator(enabled=lambda: data_manager._cache_has_app)
-def memoize(wrapped: BoundMethod, instance: _DynamicData, args, kwargs):
-    """Caches the result of wrapped function, taking its arguments into account in the cache key.
-
-    This delegates to flask_caching.memoize functionality, but with an important modification. flask_caching.memoize
-    is designed to apply the same timeout to every call of the function or bound method, but we want to have a
-    data source-dependent timeout. This means rewriting the wrapped function's __qualname__ so that different instances
-    of _DynamicData have completely independent caches. Without this, flask_caching.utils.function_namespace gives the
-    same namespace for each, which means that the timeouts cannot be set independently.
-
-    The following things *do not* work to achieve the same thing - there will still be interference between
-    different caches:
-    * altering wrapped.cache_timeout
-    * using make_name argument (since this is only applied after function_namespace is called)
-    * including self or self._name in the load_data arguments
-    * whenever function_namespace recognizes the memoized function as a bound method (even when __caching_id__ is
-    defined so that Flask Caching does not fall back on __repr__)
-
-    Another option would be to use flask_caching.cached rather than memoize, but that makes it harder to include
-    arguments in the cache key and only works inside a Flask request context.
-
-    Args:
-        wrapped: bound method to memoize.
-        instance: _DynamicData object to which wrapped is bound.
-        args : positional arguments supplied to wrapped, taken into account for generating cache key.
-        kwargs : keyword arguments supplied to wrapped, taken into account for generating cache key.
-
-    Returns:  Memoized call.
+pd_DataFrameCallable = Callable[..., pd.DataFrame]
 
-    """
-    # Before altering, wrapped.__func__.__qualname__ is "_DynamicData.__call__"
-    # After altering, it becomes _DynamicData.__call__._DynamicData("name", function_name)
-    # Note this doesn't work by using += since the qualname will just be appended to fresh every time the function is
-    wrapped.__func__.__qualname__ = ".".join([instance.__class__.__name__, wrapped.__func__.__name__, repr(instance)])
-    return data_manager.cache.memoize(timeout=instance.timeout)(wrapped)(*args, **kwargs)
+
+# TODO: consider merging with model_utils _log_call. Using wrapt.decorator is probably better than functools here.
+#  Might need messages that run before/after the wrapped function call.
+# Follows the pattern recommended in https://wrapt.readthedocs.io/en/latest/decorators.html#decorators-with-arguments
+# for making a wrapt.decorator with arguments.
+def _log_call(message: str):
+    @wrapt.decorator
+    def wrapper(wrapped, instance, args, kwargs):
+        logger.debug(message)
+        # Might be useful if merged with model_utils _log_call:
+        # logging.debug(message.format(wrapped=wrapped, instance=instance, args=args, kwargs=kwargs))
+        return wrapped(*args, **kwargs)
+
+    return wrapper
 
 
 class _DynamicData:
@@ -87,47 +52,38 @@ class _DynamicData:
 
     Possibly in future, this will become a public class so you could directly do:
         >>> data_manager["dynamic_data"] = DynamicData(dynamic_data, timeout=5)
-    But we'd need to make sure that name is not an argument in __init__ then.
 
     At this point we might like to disable the behavior so that data_manager setitem and getitem handle the same
     object rather than doing an implicit conversion to _DynamicData.
     """
 
-    def __init__(self, name: str, load_data: pd_DataFrameCallable):
+    def __init__(self, load_data: pd_DataFrameCallable):
         self.__load_data: pd_DataFrameCallable = load_data
-        # name is needed for the cache key and should not be modified by users.
-        self._name = name
         self.timeout: Optional[int] = None
         # We might also want a self.cache_arguments dictionary in future that allows user to customize more than just
         # timeout, but no rush to do this since other arguments are unlikely to be useful.
 
-    @memoize
-    def load(self) -> pd.DataFrame:
-        """Loads data.
-
-        In future this will probably take arguments that are passed through to __load_data in order to re-run the
-        loading function with different arguments. We might want to use CapturedCallable for self.__load_data then.
-        """
-        # Note you get the same "cache missed" message if NullCache is running or if data_manager._cache_has_app is
-        # False.
-        logger.debug("Cache miss; reloading data")
-        return self.__load_data()
-
-    def __repr__(self):
-        """Flask-caching uses repr to form the cache key, so this is very important to set correctly.
-
-        In particular, it must depend on something that uniquely labels the data source and is the same across all
-        workers: self._name. Using id(self), as in Python's default repr, only works in the case that gunicorn is
-        running with --preload: without preloading, the id of the same data source in different processes will be
-        different so the cache will not match up.
-
-        flask_caching make it possible to set a __cached_id__ attribute to handle this so that repr can be set
-        independently of cache key, but this doesn't seem to be well documented or work well, so it's better to rely
-        on __repr__.
-        """
-        # Note that using repr(self.__load_data) is not good since it depends on the id of self.__load_data and so
-        # would not be consistent across processes.
-        return f"{self.__class__.__name__}({self._name!r}, {self.__load_data.__qualname__})"
+    def load(self, *args, **kwargs) -> pd.DataFrame:
+        """Loads data."""
+        # Data source name can be extracted from the function's name since it was added there in DataManager.__setitem__
+        logger.debug(
+            "Looking in cache for data source %s on process %s",
+            self.__load_data.__name__.rpartition(".")[-1],
+            os.getpid(),
+        )
+        # We don't memoize the load method itself as this is tricky to get working fully when load is called with
+        # arguments, since we need the signature of the memoized function to match that of load_data. See
+        # https://github.com/GrahamDumpleton/wrapt/issues/263.
+        # It's also difficult to get memoize working correctly with bound methods anyway - see comment in
+        # DataManager.__setitem__. It's much easier to ensure that self.__load_data is always just a function.
+        if data_manager._cache_has_app:
+            # This includes the case of NullCache.
+            load_data = _log_call("Cache miss; reloading data")(self.__load_data)
+            load_data = data_manager.cache.memoize(timeout=self.timeout)(load_data)
+        else:
+            logger.debug("Cache not active; reloading data")
+            load_data = self.__load_data
+        return load_data(*args, **kwargs)
 
 
 class _StaticData:
@@ -144,7 +100,6 @@ class _StaticData:
     """
 
     def __init__(self, data: pd.DataFrame):
-        # No need for _name here because static data doesn't need a cache key.
         self.__data = data
 
     def load(self) -> pd.DataFrame:
@@ -197,18 +152,35 @@ def __init__(self):
     def __setitem__(self, name: DataSourceName, data: Union[pd.DataFrame, pd_DataFrameCallable]):
         """Adds `data` to the `DataManager` with key `name`."""
         if callable(data):
-            if not hasattr(data, "__name__") or not hasattr(data, "__qualname__"):
-                # lambda function has __name__ and __qualname__ so works well. partial does not unless explicitly
-                # assigned them and so will not work with flask_caching. The partial object's func attribute is the
-                # full function but does not contain the pre-set arguments so also does not work. We could manually
-                # set __name__ and __qualname__ based on the func attribute here if using partial proves to be a good
-                # pattern. Similarly for CapturedCallable, or that could instead set its own __name__ and  __qualname__.
-                raise TypeError(
-                    f"Data source {name}'s function does not have a name. If you have provide a `partial` object then "
-                    "either supply your partial function with `__name__` and `__qualname__` attributes or re-work your "
-                    "code to use a full function, e.g. using `lambda`."
-                )
-            self.__data[name] = _DynamicData(name, data)
+            # __qualname__ is required by flask-caching (even if we specify our own make_name) but
+            # not defined for partial functions and just '<lambda>' for lambda functions. Defining __qualname__
+            # means it's possible to have non-interfering caches for lambda functions (similarly if we
+            # end up using CapturedCallable, or that could instead set its own __qualname__).
+            # We handle __name__ the same way even though it's not currently essential to functioning of flask-caching
+            # in case they change the underlying implementation to use it.
+            # We use partial to effectively make an independent copy of the underlying data function. This means that
+            # it's possible to set __qualname__ independently for each data source. This is not essential for
+            # functions other than lambda, but it is essential for bound methods, as flask-caching cannot easily
+            # independently timeout different instances with different bound methods but the same underlying function
+            # data.__func__. If we don't do this then the bound method case needs some uglier hacking to make work
+            # correctly - see https://github.com/mckinsey/vizro/blob/abb7eebb230ba7e6cfdf6150dc56b211a78b1cd5/
+            # vizro-core/src/vizro/managers/_data_manager.py.
+            # Once partial has been used, all dynamic data sources are on equal footing since they're all treated as
+            # functions rather than bound methods, e.g. by flask_caching.utils.function_namespace. This makes it much
+            # simpler to use flask-caching reliably.
+            # It's important the __qualname__ is the same across all workers, so use the data source name rather than
+            # e.g. the repr method that includes the id of the instance so would only work in the case that gunicorn is
+            # running with --preload.
+            # __module__ is also required in flask_caching.utils.function_namespace and not defined for partial
+            # functions in some versions of Python.
+            # update_wrapper ensures that __module__, __name__, __qualname__, __annotations__ and __doc__ are
+            # assigned to the new partial(data) the same as they were in data. This isn't strictly necessary but makes
+            # inspecting these functions easier.
+            data = functools.update_wrapper(partial(data), data)
+            data.__module__ = getattr(data, "__module__", "<nomodule>")
+            data.__name__ = ".".join([getattr(data, "__name__", "<unnamed>"), name])
+            data.__qualname__ = ".".join([getattr(data, "__qualname__", "<unnamed>"), name])
+            self.__data[name] = _DynamicData(data)
         elif isinstance(data, pd.DataFrame):
             self.__data[name] = _StaticData(data)
         else: