Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEAT-#6983: Add Pluggable Documentation Module Support #6986

Merged
merged 15 commits into from
Mar 5, 2024
3 changes: 3 additions & 0 deletions modin/config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
CIAWSSecretAccessKey,
CpuCount,
DaskThreadsPerWorker,
DocModule,
DoUseCalcite,
Engine,
EnvironmentVariable,
Expand Down Expand Up @@ -107,4 +108,6 @@
"LogMode",
"LogMemoryInterval",
"LogFileSize",
# Plugin settings
"DocModule",
]
43 changes: 43 additions & 0 deletions modin/config/envvars.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

"""Module houses Modin configs originated from environment variables."""

import importlib
import os
import secrets
import sys
Expand Down Expand Up @@ -831,6 +832,48 @@ class LazyExecution(EnvironmentVariable, type=str):
default = "Auto"


class DocModule(EnvironmentVariable, type=ExactStr):
"""
The module to use that will be used for docstrings.

The value set here must be a valid, importable module. It should have
a `DataFrame`, `Series`, and/or several APIs directly (e.g. `read_csv`).
"""

varname = "MODIN_DOC_MODULE"
default = "pandas"

@classmethod
def put(cls, value: str) -> None:
"""
Assign a value to the DocModule config.

Parameters
----------
value : str
Config value to set.
"""
super().put(value)
# Reload everything to apply the documentation. This is required since the
# docs might already have been created and the implementation will assume
# that the new docs are applied when the config is set. This set of operations
# does this.
import modin.pandas as pd

importlib.reload(pd.accessor)
importlib.reload(pd.base)
importlib.reload(pd.dataframe)
importlib.reload(pd.general)
importlib.reload(pd.groupby)
importlib.reload(pd.io)
importlib.reload(pd.iterator)
importlib.reload(pd.series)
importlib.reload(pd.series_utils)
importlib.reload(pd.utils)
importlib.reload(pd.window)
importlib.reload(pd)


class DaskThreadsPerWorker(EnvironmentVariable, type=int):
"""Number of threads per Dask worker."""

Expand Down
17 changes: 17 additions & 0 deletions modin/config/test/docs_module/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

from .classes import DataFrame, Series
from .functions import read_csv

__all__ = ["DataFrame", "Series", "read_csv"]
24 changes: 24 additions & 0 deletions modin/config/test/docs_module/classes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.


class DataFrame:
sfc-gh-dpetersohn marked this conversation as resolved.
Show resolved Hide resolved
def apply(self):
"""This is a test of the documentation module for DataFrame."""
return


class Series:
def isna(self):
"""This is a test of the documentation module for Series."""
return
17 changes: 17 additions & 0 deletions modin/config/test/docs_module/functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.


def read_csv():
"""Test override for functions on the module."""
return
30 changes: 30 additions & 0 deletions modin/config/test/test_envvars.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,36 @@ def test_custom_help(make_custom_envvar):
assert "custom var" in make_custom_envvar.get_help()


def test_doc_module():
import pandas

import modin.pandas as pd
from modin.config import DocModule

DocModule.put("modin.config.test.docs_module")

# Test for override
assert (
pd.DataFrame.apply.__doc__
== "This is a test of the documentation module for DataFrame."
)
# Test for pandas doc when method is not defined on the plugin module
assert pandas.DataFrame.isna.__doc__ in pd.DataFrame.isna.__doc__
assert pandas.DataFrame.isnull.__doc__ in pd.DataFrame.isnull.__doc__
# Test for override
assert (
pd.Series.isna.__doc__
== "This is a test of the documentation module for Series."
)
# Test for pandas doc when method is not defined on the plugin module
assert pandas.Series.isnull.__doc__ in pd.Series.isnull.__doc__
assert pandas.Series.apply.__doc__ in pd.Series.apply.__doc__
# Test for override
assert pd.read_csv.__doc__ == "Test override for functions on the module."
# Test for pandas doc when function is not defined on module.
assert pandas.read_table.__doc__ in pd.read_table.__doc__


def test_hdk_envvar():
try:
import pyhdk
Expand Down
20 changes: 20 additions & 0 deletions modin/pandas/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1093,6 +1093,26 @@ def isin(self, values): # noqa: PR01, RT01, D200
"""
return super(DataFrame, self).isin(values)

def isna(self):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These had to be added to both Series and DataFrame because they share docs, but we may not want them to in the Documentation class.

In a separate PR, I think we should add the rest of these, but for now I have left it to these two as a proof of concept for what will be needed in the future. The tests will not pass if we do not have the distinction with Series and DataFrame.

"""
Detect missing values.

Returns
-------
The result of detecting missing values.
"""
return super(DataFrame, self).isna()

def isnull(self):
"""
Detect missing values.

Returns
-------
The result of detecting missing values.
"""
return super(DataFrame, self).isnull()

def iterrows(self): # noqa: D200
"""
Iterate over ``DataFrame`` rows as (index, ``Series``) pairs.
Expand Down
20 changes: 20 additions & 0 deletions modin/pandas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1183,6 +1183,26 @@ def isin(self, values): # noqa: PR01, RT01, D200
"""
return super(Series, self).isin(values, shape_hint="column")

def isna(self):
"""
Detect missing values.

Returns
-------
The result of detecting missing values.
"""
return super(Series, self).isna()

def isnull(self):
"""
Detect missing values.

Returns
-------
The result of detecting missing values.
"""
return super(Series, self).isnull()

def item(self): # noqa: RT01, D200
"""
Return the first element of the underlying data as a Python scalar.
Expand Down
25 changes: 23 additions & 2 deletions modin/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
)

from modin._version import get_versions
from modin.config import Engine, StorageFormat
from modin.config import DocModule, Engine, StorageFormat

T = TypeVar("T")
"""Generic type parameter"""
Expand Down Expand Up @@ -399,6 +399,22 @@ def _inherit_docstrings(
are not defined in target class (but are defined in the ancestor class),
which means that ancestor class attribute docstrings could also change.
"""
# Import the docs module and get the class (e.g. `DataFrame`).
imported_doc_module = importlib.import_module(DocModule.get())
# Set the default parent so we can use it in case some docs are missing from
# parent module.
default_parent = parent
# Try to get the parent object from the doc module, and if it isn't there,
# get it from parent instead. We only do this if we are overriding pandas
# documentation. We don't touch other docs.
if DocModule.get() != DocModule.default and "pandas" in str(
getattr(parent, "__module__", "")
):
parent = getattr(imported_doc_module, getattr(parent, "__name__", ""), parent)
if parent != default_parent:
# Reset API link in case the docs are overridden.
apilink = None
overwrite_existing = True

def _documentable_obj(obj: object) -> bool:
"""Check if `obj` docstring could be patched."""
Expand All @@ -421,7 +437,12 @@ def decorator(cls_or_func: Fn) -> Fn:
if attr in seen:
continue
seen.add(attr)
parent_obj = getattr(parent, attr, None)
# Try to get the attribute from the docs class first, then
# from the default parent (pandas), and if it's not in either,
# set `parent_obj` to `None`.
parent_obj = getattr(
parent, attr, getattr(default_parent, attr, None)
)
if (
parent_obj in excluded
or not _documentable_obj(parent_obj)
Expand Down