man-group · jjerphan · Jun 1, 2023 · May 31, 2023 · May 31, 2023 · May 31, 2023
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,10 @@
+repos:
+-   repo: https://github.com/psf/black
+    rev: 22.8.0
+    hooks:
+    -   id: black
+-   repo: https://github.com/pycqa/flake8
+    rev: 4.0.1
+    hooks:
+    -   id: flake8
+        types: [file, python]
diff --git a/docs/mkdocs/docs/technical/contributing.md b/docs/mkdocs/docs/technical/contributing.md
@@ -184,6 +184,26 @@ We recommend using Visual Studio 2022 (or later) to install the compiler (MSVC v
 The Python that comes with Visual Studio is sufficient for creating release builds, but for debug builds, you will have
 to separately download from [Python.org](https://www.python.org/downloads/windows/).
 
+pre-commit hooks setup
+----------------------
+
+We use [pre-commit](https://pre-commit.com/) to run some checks on the codebase before committing.
+
+To install the pre-commit hooks, run:
+
+```bash
+pip install pre-commit
+pre-commit install
+```
+
+This will install the pre-commit hooks into your local git repository.
+
+If you want to run the pre-commit hooks on all files, run:
+
+```bash
+pre-commit run --all-files
+```
+
 Running Python tests
 --------------------
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -24,3 +24,22 @@ manylinux-x86_64-image = "ghcr.io/man-group/cibuildwheel_manylinux:latest"
 build = "cp*-win_amd64"
 before-all = "bash {project}/build_tooling/prep_cpp_build.sh"
 before-build = "set"
+
+[tool.black]
+line-length = 120
+target_version = ['py36', 'py37', 'py38', 'py39', 'py310', 'py310']
+preview = true
+# This must be kept in sync with the version in setup.cfg.
+exclude = '''
+/(
+  | \.git
+  | \.github
+  | \.mypy_cache
+  | \.vscode
+  | \.idea
+  | build_tooling
+  | cpp
+  | docs
+  | static
+)/
+'''
@@ -109,7 +109,7 @@ def _parse_query(self, query: str) -> ParsedQuery:
                 raise ValueError(
                     "Invalid S3 URI. "
                     f"Invalid query parameter '{key}' passed in. "
-                    f"Value query parameters: "
+                    "Value query parameters: "
                     f"{list(field_dict.keys())}"
                 )
 

@@ -110,4 +110,7 @@ def __init__(
         self.columns_per_segment = columns_per_segment
 
     def __repr__(self):
-        return f"LibraryOptions(dynamic_schema={self.dynamic_schema}, dedup={self.dedup}, rows_per_segment={self.rows_per_segment}, columns_per_segment={self.columns_per_segment})"
+        return (
+            f"LibraryOptions(dynamic_schema={self.dynamic_schema}, dedup={self.dedup},"
+            f" rows_per_segment={self.rows_per_segment}, columns_per_segment={self.columns_per_segment})"
+        )
@@ -47,9 +47,8 @@ def __new__(cls, times, columns_names, columns_values):
         if not all(times.shape[0] == cv.shape[0] for cv in columns_values):
             s = np.array([cv.shape[0] for cv in columns_values])
             raise ValueError(
-                "Inconsistent size of column values. times.shape[0]={} must match cv.shape[0] for all column values. actual={}".format(
-                    times.shape[0], s
-                )
+                "Inconsistent size of column values. times.shape[0]={} must match cv.shape[0] for all column values."
+                " actual={}".format(times.shape[0], s)
             )
         return tuple.__new__(cls, (times, columns_names, columns_values))
 
@@ -68,7 +67,8 @@ def _iloc(self, item, resolve_slice=None):
         if isinstance(item, tuple):
             if len(item) != 2:
                 raise ValueError(
-                    "Only support 2 dimensional indexing where dimension 0 is the row indexing and dimension 1 is column one"
+                    "Only support 2 dimensional indexing where dimension 0 is the row indexing and dimension 1 is"
+                    " column one"
                 )
             col_filter = item[1]
             item = item[0]

@@ -179,9 +179,8 @@ def _to_primitive(arr, arr_name, dynamic_strings, string_max_len=None, coerce_co
     if len(arr) == 0:
         if coerce_column_type is None:
             raise ArcticNativeNotYetImplemented(
-                "coercing column type is required when empty column of object type, Column type={} for column={}".format(
-                    arr.dtype, arr_name
-                )
+                "coercing column type is required when empty column of object type, Column type={} for column={}"
+                .format(arr.dtype, arr_name)
             )
         else:
             return arr.astype(coerce_column_type)
@@ -1154,10 +1153,13 @@ def normalize(
                 log.debug("pickle_on_failure flag set, normalizing the item with MsgPackNormalizer", type(item), ex)
                 return self.fallback_normalizer.normalize(item)
             # Could not normalize with the default handler, pickle_on_failure
-            log.error(
+            error_message = (
                 "Could not normalize item of type: {} with any normalizer."
                 "You can set pickle_on_failure param to force pickling of this object instead."
-                "(Note: Pickling has worse performance and stricter memory limitations)",
+                "(Note: Pickling has worse performance and stricter memory limitations)"
+            )
+            log.error(
+                error_message,
                 type(item),
                 ex,
             )
@@ -1279,8 +1281,7 @@ def _to_utc_ts(v: "ExplicitlySupportedDates", bound_name: str) -> Timestamp:
 
         if v.tzinfo is None:
             log.debug(
-                "DateRange bounds do not have timestamps, will default to UTC for the query,"
-                f"DateRange.{bound_name}={v}"
+                f"DateRange bounds do not have timestamps, will default to UTC for the query,DateRange.{bound_name}={v}"
             )
             v = v.tz_localize("UTC")
 

@@ -157,8 +157,9 @@ def _handle_categorical_columns(symbol, data, throw=True):
             if data.dtype.name == "category":
                 categorical_columns.append(data.name)
         if len(categorical_columns) > 0:
-            message = "Symbol: {}\nDataFrame/Series contains categorical data, cannot append or update\nCategorical columns: {}".format(
-                symbol, categorical_columns
+            message = (
+                "Symbol: {}\nDataFrame/Series contains categorical data, cannot append or update\nCategorical"
+                " columns: {}".format(symbol, categorical_columns)
             )
             if throw:
                 raise ArcticNativeNotYetImplemented(message)

@@ -367,7 +367,7 @@ def write(
         """
         if not isinstance(data, NORMALIZABLE_TYPES):
             raise ArcticUnsupportedDataTypeException(
-                f"data is of a type that cannot be normalized. Consider using "
+                "data is of a type that cannot be normalized. Consider using "
                 f"write_pickle instead. type(data)=[{type(data)}]"
             )
 
@@ -932,7 +932,8 @@ def handle_symbol(s_):
                 handle_read_request(s)
             else:
                 raise ArcticInvalidApiUsageException(
-                    f"Unsupported item in the symbols argument s=[{s}] type(s)=[{type(s)}]. Only [str] and [ReadRequest] are supported."
+                    f"Unsupported item in the symbols argument s=[{s}] type(s)=[{type(s)}]. Only [str] and"
+                    " [ReadRequest] are supported."
                 )
 
         return self._nvs._batch_read_to_versioned_items(
@@ -1389,7 +1390,8 @@ def handle_symbol(s):
                 handle_read_request(s)
             else:
                 raise ArcticInvalidApiUsageException(
-                    f"Unsupported item in the symbols argument s=[{s}] type(s)=[{type(s)}]. Only [str] and [ReadInfoRequest] are supported."
+                    f"Unsupported item in the symbols argument s=[{s}] type(s)=[{type(s)}]. Only [str] and"
+                    " [ReadInfoRequest] are supported."
                 )
 
         infos = self._nvs.batch_get_info(symbol_strings, as_ofs)

@@ -54,8 +54,7 @@ class Version:
 
     def __str__(self):
         return (
-            f"{self.data and self.data.iloc[0, 0]}[{'M' if self.metadata else ''}{self.state.value}]"
-            f"{self.snaps or ''}"
+            f"{self.data and self.data.iloc[0, 0]}[{'M' if self.metadata else ''}{self.state.value}]{self.snaps or ''}"
         )
 
     def can_delete(self):

@@ -488,7 +488,8 @@ def test_repr(moto_s3_uri_incl_bucket):
     s3_endpoint += f":{port}"
     bucket = moto_s3_uri_incl_bucket.split(":")[-1].split("?")[0]
     assert (
-        repr(lib) == "Library("
+        repr(lib)
+        == "Library("
         "Arctic("
         "config=S3("
         f"endpoint={s3_endpoint}, bucket={bucket})), path=pytest_test_lib, storage=s3_storage)"
@@ -542,7 +543,8 @@ def test_write_object_in_batch_without_pickle_mode(arctic_library):
         lib.write_batch([WritePayload("test_1", A("id_1"))])
     # omit the part with the full class path as that will change in arcticdb
     assert e.value.args[0].startswith(
-        "payload contains some data of types that cannot be normalized. Consider using write_batch_pickle instead. symbols with bad datatypes"
+        "payload contains some data of types that cannot be normalized. Consider using write_batch_pickle instead."
+        " symbols with bad datatypes"
     )
 
 

@@ -586,7 +586,10 @@ def test_prune_previous_api():
 
 
 @pytest.mark.xfail(
-    reason="ArcticDB/issues/230 This test can be folded in with test_column_stats_object_deleted_with_index_key once the issue is resolved"
+    reason=(
+        "ArcticDB/issues/230 This test can be folded in with test_column_stats_object_deleted_with_index_key once the"
+        " issue is resolved"
+    )
 )
 def test_column_stats_object_deleted_with_index_key_batch_methods(lmdb_version_store):
     def clear():

diff --git a/setup.cfg b/setup.cfg
@@ -36,6 +36,68 @@ install_requires =
     decorator
     prometheus_client
 
+[flake8]
+# max line length for black
+max-line-length = 88
+target-version = ['py37']
+# Default flake8 3.5 ignored flags
+ignore=
+    # check ignored by default in flake8. Meaning unclear.
+    E24,
+    # space before : (needed for how black formats slicing)
+    E203,
+    # do not assign a lambda expression, use a def
+    E731,
+    # do not use variables named 'l', 'O', or 'I'
+    E741,
+    # line break before binary operator
+    W503,
+    # line break after binary operator
+    W504
+    # E501 is handled by black
+    E501
+    # TODO: adapt the code-base not to ignore the following checks
+    # E402: remove module imports at the top of the file
+    E402
+    # E711: change check against None
+    E711
+    # E712: simplify assertions
+    E712
+    # E722: do not use bare 'except'
+    E722
+    # F401: remove unused imports
+    F401
+    # F402: remove shadowing of symbols
+    F402
+    # F403: need to remove * and include all symbols explicitly
+    F403
+    # F405: indirectly imported symbols are present due to * imports
+    F405
+    # F523: inspect if the first format is needed
+    F523
+    # F541: adapt f-strings
+    F541
+    # F811: remove redefinition of unused symbols
+    F811
+    # F821: flake8 catches symbols in comment where it shouldn't
+    F821
+    # F841: remove unused variables
+    F841
+    # W391: remove blank lines at the end of the file
+    W391
+
+# This must be kept in sync with the black config in pyproject.toml.
+exclude=
+    .git,
+    .github
+    .mypy_cache
+    .vscode
+    .idea
+    build_tooling
+    cpp
+    docs
+    static
+
 [options.extras_require]
 Testing =
     pytest

diff --git a/setup.py b/setup.py
@@ -17,10 +17,11 @@
 
 # experimental flag to indicate that we want
 # the dependencies from a conda
-ARCTICDB_USING_CONDA  = os.environ.get("ARCTICDB_USING_CONDA", "0")
+ARCTICDB_USING_CONDA = os.environ.get("ARCTICDB_USING_CONDA", "0")
 ARCTICDB_USING_CONDA = ARCTICDB_USING_CONDA != "0"
 print(f"ARCTICDB_USING_CONDA={ARCTICDB_USING_CONDA}")
 
+
 def _log_and_run(*cmd, **kwargs):
     print("Running " + " ".join(cmd))
     subprocess.check_call(cmd, **kwargs)
@@ -170,6 +171,7 @@ def build_extension(self, ext):
 
         assert os.path.exists(dest), f"No output at {dest}, but we didn't get a bad return code from CMake?"
 
+
 def readme():
     github_emoji = re.compile(r":[a-z_]+:")
     with open("README.md", encoding="utf-8") as f: