Merge branch 'master' into ehsan/avoid_tmp_vars

numba · Jan 14, 2021 · c53de53 · c53de53
2 parents 9b1ed7d + 1976c66
commit c53de53
Show file tree

Hide file tree

Showing 69 changed files with 2,825 additions and 581 deletions.
diff --git a/.flake8 b/.flake8
@@ -40,7 +40,6 @@ exclude =
     numba/core/itanium_mangler.py
     numba/core/generators.py
     numba/misc/appdirs.py
-    numba/core/interpreter.py
     numba/core/caching.py
     numba/core/debuginfo.py
     numba/core/annotations/pretty_annotate.py

diff --git a/README.rst b/README.rst
@@ -10,6 +10,10 @@ Numba
    :target: https://numba.discourse.group/
    :alt: Discourse
 
+.. image:: https://zenodo.org/badge/3659275.svg
+   :target: https://zenodo.org/badge/latestdoi/3659275
+   :alt: Zenodo DOI
+
 A Just-In-Time Compiler for Numerical Functions in Python
 #########################################################
 

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -3,7 +3,7 @@ trigger:
 
 variables:
   # Change the following along with adding new TEST_START_INDEX.
-  TEST_COUNT: 20
+  TEST_COUNT: 22
 
 jobs:
 # Mac and Linux use the same template with different matrixes
@@ -121,6 +121,16 @@ jobs:
         NUMPY: '1.19'
         CONDA_ENV: travisci
         TEST_START_INDEX: 17
+      py39_np115:
+        PYTHON: '3.9'
+        NUMPY: '1.16'
+        CONDA_ENV: travisci
+        TEST_START_INDEX: 18
+      py39_np119:
+        PYTHON: '3.9'
+        NUMPY: '1.19'
+        CONDA_ENV: travisci
+        TEST_START_INDEX: 19
 
 - template: buildscripts/azure/azure-windows.yml
   parameters:

diff --git a/buildscripts/azure/azure-windows.yml b/buildscripts/azure/azure-windows.yml
@@ -8,16 +8,16 @@ jobs:
     vmImage: ${{ parameters.vmImage }}
   strategy:
     matrix:
-      py38_np118:
-        PYTHON: '3.8'
-        NUMPY: '1.18'
+      py39_np119:
+        PYTHON: '3.9'
+        NUMPY: '1.19'
         CONDA_ENV: 'testenv'
-        TEST_START_INDEX: 18
+        TEST_START_INDEX: 20
       py37_np115:
         PYTHON: '3.7'
         NUMPY: '1.15'
         CONDA_ENV: 'testenv'
-        TEST_START_INDEX: 19
+        TEST_START_INDEX: 21
 
   steps:
     - task: CondaEnvironment@1

diff --git a/buildscripts/incremental/build.sh b/buildscripts/incremental/build.sh
@@ -16,5 +16,4 @@ python setup.py build_ext -q --inplace --debug $EXTRA_BUILD_EXT_FLAGS
 #  during distutils-dependent tests -- e.g. test_pycc)
 
 # Install numba locally for use in `numba -s` sys info tool at test time
-# `-iNOWHERE` make a fake index to avoid auto downloading dependency
-python -m pip install -iNOWHERE -e .
+python -m pip install --no-deps -e .
diff --git a/docs/source/developer/index.rst b/docs/source/developer/index.rst
@@ -24,5 +24,6 @@ Developer Manual
    caching.rst
    threading_implementation.rst
    literal.rst
+   llvm_timings.rst
    debugging.rst
    roadmap.rst
diff --git a/docs/source/developer/llvm_timings.rst b/docs/source/developer/llvm_timings.rst
@@ -0,0 +1,107 @@
+.. _developer-llvm-timings:
+
+====================
+Notes on timing LLVM
+====================
+
+
+Getting LLVM Pass Timings
+-------------------------
+
+The dispatcher stores LLVM pass timings in the dispatcher object metadata under
+the ``llvm_pass_timings`` key when :envvar:`NUMBA_LLVM_PASS_TIMINGS` is
+enabled or ``numba.config.LLVM_PASS_TIMINGS`` is set to truthy.
+The timings information contains details on how much time
+has been spent in each pass. The pass timings are also grouped by their purpose.
+For example, there will be pass timings for function-level pre-optimizations,
+module-level optimizations, and object code generation.
+
+
+Code Example
+~~~~~~~~~~~~
+
+.. literalinclude:: ../../../numba/tests/doc_examples/test_llvm_pass_timings.py
+   :language: python
+   :caption: from ``test_pass_timings`` of ``numba/tests/doc_examples/test_llvm_pass_timings.py``
+   :start-after: magictoken.ex_llvm_pass_timings.begin
+   :end-before: magictoken.ex_llvm_pass_timings.end
+   :dedent: 16
+   :linenos:
+
+Example output:
+
+.. code-block:: text
+
+  Printing pass timings for JITCodeLibrary('DocsLLVMPassTimings.test_pass_timings.<locals>.foo')
+  Total time: 0.0376
+  == #0 Function passes on '_ZN5numba5tests12doc_examples22test_llvm_pass_timings19DocsLLVMPassTimings17test_pass_timings12$3clocals$3e7foo$241Ex'
+  Percent: 4.8%
+  Total 0.0018s
+  Top timings:
+    0.0015s ( 81.6%) SROA #3
+    0.0002s (  9.3%) Early CSE #2
+    0.0001s (  4.0%) Simplify the CFG #9
+    0.0000s (  1.5%) Prune NRT refops #4
+    0.0000s (  1.1%) Post-Dominator Tree Construction #5
+  == #1 Function passes on '_ZN7cpython5numba5tests12doc_examples22test_llvm_pass_timings19DocsLLVMPassTimings17test_pass_timings12$3clocals$3e7foo$241Ex'
+  Percent: 0.8%
+  Total 0.0003s
+  Top timings:
+    0.0001s ( 30.4%) Simplify the CFG #10
+    0.0001s ( 24.1%) Early CSE #3
+    0.0001s ( 17.8%) SROA #4
+    0.0000s (  8.8%) Prune NRT refops #5
+    0.0000s (  5.6%) Post-Dominator Tree Construction #6
+  == #2 Function passes on 'cfunc._ZN5numba5tests12doc_examples22test_llvm_pass_timings19DocsLLVMPassTimings17test_pass_timings12$3clocals$3e7foo$241Ex'
+  Percent: 0.5%
+  Total 0.0002s
+  Top timings:
+    0.0001s ( 27.7%) Early CSE #4
+    0.0001s ( 26.8%) Simplify the CFG #11
+    0.0000s ( 13.8%) Prune NRT refops #6
+    0.0000s (  7.4%) Post-Dominator Tree Construction #7
+    0.0000s (  6.7%) Dominator Tree Construction #29
+  == #3 Module passes (cheap optimization for refprune)
+  Percent: 3.7%
+  Total 0.0014s
+  Top timings:
+    0.0007s ( 52.0%) Combine redundant instructions
+    0.0001s (  5.4%) Function Integration/Inlining
+    0.0001s (  4.9%) Prune NRT refops #2
+    0.0001s (  4.8%) Natural Loop Information
+    0.0001s (  4.6%) Post-Dominator Tree Construction #2
+  == #4 Module passes (full optimization)
+  Percent: 43.9%
+  Total 0.0165s
+  Top timings:
+    0.0032s ( 19.5%) Combine redundant instructions #9
+    0.0022s ( 13.5%) Combine redundant instructions #7
+    0.0010s (  6.1%) Induction Variable Simplification
+    0.0008s (  4.8%) Unroll loops #2
+    0.0007s (  4.5%) Loop Vectorization
+  == #5 Finalize object
+  Percent: 46.3%
+  Total 0.0174s
+  Top timings:
+    0.0060s ( 34.6%) X86 DAG->DAG Instruction Selection #2
+    0.0019s ( 11.0%) Greedy Register Allocator #2
+    0.0013s (  7.4%) Machine Instruction Scheduler #2
+    0.0012s (  7.1%) Loop Strength Reduction
+    0.0004s (  2.3%) Induction Variable Users
+
+
+API for custom analysis
+~~~~~~~~~~~~~~~~~~~~~~~
+
+It is possible to get more details then the summary text in the above example.
+The pass timings are stored in a
+:class:`numba.misc.llvm_pass_timings.PassTimingsCollection`, which contains
+methods for accessing individual record for each pass.
+
+.. autoclass:: numba.misc.llvm_pass_timings.PassTimingsCollection
+    :members: get_total_time, list_longest_first, summary, __getitem__, __len__
+
+.. autoclass:: numba.misc.llvm_pass_timings.ProcessedPassTimings
+    :members: get_raw_data, get_total_time, list_records, list_top, summary
+
+.. autoclass:: numba.misc.llvm_pass_timings.PassTimingRecord
diff --git a/docs/source/developer/repomap.rst b/docs/source/developer/repomap.rst
@@ -255,7 +255,8 @@ Misc Support
 - :ghfile:`numba/core/caching.py` - Disk cache for compiled functions
 - :ghfile:`numba/np/npdatetime.py` - Helper functions for implementing NumPy
   datetime64 support
-
+- :ghfile:`numba/misc/llvm_pass_timings.py` - Helper to record timings of
+  LLVM passes.
 
 Core Python Data Types
 ''''''''''''''''''''''

diff --git a/docs/source/reference/envvars.rst b/docs/source/reference/envvars.rst
@@ -228,6 +228,14 @@ These variables influence what is printed out during compilation of
 
    Dump the native assembly code of compiled functions.
 
+.. envvar:: NUMBA_LLVM_PASS_TIMINGS
+
+    Set to ``1`` to enable recording of pass timings in LLVM;
+    e.g. ``NUMBA_LLVM_PASS_TIMINGS=1``.
+    See :ref:`developer-llvm-timings`.
+
+    *Default value*: ``0`` (Off)
+
 .. seealso::
    :ref:`numba-troubleshooting` and :ref:`architecture`.
 

diff --git a/docs/source/reference/numpysupported.rst b/docs/source/reference/numpysupported.rst
@@ -472,6 +472,7 @@ The following top-level functions are supported:
 * :func:`numpy.identity`
 * :func:`numpy.kaiser`
 * :func:`numpy.interp` (only the 3 first arguments; requires NumPy >= 1.10)
+* :func:`numpy.intersect1d` (only first 2 arguments, ar1 and ar2)
 * :func:`numpy.linspace` (only the 3-argument form)
 * :class:`numpy.ndenumerate`
 * :class:`numpy.ndindex`

diff --git a/docs/source/user/installing.rst b/docs/source/user/installing.rst
@@ -116,13 +116,8 @@ Raspberry Pi CPU is 64-bit, Raspbian runs it in 32-bit mode, so look at
 Conda-forge support for AArch64 is still quite experimental and packages are limited,
 but it does work enough for Numba to build and pass tests.  To set up the environment:
 
-* Install `conda4aarch64 <https://github.com/jjhelmus/conda4aarch64/releases>`_.
+* Install `miniforge <https://github.com/conda-forge/miniforge>`_.
   This will create a minimal conda environment.
-* Add the ``c4aarch64`` and ``conda-forge`` channels to your conda
-  configuration::
-
-    $ conda config --add channels c4aarch64
-    $ conda config --add channels conda-forge
 
 * Then you can install Numba from the ``numba`` channel::
 

diff --git a/docs/source/user/vectorize.rst b/docs/source/user/vectorize.rst
@@ -157,6 +157,11 @@ argument, which must be filled in by the function.  This is because the
 array is actually allocated by NumPy's dispatch mechanism, which calls into
 the Numba-generated code.
 
+Similar to :func:`~numba.vectorize` decorator, :func:`~numba.guvectorize`
+also has two modes of operation: Eager, or decoration-time compilation and
+lazy, or call-time compilation.
+
+
 Here is a very simple example::
 
    @guvectorize([(int64[:], int64, int64[:])], '(n),()->(n)')
@@ -347,3 +352,72 @@ floating-point values.  For example::
 If you require precise support for various type signatures, you should
 specify them in the :func:`~numba.vectorize` decorator, and not rely
 on dynamic compilation.
+
+Dynamic generalized universal functions
+=======================================
+
+Similar to a dynamic universal function, if you do not specify any types to
+the :func:`~numba.guvectorize` decorator, your Python function will be used
+to build a dynamic generalized universal function, or :class:`~numba.GUFunc`.
+For example::
+
+   from numba import guvectorize
+
+   @guvectorize('(n),()->(n)')
+   def g(x, y, res):
+       for i in range(x.shape[0]):
+           res[i] = x[i] + y
+
+We can verify the resulting function :func:`g` is a :class:`~numba.GUFunc`
+instance that starts with no supported input types. For instance::
+
+   >>> g
+   <numba._GUFunc 'g'>
+   >>> g.ufunc
+   <ufunc 'g'>
+   >>> g.ufunc.types
+   []
+
+Similar to a :class:`~numba.DUFunc`, as one make calls to :func:`g()`,
+numba generates new kernels for previously unsupported input types. The
+following set of interpreter interactions will illustrate how dynamic
+compilation works for a :class:`~numba.GUFunc`::
+
+   >>> x = np.arange(5, dtype=np.int64)
+   >>> y = 10
+   >>> res = np.zeros_like(x)
+   >>> g(x, y, res)
+   >>> res
+   array([5, 6, 7, 8, 9])
+   >>> g.types
+   ['ll->l']
+
+If this was a normal :func:`guvectorize` function, we would have seen an
+exception complaining that the ufunc could not handle the given input types.
+When we call :func:`g()` with the input arguments, numba creates a new loop
+for the input types.
+
+We can add additional loops by calling :func:`g` with new arguments::
+
+   >>> x = np.arange(5, dtype=np.double)
+   >>> y = 2.2
+   >>> res = np.zeros_like(x)
+   >>> g(x, y, res)
+
+We can now verify that Numba added a second loop for dealing with
+floating-point inputs, :code:`"dd->d"`.
+
+   >>> g.types  # shorthand for g.ufunc.types
+   ['ll->l', 'dd->d']
+
+One can also verify that Numpy ufunc casting rules are working as expected::
+
+   >>> x = np.arange(5, dtype=np.int64)
+   >>> y = 2.2
+   >>> res = np.zeros_like(x)
+   >>> g(x, y, res)
+   >>> res
+
+If you need precise support for various type signatures, you should not rely on dynamic
+compilation and instead, specify the types them as first
+argument in the :func:`~numba.guvectorize` decorator.
diff --git a/numba/__init__.py b/numba/__init__.py
@@ -66,7 +66,6 @@
     njit
     stencil
     jit_module
-    jitclass
     typeof
     prange
     gdb

diff --git a/numba/_dispatcher.cpp b/numba/_dispatcher.cpp
@@ -740,10 +740,15 @@ static PyTypeObject DispatcherType = {
     0,                                           /* tp_del */
     0,                                           /* tp_version_tag */
     0,                                           /* tp_finalize */
-#if PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION > 7
+#if PY_MAJOR_VERSION == 3
+/* Python 3.8 has two slots, 3.9 has one. */
+#if PY_MINOR_VERSION > 7
     0,                                           /* tp_vectorcall */
+#if PY_MINOR_VERSION == 8
     0,                                           /* tp_print */
 #endif
+#endif
+#endif
 };
 
 

diff --git a/numba/_dynfunc.c b/numba/_dynfunc.c
@@ -135,10 +135,15 @@ static PyTypeObject EnvironmentType = {
     0,                         /* tp_del */
     0,                         /* tp_version_tag */
     0,                         /* tp_finalize */
-#if PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION > 7
+#if PY_MAJOR_VERSION == 3
+/* Python 3.8 has two slots, 3.9 has one. */
+#if PY_MINOR_VERSION > 7
     0,                         /* tp_vectorcall */
+#if PY_MINOR_VERSION == 8
     0,                         /* tp_print */
 #endif
+#endif
+#endif
 };
 
 /* A closure object is created for each call to make_function(), and stored
@@ -240,10 +245,15 @@ static PyTypeObject ClosureType = {
     0,                         /* tp_del */
     0,                         /* tp_version_tag */
     0,                         /* tp_finalize */
-#if PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION > 7
+#if PY_MAJOR_VERSION == 3
+/* Python 3.8 has two slots, 3.9 has one. */
+#if PY_MINOR_VERSION > 7
     0,                         /* tp_vectorcall */
+#if PY_MINOR_VERSION == 8
     0,                         /* tp_print */
 #endif
+#endif
+#endif
 };
 
 
@@ -452,9 +462,14 @@ static PyTypeObject GeneratorType = {
     0,                                        /* tp_del */
     0,                                        /* tp_version_tag */
     0,                                        /* tp_finalize */
-#if PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION > 7
-    0,                                        /* tp_vectorcall */
-    0,                                        /* tp_print */
+#if PY_MAJOR_VERSION == 3
+/* Python 3.8 has two slots, 3.9 has one. */
+#if PY_MINOR_VERSION > 7
+    0,                         /* tp_vectorcall */
+#if PY_MINOR_VERSION == 8
+    0,                         /* tp_print */
+#endif
+#endif
 #endif
 };