Skip to content

Commit

Permalink
Merge branch 'master' into ehsan/avoid_tmp_vars
Browse files Browse the repository at this point in the history
  • Loading branch information
ehsantn committed Jan 14, 2021
2 parents 9b1ed7d + 1976c66 commit c53de53
Show file tree
Hide file tree
Showing 69 changed files with 2,825 additions and 581 deletions.
1 change: 0 additions & 1 deletion .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ exclude =
numba/core/itanium_mangler.py
numba/core/generators.py
numba/misc/appdirs.py
numba/core/interpreter.py
numba/core/caching.py
numba/core/debuginfo.py
numba/core/annotations/pretty_annotate.py
Expand Down
4 changes: 4 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ Numba
:target: https://numba.discourse.group/
:alt: Discourse

.. image:: https://zenodo.org/badge/3659275.svg
:target: https://zenodo.org/badge/latestdoi/3659275
:alt: Zenodo DOI

A Just-In-Time Compiler for Numerical Functions in Python
#########################################################

Expand Down
12 changes: 11 additions & 1 deletion azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ trigger:

variables:
# Change the following along with adding new TEST_START_INDEX.
TEST_COUNT: 20
TEST_COUNT: 22

jobs:
# Mac and Linux use the same template with different matrixes
Expand Down Expand Up @@ -121,6 +121,16 @@ jobs:
NUMPY: '1.19'
CONDA_ENV: travisci
TEST_START_INDEX: 17
py39_np115:
PYTHON: '3.9'
NUMPY: '1.16'
CONDA_ENV: travisci
TEST_START_INDEX: 18
py39_np119:
PYTHON: '3.9'
NUMPY: '1.19'
CONDA_ENV: travisci
TEST_START_INDEX: 19

- template: buildscripts/azure/azure-windows.yml
parameters:
Expand Down
10 changes: 5 additions & 5 deletions buildscripts/azure/azure-windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,16 @@ jobs:
vmImage: ${{ parameters.vmImage }}
strategy:
matrix:
py38_np118:
PYTHON: '3.8'
NUMPY: '1.18'
py39_np119:
PYTHON: '3.9'
NUMPY: '1.19'
CONDA_ENV: 'testenv'
TEST_START_INDEX: 18
TEST_START_INDEX: 20
py37_np115:
PYTHON: '3.7'
NUMPY: '1.15'
CONDA_ENV: 'testenv'
TEST_START_INDEX: 19
TEST_START_INDEX: 21

steps:
- task: CondaEnvironment@1
Expand Down
3 changes: 1 addition & 2 deletions buildscripts/incremental/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,4 @@ python setup.py build_ext -q --inplace --debug $EXTRA_BUILD_EXT_FLAGS
# during distutils-dependent tests -- e.g. test_pycc)

# Install numba locally for use in `numba -s` sys info tool at test time
# `-iNOWHERE` make a fake index to avoid auto downloading dependency
python -m pip install -iNOWHERE -e .
python -m pip install --no-deps -e .
1 change: 1 addition & 0 deletions docs/source/developer/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,6 @@ Developer Manual
caching.rst
threading_implementation.rst
literal.rst
llvm_timings.rst
debugging.rst
roadmap.rst
107 changes: 107 additions & 0 deletions docs/source/developer/llvm_timings.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
.. _developer-llvm-timings:

====================
Notes on timing LLVM
====================


Getting LLVM Pass Timings
-------------------------

The dispatcher stores LLVM pass timings in the dispatcher object metadata under
the ``llvm_pass_timings`` key when :envvar:`NUMBA_LLVM_PASS_TIMINGS` is
enabled or ``numba.config.LLVM_PASS_TIMINGS`` is set to truthy.
The timings information contains details on how much time
has been spent in each pass. The pass timings are also grouped by their purpose.
For example, there will be pass timings for function-level pre-optimizations,
module-level optimizations, and object code generation.


Code Example
~~~~~~~~~~~~

.. literalinclude:: ../../../numba/tests/doc_examples/test_llvm_pass_timings.py
:language: python
:caption: from ``test_pass_timings`` of ``numba/tests/doc_examples/test_llvm_pass_timings.py``
:start-after: magictoken.ex_llvm_pass_timings.begin
:end-before: magictoken.ex_llvm_pass_timings.end
:dedent: 16
:linenos:

Example output:

.. code-block:: text
Printing pass timings for JITCodeLibrary('DocsLLVMPassTimings.test_pass_timings.<locals>.foo')
Total time: 0.0376
== #0 Function passes on '_ZN5numba5tests12doc_examples22test_llvm_pass_timings19DocsLLVMPassTimings17test_pass_timings12$3clocals$3e7foo$241Ex'
Percent: 4.8%
Total 0.0018s
Top timings:
0.0015s ( 81.6%) SROA #3
0.0002s ( 9.3%) Early CSE #2
0.0001s ( 4.0%) Simplify the CFG #9
0.0000s ( 1.5%) Prune NRT refops #4
0.0000s ( 1.1%) Post-Dominator Tree Construction #5
== #1 Function passes on '_ZN7cpython5numba5tests12doc_examples22test_llvm_pass_timings19DocsLLVMPassTimings17test_pass_timings12$3clocals$3e7foo$241Ex'
Percent: 0.8%
Total 0.0003s
Top timings:
0.0001s ( 30.4%) Simplify the CFG #10
0.0001s ( 24.1%) Early CSE #3
0.0001s ( 17.8%) SROA #4
0.0000s ( 8.8%) Prune NRT refops #5
0.0000s ( 5.6%) Post-Dominator Tree Construction #6
== #2 Function passes on 'cfunc._ZN5numba5tests12doc_examples22test_llvm_pass_timings19DocsLLVMPassTimings17test_pass_timings12$3clocals$3e7foo$241Ex'
Percent: 0.5%
Total 0.0002s
Top timings:
0.0001s ( 27.7%) Early CSE #4
0.0001s ( 26.8%) Simplify the CFG #11
0.0000s ( 13.8%) Prune NRT refops #6
0.0000s ( 7.4%) Post-Dominator Tree Construction #7
0.0000s ( 6.7%) Dominator Tree Construction #29
== #3 Module passes (cheap optimization for refprune)
Percent: 3.7%
Total 0.0014s
Top timings:
0.0007s ( 52.0%) Combine redundant instructions
0.0001s ( 5.4%) Function Integration/Inlining
0.0001s ( 4.9%) Prune NRT refops #2
0.0001s ( 4.8%) Natural Loop Information
0.0001s ( 4.6%) Post-Dominator Tree Construction #2
== #4 Module passes (full optimization)
Percent: 43.9%
Total 0.0165s
Top timings:
0.0032s ( 19.5%) Combine redundant instructions #9
0.0022s ( 13.5%) Combine redundant instructions #7
0.0010s ( 6.1%) Induction Variable Simplification
0.0008s ( 4.8%) Unroll loops #2
0.0007s ( 4.5%) Loop Vectorization
== #5 Finalize object
Percent: 46.3%
Total 0.0174s
Top timings:
0.0060s ( 34.6%) X86 DAG->DAG Instruction Selection #2
0.0019s ( 11.0%) Greedy Register Allocator #2
0.0013s ( 7.4%) Machine Instruction Scheduler #2
0.0012s ( 7.1%) Loop Strength Reduction
0.0004s ( 2.3%) Induction Variable Users
API for custom analysis
~~~~~~~~~~~~~~~~~~~~~~~

It is possible to get more details then the summary text in the above example.
The pass timings are stored in a
:class:`numba.misc.llvm_pass_timings.PassTimingsCollection`, which contains
methods for accessing individual record for each pass.

.. autoclass:: numba.misc.llvm_pass_timings.PassTimingsCollection
:members: get_total_time, list_longest_first, summary, __getitem__, __len__

.. autoclass:: numba.misc.llvm_pass_timings.ProcessedPassTimings
:members: get_raw_data, get_total_time, list_records, list_top, summary

.. autoclass:: numba.misc.llvm_pass_timings.PassTimingRecord
3 changes: 2 additions & 1 deletion docs/source/developer/repomap.rst
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,8 @@ Misc Support
- :ghfile:`numba/core/caching.py` - Disk cache for compiled functions
- :ghfile:`numba/np/npdatetime.py` - Helper functions for implementing NumPy
datetime64 support

- :ghfile:`numba/misc/llvm_pass_timings.py` - Helper to record timings of
LLVM passes.

Core Python Data Types
''''''''''''''''''''''
Expand Down
8 changes: 8 additions & 0 deletions docs/source/reference/envvars.rst
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,14 @@ These variables influence what is printed out during compilation of

Dump the native assembly code of compiled functions.

.. envvar:: NUMBA_LLVM_PASS_TIMINGS

Set to ``1`` to enable recording of pass timings in LLVM;
e.g. ``NUMBA_LLVM_PASS_TIMINGS=1``.
See :ref:`developer-llvm-timings`.

*Default value*: ``0`` (Off)

.. seealso::
:ref:`numba-troubleshooting` and :ref:`architecture`.

Expand Down
1 change: 1 addition & 0 deletions docs/source/reference/numpysupported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,7 @@ The following top-level functions are supported:
* :func:`numpy.identity`
* :func:`numpy.kaiser`
* :func:`numpy.interp` (only the 3 first arguments; requires NumPy >= 1.10)
* :func:`numpy.intersect1d` (only first 2 arguments, ar1 and ar2)
* :func:`numpy.linspace` (only the 3-argument form)
* :class:`numpy.ndenumerate`
* :class:`numpy.ndindex`
Expand Down
7 changes: 1 addition & 6 deletions docs/source/user/installing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -116,13 +116,8 @@ Raspberry Pi CPU is 64-bit, Raspbian runs it in 32-bit mode, so look at
Conda-forge support for AArch64 is still quite experimental and packages are limited,
but it does work enough for Numba to build and pass tests. To set up the environment:

* Install `conda4aarch64 <https://github.com/jjhelmus/conda4aarch64/releases>`_.
* Install `miniforge <https://github.com/conda-forge/miniforge>`_.
This will create a minimal conda environment.
* Add the ``c4aarch64`` and ``conda-forge`` channels to your conda
configuration::

$ conda config --add channels c4aarch64
$ conda config --add channels conda-forge

* Then you can install Numba from the ``numba`` channel::

Expand Down
74 changes: 74 additions & 0 deletions docs/source/user/vectorize.rst
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,11 @@ argument, which must be filled in by the function. This is because the
array is actually allocated by NumPy's dispatch mechanism, which calls into
the Numba-generated code.

Similar to :func:`~numba.vectorize` decorator, :func:`~numba.guvectorize`
also has two modes of operation: Eager, or decoration-time compilation and
lazy, or call-time compilation.


Here is a very simple example::

@guvectorize([(int64[:], int64, int64[:])], '(n),()->(n)')
Expand Down Expand Up @@ -347,3 +352,72 @@ floating-point values. For example::
If you require precise support for various type signatures, you should
specify them in the :func:`~numba.vectorize` decorator, and not rely
on dynamic compilation.

Dynamic generalized universal functions
=======================================

Similar to a dynamic universal function, if you do not specify any types to
the :func:`~numba.guvectorize` decorator, your Python function will be used
to build a dynamic generalized universal function, or :class:`~numba.GUFunc`.
For example::

from numba import guvectorize

@guvectorize('(n),()->(n)')
def g(x, y, res):
for i in range(x.shape[0]):
res[i] = x[i] + y

We can verify the resulting function :func:`g` is a :class:`~numba.GUFunc`
instance that starts with no supported input types. For instance::

>>> g
<numba._GUFunc 'g'>
>>> g.ufunc
<ufunc 'g'>
>>> g.ufunc.types
[]

Similar to a :class:`~numba.DUFunc`, as one make calls to :func:`g()`,
numba generates new kernels for previously unsupported input types. The
following set of interpreter interactions will illustrate how dynamic
compilation works for a :class:`~numba.GUFunc`::

>>> x = np.arange(5, dtype=np.int64)
>>> y = 10
>>> res = np.zeros_like(x)
>>> g(x, y, res)
>>> res
array([5, 6, 7, 8, 9])
>>> g.types
['ll->l']

If this was a normal :func:`guvectorize` function, we would have seen an
exception complaining that the ufunc could not handle the given input types.
When we call :func:`g()` with the input arguments, numba creates a new loop
for the input types.

We can add additional loops by calling :func:`g` with new arguments::

>>> x = np.arange(5, dtype=np.double)
>>> y = 2.2
>>> res = np.zeros_like(x)
>>> g(x, y, res)

We can now verify that Numba added a second loop for dealing with
floating-point inputs, :code:`"dd->d"`.

>>> g.types # shorthand for g.ufunc.types
['ll->l', 'dd->d']

One can also verify that Numpy ufunc casting rules are working as expected::

>>> x = np.arange(5, dtype=np.int64)
>>> y = 2.2
>>> res = np.zeros_like(x)
>>> g(x, y, res)
>>> res

If you need precise support for various type signatures, you should not rely on dynamic
compilation and instead, specify the types them as first
argument in the :func:`~numba.guvectorize` decorator.
1 change: 0 additions & 1 deletion numba/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@
njit
stencil
jit_module
jitclass
typeof
prange
gdb
Expand Down
7 changes: 6 additions & 1 deletion numba/_dispatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -740,10 +740,15 @@ static PyTypeObject DispatcherType = {
0, /* tp_del */
0, /* tp_version_tag */
0, /* tp_finalize */
#if PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION > 7
#if PY_MAJOR_VERSION == 3
/* Python 3.8 has two slots, 3.9 has one. */
#if PY_MINOR_VERSION > 7
0, /* tp_vectorcall */
#if PY_MINOR_VERSION == 8
0, /* tp_print */
#endif
#endif
#endif
};


Expand Down
25 changes: 20 additions & 5 deletions numba/_dynfunc.c
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,15 @@ static PyTypeObject EnvironmentType = {
0, /* tp_del */
0, /* tp_version_tag */
0, /* tp_finalize */
#if PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION > 7
#if PY_MAJOR_VERSION == 3
/* Python 3.8 has two slots, 3.9 has one. */
#if PY_MINOR_VERSION > 7
0, /* tp_vectorcall */
#if PY_MINOR_VERSION == 8
0, /* tp_print */
#endif
#endif
#endif
};

/* A closure object is created for each call to make_function(), and stored
Expand Down Expand Up @@ -240,10 +245,15 @@ static PyTypeObject ClosureType = {
0, /* tp_del */
0, /* tp_version_tag */
0, /* tp_finalize */
#if PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION > 7
#if PY_MAJOR_VERSION == 3
/* Python 3.8 has two slots, 3.9 has one. */
#if PY_MINOR_VERSION > 7
0, /* tp_vectorcall */
#if PY_MINOR_VERSION == 8
0, /* tp_print */
#endif
#endif
#endif
};


Expand Down Expand Up @@ -452,9 +462,14 @@ static PyTypeObject GeneratorType = {
0, /* tp_del */
0, /* tp_version_tag */
0, /* tp_finalize */
#if PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION > 7
0, /* tp_vectorcall */
0, /* tp_print */
#if PY_MAJOR_VERSION == 3
/* Python 3.8 has two slots, 3.9 has one. */
#if PY_MINOR_VERSION > 7
0, /* tp_vectorcall */
#if PY_MINOR_VERSION == 8
0, /* tp_print */
#endif
#endif
#endif
};

Expand Down

0 comments on commit c53de53

Please sign in to comment.