From b528205463b6c532c361980f5cb3cd4a5d402a4d Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Wed, 25 Sep 2019 17:04:43 -0600
Subject: [PATCH 001/136] Add get_num_threads and set_num_threads functions to
 tbbpool.cpp

These will be used to allow masking threads.
---
 numba/npyufunc/tbbpool.cpp | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/numba/npyufunc/tbbpool.cpp b/numba/npyufunc/tbbpool.cpp
index d976c8f8974..b763e228a8f 100644
--- a/numba/npyufunc/tbbpool.cpp
+++ b/numba/npyufunc/tbbpool.cpp
@@ -37,6 +37,7 @@ Implement parallel vectorize workqueue on top of Intel TBB.
 static tbb::task_group *tg = NULL;
 static tbb::task_scheduler_init *tsi = NULL;
 static int tsi_count = 0;
+int num_threads = 0;
 
 static void
 add_task(void *fn, void *args, void *dims, void *steps, void *data)
@@ -206,6 +207,15 @@ static void ready(void)
 {
 }
 
+static void set_num_threads(int count)
+{
+    num_threads = count;
+}
+
+static int get_num_threads(void)
+{
+    return num_threads;
+}
 
 MOD_INIT(tbbpool)
 {
@@ -235,7 +245,10 @@ MOD_INIT(tbbpool)
                            PyLong_FromVoidPtr((void*)&do_scheduling_signed));
     PyObject_SetAttrString(m, "do_scheduling_unsigned",
                            PyLong_FromVoidPtr((void*)&do_scheduling_unsigned));
-
+    PyObject_SetAttrString(m, "set_num_threads",
+                           PyLong_FromVoidPtr((void*)&set_num_threads));
+    PyObject_SetAttrString(m, "get_num_threads",
+                           PyLong_FromVoidPtr((void*)&get_num_threads));
 
     return MOD_SUCCESS_VAL(m);
 }

From 066c61b97f137ff374ee7170e034d356b264442f Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Wed, 25 Sep 2019 17:29:41 -0600
Subject: [PATCH 002/136] Add get_num_threads and set_num_threads to
 omppool.cpp and workqueue.c

---
 numba/npyufunc/omppool.cpp | 17 +++++++++++++++++
 numba/npyufunc/workqueue.c | 16 ++++++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/numba/npyufunc/omppool.cpp b/numba/npyufunc/omppool.cpp
index 32293fd94df..c8dd0d8d31c 100644
--- a/numba/npyufunc/omppool.cpp
+++ b/numba/npyufunc/omppool.cpp
@@ -40,6 +40,8 @@ Threading layer on top of OpenMP.
 static pid_t parent_pid = 0; // 0 is not set, users can't own this anyway
 #endif
 
+int num_threads = 0;
+
 static void
 add_task(void *fn, void *args, void *dims, void *steps, void *data)
 {
@@ -182,6 +184,16 @@ static void ready(void)
 {
 }
 
+static void set_num_threads(int count)
+{
+    num_threads = count;
+}
+
+static int get_num_threads(void)
+{
+    return num_threads;
+}
+
 MOD_INIT(omppool)
 {
     PyObject *m;
@@ -205,5 +217,10 @@ MOD_INIT(omppool)
                            PyLong_FromVoidPtr((void*)&do_scheduling_unsigned));
     PyObject_SetAttrString(m, "openmp_vendor",
                            PyString_FromString(_OMP_VENDOR));
+    PyObject_SetAttrString(m, "set_num_threads",
+                           PyLong_FromVoidPtr((void*)&set_num_threads));
+    PyObject_SetAttrString(m, "get_num_threads",
+                           PyLong_FromVoidPtr((void*)&get_num_threads));
+
     return MOD_SUCCESS_VAL(m);
 }
diff --git a/numba/npyufunc/workqueue.c b/numba/npyufunc/workqueue.c
index 17c29d22f87..f1366bbe48a 100644
--- a/numba/npyufunc/workqueue.c
+++ b/numba/npyufunc/workqueue.c
@@ -51,6 +51,8 @@ typedef struct
     pthread_mutex_t mutex;
 } queue_condition_t;
 
+int num_threads = 0;
+
 static int
 queue_condition_init(queue_condition_t *qc)
 {
@@ -450,6 +452,16 @@ static void reset_after_fork(void)
     NUM_THREADS = -1;
 }
 
+static void set_num_threads(int count)
+{
+    num_threads = count;
+}
+
+static int get_num_threads(void)
+{
+    return num_threads;
+}
+
 MOD_INIT(workqueue)
 {
     PyObject *m;
@@ -471,6 +483,10 @@ MOD_INIT(workqueue)
                            PyLong_FromVoidPtr(&do_scheduling_signed));
     PyObject_SetAttrString(m, "do_scheduling_unsigned",
                            PyLong_FromVoidPtr(&do_scheduling_unsigned));
+    PyObject_SetAttrString(m, "set_num_threads",
+                           PyLong_FromVoidPtr((void*)&set_num_threads));
+    PyObject_SetAttrString(m, "get_num_threads",
+                           PyLong_FromVoidPtr((void*)&get_num_threads));
 
     return MOD_SUCCESS_VAL(m);
 }

From 33c4c8b72adcb5f5fea1d4152997713ea63c1593 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Wed, 25 Sep 2019 17:30:32 -0600
Subject: [PATCH 003/136] Add a basic API to mask the number of threads used by
 parfor

This is still a work in progress. Still todo are:

- Fix it for workqueue. For some reason if you set the number of threads to a
  lower number than the max, it only uses 1 thread.

- Do the masking in the reduction phase as well.

- Clean up the API (better function names?).

- Tests.
---
 numba/npyufunc/parallel.py | 10 ++++++++++
 numba/npyufunc/parfor.py   | 12 +++++++++---
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index d2e52e81088..67324d7f45f 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -442,9 +442,14 @@ def raise_with_hint(required):
             ll.add_symbol('numba_parallel_for', lib.parallel_for)
             ll.add_symbol('do_scheduling_signed', lib.do_scheduling_signed)
             ll.add_symbol('do_scheduling_unsigned', lib.do_scheduling_unsigned)
+            ll.add_symbol('get_num_threads', lib.get_num_threads)
+            ll.add_symbol('set_num_threads', lib.set_num_threads)
 
             launch_threads = CFUNCTYPE(None, c_int)(lib.launch_threads)
             launch_threads(NUM_THREADS)
+            global _set_num_threads
+            _set_num_threads = CFUNCTYPE(None, c_int)(lib.set_num_threads)
+            _set_num_threads(NUM_THREADS)
 
             # set library name so it can be queried
             global _threading_layer
@@ -452,6 +457,11 @@ def raise_with_hint(required):
             _is_initialized = True
 
 
+def set_num_threads(n):
+    if n > NUM_THREADS or n < 0:
+        raise ValueError("The number of threads must be between 0 and %s" % NUM_THREADS)
+    _set_num_threads(n)
+
 _DYLD_WORKAROUND_SET = 'NUMBA_DYLD_WORKAROUND' in os.environ
 _DYLD_WORKAROUND_VAL = int(os.environ.get('NUMBA_DYLD_WORKAROUND', 0))
 
diff --git a/numba/npyufunc/parfor.py b/numba/npyufunc/parfor.py
index 1096f32873d..9386794a9dc 100644
--- a/numba/npyufunc/parfor.py
+++ b/numba/npyufunc/parfor.py
@@ -1291,11 +1291,17 @@ def load_range(v):
         do_scheduling = builder.module.get_or_insert_function(scheduling_fnty,
                                                           name="do_scheduling_unsigned")
 
+    get_num_threads = builder.module.get_or_insert_function(
+        lc.Type.function(lc.Type.int(types.intp.bitwidth), []),
+        name="get_num_threads")
+
+    num_threads = builder.call(get_num_threads, [])
+
     builder.call(
         do_scheduling, [
             context.get_constant(
-                types.uintp, num_dim), dim_starts, dim_stops, context.get_constant(
-                types.uintp, get_thread_count()), sched, context.get_constant(
+                types.uintp, num_dim), dim_starts, dim_stops, num_threads,
+            sched, context.get_constant(
                     types.intp, debug_flag)])
 
     # Get the LLVM vars for the Numba IR reduction array vars.
@@ -1422,7 +1428,7 @@ def load_range(v):
     nshapes = len(sig_dim_dict) + 1
     shapes = cgutils.alloca_once(builder, intp_t, size=nshapes, name="pshape")
     # For now, outer loop size is the same as number of threads
-    builder.store(context.get_constant(types.intp, get_thread_count()), shapes)
+    builder.store(num_threads, shapes)
     # Individual shape variables go next
     i = 1
     for dim_sym in occurances:

From 96cc3d354f73be3e2f7452aa2c5792ab2fe01726 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Fri, 27 Sep 2019 16:24:11 -0600
Subject: [PATCH 004/136] Some work in progress on masking threads in the
 reduction phase

---
 numba/npyufunc/parfor.py | 33 ++++++++++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/numba/npyufunc/parfor.py b/numba/npyufunc/parfor.py
index 9386794a9dc..3a864a62597 100644
--- a/numba/npyufunc/parfor.py
+++ b/numba/npyufunc/parfor.py
@@ -6,6 +6,8 @@
 import linecache
 import os
 import sys
+import operator
+
 import numpy as np
 
 import llvmlite.llvmpy.core as lc
@@ -19,7 +21,7 @@
                             get_definition, guard, find_callname,
                             get_call_table, is_pure, get_np_ufunc_typ,
                             get_unused_var_name, find_potential_aliases,
-                            is_const_call)
+                            is_const_call, next_label)
 from numba.analysis import (compute_use_defs, compute_live_map,
                             compute_dead_maps, compute_cfg_from_blocks)
 from ..typing import signature
@@ -58,6 +60,7 @@ def _lower_parfor_parallel(lowerer, parfor):
     lowerer.fndesc.typemap = copy.copy(orig_typemap)
     typemap = lowerer.fndesc.typemap
     varmap = lowerer.varmap
+    builder = lowerer.builder
 
     if config.DEBUG_ARRAY_OPT:
         print("_lower_parfor_parallel")
@@ -97,6 +100,23 @@ def _lower_parfor_parallel(lowerer, parfor):
     parfor_redvars, parfor_reddict = numba.parfor.get_parfor_reductions(
         parfor, parfor.params, lowerer.fndesc.calltypes)
 
+
+
+    builder.module.get_or_insert_function(
+        lc.Type.function(lc.Type.int(types.intp.bitwidth), []),
+        name="get_num_threads")
+
+    get_num_threads_sig = signature(types.intp)
+    get_num_threads_intrinsic = ir.Intrinsic('get_num_threads', get_num_threads_sig, [])
+
+    num_threads = ir.Expr.call(get_num_threads_intrinsic, (), (), loc)
+    lowerer.fndesc.calltypes[num_threads] = get_num_threads_sig
+
+    get_num_threads_var = ir.Var(scope, mk_unique_var('get_num_threads'), loc)
+    get_num_threads_assign = ir.Assign(num_threads, get_num_threads_var, loc)
+    typemap[get_num_threads_var.name] = types.intp
+    lowerer.lower_inst(get_num_threads_assign)
+
     # init reduction array allocation here.
     nredvars = len(parfor_redvars)
     redarrs = {}
@@ -221,6 +241,17 @@ def _lower_parfor_parallel(lowerer, parfor):
                 typemap[index_var.name] = types.uintp
                 lowerer.lower_inst(index_var_assign)
 
+                cond = ir.Expr.binop(operator.gt, index_var, get_num_threads_var, loc)
+                cond_var = ir.Var(scope, mk_unique_var('cond'), loc)
+                cond_assign = ir.Assign(cond, cond_var, loc)
+                typemap[cond_var.name] = types.intp
+                ir.lower_inst(cond_assign)
+
+                truebr = next_label()
+                falsebr = next_label()
+                branch = ir.Branch(cond, truebr, falsebr, loc)
+                lowerer.lower_inst(branch)
+
                 redsetitem = ir.SetItem(redarr_var, index_var, redtoset, loc)
                 lowerer.fndesc.calltypes[redsetitem] = signature(types.none,
                         typemap[redarr_var.name], typemap[index_var.name], redvar_typ)

From 2ea2cbb1cd323f15445eb6802316b6acd983246e Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Mon, 14 Oct 2019 16:25:20 -0600
Subject: [PATCH 005/136] Correct the code for calling get_num_threads in the
 parfor reduction phase

---
 numba/npyufunc/parfor.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/numba/npyufunc/parfor.py b/numba/npyufunc/parfor.py
index 3a864a62597..1285fb05ff0 100644
--- a/numba/npyufunc/parfor.py
+++ b/numba/npyufunc/parfor.py
@@ -100,20 +100,20 @@ def _lower_parfor_parallel(lowerer, parfor):
     parfor_redvars, parfor_reddict = numba.parfor.get_parfor_reductions(
         parfor, parfor.params, lowerer.fndesc.calltypes)
 
-
-
-    builder.module.get_or_insert_function(
+    get_num_threads = builder.module.get_or_insert_function(
         lc.Type.function(lc.Type.int(types.intp.bitwidth), []),
         name="get_num_threads")
 
-    get_num_threads_sig = signature(types.intp)
-    get_num_threads_intrinsic = ir.Intrinsic('get_num_threads', get_num_threads_sig, [])
+    num_threads = builder.call(get_num_threads, [])
 
-    num_threads = ir.Expr.call(get_num_threads_intrinsic, (), (), loc)
-    lowerer.fndesc.calltypes[num_threads] = get_num_threads_sig
+    # get_num_threads_sig = signature(types.intp)
+    # get_num_threads_intrinsic = ir.Intrinsic('get_num_threads', get_num_threads_sig, [])
+    #
+    # num_threads = ir.Expr.call(get_num_threads_intrinsic, (), (), loc)
+    # lowerer.fndesc.calltypes[num_threads] = get_num_threads_sig
 
     get_num_threads_var = ir.Var(scope, mk_unique_var('get_num_threads'), loc)
-    get_num_threads_assign = ir.Assign(num_threads, get_num_threads_var, loc)
+    get_num_threads_assign = ir.Assign(ir.Const(num_threads, loc), get_num_threads_var, loc)
     typemap[get_num_threads_var.name] = types.intp
     lowerer.lower_inst(get_num_threads_assign)
 

From 305c97b76aeb386d9bee3d8a579dfe7f632ffd5c Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Mon, 14 Oct 2019 18:45:54 -0600
Subject: [PATCH 006/136] Some more work on using get_num_threads in the
 reduction phase

It still doesn't work. I'm not sure exactly how to generate this logic with
the numba IR.
---
 numba/npyufunc/parfor.py | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/numba/npyufunc/parfor.py b/numba/npyufunc/parfor.py
index 1285fb05ff0..baf96c914fc 100644
--- a/numba/npyufunc/parfor.py
+++ b/numba/npyufunc/parfor.py
@@ -235,27 +235,41 @@ def _lower_parfor_parallel(lowerer, parfor):
                 redtoset = redvar
 
             # For each thread, initialize the per-worker reduction array to the current reduction array value.
+            redblocks = []
             for j in range(get_thread_count()):
+                block = ir.Block(scope, loc)
+                redblocks.append(block)
                 index_var = ir.Var(scope, mk_unique_var("index_var"), loc)
                 index_var_assign = ir.Assign(ir.Const(j, loc), index_var, loc)
                 typemap[index_var.name] = types.uintp
+                block.body.append(index_var_assign)
                 lowerer.lower_inst(index_var_assign)
 
                 cond = ir.Expr.binop(operator.gt, index_var, get_num_threads_var, loc)
                 cond_var = ir.Var(scope, mk_unique_var('cond'), loc)
                 cond_assign = ir.Assign(cond, cond_var, loc)
                 typemap[cond_var.name] = types.intp
-                ir.lower_inst(cond_assign)
+                lowerer.fndesc.calltypes[cond] = signature(types.i1, types.intp, types.intp)
+                lowerer.lower_inst(cond_assign)
 
                 truebr = next_label()
+                true_block = ir.Block(scope, loc)
+                lowerer.blkmap[truebr] = true_block
+                # lowerer.blocks[truebr] = true_block
                 falsebr = next_label()
-                branch = ir.Branch(cond, truebr, falsebr, loc)
-                lowerer.lower_inst(branch)
+                false_block = ir.Block(scope, loc)
+                lowerer.blkmap[falsebr] = false_block
+                # lowerer.blocks[falsebr] = false_block
 
                 redsetitem = ir.SetItem(redarr_var, index_var, redtoset, loc)
                 lowerer.fndesc.calltypes[redsetitem] = signature(types.none,
                         typemap[redarr_var.name], typemap[index_var.name], redvar_typ)
-                lowerer.lower_inst(redsetitem)
+                false_block.body.append(redsetitem)
+
+                branch = ir.Branch(cond_var, truebr, falsebr, loc)
+                block.body.append(branch)
+                # lowerer.lower_inst(branch)
+            parfor.init_block.body.append(ir.Jump(redblocks[0], loc))
 
     # compile parfor body as a separate function to be used with GUFuncWrapper
     flags = copy.copy(parfor.flags)

From 6082c0ac0afd4380a2c318e8d2d3e125dbc4da7c Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Wed, 23 Oct 2019 15:13:55 -0600
Subject: [PATCH 007/136] Remove WIP get_thread_count stuff from the parfor
 reduction phase

---
 numba/npyufunc/parfor.py | 44 +---------------------------------------
 1 file changed, 1 insertion(+), 43 deletions(-)

diff --git a/numba/npyufunc/parfor.py b/numba/npyufunc/parfor.py
index baf96c914fc..da4c119e6aa 100644
--- a/numba/npyufunc/parfor.py
+++ b/numba/npyufunc/parfor.py
@@ -100,23 +100,6 @@ def _lower_parfor_parallel(lowerer, parfor):
     parfor_redvars, parfor_reddict = numba.parfor.get_parfor_reductions(
         parfor, parfor.params, lowerer.fndesc.calltypes)
 
-    get_num_threads = builder.module.get_or_insert_function(
-        lc.Type.function(lc.Type.int(types.intp.bitwidth), []),
-        name="get_num_threads")
-
-    num_threads = builder.call(get_num_threads, [])
-
-    # get_num_threads_sig = signature(types.intp)
-    # get_num_threads_intrinsic = ir.Intrinsic('get_num_threads', get_num_threads_sig, [])
-    #
-    # num_threads = ir.Expr.call(get_num_threads_intrinsic, (), (), loc)
-    # lowerer.fndesc.calltypes[num_threads] = get_num_threads_sig
-
-    get_num_threads_var = ir.Var(scope, mk_unique_var('get_num_threads'), loc)
-    get_num_threads_assign = ir.Assign(ir.Const(num_threads, loc), get_num_threads_var, loc)
-    typemap[get_num_threads_var.name] = types.intp
-    lowerer.lower_inst(get_num_threads_assign)
-
     # init reduction array allocation here.
     nredvars = len(parfor_redvars)
     redarrs = {}
@@ -235,41 +218,16 @@ def _lower_parfor_parallel(lowerer, parfor):
                 redtoset = redvar
 
             # For each thread, initialize the per-worker reduction array to the current reduction array value.
-            redblocks = []
             for j in range(get_thread_count()):
-                block = ir.Block(scope, loc)
-                redblocks.append(block)
                 index_var = ir.Var(scope, mk_unique_var("index_var"), loc)
                 index_var_assign = ir.Assign(ir.Const(j, loc), index_var, loc)
                 typemap[index_var.name] = types.uintp
-                block.body.append(index_var_assign)
                 lowerer.lower_inst(index_var_assign)
 
-                cond = ir.Expr.binop(operator.gt, index_var, get_num_threads_var, loc)
-                cond_var = ir.Var(scope, mk_unique_var('cond'), loc)
-                cond_assign = ir.Assign(cond, cond_var, loc)
-                typemap[cond_var.name] = types.intp
-                lowerer.fndesc.calltypes[cond] = signature(types.i1, types.intp, types.intp)
-                lowerer.lower_inst(cond_assign)
-
-                truebr = next_label()
-                true_block = ir.Block(scope, loc)
-                lowerer.blkmap[truebr] = true_block
-                # lowerer.blocks[truebr] = true_block
-                falsebr = next_label()
-                false_block = ir.Block(scope, loc)
-                lowerer.blkmap[falsebr] = false_block
-                # lowerer.blocks[falsebr] = false_block
-
                 redsetitem = ir.SetItem(redarr_var, index_var, redtoset, loc)
                 lowerer.fndesc.calltypes[redsetitem] = signature(types.none,
                         typemap[redarr_var.name], typemap[index_var.name], redvar_typ)
-                false_block.body.append(redsetitem)
-
-                branch = ir.Branch(cond_var, truebr, falsebr, loc)
-                block.body.append(branch)
-                # lowerer.lower_inst(branch)
-            parfor.init_block.body.append(ir.Jump(redblocks[0], loc))
+                lowerer.lower_inst(redsetitem)
 
     # compile parfor body as a separate function to be used with GUFuncWrapper
     flags = copy.copy(parfor.flags)

From 2651ce7f9700846183e82329fbff5203ff080fa5 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Wed, 23 Oct 2019 15:15:08 -0600
Subject: [PATCH 008/136] Remove unused variable

---
 numba/npyufunc/parfor.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/numba/npyufunc/parfor.py b/numba/npyufunc/parfor.py
index da4c119e6aa..0404b4084ed 100644
--- a/numba/npyufunc/parfor.py
+++ b/numba/npyufunc/parfor.py
@@ -60,7 +60,6 @@ def _lower_parfor_parallel(lowerer, parfor):
     lowerer.fndesc.typemap = copy.copy(orig_typemap)
     typemap = lowerer.fndesc.typemap
     varmap = lowerer.varmap
-    builder = lowerer.builder
 
     if config.DEBUG_ARRAY_OPT:
         print("_lower_parfor_parallel")

From 63189ca9f567dc6ba35d59e860e2295959dc02a2 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Wed, 23 Oct 2019 16:23:47 -0600
Subject: [PATCH 009/136] Make num_threads thread local in the tbb backend

---
 numba/npyufunc/tbbpool.cpp | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/numba/npyufunc/tbbpool.cpp b/numba/npyufunc/tbbpool.cpp
index b763e228a8f..db1ca3d8f38 100644
--- a/numba/npyufunc/tbbpool.cpp
+++ b/numba/npyufunc/tbbpool.cpp
@@ -37,7 +37,15 @@ Implement parallel vectorize workqueue on top of Intel TBB.
 static tbb::task_group *tg = NULL;
 static tbb::task_scheduler_init *tsi = NULL;
 static int tsi_count = 0;
-int num_threads = 0;
+
+#ifdef _MSC_VER
+#define THREAD_LOCAL(ty) __declspec(thread) ty
+#else
+/* Non-standard C99 extension that's understood by gcc and clang */
+#define THREAD_LOCAL(ty) __thread ty
+#endif
+
+static THREAD_LOCAL(int) num_threads = 0;
 
 static void
 add_task(void *fn, void *args, void *dims, void *steps, void *data)

From 8ee5652dddb66b2eb6e766c39ce1e7aa73266478 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Mon, 28 Oct 2019 12:50:55 -0600
Subject: [PATCH 010/136] Fix flake8 issues

---
 numba/npyufunc/parallel.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index 67324d7f45f..a7485f2f502 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -459,9 +459,11 @@ def raise_with_hint(required):
 
 def set_num_threads(n):
     if n > NUM_THREADS or n < 0:
-        raise ValueError("The number of threads must be between 0 and %s" % NUM_THREADS)
+        raise ValueError("The number of threads must be between 0 and %s" %
+                         NUM_THREADS)
     _set_num_threads(n)
 
+
 _DYLD_WORKAROUND_SET = 'NUMBA_DYLD_WORKAROUND' in os.environ
 _DYLD_WORKAROUND_VAL = int(os.environ.get('NUMBA_DYLD_WORKAROUND', 0))
 

From 4eac9cf439c712245e7254047ad05efe803ef791 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Tue, 29 Oct 2019 13:46:08 -0600
Subject: [PATCH 011/136] Make sure threads are launched when calling
 set_num_threads()

---
 numba/npyufunc/parallel.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index a7485f2f502..aa48a595468 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -458,6 +458,8 @@ def raise_with_hint(required):
 
 
 def set_num_threads(n):
+    _launch_threads()
+
     if n > NUM_THREADS or n < 0:
         raise ValueError("The number of threads must be between 0 and %s" %
                          NUM_THREADS)

From 5287cf8279c1c6bc6998c49d53acf818595f17e2 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Tue, 29 Oct 2019 16:04:58 -0600
Subject: [PATCH 012/136] Add a jitted version of set_num_threads for testing
 purposes

---
 numba/npyufunc/parallel.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index aa48a595468..0e679c365c6 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -26,7 +26,7 @@
 
 from numba.npyufunc import ufuncbuilder
 from numba.numpy_support import as_dtype
-from numba import types, config, utils
+from numba import types, config, utils, njit
 from numba.npyufunc.wrappers import _wrapper_info
 
 
@@ -466,6 +466,16 @@ def set_num_threads(n):
     _set_num_threads(n)
 
 
+@njit
+def _set_num_threads_jit(n):
+    """
+    Jitted version of set_num_threads for testing
+
+    It does not check that n is in the right range and it will fail if
+    _launch_threads() has not already been called.
+    """
+    _set_num_threads(n)
+
 _DYLD_WORKAROUND_SET = 'NUMBA_DYLD_WORKAROUND' in os.environ
 _DYLD_WORKAROUND_VAL = int(os.environ.get('NUMBA_DYLD_WORKAROUND', 0))
 

From 464784b15bf4318c3b7d646e5efb6eb689fe44de Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Wed, 30 Oct 2019 15:28:57 -0600
Subject: [PATCH 013/136] Don't allow set_num_threads(0)

---
 numba/npyufunc/parallel.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index 0e679c365c6..c00e03301e2 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -460,8 +460,8 @@ def raise_with_hint(required):
 def set_num_threads(n):
     _launch_threads()
 
-    if n > NUM_THREADS or n < 0:
-        raise ValueError("The number of threads must be between 0 and %s" %
+    if n > NUM_THREADS or n < 1:
+        raise ValueError("The number of threads must be between 1 and %s" %
                          NUM_THREADS)
     _set_num_threads(n)
 

From fcd6cd136ee0887b3db36d04019d94a74686096e Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Wed, 30 Oct 2019 15:31:16 -0600
Subject: [PATCH 014/136] Make set_num_threads() return the old number of
 threads

---
 numba/npyufunc/parallel.py | 6 +++---
 numba/npyufunc/tbbpool.cpp | 4 +++-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index c00e03301e2..f99929b3b84 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -448,7 +448,7 @@ def raise_with_hint(required):
             launch_threads = CFUNCTYPE(None, c_int)(lib.launch_threads)
             launch_threads(NUM_THREADS)
             global _set_num_threads
-            _set_num_threads = CFUNCTYPE(None, c_int)(lib.set_num_threads)
+            _set_num_threads = CFUNCTYPE(c_int, c_int)(lib.set_num_threads)
             _set_num_threads(NUM_THREADS)
 
             # set library name so it can be queried
@@ -463,7 +463,7 @@ def set_num_threads(n):
     if n > NUM_THREADS or n < 1:
         raise ValueError("The number of threads must be between 1 and %s" %
                          NUM_THREADS)
-    _set_num_threads(n)
+    return _set_num_threads(n)
 
 
 @njit
@@ -474,7 +474,7 @@ def _set_num_threads_jit(n):
     It does not check that n is in the right range and it will fail if
     _launch_threads() has not already been called.
     """
-    _set_num_threads(n)
+    return _set_num_threads(n)
 
 _DYLD_WORKAROUND_SET = 'NUMBA_DYLD_WORKAROUND' in os.environ
 _DYLD_WORKAROUND_VAL = int(os.environ.get('NUMBA_DYLD_WORKAROUND', 0))
diff --git a/numba/npyufunc/tbbpool.cpp b/numba/npyufunc/tbbpool.cpp
index db1ca3d8f38..1eb77571f0a 100644
--- a/numba/npyufunc/tbbpool.cpp
+++ b/numba/npyufunc/tbbpool.cpp
@@ -215,9 +215,11 @@ static void ready(void)
 {
 }
 
-static void set_num_threads(int count)
+static int set_num_threads(int count)
 {
+    int old_num_threads = num_threads;
     num_threads = count;
+    return old_num_threads;
 }
 
 static int get_num_threads(void)

From 9f0561faa2c81d2ef3d3780af6f44e9e2a946ae8 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Wed, 30 Oct 2019 16:11:45 -0600
Subject: [PATCH 015/136] Add some documentation for thread masking

---
 docs/source/user/threading-layer.rst | 18 ++++++++++++++++++
 numba/npyufunc/parallel.py           | 28 +++++++++++++++++++++++++++-
 2 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/docs/source/user/threading-layer.rst b/docs/source/user/threading-layer.rst
index 576979e9592..dc00b7e5706 100644
--- a/docs/source/user/threading-layer.rst
+++ b/docs/source/user/threading-layer.rst
@@ -167,3 +167,21 @@ system level libraries, some additional things to note:
   threading layer.
 * For Windows users running Python 2.7, the ``tbb`` threading layer is not
   available.
+
+Masking Threads
+---------------
+
+The number of threads used by numba is based on the number of CPU cores
+available (``multiprocessing.cpu_count()``), but it can be overridden with the
+:envvar:`NUMBA_NUM_THREADS` environment variable.
+
+The total number of threads is in the variable
+:obj:`numba.npyufunc.parallel.NUM_THREADS`:
+
+.. autodata:: numba.npyufunc.parallel.NUM_THREADS
+   :annotation:
+
+This total can be masked to a smaller number using
+:func:`numba.npyufunc.parallel.set_num_threads`:
+
+.. autofunction:: numba.npyufunc.parallel.set_num_threads
diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index f99929b3b84..daa424858f3 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -39,7 +39,10 @@ def get_thread_count():
         raise ValueError("Number of threads specified must be > 0.")
     return t
 
-
+#: The total (maximum) number of threads used by numba parallel.
+#:
+#: Defaults to the number of cores but can be overridden with the
+#: envvar:`NUMBA_NUM_THREADS` environment variable.
 NUM_THREADS = get_thread_count()
 
 
@@ -458,6 +461,29 @@ def raise_with_hint(required):
 
 
 def set_num_threads(n):
+    """
+    Set the number of threads to use for parallel execution
+
+    This functionality works by masking out threads that are not used.
+    Therefore, the number of threads *n* must be less than or equal to the
+    total number of threads that are launched, which is set to the number of
+    cores by default but can be configured with the
+    :envvar:`NUMBA_NUM_THREADS` environment variable. See the
+    :func:`get_thread_count` function.
+
+    Parameters
+    ----------
+    n: The number of threads. Must be between 1 and :obj:`numba.npyufunc.parallel.NUM_THREADS`.
+
+    Returns
+    -------
+    The old number of threads.
+
+    See Also
+    --------
+    get_num_threads, NUM_THREADS
+
+    """
     _launch_threads()
 
     if n > NUM_THREADS or n < 1:

From 87793a779c9c4cc7a954ab77efd497927bba8e3e Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Wed, 30 Oct 2019 16:28:46 -0600
Subject: [PATCH 016/136] Add get_num_threads function to Python

---
 numba/npyufunc/parallel.py | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index daa424858f3..93998859ec8 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -450,10 +450,14 @@ def raise_with_hint(required):
 
             launch_threads = CFUNCTYPE(None, c_int)(lib.launch_threads)
             launch_threads(NUM_THREADS)
+
             global _set_num_threads
             _set_num_threads = CFUNCTYPE(c_int, c_int)(lib.set_num_threads)
             _set_num_threads(NUM_THREADS)
 
+            global _get_num_threads
+            _get_num_threads = CFUNCTYPE(c_int)(lib.set_num_threads)
+
             # set library name so it can be queried
             global _threading_layer
             _threading_layer = libname
@@ -492,6 +496,29 @@ def set_num_threads(n):
     return _set_num_threads(n)
 
 
+def get_num_threads():
+    """
+    Get the number of threads used for parallel execution.
+
+    By default (if :func:`~.set_num_threads` is never called), all
+    :obj:`numba.npyufunc.parallel.NUM_THREADS` threads are used.
+
+    This number is less than or equal to the total number of threads that are
+    launched, :obj:`numba.npyufunc.parallel.NUM_THREADS`.
+
+    Returns
+    -------
+    The number of threads.
+
+    See Also
+    --------
+    set_num_threads, NUM_THREADS
+
+    """
+    _launch_threads()
+    return _get_num_threads()
+
+
 @njit
 def _set_num_threads_jit(n):
     """

From 1ebef50f893692fcd443c2e8684e1f7663b7ec64 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Wed, 30 Oct 2019 16:29:06 -0600
Subject: [PATCH 017/136] Improved the documentation for thread masking

---
 docs/source/user/threading-layer.rst | 25 +++++++++++++++++++------
 numba/npyufunc/parallel.py           | 15 ++++++++-------
 2 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/docs/source/user/threading-layer.rst b/docs/source/user/threading-layer.rst
index dc00b7e5706..81f950fbd2b 100644
--- a/docs/source/user/threading-layer.rst
+++ b/docs/source/user/threading-layer.rst
@@ -168,20 +168,33 @@ system level libraries, some additional things to note:
 * For Windows users running Python 2.7, the ``tbb`` threading layer is not
   available.
 
-Masking Threads
----------------
+Setting the Number of Threads
+-----------------------------
 
 The number of threads used by numba is based on the number of CPU cores
 available (``multiprocessing.cpu_count()``), but it can be overridden with the
 :envvar:`NUMBA_NUM_THREADS` environment variable.
 
 The total number of threads is in the variable
-:obj:`numba.npyufunc.parallel.NUM_THREADS`:
+:obj:`numba.npyufunc.parallel.NUM_THREADS`.
+
+For some use cases, it may be desirable to set the number of threads to a
+lower value, so that numba can be used with higher level parallelism.
+
+The number of threads can be set dynamically at runtime using
+:func:`numba.npyufunc.parallel.set_num_threads`. Note that
+:func:`~.set_num_threads` only allows setting the number
+of threads to a smaller value than :obj:`~.NUM_THREADS`.
+
+The number of threads can be accessed with
+:func:`numba.npyufunc.parallel.get_num_threads`:
+
+API Reference
+~~~~~~~~~~~~~
 
 .. autodata:: numba.npyufunc.parallel.NUM_THREADS
    :annotation:
 
-This total can be masked to a smaller number using
-:func:`numba.npyufunc.parallel.set_num_threads`:
-
 .. autofunction:: numba.npyufunc.parallel.set_num_threads
+
+.. autofunction:: numba.npyufunc.parallel.get_num_threads
diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index 93998859ec8..94e9505953e 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -466,18 +466,19 @@ def raise_with_hint(required):
 
 def set_num_threads(n):
     """
-    Set the number of threads to use for parallel execution
+    Set the number of threads to use for parallel execution.
+
+    By default, all :obj:`numba.npyufunc.parallel.NUM_THREADS` threads are
+    used.
 
     This functionality works by masking out threads that are not used.
-    Therefore, the number of threads *n* must be less than or equal to the
-    total number of threads that are launched, which is set to the number of
-    cores by default but can be configured with the
-    :envvar:`NUMBA_NUM_THREADS` environment variable. See the
-    :func:`get_thread_count` function.
+    Therefore, the number of threads *n* must be less than or equal to
+    :obj:`~.NUM_THREADS`, the total number of threads that are launched. See
+    its documentation for more details.
 
     Parameters
     ----------
-    n: The number of threads. Must be between 1 and :obj:`numba.npyufunc.parallel.NUM_THREADS`.
+    n: The number of threads. Must be between 1 and NUM_THREADS.
 
     Returns
     -------

From 81292d1cc4bd99451c7baca3329e8f89e751e1c7 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Thu, 31 Oct 2019 12:36:44 -0600
Subject: [PATCH 018/136] Fix flake8 errors

---
 numba/npyufunc/parallel.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index 94e9505953e..d1e95494f31 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -39,6 +39,7 @@ def get_thread_count():
         raise ValueError("Number of threads specified must be > 0.")
     return t
 
+
 #: The total (maximum) number of threads used by numba parallel.
 #:
 #: Defaults to the number of cores but can be overridden with the
@@ -530,6 +531,7 @@ def _set_num_threads_jit(n):
     """
     return _set_num_threads(n)
 
+
 _DYLD_WORKAROUND_SET = 'NUMBA_DYLD_WORKAROUND' in os.environ
 _DYLD_WORKAROUND_VAL = int(os.environ.get('NUMBA_DYLD_WORKAROUND', 0))
 

From 86e70f9b215e41d644ccc66550e10f432b3d17f2 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Tue, 5 Nov 2019 12:22:15 -0700
Subject: [PATCH 019/136] Move get_num_threads and set_num_threads to their own
 C module

---
 numba/npyufunc/_num_threads.c | 39 +++++++++++++++++++++++++++++++++++
 numba/npyufunc/omppool.cpp    | 17 ---------------
 numba/npyufunc/parallel.py    | 27 ++++++++++++++----------
 numba/npyufunc/tbbpool.cpp    | 25 +---------------------
 numba/npyufunc/workqueue.c    | 16 --------------
 setup.py                      |  8 +++++--
 6 files changed, 62 insertions(+), 70 deletions(-)
 create mode 100644 numba/npyufunc/_num_threads.c

diff --git a/numba/npyufunc/_num_threads.c b/numba/npyufunc/_num_threads.c
new file mode 100644
index 00000000000..0ba43910474
--- /dev/null
+++ b/numba/npyufunc/_num_threads.c
@@ -0,0 +1,39 @@
+// Thread local num_threads variable for masking out the total number of
+// launched threads.
+
+#include "../_pymodule.h"
+
+#ifdef _MSC_VER
+#define THREAD_LOCAL(ty) __declspec(thread) ty
+#else
+/* Non-standard C99 extension that's understood by gcc and clang */
+#define THREAD_LOCAL(ty) __thread ty
+#endif
+
+static THREAD_LOCAL(int) num_threads = 0;
+
+static int set_num_threads(int count)
+{
+    int old_num_threads = num_threads;
+    num_threads = count;
+    return old_num_threads;
+}
+
+static int get_num_threads(void)
+{
+    return num_threads;
+}
+
+MOD_INIT(_num_threads)
+{
+    PyObject *m;
+    MOD_DEF(m, "_num_threads", "No docs", NULL)
+        if (m == NULL)
+            return MOD_ERROR_VAL;
+    PyObject_SetAttrString(m, "set_num_threads",
+                           PyLong_FromVoidPtr((void*)&set_num_threads));
+    PyObject_SetAttrString(m, "get_num_threads",
+                           PyLong_FromVoidPtr((void*)&get_num_threads));
+
+    return MOD_SUCCESS_VAL(m);
+}
diff --git a/numba/npyufunc/omppool.cpp b/numba/npyufunc/omppool.cpp
index c8dd0d8d31c..32293fd94df 100644
--- a/numba/npyufunc/omppool.cpp
+++ b/numba/npyufunc/omppool.cpp
@@ -40,8 +40,6 @@ Threading layer on top of OpenMP.
 static pid_t parent_pid = 0; // 0 is not set, users can't own this anyway
 #endif
 
-int num_threads = 0;
-
 static void
 add_task(void *fn, void *args, void *dims, void *steps, void *data)
 {
@@ -184,16 +182,6 @@ static void ready(void)
 {
 }
 
-static void set_num_threads(int count)
-{
-    num_threads = count;
-}
-
-static int get_num_threads(void)
-{
-    return num_threads;
-}
-
 MOD_INIT(omppool)
 {
     PyObject *m;
@@ -217,10 +205,5 @@ MOD_INIT(omppool)
                            PyLong_FromVoidPtr((void*)&do_scheduling_unsigned));
     PyObject_SetAttrString(m, "openmp_vendor",
                            PyString_FromString(_OMP_VENDOR));
-    PyObject_SetAttrString(m, "set_num_threads",
-                           PyLong_FromVoidPtr((void*)&set_num_threads));
-    PyObject_SetAttrString(m, "get_num_threads",
-                           PyLong_FromVoidPtr((void*)&get_num_threads));
-
     return MOD_SUCCESS_VAL(m);
 }
diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index d1e95494f31..9c1643114fa 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -18,6 +18,7 @@
 from threading import RLock as threadRLock
 import multiprocessing
 from contextlib import contextmanager
+from ctypes import CFUNCTYPE, c_int
 
 import numpy as np
 
@@ -324,8 +325,6 @@ def _launch_threads():
             if _is_initialized:
                 return
 
-            from ctypes import CFUNCTYPE, c_int
-
             def select_known_backend(backend):
                 """
                 Loads a specific threading layer backend based on string
@@ -446,25 +445,31 @@ def raise_with_hint(required):
             ll.add_symbol('numba_parallel_for', lib.parallel_for)
             ll.add_symbol('do_scheduling_signed', lib.do_scheduling_signed)
             ll.add_symbol('do_scheduling_unsigned', lib.do_scheduling_unsigned)
-            ll.add_symbol('get_num_threads', lib.get_num_threads)
-            ll.add_symbol('set_num_threads', lib.set_num_threads)
 
             launch_threads = CFUNCTYPE(None, c_int)(lib.launch_threads)
             launch_threads(NUM_THREADS)
 
-            global _set_num_threads
-            _set_num_threads = CFUNCTYPE(c_int, c_int)(lib.set_num_threads)
-            _set_num_threads(NUM_THREADS)
-
-            global _get_num_threads
-            _get_num_threads = CFUNCTYPE(c_int)(lib.set_num_threads)
-
             # set library name so it can be queried
             global _threading_layer
             _threading_layer = libname
             _is_initialized = True
 
 
+def _load_num_threads_funcs():
+    from . import _num_threads as lib
+
+    ll.add_symbol('get_num_threads', lib.get_num_threads)
+    ll.add_symbol('set_num_threads', lib.set_num_threads)
+
+    global _set_num_threads
+    _set_num_threads = CFUNCTYPE(c_int, c_int)(lib.set_num_threads)
+    _set_num_threads(NUM_THREADS)
+
+    global _get_num_threads
+    _get_num_threads = CFUNCTYPE(c_int)(lib.set_num_threads)
+
+_load_num_threads_funcs()
+
 def set_num_threads(n):
     """
     Set the number of threads to use for parallel execution.
diff --git a/numba/npyufunc/tbbpool.cpp b/numba/npyufunc/tbbpool.cpp
index 1eb77571f0a..d976c8f8974 100644
--- a/numba/npyufunc/tbbpool.cpp
+++ b/numba/npyufunc/tbbpool.cpp
@@ -38,15 +38,6 @@ static tbb::task_group *tg = NULL;
 static tbb::task_scheduler_init *tsi = NULL;
 static int tsi_count = 0;
 
-#ifdef _MSC_VER
-#define THREAD_LOCAL(ty) __declspec(thread) ty
-#else
-/* Non-standard C99 extension that's understood by gcc and clang */
-#define THREAD_LOCAL(ty) __thread ty
-#endif
-
-static THREAD_LOCAL(int) num_threads = 0;
-
 static void
 add_task(void *fn, void *args, void *dims, void *steps, void *data)
 {
@@ -215,17 +206,6 @@ static void ready(void)
 {
 }
 
-static int set_num_threads(int count)
-{
-    int old_num_threads = num_threads;
-    num_threads = count;
-    return old_num_threads;
-}
-
-static int get_num_threads(void)
-{
-    return num_threads;
-}
 
 MOD_INIT(tbbpool)
 {
@@ -255,10 +235,7 @@ MOD_INIT(tbbpool)
                            PyLong_FromVoidPtr((void*)&do_scheduling_signed));
     PyObject_SetAttrString(m, "do_scheduling_unsigned",
                            PyLong_FromVoidPtr((void*)&do_scheduling_unsigned));
-    PyObject_SetAttrString(m, "set_num_threads",
-                           PyLong_FromVoidPtr((void*)&set_num_threads));
-    PyObject_SetAttrString(m, "get_num_threads",
-                           PyLong_FromVoidPtr((void*)&get_num_threads));
+
 
     return MOD_SUCCESS_VAL(m);
 }
diff --git a/numba/npyufunc/workqueue.c b/numba/npyufunc/workqueue.c
index f1366bbe48a..17c29d22f87 100644
--- a/numba/npyufunc/workqueue.c
+++ b/numba/npyufunc/workqueue.c
@@ -51,8 +51,6 @@ typedef struct
     pthread_mutex_t mutex;
 } queue_condition_t;
 
-int num_threads = 0;
-
 static int
 queue_condition_init(queue_condition_t *qc)
 {
@@ -452,16 +450,6 @@ static void reset_after_fork(void)
     NUM_THREADS = -1;
 }
 
-static void set_num_threads(int count)
-{
-    num_threads = count;
-}
-
-static int get_num_threads(void)
-{
-    return num_threads;
-}
-
 MOD_INIT(workqueue)
 {
     PyObject *m;
@@ -483,10 +471,6 @@ MOD_INIT(workqueue)
                            PyLong_FromVoidPtr(&do_scheduling_signed));
     PyObject_SetAttrString(m, "do_scheduling_unsigned",
                            PyLong_FromVoidPtr(&do_scheduling_unsigned));
-    PyObject_SetAttrString(m, "set_num_threads",
-                           PyLong_FromVoidPtr((void*)&set_num_threads));
-    PyObject_SetAttrString(m, "get_num_threads",
-                           PyLong_FromVoidPtr((void*)&get_num_threads));
 
     return MOD_SUCCESS_VAL(m);
 }
diff --git a/setup.py b/setup.py
index b36604f6d9f..b86330a80eb 100644
--- a/setup.py
+++ b/setup.py
@@ -148,6 +148,10 @@ def get_ext_modules():
                                             "numba/_pymodule.h"],
                                    **np_compile_args)
 
+    ext_npyufunc_num_threads = Extension(name="numba.npyufunc._num_threads",
+                                         sources=["numba/npyufunc/_num_threads.c"],
+                                         depends=["numba/_pymodule.h"],
+                                         )
 
     ext_npyufunc_workqueue_impls = []
 
@@ -279,8 +283,8 @@ def check_file_at_path(path2file):
                                 include_dirs=["numba"])
 
     ext_modules = [ext_dynfunc, ext_dispatcher, ext_helperlib, ext_typeconv,
-                   ext_npyufunc_ufunc, ext_mviewbuf, ext_nrt_python,
-                   ext_jitclass_box, ext_cuda_extras]
+                   ext_npyufunc_ufunc, ext_npyufunc_num_threads, ext_mviewbuf,
+                   ext_nrt_python, ext_jitclass_box, ext_cuda_extras]
 
     ext_modules += ext_npyufunc_workqueue_impls
 

From 424202cb48e5f1c231a29ca9d0cb90f8cc61fc7e Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Tue, 5 Nov 2019 17:00:45 -0700
Subject: [PATCH 020/136] Remove the return value from set_num_threads

---
 numba/npyufunc/_num_threads.c | 4 +---
 numba/npyufunc/parallel.py    | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/numba/npyufunc/_num_threads.c b/numba/npyufunc/_num_threads.c
index 0ba43910474..78156d78a77 100644
--- a/numba/npyufunc/_num_threads.c
+++ b/numba/npyufunc/_num_threads.c
@@ -12,11 +12,9 @@
 
 static THREAD_LOCAL(int) num_threads = 0;
 
-static int set_num_threads(int count)
+static void set_num_threads(int count)
 {
-    int old_num_threads = num_threads;
     num_threads = count;
-    return old_num_threads;
 }
 
 static int get_num_threads(void)
diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index 9c1643114fa..b332f394b8c 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -462,7 +462,7 @@ def _load_num_threads_funcs():
     ll.add_symbol('set_num_threads', lib.set_num_threads)
 
     global _set_num_threads
-    _set_num_threads = CFUNCTYPE(c_int, c_int)(lib.set_num_threads)
+    _set_num_threads = CFUNCTYPE(None, c_int)(lib.set_num_threads)
     _set_num_threads(NUM_THREADS)
 
     global _get_num_threads

From f353a6f9eab219c32539816ee02b312725cfb207 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Tue, 5 Nov 2019 17:01:02 -0700
Subject: [PATCH 021/136] Fix a typo

---
 numba/npyufunc/parallel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index b332f394b8c..69e60cb1989 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -466,7 +466,7 @@ def _load_num_threads_funcs():
     _set_num_threads(NUM_THREADS)
 
     global _get_num_threads
-    _get_num_threads = CFUNCTYPE(c_int)(lib.set_num_threads)
+    _get_num_threads = CFUNCTYPE(c_int)(lib.get_num_threads)
 
 _load_num_threads_funcs()
 

From 1965c975040e247654438d44870e59a243c11146 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Tue, 5 Nov 2019 17:01:10 -0700
Subject: [PATCH 022/136] Make get_num_threads and set_num_threads work in a
 jitted function

---
 numba/npyufunc/parallel.py | 36 +++++++++++++++++++++++++++++++-----
 1 file changed, 31 insertions(+), 5 deletions(-)

diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index 69e60cb1989..ede6d631c2a 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -29,6 +29,7 @@
 from numba.numpy_support import as_dtype
 from numba import types, config, utils, njit
 from numba.npyufunc.wrappers import _wrapper_info
+from numba.extending import overload
 
 
 def get_thread_count():
@@ -470,6 +471,23 @@ def _load_num_threads_funcs():
 
 _load_num_threads_funcs()
 
+# Some helpers to make set_num_threads jittable
+
+def snt_check(n):
+    from numba.config import NUMBA_NUM_THREADS
+    msg = "The number of threads must be between 1 and %s" % NUMBA_NUM_THREADS
+    if n > NUMBA_NUM_THREADS or n < 1:
+        raise ValueError(msg)
+
+@overload(snt_check)
+def ol_snt_check(n):
+    from numba.config import NUMBA_NUM_THREADS
+    msg = "The number of threads must be between 1 and %s" % NUMBA_NUM_THREADS
+    def impl(n):
+        if n > NUMBA_NUM_THREADS or n < 1:
+            raise ValueError(msg)
+    return impl
+
 def set_num_threads(n):
     """
     Set the number of threads to use for parallel execution.
@@ -497,11 +515,15 @@ def set_num_threads(n):
     """
     _launch_threads()
 
-    if n > NUM_THREADS or n < 1:
-        raise ValueError("The number of threads must be between 1 and %s" %
-                         NUM_THREADS)
-    return _set_num_threads(n)
+    snt_check(n)
+    _set_num_threads(n)
 
+@overload(set_num_threads)
+def ol_set_num_threads(n):
+    def impl(n):
+        snt_check(n)
+        _set_num_threads(n)
+    return impl
 
 def get_num_threads():
     """
@@ -522,9 +544,13 @@ def get_num_threads():
     set_num_threads, NUM_THREADS
 
     """
-    _launch_threads()
     return _get_num_threads()
 
+@overload(get_num_threads)
+def ol_get_num_threads():
+    def impl():
+        return _get_num_threads()
+    return impl
 
 @njit
 def _set_num_threads_jit(n):

From 8f06b5e910df9e6423fad99a3244ac503078d83a Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Tue, 5 Nov 2019 17:01:35 -0700
Subject: [PATCH 023/136] Remove _set_num_threads_jit

---
 numba/npyufunc/parallel.py | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index ede6d631c2a..4652b1da55b 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -552,17 +552,6 @@ def impl():
         return _get_num_threads()
     return impl
 
-@njit
-def _set_num_threads_jit(n):
-    """
-    Jitted version of set_num_threads for testing
-
-    It does not check that n is in the right range and it will fail if
-    _launch_threads() has not already been called.
-    """
-    return _set_num_threads(n)
-
-
 _DYLD_WORKAROUND_SET = 'NUMBA_DYLD_WORKAROUND' in os.environ
 _DYLD_WORKAROUND_VAL = int(os.environ.get('NUMBA_DYLD_WORKAROUND', 0))
 

From 538cc40bb30661075e2bbf76831041af44b39fd7 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Tue, 5 Nov 2019 17:04:30 -0700
Subject: [PATCH 024/136] Add get_num_threads and set_num_threads to
 __init__.py

---
 numba/__init__.py          | 3 ++-
 numba/npyufunc/__init__.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/numba/__init__.py b/numba/__init__.py
index aabe97fc99a..5887cb33718 100644
--- a/numba/__init__.py
+++ b/numba/__init__.py
@@ -31,7 +31,8 @@
                          jit_module)
 
 # Re-export vectorize decorators and the thread layer querying function
-from .npyufunc import vectorize, guvectorize, threading_layer
+from .npyufunc import (vectorize, guvectorize, threading_layer,
+                       get_num_threads, set_num_threads)
 
 # Re-export Numpy helpers
 from .numpy_support import carray, farray, from_dtype
diff --git a/numba/npyufunc/__init__.py b/numba/npyufunc/__init__.py
index b1736268832..2f44fc80f02 100644
--- a/numba/npyufunc/__init__.py
+++ b/numba/npyufunc/__init__.py
@@ -4,7 +4,7 @@
 from .decorators import Vectorize, GUVectorize, vectorize, guvectorize
 from ._internal import PyUFunc_None, PyUFunc_Zero, PyUFunc_One
 from . import _internal, array_exprs, parfor
-from .parallel import threading_layer
+from .parallel import threading_layer, get_num_threads, set_num_threads
 if hasattr(_internal, 'PyUFunc_ReorderableNone'):
     PyUFunc_ReorderableNone = _internal.PyUFunc_ReorderableNone
 del _internal, array_exprs

From 74abf6d3260ce449e5e74b7e580d7a96f7adb999 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Tue, 5 Nov 2019 17:09:48 -0700
Subject: [PATCH 025/136] Remove an unused import

---
 numba/npyufunc/parallel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index 5185cbd8fb3..e903c9df4df 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -27,7 +27,7 @@
 
 from numba.npyufunc import ufuncbuilder
 from numba.numpy_support import as_dtype
-from numba import types, config, utils, njit
+from numba import types, config, utils
 from numba.npyufunc.wrappers import _wrapper_info
 from numba.extending import overload
 

From bf40093cf99e0fb4709fe355fdd16c7cc6a448aa Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Tue, 5 Nov 2019 17:41:43 -0700
Subject: [PATCH 026/136] Update the documentation on set_num_threads and
 friends

---
 docs/source/reference/envvars.rst    |  6 ++--
 docs/source/user/threading-layer.rst | 42 +++++++++++++++++++---------
 numba/npyufunc/parallel.py           | 31 ++++++++++----------
 3 files changed, 47 insertions(+), 32 deletions(-)

diff --git a/docs/source/reference/envvars.rst b/docs/source/reference/envvars.rst
index 8b0645ea3c0..fd4f3ba4ea4 100644
--- a/docs/source/reference/envvars.rst
+++ b/docs/source/reference/envvars.rst
@@ -342,7 +342,10 @@ Threading Control
    of ``OMP_NUM_THREADS`` and ``MKL_NUM_THREADS``.
 
    *Default value:* The number of CPU cores on the system as determined at run
-   time, this can be accessed via ``numba.config.NUMBA_DEFAULT_NUM_THREADS``.
+   time. This can be accessed via :obj:`numba.config.NUMBA_DEFAULT_NUM_THREADS`.
+
+   See also the section on :ref:`setting_the_number_of_threads` for
+   information on how to set the number of threads at runtime.
 
 .. envvar:: NUMBA_THREADING_LAYER
 
@@ -363,4 +366,3 @@ Threading Control
    * ``tbb`` - A threading layer backed by Intel TBB.
    * ``omp`` - A threading layer backed by OpenMP.
    * ``workqueue`` - A simple built-in work-sharing task scheduler.
-
diff --git a/docs/source/user/threading-layer.rst b/docs/source/user/threading-layer.rst
index 81f950fbd2b..9e8fcf2b9a2 100644
--- a/docs/source/user/threading-layer.rst
+++ b/docs/source/user/threading-layer.rst
@@ -168,33 +168,49 @@ system level libraries, some additional things to note:
 * For Windows users running Python 2.7, the ``tbb`` threading layer is not
   available.
 
+.. _setting_the_number_of_threads:
+
 Setting the Number of Threads
 -----------------------------
 
 The number of threads used by numba is based on the number of CPU cores
-available (``multiprocessing.cpu_count()``), but it can be overridden with the
-:envvar:`NUMBA_NUM_THREADS` environment variable.
+available (see :obj:`numba.config.NUMBA_DEFAULT_NUM_THREADS`), but it can be
+overridden with the :envvar:`NUMBA_NUM_THREADS` environment variable.
 
-The total number of threads is in the variable
-:obj:`numba.npyufunc.parallel.NUM_THREADS`.
+The total number of threads that numba launches is in the variable
+:obj:`numba.config.NUMBA_NUM_THREADS`.
 
 For some use cases, it may be desirable to set the number of threads to a
 lower value, so that numba can be used with higher level parallelism.
 
 The number of threads can be set dynamically at runtime using
-:func:`numba.npyufunc.parallel.set_num_threads`. Note that
-:func:`~.set_num_threads` only allows setting the number
-of threads to a smaller value than :obj:`~.NUM_THREADS`.
+:func:`numba.set_num_threads`. Note that :func:`~.set_num_threads` only allows
+setting the number of threads to a smaller value than
+:obj:`~.NUMBA_NUM_THREADS`. Numba always launches
+:obj:`numba.config.NUMBA_NUM_THREADS` threads, but :func:`~.set_num_threads`
+causes it to mask out unused threads so they aren't used in computations.
 
-The number of threads can be accessed with
-:func:`numba.npyufunc.parallel.get_num_threads`:
+The current number of threads used by numba can be accessed with
+:func:`numba.get_num_threads`. Both functions work inside of a jitted
+function.
 
 API Reference
 ~~~~~~~~~~~~~
 
-.. autodata:: numba.npyufunc.parallel.NUM_THREADS
-   :annotation:
+.. py:data:: numba.config.NUMBA_NUM_THREADS
+
+   The total (maximum) number of threads launched by numba.
+
+   Defaults :obj:`numba.config.NUMBA_DEFAULT_NUM_THREADS`, but can be
+   overridden with the :envvar:`NUMBA_NUM_THREADS` environment variable.
+
+.. py:data:: numba.config.NUMBA_DEFAULT_NUM_THREADS
+
+   The number of CPU cores on the system (as determined by
+   ``multiprocessing.cpu_count()``). This is the default value for
+   :obj:`numba.config.NUMBA_NUM_THREADS` unless the
+   :envvar:`NUMBA_NUM_THREADS` environment variable is set.
 
-.. autofunction:: numba.npyufunc.parallel.set_num_threads
+.. autofunction:: numba.set_num_threads
 
-.. autofunction:: numba.npyufunc.parallel.get_num_threads
+.. autofunction:: numba.get_num_threads
diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index e903c9df4df..8ea3363fe55 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -42,10 +42,6 @@ def get_thread_count():
     return t
 
 
-#: The total (maximum) number of threads used by numba parallel.
-#:
-#: Defaults to the number of cores but can be overridden with the
-#: envvar:`NUMBA_NUM_THREADS` environment variable.
 NUM_THREADS = get_thread_count()
 
 
@@ -492,25 +488,23 @@ def set_num_threads(n):
     """
     Set the number of threads to use for parallel execution.
 
-    By default, all :obj:`numba.npyufunc.parallel.NUM_THREADS` threads are
-    used.
+    By default, all :obj:`numba.config.NUMBA_NUM_THREADS` threads are used.
 
     This functionality works by masking out threads that are not used.
     Therefore, the number of threads *n* must be less than or equal to
-    :obj:`~.NUM_THREADS`, the total number of threads that are launched. See
-    its documentation for more details.
+    :obj:`~.NUMBA_NUM_THREADS`, the total number of threads that are launched.
+    See its documentation for more details.
+
+    This function can be used inside of a jitted function.
 
     Parameters
     ----------
-    n: The number of threads. Must be between 1 and NUM_THREADS.
-
-    Returns
-    -------
-    The old number of threads.
+    n: The number of threads. Must be between 1 and NUMBA_NUM_THREADS.
 
     See Also
     --------
-    get_num_threads, NUM_THREADS
+    get_num_threads, numba.config.NUMBA_NUM_THREADS,
+    numba.config.NUMBA_DEFAULT_NUM_THREADS, :envvar:`NUMBA_NUM_THREADS`
 
     """
     _launch_threads()
@@ -530,10 +524,12 @@ def get_num_threads():
     Get the number of threads used for parallel execution.
 
     By default (if :func:`~.set_num_threads` is never called), all
-    :obj:`numba.npyufunc.parallel.NUM_THREADS` threads are used.
+    :obj:`numba.config.NUMBA_NUM_THREADS` threads are used.
 
     This number is less than or equal to the total number of threads that are
-    launched, :obj:`numba.npyufunc.parallel.NUM_THREADS`.
+    launched, :obj:`numba.config.NUMBA_NUM_THREADS`.
+
+    This function can be used inside of a jitted function.
 
     Returns
     -------
@@ -541,7 +537,8 @@ def get_num_threads():
 
     See Also
     --------
-    set_num_threads, NUM_THREADS
+    set_num_threads, numba.config.NUMBA_NUM_THREADS,
+    numba.config.NUMBA_DEFAULT_NUM_THREADS, :envvar:`NUMBA_NUM_THREADS`
 
     """
     return _get_num_threads()

From a3f416003a67d9798a9a294177b0a3a077ba3c8e Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Wed, 6 Nov 2019 15:10:23 -0700
Subject: [PATCH 027/136] Fix flake8 issues

---
 numba/npyufunc/parallel.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index 8ea3363fe55..075c4d95321 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -465,25 +465,30 @@ def _load_num_threads_funcs():
     global _get_num_threads
     _get_num_threads = CFUNCTYPE(c_int)(lib.get_num_threads)
 
+
 _load_num_threads_funcs()
 
 # Some helpers to make set_num_threads jittable
 
+
 def snt_check(n):
     from numba.config import NUMBA_NUM_THREADS
     msg = "The number of threads must be between 1 and %s" % NUMBA_NUM_THREADS
     if n > NUMBA_NUM_THREADS or n < 1:
         raise ValueError(msg)
 
+
 @overload(snt_check)
 def ol_snt_check(n):
     from numba.config import NUMBA_NUM_THREADS
     msg = "The number of threads must be between 1 and %s" % NUMBA_NUM_THREADS
+
     def impl(n):
         if n > NUMBA_NUM_THREADS or n < 1:
             raise ValueError(msg)
     return impl
 
+
 def set_num_threads(n):
     """
     Set the number of threads to use for parallel execution.
@@ -512,6 +517,7 @@ def set_num_threads(n):
     snt_check(n)
     _set_num_threads(n)
 
+
 @overload(set_num_threads)
 def ol_set_num_threads(n):
     def impl(n):
@@ -519,6 +525,7 @@ def impl(n):
         _set_num_threads(n)
     return impl
 
+
 def get_num_threads():
     """
     Get the number of threads used for parallel execution.
@@ -543,12 +550,14 @@ def get_num_threads():
     """
     return _get_num_threads()
 
+
 @overload(get_num_threads)
 def ol_get_num_threads():
     def impl():
         return _get_num_threads()
     return impl
 
+
 _DYLD_WORKAROUND_SET = 'NUMBA_DYLD_WORKAROUND' in os.environ
 _DYLD_WORKAROUND_VAL = int(os.environ.get('NUMBA_DYLD_WORKAROUND', 0))
 

From cb5d7cef0acb09a582e93612915ab3ffe9f53b23 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Fri, 8 Nov 2019 16:14:41 -0700
Subject: [PATCH 028/136] Start writing tests for get_num_threads and
 set_num_threads

---
 numba/tests/test_num_threads.py | 107 ++++++++++++++++++++++++++++++++
 1 file changed, 107 insertions(+)
 create mode 100644 numba/tests/test_num_threads.py

diff --git a/numba/tests/test_num_threads.py b/numba/tests/test_num_threads.py
new file mode 100644
index 00000000000..fb7d7b6e161
--- /dev/null
+++ b/numba/tests/test_num_threads.py
@@ -0,0 +1,107 @@
+# -*- coding: utf-8 -*-
+from __future__ import print_function, absolute_import, division
+
+import numpy as np
+
+from numba import njit, set_num_threads, get_num_threads, prange, config
+from numba import unittest_support as unittest
+from .support import TestCase, skip_parfors_unsupported
+
+class TestNumThreads(TestCase):
+    _numba_parallel_test_ = False
+
+    def setUp(self):
+        # Make sure the num_threads is set to the max. This also makes sure
+        # the threads are launched.
+        set_num_threads(config.NUMBA_NUM_THREADS)
+
+    @skip_parfors_unsupported
+    @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
+    def test_set_num_threads_basic(self):
+        max_threads = config.NUMBA_NUM_THREADS
+
+        self.assertEqual(get_num_threads(), max_threads)
+        set_num_threads(2)
+        self.assertEqual(get_num_threads(), 2)
+        set_num_threads(max_threads)
+        self.assertEqual(get_num_threads(), max_threads)
+
+        with self.assertRaises(ValueError):
+            set_num_threads(0)
+
+        with self.assertRaises(ValueError):
+            set_num_threads(max_threads + 1)
+
+    @skip_parfors_unsupported
+    @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
+    def test_set_num_threads_basic_jit(self):
+        max_threads = config.NUMBA_NUM_THREADS
+
+        @njit
+        def get_n():
+            return get_num_threads()
+
+        @njit
+        def set_n(n):
+            set_num_threads(n)
+
+        self.assertEqual(get_n(), max_threads)
+        set_n(2)
+        self.assertEqual(get_n(), 2)
+        set_n(max_threads)
+        self.assertEqual(get_n(), max_threads)
+
+        @njit
+        def set_get_n(n):
+            set_num_threads(n)
+            return get_num_threads()
+
+        self.assertEqual(set_get_n(2), 2)
+        self.assertEqual(set_get_n(max_threads), max_threads)
+
+        with self.assertRaises(ValueError):
+            set_n(0)
+
+        with self.assertRaises(ValueError):
+            set_n(max_threads + 1)
+
+    @skip_parfors_unsupported
+    @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
+    def test_set_num_threads_outside_jit(self):
+
+        # Test set_num_threads outside a jitted function
+        set_num_threads(2)
+
+        @njit(parallel=True)
+        def test_func():
+            x = 5
+            buf = np.empty((x,))
+            for i in prange(x):
+                buf[i] = get_num_threads()
+            return buf
+
+        out = test_func()
+        self.assertTrue(np.all(out == 2))
+
+    @skip_parfors_unsupported
+    @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
+    def test_set_num_threads_inside_jit(self):
+        # Test set_num_threads inside a jitted function
+        @njit(parallel=True)
+        def test_func(nthreads):
+            x = 5
+            buf = np.empty((x,))
+            set_num_threads(nthreads)
+            for i in prange(x):
+                buf[i] = get_num_threads()
+            return buf
+
+        mask = 2
+        out = test_func(mask)
+        self.assertTrue(np.all(out == mask))
+
+    def tearDown(self):
+        set_num_threads(config.NUMBA_NUM_THREADS)
+
+if __name__ == '__main__':
+    unittest.main()

From ff17bf8555ef14b280abbd6a42caac3085cf3332 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Fri, 15 Nov 2019 13:44:07 -0700
Subject: [PATCH 029/136] Use numpy.testing.assert_equal instead of
 self.assertEqual(np.all(...))

---
 numba/tests/test_num_threads.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/numba/tests/test_num_threads.py b/numba/tests/test_num_threads.py
index fb7d7b6e161..461ba90bc53 100644
--- a/numba/tests/test_num_threads.py
+++ b/numba/tests/test_num_threads.py
@@ -81,7 +81,7 @@ def test_func():
             return buf
 
         out = test_func()
-        self.assertTrue(np.all(out == 2))
+        np.testing.assert_equal(out, 2)
 
     @skip_parfors_unsupported
     @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
@@ -98,7 +98,7 @@ def test_func(nthreads):
 
         mask = 2
         out = test_func(mask)
-        self.assertTrue(np.all(out == mask))
+        np.testing.assert_equal(out, mask)
 
     def tearDown(self):
         set_num_threads(config.NUMBA_NUM_THREADS)

From 8502b0354ef2756a00a56cef65e64870b0529ebf Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Mon, 18 Nov 2019 15:33:42 -0700
Subject: [PATCH 030/136] Make set_num_threads() affect guvectorized functions

This requires setting the number of threads in the backends themselves. This
is not yet implemented for workqueue.
---
 numba/npyufunc/omppool.cpp |  4 +-
 numba/npyufunc/parallel.py | 10 ++++-
 numba/npyufunc/tbbpool.cpp | 89 +++++++++++++++++++++-----------------
 numba/npyufunc/workqueue.c |  2 +-
 numba/npyufunc/workqueue.h |  2 +-
 5 files changed, 62 insertions(+), 45 deletions(-)

diff --git a/numba/npyufunc/omppool.cpp b/numba/npyufunc/omppool.cpp
index 32293fd94df..3d1c6a4d66a 100644
--- a/numba/npyufunc/omppool.cpp
+++ b/numba/npyufunc/omppool.cpp
@@ -51,7 +51,7 @@ add_task(void *fn, void *args, void *dims, void *steps, void *data)
 
 static void
 parallel_for(void *fn, char **args, size_t *dimensions, size_t *steps, void *data,
-             size_t inner_ndim, size_t array_count)
+             size_t inner_ndim, size_t array_count, int num_threads)
 {
     typedef void (*func_ptr_t)(char **args, size_t *dims, size_t *steps, void *data);
     func_ptr_t func = reinterpret_cast<func_ptr_t>(fn);
@@ -107,6 +107,8 @@ parallel_for(void *fn, char **args, size_t *dimensions, size_t *steps, void *dat
         printf("\n");
     }
 
+    omp_set_num_threads(num_threads);
+
     #pragma omp parallel
     {
         size_t * count_space = (size_t *)alloca(sizeof(size_t) * arg_len);
diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index 075c4d95321..aa6c435f292 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -132,7 +132,7 @@ def as_void_ptr(arg):
     array_count = len(sig.args) + 1
 
     parallel_for_ty = lc.Type.function(lc.Type.void(),
-                                       [byte_ptr_t] * 5 + [intp_t, ] * 2)
+                                       [byte_ptr_t] * 5 + [intp_t, ] * 3)
     parallel_for = mod.get_or_insert_function(parallel_for_ty,
                                               name='numba_parallel_for')
 
@@ -146,12 +146,18 @@ def as_void_ptr(arg):
     )
     wrapperlib.add_linking_library(info.library)
 
+    get_num_threads = builder.module.get_or_insert_function(
+        lc.Type.function(lc.Type.int(types.intp.bitwidth), []),
+        name="get_num_threads")
+
+    num_threads = builder.call(get_num_threads, [])
+
     # Prepare call
     fnptr = builder.bitcast(tmp_voidptr, byte_ptr_t)
     innerargs = [as_void_ptr(x) for x
                  in [args, dimensions, steps, data]]
     builder.call(parallel_for, [fnptr] + innerargs +
-                 [intp_t(x) for x in (inner_ndim, array_count)])
+                 [intp_t(x) for x in (inner_ndim, array_count)] + [num_threads])
 
     # Release the GIL
     pyapi.restore_thread(thread_state)
diff --git a/numba/npyufunc/tbbpool.cpp b/numba/npyufunc/tbbpool.cpp
index d976c8f8974..5bcc1029ae0 100644
--- a/numba/npyufunc/tbbpool.cpp
+++ b/numba/npyufunc/tbbpool.cpp
@@ -50,7 +50,7 @@ add_task(void *fn, void *args, void *dims, void *steps, void *data)
 
 static void
 parallel_for(void *fn, char **args, size_t *dimensions, size_t *steps, void *data,
-             size_t inner_ndim, size_t array_count)
+             size_t inner_ndim, size_t array_count, int num_threads)
 {
     static bool printed = false;
     if(!printed && _DEBUG)
@@ -83,49 +83,58 @@ parallel_for(void *fn, char **args, size_t *dimensions, size_t *steps, void *dat
         printf("\n");
     }
 
-    using range_t = tbb::blocked_range<size_t>;
-    tbb::parallel_for(range_t(0, dimensions[0]), [=](const range_t &range)
-    {
-        size_t * count_space = (size_t *)alloca(sizeof(size_t) * arg_len);
-        char ** array_arg_space = (char**)alloca(sizeof(char*) * array_count);
-        memcpy(count_space, dimensions, arg_len * sizeof(size_t));
-        count_space[0] = range.size();
+    tbb::task_arena limited(num_threads);
 
-        if(_DEBUG && _TRACE_SPLIT > 1)
-        {
-            printf("THREAD %p:", count_space);
-            printf("count_space: ");
-            for(size_t j = 0; j < arg_len; j++)
-                printf("%lu, ", count_space[j]);
-            printf("\n");
-        }
-        for(size_t j = 0; j < array_count; j++)
+    limited.execute([&]
+    {
+        tg->run([=]
         {
-            char * base = args[j];
-            size_t step = steps[j];
-            ptrdiff_t offset = step * range.begin();
-            array_arg_space[j] = base + offset;
-
-            if(_DEBUG && _TRACE_SPLIT > 2)
+            using range_t = tbb::blocked_range<size_t>;
+            tbb::parallel_for(range_t(0, dimensions[0]), [=](const range_t &range)
             {
-                printf("Index %ld\n", j);
-                printf("-->Got base %p\n", (void *)base);
-                printf("-->Got step %lu\n", step);
-                printf("-->Got offset %ld\n", offset);
-                printf("-->Got addr %p\n", (void *)array_arg_space[j]);
-            }
-        }
-
-        if(_DEBUG && _TRACE_SPLIT > 2)
-        {
-            printf("array_arg_space: ");
-            for(size_t j = 0; j < array_count; j++)
-                printf("%p, ", (void *)array_arg_space[j]);
-            printf("\n");
-        }
-        auto func = reinterpret_cast<void (*)(char **args, size_t *dims, size_t *steps, void *data)>(fn);
-        func(array_arg_space, count_space, steps, data);
+                size_t * count_space = (size_t *)alloca(sizeof(size_t) * arg_len);
+                char ** array_arg_space = (char**)alloca(sizeof(char*) * array_count);
+                memcpy(count_space, dimensions, arg_len * sizeof(size_t));
+                count_space[0] = range.size();
+
+                if(_DEBUG && _TRACE_SPLIT > 1)
+                {
+                    printf("THREAD %p:", count_space);
+                    printf("count_space: ");
+                    for(size_t j = 0; j < arg_len; j++)
+                        printf("%lu, ", count_space[j]);
+                    printf("\n");
+                }
+                for(size_t j = 0; j < array_count; j++)
+                {
+                    char * base = args[j];
+                    size_t step = steps[j];
+                    ptrdiff_t offset = step * range.begin();
+                    array_arg_space[j] = base + offset;
+
+                    if(_DEBUG && _TRACE_SPLIT > 2)
+                    {
+                        printf("Index %ld\n", j);
+                        printf("-->Got base %p\n", (void *)base);
+                        printf("-->Got step %lu\n", step);
+                        printf("-->Got offset %ld\n", offset);
+                        printf("-->Got addr %p\n", (void *)array_arg_space[j]);
+                    }
+                }
+
+                if(_DEBUG && _TRACE_SPLIT > 2)
+                {
+                    printf("array_arg_space: ");
+                    for(size_t j = 0; j < array_count; j++)
+                        printf("%p, ", (void *)array_arg_space[j]);
+                    printf("\n");
+                }
+                auto func = reinterpret_cast<void (*)(char **args, size_t *dims, size_t *steps, void *data)>(fn);
+                func(array_arg_space, count_space, steps, data);
+            });
+        });
     });
+    limited.execute([&]{ tg->wait(); });
 }
 
 void ignore_blocking_terminate_assertion( const char*, int, const char*, const char * )
diff --git a/numba/npyufunc/workqueue.c b/numba/npyufunc/workqueue.c
index 17c29d22f87..428b21845b6 100644
--- a/numba/npyufunc/workqueue.c
+++ b/numba/npyufunc/workqueue.c
@@ -246,7 +246,7 @@ void nopfn(void *args, void *dims, void *steps, void *data) {};
 
 static void
 parallel_for(void *fn, char **args, size_t *dimensions, size_t *steps, void *data,
-             size_t inner_ndim, size_t array_count)
+             size_t inner_ndim, size_t array_count, int num_threads)
 {
 
     //     args = <ir.Argument '.1' of type i8**>,
diff --git a/numba/npyufunc/workqueue.h b/numba/npyufunc/workqueue.h
index cfb805c55fd..32f78480bd6 100644
--- a/numba/npyufunc/workqueue.h
+++ b/numba/npyufunc/workqueue.h
@@ -53,4 +53,4 @@ void ready(void);
  */
 static void
 parallel_for(void *fn, char **args, size_t *dims, size_t *steps, void *data,\
-             size_t inner_ndim, size_t array_count);
+             size_t inner_ndim, size_t array_count, int num_threads);

From 179ce8daff2eac6d15af9343e9f88bdabd768d9a Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Mon, 25 Nov 2019 13:01:50 +0000
Subject: [PATCH 031/136] Fix issues in #4615 (thread masking).

This does the following:

* Adds the function `get_thread_num()` to the API, this returns
  a unique id for the calling thread in the current execution
  context. This is not to be part of the public API but is useful
  in testing.
* Fixes the TBB backend:
  * removes potential nested parallelism deadlock
  * adds TLS synchronisation in tbb::task_arena usage
* Fixes the OpenMP backend:
  * Permits nested parallelism
  * Adds TLS sychronisation in the OMP PARALLEL region
  * Prevents potential race condition in parallel for launch
  * Remove global OMP DSO state mutation
* Fixes the workqueue backend:
  * Limits the active queue size based on num_threads
  * Adds TLS sychronisation as a prequistite task set prior to
    submitting the distributed kernel work.
* Adds some tests (more work needed).
---
 numba/__init__.py               |   2 +-
 numba/npyufunc/__init__.py      |   3 +-
 numba/npyufunc/omppool.cpp      |  50 ++++++++-
 numba/npyufunc/parallel.py      |  24 +++-
 numba/npyufunc/tbbpool.cpp      | 141 ++++++++++++++++--------
 numba/npyufunc/workqueue.c      |  78 +++++++++++--
 numba/npyufunc/workqueue.h      |   9 ++
 numba/tests/test_num_threads.py | 189 +++++++++++++++++++++++++++++++-
 8 files changed, 436 insertions(+), 60 deletions(-)

diff --git a/numba/__init__.py b/numba/__init__.py
index 5887cb33718..30d75fb15f5 100644
--- a/numba/__init__.py
+++ b/numba/__init__.py
@@ -32,7 +32,7 @@
 
 # Re-export vectorize decorators and the thread layer querying function
 from .npyufunc import (vectorize, guvectorize, threading_layer,
-                       get_num_threads, set_num_threads)
+                       get_num_threads, set_num_threads, get_thread_num)
 
 # Re-export Numpy helpers
 from .numpy_support import carray, farray, from_dtype
diff --git a/numba/npyufunc/__init__.py b/numba/npyufunc/__init__.py
index 2f44fc80f02..97448d43597 100644
--- a/numba/npyufunc/__init__.py
+++ b/numba/npyufunc/__init__.py
@@ -4,7 +4,8 @@
 from .decorators import Vectorize, GUVectorize, vectorize, guvectorize
 from ._internal import PyUFunc_None, PyUFunc_Zero, PyUFunc_One
 from . import _internal, array_exprs, parfor
-from .parallel import threading_layer, get_num_threads, set_num_threads
+from .parallel import (threading_layer, get_num_threads, set_num_threads,
+                       get_thread_num)
 if hasattr(_internal, 'PyUFunc_ReorderableNone'):
     PyUFunc_ReorderableNone = _internal.PyUFunc_ReorderableNone
 del _internal, array_exprs
diff --git a/numba/npyufunc/omppool.cpp b/numba/npyufunc/omppool.cpp
index 3d1c6a4d66a..79b61b80576 100644
--- a/numba/npyufunc/omppool.cpp
+++ b/numba/npyufunc/omppool.cpp
@@ -40,6 +40,34 @@ Threading layer on top of OpenMP.
 static pid_t parent_pid = 0; // 0 is not set, users can't own this anyway
 #endif
 
+
+#ifdef _MSC_VER
+#define THREAD_LOCAL(ty) __declspec(thread) ty
+#else
+/* Non-standard C99 extension that's understood by gcc and clang */
+#define THREAD_LOCAL(ty) __thread ty
+#endif
+
+static THREAD_LOCAL(int) num_threads = 0;
+
+static void
+set_num_threads(int count)
+{
+    num_threads = count;
+}
+
+static int
+get_num_threads(void)
+{
+    return num_threads;
+}
+
+static int
+get_thread_num(void)
+{
+    return omp_get_thread_num();
+}
+
 static void
 add_task(void *fn, void *args, void *dims, void *steps, void *data)
 {
@@ -90,6 +118,10 @@ parallel_for(void *fn, char **args, size_t *dimensions, size_t *steps, void *dat
     // index variable in OpenMP 'for' statement must have signed integral type for MSVC
     const ptrdiff_t size = (ptrdiff_t)dimensions[0];
 
+    // holds the shared variable for `num_threads`, this is a bit superfluous
+    // but present to force thinking about the scope of validity
+    int agreed_nthreads = num_threads;
+
     if(_DEBUG)
     {
         printf("inner_ndim: %lu\n",inner_ndim);
@@ -107,12 +139,17 @@ parallel_for(void *fn, char **args, size_t *dimensions, size_t *steps, void *dat
         printf("\n");
     }
 
-    omp_set_num_threads(num_threads);
-
-    #pragma omp parallel
+    // Set the thread mask on the pragma such that the state is scope limited
+    // and passed via a register on the OMP region call site, this limiting
+    // global state and racing
+    #pragma omp parallel num_threads(num_threads), shared(agreed_nthreads)
     {
         size_t * count_space = (size_t *)alloca(sizeof(size_t) * arg_len);
         char ** array_arg_space = (char**)alloca(sizeof(char*) * array_count);
+
+        // tell the active thread team about the number of threads
+        set_num_threads(agreed_nthreads);
+
         #pragma omp for
         for(ptrdiff_t r = 0; r < size; r++)
         {
@@ -174,6 +211,7 @@ static void launch_threads(int count)
     if(count < 1)
         return;
     omp_set_num_threads(count);
+    omp_set_nested(0x1); // enable nesting, control depth with OMP env var
 }
 
 static void synchronize(void)
@@ -207,5 +245,11 @@ MOD_INIT(omppool)
                            PyLong_FromVoidPtr((void*)&do_scheduling_unsigned));
     PyObject_SetAttrString(m, "openmp_vendor",
                            PyString_FromString(_OMP_VENDOR));
+    PyObject_SetAttrString(m, "set_num_threads",
+                           PyLong_FromVoidPtr((void*)&set_num_threads));
+    PyObject_SetAttrString(m, "get_num_threads",
+                           PyLong_FromVoidPtr((void*)&get_num_threads));
+    PyObject_SetAttrString(m, "get_thread_num",
+                           PyLong_FromVoidPtr((void*)&get_thread_num));
     return MOD_SUCCESS_VAL(m);
 }
diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index aa6c435f292..0a3dfc1e599 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -452,17 +452,19 @@ def raise_with_hint(required):
             launch_threads = CFUNCTYPE(None, c_int)(lib.launch_threads)
             launch_threads(NUM_THREADS)
 
+            _load_num_threads_funcs(lib) # load late
+
             # set library name so it can be queried
             global _threading_layer
             _threading_layer = libname
             _is_initialized = True
 
 
-def _load_num_threads_funcs():
-    from . import _num_threads as lib
+def _load_num_threads_funcs(lib):
 
     ll.add_symbol('get_num_threads', lib.get_num_threads)
     ll.add_symbol('set_num_threads', lib.set_num_threads)
+    ll.add_symbol('get_thread_num', lib.get_thread_num)
 
     global _set_num_threads
     _set_num_threads = CFUNCTYPE(None, c_int)(lib.set_num_threads)
@@ -471,8 +473,9 @@ def _load_num_threads_funcs():
     global _get_num_threads
     _get_num_threads = CFUNCTYPE(c_int)(lib.get_num_threads)
 
+    global _get_thread_num
+    _get_thread_num = CFUNCTYPE(c_int)(lib.get_thread_num)
 
-_load_num_threads_funcs()
 
 # Some helpers to make set_num_threads jittable
 
@@ -519,13 +522,13 @@ def set_num_threads(n):
 
     """
     _launch_threads()
-
     snt_check(n)
     _set_num_threads(n)
 
 
 @overload(set_num_threads)
 def ol_set_num_threads(n):
+    _launch_threads()
     def impl(n):
         snt_check(n)
         _set_num_threads(n)
@@ -554,15 +557,28 @@ def get_num_threads():
     numba.config.NUMBA_DEFAULT_NUM_THREADS, :envvar:`NUMBA_NUM_THREADS`
 
     """
+    _launch_threads()
     return _get_num_threads()
 
 
 @overload(get_num_threads)
 def ol_get_num_threads():
+    _launch_threads()
     def impl():
         return _get_num_threads()
     return impl
 
+def get_thread_num():
+    """
+    docs
+    """
+    return _get_thread_num()
+
+@overload(get_thread_num)
+def ol_get_thread_num():
+    def impl():
+        return _get_thread_num()
+    return impl
 
 _DYLD_WORKAROUND_SET = 'NUMBA_DYLD_WORKAROUND' in os.environ
 _DYLD_WORKAROUND_VAL = int(os.environ.get('NUMBA_DYLD_WORKAROUND', 0))
diff --git a/numba/npyufunc/tbbpool.cpp b/numba/npyufunc/tbbpool.cpp
index 5bcc1029ae0..157b069d253 100644
--- a/numba/npyufunc/tbbpool.cpp
+++ b/numba/npyufunc/tbbpool.cpp
@@ -38,6 +38,49 @@ static tbb::task_group *tg = NULL;
 static tbb::task_scheduler_init *tsi = NULL;
 static int tsi_count = 0;
 
+#ifdef _MSC_VER
+#define THREAD_LOCAL(ty) __declspec(thread) ty
+#else
+/* Non-standard C99 extension that's understood by gcc and clang */
+#define THREAD_LOCAL(ty) __thread ty
+#endif
+
+static THREAD_LOCAL(int) num_threads = 0;
+
+static void
+set_num_threads(int count)
+{
+    num_threads = count;
+}
+
+static int
+get_num_threads(void)
+{
+    return num_threads;
+}
+
+static int
+get_thread_num(void)
+{
+    return tbb::task_arena::current_thread_index();
+}
+
+// watch the arena, if it decides to create more threads/add threads into the
+// arena then make sure they get the right thread count
+class fix_tls_observer: public tbb::task_scheduler_observer {
+    int mask_val;
+    void on_scheduler_entry( bool is_worker ) override;
+public:
+    fix_tls_observer(tbb::task_arena &arena, int mask) : tbb::task_scheduler_observer(arena), mask_val(mask)
+    {
+        observe(true);
+    }
+};
+
+void fix_tls_observer::on_scheduler_entry(bool worker) {
+    set_num_threads(mask_val);
+}
+
 static void
 add_task(void *fn, void *args, void *dims, void *steps, void *data)
 {
@@ -83,58 +126,65 @@ parallel_for(void *fn, char **args, size_t *dimensions, size_t *steps, void *dat
         printf("\n");
     }
 
+    // This is making the assumption that the calling thread knows the truth
+    // about num_threads, which should be correct via the following:
+    // program starts/reinits and the threadpool launches, num_threads TLS is
+    // set as default. Any thread spawned on init making a call to this function
+    // will have a valid num_threads TLS slot and so the task_arena is sized
+    // appropriately and it's value is used in the observer that fixes the TLS
+    // slots of any subsequent threads joining the task_arena. This leads to
+    // all threads in a task_arena having valid num_threads TLS slots prior to
+    // doing any work. Any further call to query the TLS slot value made by any
+    // thread in the arena is then safe and were any thread to create a nested
+    // parallel region the same logic applies as per program start/reinit.
     tbb::task_arena limited(num_threads);
+    fix_tls_observer observer(limited, num_threads);
 
-    limited.execute([&]
-    {
-        tg->run([=]
+    limited.execute([&]{
+        using range_t = tbb::blocked_range<size_t>;
+        tbb::parallel_for(range_t(0, dimensions[0]), [=](const range_t &range)
         {
-            using range_t = tbb::blocked_range<size_t>;
-            tbb::parallel_for(range_t(0, dimensions[0]), [=](const range_t &range)
-            {
-                size_t * count_space = (size_t *)alloca(sizeof(size_t) * arg_len);
-                char ** array_arg_space = (char**)alloca(sizeof(char*) * array_count);
-                memcpy(count_space, dimensions, arg_len * sizeof(size_t));
-                count_space[0] = range.size();
+            size_t * count_space = (size_t *)alloca(sizeof(size_t) * arg_len);
+            char ** array_arg_space = (char**)alloca(sizeof(char*) * array_count);
+            memcpy(count_space, dimensions, arg_len * sizeof(size_t));
+            count_space[0] = range.size();
 
-                if(_DEBUG && _TRACE_SPLIT > 1)
-                {
-                    printf("THREAD %p:", count_space);
-                    printf("count_space: ");
-                    for(size_t j = 0; j < arg_len; j++)
-                        printf("%lu, ", count_space[j]);
-                    printf("\n");
-                }
-                for(size_t j = 0; j < array_count; j++)
-                {
-                    char * base = args[j];
-                    size_t step = steps[j];
-                    ptrdiff_t offset = step * range.begin();
-                    array_arg_space[j] = base + offset;
-
-                    if(_DEBUG && _TRACE_SPLIT > 2)
-                    {
-                        printf("Index %ld\n", j);
-                        printf("-->Got base %p\n", (void *)base);
-                        printf("-->Got step %lu\n", step);
-                        printf("-->Got offset %ld\n", offset);
-                        printf("-->Got addr %p\n", (void *)array_arg_space[j]);
-                    }
-                }
+            if(_DEBUG && _TRACE_SPLIT > 1)
+            {
+                printf("THREAD %p:", count_space);
+                printf("count_space: ");
+                for(size_t j = 0; j < arg_len; j++)
+                    printf("%lu, ", count_space[j]);
+                printf("\n");
+            }
+            for(size_t j = 0; j < array_count; j++)
+            {
+                char * base = args[j];
+                size_t step = steps[j];
+                ptrdiff_t offset = step * range.begin();
+                array_arg_space[j] = base + offset;
 
                 if(_DEBUG && _TRACE_SPLIT > 2)
                 {
-                    printf("array_arg_space: ");
-                    for(size_t j = 0; j < array_count; j++)
-                        printf("%p, ", (void *)array_arg_space[j]);
-                    printf("\n");
+                    printf("Index %ld\n", j);
+                    printf("-->Got base %p\n", (void *)base);
+                    printf("-->Got step %lu\n", step);
+                    printf("-->Got offset %ld\n", offset);
+                    printf("-->Got addr %p\n", (void *)array_arg_space[j]);
                 }
-                auto func = reinterpret_cast<void (*)(char **args, size_t *dims, size_t *steps, void *data)>(fn);
-                func(array_arg_space, count_space, steps, data);
-            });
+            }
+
+            if(_DEBUG && _TRACE_SPLIT > 2)
+            {
+                printf("array_arg_space: ");
+                for(size_t j = 0; j < array_count; j++)
+                    printf("%p, ", (void *)array_arg_space[j]);
+                printf("\n");
+            }
+            auto func = reinterpret_cast<void (*)(char **args, size_t *dims, size_t *steps, void *data)>(fn);
+            func(array_arg_space, count_space, steps, data);
         });
     });
-    limited.execute([&]{ tg->wait(); });
 }
 
 void ignore_blocking_terminate_assertion( const char*, int, const char*, const char * )
@@ -244,7 +294,12 @@ MOD_INIT(tbbpool)
                            PyLong_FromVoidPtr((void*)&do_scheduling_signed));
     PyObject_SetAttrString(m, "do_scheduling_unsigned",
                            PyLong_FromVoidPtr((void*)&do_scheduling_unsigned));
-
+    PyObject_SetAttrString(m, "set_num_threads",
+                           PyLong_FromVoidPtr((void*)&set_num_threads));
+    PyObject_SetAttrString(m, "get_num_threads",
+                           PyLong_FromVoidPtr((void*)&get_num_threads));
+    PyObject_SetAttrString(m, "get_thread_num",
+                           PyLong_FromVoidPtr((void*)&get_thread_num));
 
     return MOD_SUCCESS_VAL(m);
 }
diff --git a/numba/npyufunc/workqueue.c b/numba/npyufunc/workqueue.c
index 428b21845b6..8267e6621cf 100644
--- a/numba/npyufunc/workqueue.c
+++ b/numba/npyufunc/workqueue.c
@@ -114,6 +114,12 @@ numba_new_thread(void *worker, void *arg)
     return (thread_pointer)th;
 }
 
+static int
+get_thread_num(void)
+{
+    return pthread_self();
+}
+
 #endif
 
 /* Win Thread */
@@ -199,6 +205,12 @@ numba_new_thread(void *worker, void *arg)
     return (thread_pointer)handle;
 }
 
+static int
+get_thread_num(void)
+{
+    return GetCurrentThreadId();
+}
+
 #endif
 
 typedef struct Task
@@ -239,10 +251,41 @@ queue_state_wait(Queue *queue, int old, int repl)
 void debug_marker(void);
 void debug_marker() {};
 
+
+#ifdef _MSC_VER
+#define THREAD_LOCAL(ty) __declspec(thread) ty
+#else
+/* Non-standard C99 extension that's understood by gcc and clang */
+#define THREAD_LOCAL(ty) __thread ty
+#endif
+
+static THREAD_LOCAL(int) num_threads = 0;
+
+static void
+set_num_threads(int count)
+{
+    num_threads = count;
+}
+
+static int
+get_num_threads(void)
+{
+    return num_threads;
+}
+
+
 // this complies to a launchable function from `add_task` like:
 // add_task(nopfn, NULL, NULL, NULL, NULL)
 // useful if you want to limit the number of threads locally
-void nopfn(void *args, void *dims, void *steps, void *data) {};
+// static void nopfn(void *args, void *dims, void *steps, void *data) {};
+
+
+// synchronize the TLS num_threads slot to value args[0]
+static void sync_tls(void *args, void *dims, void *steps, void *data) {
+    int nthreads = *((int *)(args));
+    num_threads = nthreads;
+};
+
 
 static void
 parallel_for(void *fn, char **args, size_t *dimensions, size_t *steps, void *data,
@@ -257,17 +300,19 @@ parallel_for(void *fn, char **args, size_t *dimensions, size_t *steps, void *dat
     size_t * count_space = NULL;
     char ** array_arg_space = NULL;
     const size_t arg_len = (inner_ndim + 1);
-    size_t i, j, count, remain, total;
+    int i; // induction var for chunking, thread count unlikely to overflow int
+    size_t j, count, remain, total;
 
     ptrdiff_t offset;
     char * base;
+    int old_queue_count = -1;
 
     size_t step;
 
     debug_marker();
 
     total = *((size_t *)dimensions);
-    count = total / NUM_THREADS;
+    count = total / num_threads;
     remain = total;
 
     if(_DEBUG)
@@ -298,12 +343,24 @@ parallel_for(void *fn, char **args, size_t *dimensions, size_t *steps, void *dat
         }
     }
 
-
+    // sync the thread pool TLS slots, sync all slots, we don't know which
+    // threads will end up running.
     for (i = 0; i < NUM_THREADS; i++)
+    {
+        add_task(sync_tls, (void *)(&num_threads), NULL, NULL, NULL);
+    }
+    ready();
+    synchronize();
+
+    // This backend isn't threadsafe so just mutate the global
+    old_queue_count = queue_count;
+    queue_count = num_threads;
+
+    for (i = 0; i < num_threads; i++)
     {
         count_space = (size_t *)alloca(sizeof(size_t) * arg_len);
         memcpy(count_space, dimensions, arg_len * sizeof(size_t));
-        if(i == NUM_THREADS - 1)
+        if(i == num_threads - 1)
         {
             // Last thread takes all leftover
             count_space[0] = remain;
@@ -316,7 +373,7 @@ parallel_for(void *fn, char **args, size_t *dimensions, size_t *steps, void *dat
 
         if(_DEBUG)
         {
-            printf("\n=================== THREAD %ld ===================\n", i);
+            printf("\n=================== THREAD %d ===================\n", i);
             printf("\ncount_space: ");
             for(j = 0; j < arg_len; j++)
             {
@@ -357,6 +414,8 @@ parallel_for(void *fn, char **args, size_t *dimensions, size_t *steps, void *dat
 
     ready();
     synchronize();
+
+    queue_count = old_queue_count;
 }
 
 static void
@@ -471,6 +530,11 @@ MOD_INIT(workqueue)
                            PyLong_FromVoidPtr(&do_scheduling_signed));
     PyObject_SetAttrString(m, "do_scheduling_unsigned",
                            PyLong_FromVoidPtr(&do_scheduling_unsigned));
-
+    PyObject_SetAttrString(m, "set_num_threads",
+                           PyLong_FromVoidPtr((void*)&set_num_threads));
+    PyObject_SetAttrString(m, "get_num_threads",
+                           PyLong_FromVoidPtr((void*)&get_num_threads));
+    PyObject_SetAttrString(m, "get_thread_num",
+                           PyLong_FromVoidPtr((void*)&get_thread_num));
     return MOD_SUCCESS_VAL(m);
 }
diff --git a/numba/npyufunc/workqueue.h b/numba/npyufunc/workqueue.h
index 32f78480bd6..865a082116a 100644
--- a/numba/npyufunc/workqueue.h
+++ b/numba/npyufunc/workqueue.h
@@ -54,3 +54,12 @@ void ready(void);
 static void
 parallel_for(void *fn, char **args, size_t *dims, size_t *steps, void *data,\
              size_t inner_ndim, size_t array_count, int num_threads);
+
+
+/* Masking API cf. OpenMP */
+static void
+set_num_threads(int count);
+static int
+get_num_threads(void);
+static int
+get_thread_num(void);
diff --git a/numba/tests/test_num_threads.py b/numba/tests/test_num_threads.py
index 461ba90bc53..bc4bbd229c3 100644
--- a/numba/tests/test_num_threads.py
+++ b/numba/tests/test_num_threads.py
@@ -3,7 +3,8 @@
 
 import numpy as np
 
-from numba import njit, set_num_threads, get_num_threads, prange, config
+from numba import (njit, set_num_threads, get_num_threads, get_thread_num,
+                   prange, config)
 from numba import unittest_support as unittest
 from .support import TestCase, skip_parfors_unsupported
 
@@ -100,6 +101,192 @@ def test_func(nthreads):
         out = test_func(mask)
         np.testing.assert_equal(out, mask)
 
+    @skip_parfors_unsupported
+    @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
+    def test_get_num_threads_truth_outside_jit(self):
+
+        for mask in range(2, min(6, config.NUMBA_NUM_THREADS + 1)):
+            set_num_threads(mask)
+
+            # a lot of work, hopefully will trigger "mask" count of threads to
+            # join the parallel region (for those backends with dynamic threads)
+            @njit(parallel=True)
+            def test_func():
+                x = 5000000
+                buf = np.empty((x,))
+                for i in prange(x):
+                    buf[i] = get_thread_num()
+                return len(np.unique(buf)), get_num_threads()
+
+            out = test_func()
+            self.assertEqual(out, (mask, mask))
+
+    @skip_parfors_unsupported
+    @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
+    def test_get_num_threads_truth_inside_jit(self):
+
+        for mask in range(2, min(6, config.NUMBA_NUM_THREADS + 1)):
+
+            # a lot of work, hopefully will trigger "mask" count of threads to
+            # join the parallel region (for those backends with dynamic threads)
+            @njit(parallel=True)
+            def test_func():
+                set_num_threads(mask)
+                x = 5000000
+                buf = np.empty((x,))
+                for i in prange(x):
+                    buf[i] = get_thread_num()
+                return len(np.unique(buf)), get_num_threads()
+
+            out = test_func()
+            self.assertEqual(out, (mask, mask))
+
+    # this test can only run on OpenMP (providing OMP_MAX_ACTIVE_LEVELS is not
+    # set or >= 2) and TBB backends
+    @skip_parfors_unsupported
+    @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
+    def test_nested_parallelism_1(self):
+        # check that get_thread_num is ok in nesting
+        mask = config.NUMBA_NUM_THREADS - 1
+
+        N = 4
+        M = 8
+
+        def gen(fid):
+            @njit(parallel=True)
+            def child_func(buf):
+                M, N = buf.shape
+                for i in prange(N):
+                    buf[fid, i] = get_num_threads()
+            return child_func
+
+        child1 = gen(1)
+        child2 = gen(2)
+        child3 = gen(3)
+
+        @njit(parallel=True)
+        def test_func(nthreads):
+            acc = 0
+            buf = np.zeros((M, N))
+            set_num_threads(nthreads)
+            for i in prange(M):
+                local_mask = 1 + i % mask
+                set_num_threads(local_mask) # set threads in parent function
+                if local_mask == 1:
+                    child1(buf)
+                elif local_mask == 2:
+                    child2(buf)
+                elif local_mask == 3:
+                    child3(buf)
+                acc += get_num_threads()
+            return acc, buf
+
+        got_acc, got_arr = test_func(mask)
+        exp_acc, exp_arr = test_func.py_func(mask)
+        self.assertEqual(exp_acc, got_acc)
+        np.testing.assert_equal(exp_arr, got_arr)
+
+        # check the maths reconciles
+        math_acc = np.sum(1 + np.arange(M) % mask)
+        self.assertEqual(math_acc, got_acc)
+        math_arr = np.zeros((M, N))
+        for i in range(1, 4): # there's branches on 1, 2, 3
+            math_arr[i, :] = i
+        np.testing.assert_equal(math_arr, got_arr)
+
+
+    # this test can only run on OpenMP (providing OMP_MAX_ACTIVE_LEVELS is not
+    # set or >= 2) and TBB backends
+    @skip_parfors_unsupported
+    @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
+    def test_nested_parallelism_2(self):
+        # check that get_thread_num is ok in nesting
+
+        N = 4
+        M = 8
+        def gen(fid):
+            @njit(parallel=True)
+            def child_func(buf):
+                M, N = buf.shape
+                set_num_threads(fid) # set threads in child function
+                for i in prange(N):
+                    buf[fid, i] = get_num_threads()
+            return child_func
+
+        child1 = gen(1)
+        child2 = gen(2)
+        child3 = gen(3)
+
+        @njit(parallel=True)
+        def test_func(nthreads):
+            acc = 0
+            buf = np.zeros((M, N))
+            set_num_threads(nthreads)
+            for i in prange(M):
+                local_mask = 1 + i % mask
+                if local_mask == 1:
+                    child1(buf)
+                elif local_mask == 2:
+                    child2(buf)
+                elif local_mask == 3:
+                    child3(buf)
+                acc += get_num_threads()
+            return acc, buf
+
+        mask = config.NUMBA_NUM_THREADS - 1
+        got_acc, got_arr = test_func(mask)
+        exp_acc, exp_arr = test_func.py_func(mask)
+        self.assertEqual(exp_acc, got_acc)
+        np.testing.assert_equal(exp_arr, got_arr)
+
+        # check the maths reconciles
+        math_acc = np.sum(1 + np.arange(M) % mask)
+        self.assertEqual(math_acc, got_acc)
+        math_arr = np.zeros((M, N))
+        for i in range(1, 4): # there's branches on 1, 2, 3
+            math_arr[i, :] = i
+        np.testing.assert_equal(math_arr, got_arr)
+
+
+    # this test can only run on OpenMP (providing OMP_MAX_ACTIVE_LEVELS is not
+    # set or >= 2) and TBB backends
+    @skip_parfors_unsupported
+    @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
+    def test_nested_parallelism_3(self):
+        # check that the right number of threads are present in nesting
+        # this relies on there being a load of cores present
+        BIG = 1000000
+
+        @njit(parallel=True)
+        def work(local_nt):
+            tid = np.zeros(BIG)
+            acc = 0
+            set_num_threads(local_nt)
+            for i in prange(BIG):
+                acc += 1
+                tid[i] = get_thread_num()
+            return acc, np.unique(tid)
+
+        @njit(parallel=True)
+        def test_func(nthreads):
+            acc = 0
+            set_num_threads(nthreads)
+            lens = np.zeros(nthreads)
+            total = 0
+            for i in prange(nthreads):
+                my_acc, tids = work(nthreads + 1)
+                lens[i] = len(tids)
+                total += my_acc
+            return total, np.unique(lens)
+
+        NT = 2
+        expected_acc = BIG * NT
+        expected_thread_count = NT + 1
+
+        got_acc, got_tc = test_func(NT)
+        self.assertEqual(expected_acc, got_acc)
+        np.testing.assert_equal(expected_thread_count, got_tc)
+
     def tearDown(self):
         set_num_threads(config.NUMBA_NUM_THREADS)
 

From 74bf5f84f03dc488d9f4e7044b547e9679f2e8ee Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Mon, 9 Dec 2019 11:03:05 +0000
Subject: [PATCH 032/136] Fix problematic test design.

As title.
---
 numba/tests/test_num_threads.py | 102 +++++++++++++++++++-------------
 1 file changed, 61 insertions(+), 41 deletions(-)

diff --git a/numba/tests/test_num_threads.py b/numba/tests/test_num_threads.py
index bc4bbd229c3..e23e8f60625 100644
--- a/numba/tests/test_num_threads.py
+++ b/numba/tests/test_num_threads.py
@@ -8,6 +8,7 @@
 from numba import unittest_support as unittest
 from .support import TestCase, skip_parfors_unsupported
 
+
 class TestNumThreads(TestCase):
     _numba_parallel_test_ = False
 
@@ -171,7 +172,7 @@ def test_func(nthreads):
             set_num_threads(nthreads)
             for i in prange(M):
                 local_mask = 1 + i % mask
-                set_num_threads(local_mask) # set threads in parent function
+                set_num_threads(local_mask)  # set threads in parent function
                 if local_mask == 1:
                     child1(buf)
                 elif local_mask == 2:
@@ -190,11 +191,10 @@ def test_func(nthreads):
         math_acc = np.sum(1 + np.arange(M) % mask)
         self.assertEqual(math_acc, got_acc)
         math_arr = np.zeros((M, N))
-        for i in range(1, 4): # there's branches on 1, 2, 3
+        for i in range(1, 4):  # there's branches on 1, 2, 3
             math_arr[i, :] = i
         np.testing.assert_equal(math_arr, got_arr)
 
-
     # this test can only run on OpenMP (providing OMP_MAX_ACTIVE_LEVELS is not
     # set or >= 2) and TBB backends
     @skip_parfors_unsupported
@@ -202,51 +202,70 @@ def test_func(nthreads):
     def test_nested_parallelism_2(self):
         # check that get_thread_num is ok in nesting
 
-        N = 4
-        M = 8
-        def gen(fid):
-            @njit(parallel=True)
-            def child_func(buf):
-                M, N = buf.shape
-                set_num_threads(fid) # set threads in child function
-                for i in prange(N):
-                    buf[fid, i] = get_num_threads()
-            return child_func
-
-        child1 = gen(1)
-        child2 = gen(2)
-        child3 = gen(3)
-
-        @njit(parallel=True)
-        def test_func(nthreads):
-            acc = 0
-            buf = np.zeros((M, N))
-            set_num_threads(nthreads)
-            for i in prange(M):
-                local_mask = 1 + i % mask
-                if local_mask == 1:
-                    child1(buf)
-                elif local_mask == 2:
-                    child2(buf)
-                elif local_mask == 3:
-                    child3(buf)
-                acc += get_num_threads()
-            return acc, buf
+        N = 5
+        M = 17
+        def get_impl(flag):
+
+            if flag == True:
+                dec = njit(parallel=True)
+            elif flag == False:
+                dec = njit(parallel=False)
+            else:
+                def dec(x): return x
+
+            def gen(fid):
+                @dec
+                def child_func(buf):
+                    M, N = buf.shape
+                    set_num_threads(fid)  # set threads in child function
+                    for i in prange(N):
+                        buf[fid, i] = get_num_threads()
+                return child_func
+
+            child1 = gen(1)
+            child2 = gen(2)
+            child3 = gen(3)
+
+            @dec
+            def test_func(nthreads):
+                acc = 0
+                buf = np.zeros((M, N))
+                set_num_threads(nthreads)
+                for i in prange(M):
+                    local_mask = 1 + i % mask
+                    # when the threads exit the child functions they should have
+                    # a TLS slot value of the local mask as it was set in
+                    # child
+                    if local_mask == 1:
+                        child1(buf)
+                        assert get_num_threads() == local_mask
+                    elif local_mask == 2:
+                        child2(buf)
+                        assert get_num_threads() == local_mask
+                    elif local_mask == 3:
+                        child3(buf)
+                        assert get_num_threads() == local_mask
+                return buf
+            return test_func
 
         mask = config.NUMBA_NUM_THREADS - 1
-        got_acc, got_arr = test_func(mask)
-        exp_acc, exp_arr = test_func.py_func(mask)
-        self.assertEqual(exp_acc, got_acc)
-        np.testing.assert_equal(exp_arr, got_arr)
+        set_num_threads(mask)
+        pf_arr = get_impl(True)(mask)
+        set_num_threads(mask)
+        nj_arr = get_impl(False)(mask)
+        set_num_threads(mask)
+        py_arr = get_impl(None)(mask)
+
+        np.testing.assert_equal(pf_arr, py_arr)
+        np.testing.assert_equal(nj_arr, py_arr)
 
         # check the maths reconciles
-        math_acc = np.sum(1 + np.arange(M) % mask)
-        self.assertEqual(math_acc, got_acc)
         math_arr = np.zeros((M, N))
-        for i in range(1, 4): # there's branches on 1, 2, 3
+        for i in range(
+                1, 4):  # there's branches on modulo mask but only 3 funcs
             math_arr[i, :] = i
-        np.testing.assert_equal(math_arr, got_arr)
 
+        np.testing.assert_equal(math_arr, pf_arr)
 
     # this test can only run on OpenMP (providing OMP_MAX_ACTIVE_LEVELS is not
     # set or >= 2) and TBB backends
@@ -290,5 +309,6 @@ def test_func(nthreads):
     def tearDown(self):
         set_num_threads(config.NUMBA_NUM_THREADS)
 
+
 if __name__ == '__main__':
     unittest.main()

From 86897dbbf39bd8796f5147c4796ae6d18b00f4b0 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Tue, 10 Dec 2019 11:10:44 -0700
Subject: [PATCH 033/136] Skip tests that deadlock in workqueue

test_nested_parallelism_2 doesn't deadlock for me, so I haven't skipped it,
although I'm not sure if that's just a coincidence.
---
 numba/tests/test_num_threads.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/numba/tests/test_num_threads.py b/numba/tests/test_num_threads.py
index e23e8f60625..3e623690d4d 100644
--- a/numba/tests/test_num_threads.py
+++ b/numba/tests/test_num_threads.py
@@ -4,11 +4,10 @@
 import numpy as np
 
 from numba import (njit, set_num_threads, get_num_threads, get_thread_num,
-                   prange, config)
+                   prange, config, threading_layer)
 from numba import unittest_support as unittest
 from .support import TestCase, skip_parfors_unsupported
 
-
 class TestNumThreads(TestCase):
     _numba_parallel_test_ = False
 
@@ -147,6 +146,9 @@ def test_func():
     @skip_parfors_unsupported
     @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
     def test_nested_parallelism_1(self):
+        if threading_layer() == 'workqueue':
+            return unittest.skip("workqueue is not threadsafe")
+
         # check that get_thread_num is ok in nesting
         mask = config.NUMBA_NUM_THREADS - 1
 
@@ -272,6 +274,9 @@ def test_func(nthreads):
     @skip_parfors_unsupported
     @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
     def test_nested_parallelism_3(self):
+        if threading_layer() == 'workqueue':
+            return unittest.skip("workqueue is not threadsafe")
+
         # check that the right number of threads are present in nesting
         # this relies on there being a load of cores present
         BIG = 1000000

From 63589f937f35dc3c97e2fb636313bb61c55f4ea5 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Tue, 10 Dec 2019 11:13:40 -0700
Subject: [PATCH 034/136] Use the correct mechanism for skipping a test in
 unittest

---
 numba/tests/test_num_threads.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/numba/tests/test_num_threads.py b/numba/tests/test_num_threads.py
index 3e623690d4d..470d342f4ca 100644
--- a/numba/tests/test_num_threads.py
+++ b/numba/tests/test_num_threads.py
@@ -147,7 +147,7 @@ def test_func():
     @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
     def test_nested_parallelism_1(self):
         if threading_layer() == 'workqueue':
-            return unittest.skip("workqueue is not threadsafe")
+            self.skipTest("workqueue is not threadsafe")
 
         # check that get_thread_num is ok in nesting
         mask = config.NUMBA_NUM_THREADS - 1
@@ -275,7 +275,7 @@ def test_func(nthreads):
     @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
     def test_nested_parallelism_3(self):
         if threading_layer() == 'workqueue':
-            return unittest.skip("workqueue is not threadsafe")
+            self.skipTest("workqueue is not threadsafe")
 
         # check that the right number of threads are present in nesting
         # this relies on there being a load of cores present

From a604ee5c625aa5a6d27fc5bb60fb842a7b73e341 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Tue, 10 Dec 2019 11:16:24 -0700
Subject: [PATCH 035/136] Also skip test_nested_parallelism_2 with the
 workqueue backend

---
 numba/tests/test_num_threads.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/numba/tests/test_num_threads.py b/numba/tests/test_num_threads.py
index 470d342f4ca..b365ae8ed77 100644
--- a/numba/tests/test_num_threads.py
+++ b/numba/tests/test_num_threads.py
@@ -202,6 +202,9 @@ def test_func(nthreads):
     @skip_parfors_unsupported
     @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
     def test_nested_parallelism_2(self):
+        if threading_layer() == 'workqueue':
+            self.skipTest("workqueue is not threadsafe")
+
         # check that get_thread_num is ok in nesting
 
         N = 5

From d6a8262906d9805965056182efbbc25afa907085 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Tue, 10 Dec 2019 13:32:52 -0700
Subject: [PATCH 036/136] Fix thread requirement for test_nested_parallelism_3

The nested function uses nthreads + 1, where nthreads is set to 2, so at least
3 cores are required.
---
 numba/tests/test_num_threads.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numba/tests/test_num_threads.py b/numba/tests/test_num_threads.py
index b365ae8ed77..16dcc8b7b92 100644
--- a/numba/tests/test_num_threads.py
+++ b/numba/tests/test_num_threads.py
@@ -275,7 +275,7 @@ def test_func(nthreads):
     # this test can only run on OpenMP (providing OMP_MAX_ACTIVE_LEVELS is not
     # set or >= 2) and TBB backends
     @skip_parfors_unsupported
-    @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
+    @unittest.skipIf(config.NUMBA_NUM_THREADS < 3, "Not enough CPU cores")
     def test_nested_parallelism_3(self):
         if threading_layer() == 'workqueue':
             self.skipTest("workqueue is not threadsafe")

From a45fce4b30555145db5c2e81826efadb242a40ea Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Tue, 10 Dec 2019 13:35:02 -0700
Subject: [PATCH 037/136] Remove unused variables

---
 numba/tests/test_num_threads.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/numba/tests/test_num_threads.py b/numba/tests/test_num_threads.py
index 16dcc8b7b92..e7ed1fe1ee0 100644
--- a/numba/tests/test_num_threads.py
+++ b/numba/tests/test_num_threads.py
@@ -233,7 +233,6 @@ def child_func(buf):
 
             @dec
             def test_func(nthreads):
-                acc = 0
                 buf = np.zeros((M, N))
                 set_num_threads(nthreads)
                 for i in prange(M):
@@ -296,7 +295,6 @@ def work(local_nt):
 
         @njit(parallel=True)
         def test_func(nthreads):
-            acc = 0
             set_num_threads(nthreads)
             lens = np.zeros(nthreads)
             total = 0

From 03c28707c0dbf4d305c2ace9aad22b93a6efbb6e Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Tue, 10 Dec 2019 15:25:16 -0700
Subject: [PATCH 038/136] Skip nested parallelism tests if there are fewer than
 4 cores

---
 numba/tests/test_num_threads.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/numba/tests/test_num_threads.py b/numba/tests/test_num_threads.py
index e7ed1fe1ee0..4574eff09eb 100644
--- a/numba/tests/test_num_threads.py
+++ b/numba/tests/test_num_threads.py
@@ -144,7 +144,7 @@ def test_func():
     # this test can only run on OpenMP (providing OMP_MAX_ACTIVE_LEVELS is not
     # set or >= 2) and TBB backends
     @skip_parfors_unsupported
-    @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
+    @unittest.skipIf(config.NUMBA_NUM_THREADS < 4, "Not enough CPU cores")
     def test_nested_parallelism_1(self):
         if threading_layer() == 'workqueue':
             self.skipTest("workqueue is not threadsafe")
@@ -200,7 +200,7 @@ def test_func(nthreads):
     # this test can only run on OpenMP (providing OMP_MAX_ACTIVE_LEVELS is not
     # set or >= 2) and TBB backends
     @skip_parfors_unsupported
-    @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
+    @unittest.skipIf(config.NUMBA_NUM_THREADS < 4, "Not enough CPU cores")
     def test_nested_parallelism_2(self):
         if threading_layer() == 'workqueue':
             self.skipTest("workqueue is not threadsafe")

From fcf7c7caf6ee16ab75a6c94e28905129f1556fa3 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Tue, 10 Dec 2019 16:33:13 -0700
Subject: [PATCH 039/136] Add some test_num_threads for guvectorize

Note that some of these tests don't actually work yet

- The test of calling set_num_threads inside of a guvectorized function does
  not work in workqueue

- The test to check how many threads are used by reading get_thread_num() does
  not work, presumably because the function is only called once. I'm unsure
  how to test this properly.
---
 numba/tests/test_num_threads.py | 115 +++++++++++++++++++++++++++++++-
 1 file changed, 114 insertions(+), 1 deletion(-)

diff --git a/numba/tests/test_num_threads.py b/numba/tests/test_num_threads.py
index 4574eff09eb..3165a879196 100644
--- a/numba/tests/test_num_threads.py
+++ b/numba/tests/test_num_threads.py
@@ -4,7 +4,7 @@
 import numpy as np
 
 from numba import (njit, set_num_threads, get_num_threads, get_thread_num,
-                   prange, config, threading_layer)
+                   prange, config, threading_layer, guvectorize)
 from numba import unittest_support as unittest
 from .support import TestCase, skip_parfors_unsupported
 
@@ -66,6 +66,60 @@ def set_get_n(n):
         with self.assertRaises(ValueError):
             set_n(max_threads + 1)
 
+    @skip_parfors_unsupported
+    @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
+    def test_set_num_threads_basic_guvectorize(self):
+        max_threads = config.NUMBA_NUM_THREADS
+
+        @guvectorize(['void(int64[:])'],
+                     '(n)',
+                     nopython=True,
+                     target='parallel')
+        def get_n(x):
+            x[:] = get_num_threads()
+
+        @guvectorize(['void(int64[:])'],
+                     '(n)',
+                     nopython=True,
+                     target='parallel')
+        def set_n(n):
+            set_num_threads(n[0])
+
+        x = np.zeros((5000000,), dtype=np.int64)
+        get_n(x)
+        np.testing.assert_equal(x, max_threads)
+        set_n(np.array([2]))
+        x = np.zeros((5000000,), dtype=np.int64)
+        get_n(x)
+        np.testing.assert_equal(x, 2)
+        set_n(np.array([max_threads]))
+        x = np.zeros((5000000,), dtype=np.int64)
+        get_n(x)
+        np.testing.assert_equal(x, max_threads)
+
+        @guvectorize(['void(int64[:])'],
+                     '(n)',
+                     nopython=True,
+                     target='parallel')
+        def set_get_n(n):
+            set_num_threads(n[0])
+            n[:] = get_num_threads()
+
+        x = np.zeros((5000000,), dtype=np.int64)
+        x[0] = 2
+        set_get_n(x)
+        np.testing.assert_equal(x, 2)
+        x = np.zeros((5000000,), dtype=np.int64)
+        x[0] = max_threads
+        set_get_n(x)
+        np.testing.assert_equal(x, max_threads)
+
+        with self.assertRaises(ValueError):
+            set_n(np.array([0]))
+
+        with self.assertRaises(ValueError):
+            set_n(np.array([max_threads + 1]))
+
     @skip_parfors_unsupported
     @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
     def test_set_num_threads_outside_jit(self):
@@ -81,9 +135,21 @@ def test_func():
                 buf[i] = get_num_threads()
             return buf
 
+        @guvectorize(['void(int64[:])'],
+                     '(n)',
+                     nopython=True,
+                     target='parallel')
+        def test_gufunc(x):
+            x[:] = get_num_threads()
+
+
         out = test_func()
         np.testing.assert_equal(out, 2)
 
+        x = np.zeros((5000000,), dtype=np.int64)
+        test_gufunc(x)
+        np.testing.assert_equal(x, 2)
+
     @skip_parfors_unsupported
     @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
     def test_set_num_threads_inside_jit(self):
@@ -101,6 +167,24 @@ def test_func(nthreads):
         out = test_func(mask)
         np.testing.assert_equal(out, mask)
 
+    @skip_parfors_unsupported
+    @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
+    def test_set_num_threads_inside_guvectorize(self):
+        # Test set_num_threads inside a jitted guvectorize function
+        @guvectorize(['void(int64[:])'],
+                     '(n)',
+                     nopython=True,
+                     target='parallel')
+        def test_func(x):
+            set_num_threads(x[0])
+            x[:] = get_num_threads()
+
+        x = np.zeros((5000000,), dtype=np.int64)
+        mask = 2
+        x[0] = mask
+        test_func(x)
+        np.testing.assert_equal(x, mask)
+
     @skip_parfors_unsupported
     @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
     def test_get_num_threads_truth_outside_jit(self):
@@ -121,6 +205,20 @@ def test_func():
             out = test_func()
             self.assertEqual(out, (mask, mask))
 
+            @guvectorize(['void(int64[:], int64[:])'],
+                         '(n), (m)',
+                         nopython=True,
+                         target='parallel')
+            def test_gufunc(x, out):
+                x[:] = get_thread_num() # XXX: Doesn't actually work
+                out[0] = len(np.unique(x))
+                out[1] = get_num_threads()
+
+            x = np.full((5000000,), -1, dtype=np.int64)
+            out = np.zeros((mask,), dtype=np.int64)
+            test_gufunc(x, out)
+            np.testing.assert_equal(out, np.array([mask, mask]), str(x[0]))
+
     @skip_parfors_unsupported
     @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
     def test_get_num_threads_truth_inside_jit(self):
@@ -141,6 +239,21 @@ def test_func():
             out = test_func()
             self.assertEqual(out, (mask, mask))
 
+            @guvectorize(['void(int64[:], int64[:])'],
+                         '(n), (m)',
+                         nopython=True,
+                         target='parallel')
+            def test_gufunc(x, out):
+                set_num_threads(mask)
+                x[:] = get_thread_num() # XXX: Doesn't actually work
+                out[0] = len(np.unique(x))
+                out[1] = get_num_threads()
+
+            x = np.full((5000000,), -1, dtype=np.int64)
+            out = np.zeros((mask,), dtype=np.int64)
+            test_gufunc(x, out)
+            np.testing.assert_equal(out, np.array([mask, mask]), str(x[0]))
+
     # this test can only run on OpenMP (providing OMP_MAX_ACTIVE_LEVELS is not
     # set or >= 2) and TBB backends
     @skip_parfors_unsupported

From 4925df38ea28423d54677807a66ae789b4e83c2b Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Thu, 12 Dec 2019 15:35:46 -0700
Subject: [PATCH 040/136] Make sure threads are launched when calling
 get_thread_num()

Otherwise it will reference the C implementation from the threading library
which hasn't been loaded yet.
---
 numba/npyufunc/parallel.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index 3a3664a4b29..043527e3bf6 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -576,10 +576,12 @@ def get_thread_num():
     """
     docs
     """
+    _launch_threads()
     return _get_thread_num()
 
 @overload(get_thread_num)
 def ol_get_thread_num():
+    _launch_threads()
     def impl():
         return _get_thread_num()
     return impl

From 28ade30a48a2008939509a96d8a714325aea0a0a Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Thu, 12 Dec 2019 15:36:59 -0700
Subject: [PATCH 041/136] Fix the guvectorize tests to properly test
 parallelism

vectorized kernels aren't parallelized unless they have to do broadcasting
over a higher number of dimensions than they were compiled for.
---
 numba/tests/test_num_threads.py | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/numba/tests/test_num_threads.py b/numba/tests/test_num_threads.py
index 3165a879196..610b2dbae95 100644
--- a/numba/tests/test_num_threads.py
+++ b/numba/tests/test_num_threads.py
@@ -210,14 +210,15 @@ def test_func():
                          nopython=True,
                          target='parallel')
             def test_gufunc(x, out):
-                x[:] = get_thread_num() # XXX: Doesn't actually work
-                out[0] = len(np.unique(x))
-                out[1] = get_num_threads()
+                x[:] = get_thread_num()
+                out[0] = get_num_threads()
 
-            x = np.full((5000000,), -1, dtype=np.int64)
-            out = np.zeros((mask,), dtype=np.int64)
+            # Reshape to force parallelism
+            x = np.full((5000000,), -1, dtype=np.int64).reshape((100, 50000))
+            out = np.zeros((1,), dtype=np.int64)
             test_gufunc(x, out)
-            np.testing.assert_equal(out, np.array([mask, mask]), str(x[0]))
+            np.testing.assert_equal(out, np.array([mask]))
+            self.assertEqual(len(np.unique(x)), mask)
 
     @skip_parfors_unsupported
     @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
@@ -239,20 +240,22 @@ def test_func():
             out = test_func()
             self.assertEqual(out, (mask, mask))
 
+
             @guvectorize(['void(int64[:], int64[:])'],
                          '(n), (m)',
                          nopython=True,
                          target='parallel')
             def test_gufunc(x, out):
                 set_num_threads(mask)
-                x[:] = get_thread_num() # XXX: Doesn't actually work
-                out[0] = len(np.unique(x))
-                out[1] = get_num_threads()
+                x[:] = get_thread_num()
+                out[0] = get_num_threads()
 
-            x = np.full((5000000,), -1, dtype=np.int64)
-            out = np.zeros((mask,), dtype=np.int64)
+            # Reshape to force parallelism
+            x = np.full((5000000,), -1, dtype=np.int64).reshape((100, 50000))
+            out = np.zeros((1,), dtype=np.int64)
             test_gufunc(x, out)
-            np.testing.assert_equal(out, np.array([mask, mask]), str(x[0]))
+            np.testing.assert_equal(out, np.array([mask]))
+            self.assertEqual(len(np.unique(x)), mask)
 
     # this test can only run on OpenMP (providing OMP_MAX_ACTIVE_LEVELS is not
     # set or >= 2) and TBB backends

From 2e7636443bf8e44f9e1ca4c43f226ff63971e58f Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Thu, 12 Dec 2019 16:38:43 -0700
Subject: [PATCH 042/136] Rename get_thread_num() to get_thread_id()

---
 numba/__init__.py               |  2 +-
 numba/npyufunc/__init__.py      |  2 +-
 numba/npyufunc/omppool.cpp      |  6 +++---
 numba/npyufunc/parallel.py      | 16 ++++++++--------
 numba/npyufunc/tbbpool.cpp      |  6 +++---
 numba/npyufunc/workqueue.c      |  8 ++++----
 numba/npyufunc/workqueue.h      |  2 +-
 numba/tests/test_num_threads.py | 16 ++++++++--------
 8 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/numba/__init__.py b/numba/__init__.py
index 3ae810fa994..643354bee0d 100644
--- a/numba/__init__.py
+++ b/numba/__init__.py
@@ -32,7 +32,7 @@
 
 # Re-export vectorize decorators and the thread layer querying function
 from .npyufunc import (vectorize, guvectorize, threading_layer,
-                       get_num_threads, set_num_threads, get_thread_num)
+                       get_num_threads, set_num_threads, get_thread_id)
 
 # Re-export Numpy helpers
 from .numpy_support import carray, farray, from_dtype
diff --git a/numba/npyufunc/__init__.py b/numba/npyufunc/__init__.py
index 97448d43597..48217324f8a 100644
--- a/numba/npyufunc/__init__.py
+++ b/numba/npyufunc/__init__.py
@@ -5,7 +5,7 @@
 from ._internal import PyUFunc_None, PyUFunc_Zero, PyUFunc_One
 from . import _internal, array_exprs, parfor
 from .parallel import (threading_layer, get_num_threads, set_num_threads,
-                       get_thread_num)
+                       get_thread_id)
 if hasattr(_internal, 'PyUFunc_ReorderableNone'):
     PyUFunc_ReorderableNone = _internal.PyUFunc_ReorderableNone
 del _internal, array_exprs
diff --git a/numba/npyufunc/omppool.cpp b/numba/npyufunc/omppool.cpp
index 79b61b80576..12e0ee88f70 100644
--- a/numba/npyufunc/omppool.cpp
+++ b/numba/npyufunc/omppool.cpp
@@ -63,7 +63,7 @@ get_num_threads(void)
 }
 
 static int
-get_thread_num(void)
+get_thread_id(void)
 {
     return omp_get_thread_num();
 }
@@ -249,7 +249,7 @@ MOD_INIT(omppool)
                            PyLong_FromVoidPtr((void*)&set_num_threads));
     PyObject_SetAttrString(m, "get_num_threads",
                            PyLong_FromVoidPtr((void*)&get_num_threads));
-    PyObject_SetAttrString(m, "get_thread_num",
-                           PyLong_FromVoidPtr((void*)&get_thread_num));
+    PyObject_SetAttrString(m, "get_thread_id",
+                           PyLong_FromVoidPtr((void*)&get_thread_id));
     return MOD_SUCCESS_VAL(m);
 }
diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index 043527e3bf6..108b0f7d5d5 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -468,7 +468,7 @@ def _load_num_threads_funcs(lib):
 
     ll.add_symbol('get_num_threads', lib.get_num_threads)
     ll.add_symbol('set_num_threads', lib.set_num_threads)
-    ll.add_symbol('get_thread_num', lib.get_thread_num)
+    ll.add_symbol('get_thread_id', lib.get_thread_id)
 
     global _set_num_threads
     _set_num_threads = CFUNCTYPE(None, c_int)(lib.set_num_threads)
@@ -477,8 +477,8 @@ def _load_num_threads_funcs(lib):
     global _get_num_threads
     _get_num_threads = CFUNCTYPE(c_int)(lib.get_num_threads)
 
-    global _get_thread_num
-    _get_thread_num = CFUNCTYPE(c_int)(lib.get_thread_num)
+    global _get_thread_id
+    _get_thread_id = CFUNCTYPE(c_int)(lib.get_thread_id)
 
 
 # Some helpers to make set_num_threads jittable
@@ -572,18 +572,18 @@ def impl():
         return _get_num_threads()
     return impl
 
-def get_thread_num():
+def get_thread_id():
     """
     docs
     """
     _launch_threads()
-    return _get_thread_num()
+    return _get_thread_id()
 
-@overload(get_thread_num)
-def ol_get_thread_num():
+@overload(get_thread_id)
+def ol_get_thread_id():
     _launch_threads()
     def impl():
-        return _get_thread_num()
+        return _get_thread_id()
     return impl
 
 _DYLD_WORKAROUND_SET = 'NUMBA_DYLD_WORKAROUND' in os.environ
diff --git a/numba/npyufunc/tbbpool.cpp b/numba/npyufunc/tbbpool.cpp
index 157b069d253..49c2ff0b2f7 100644
--- a/numba/npyufunc/tbbpool.cpp
+++ b/numba/npyufunc/tbbpool.cpp
@@ -60,7 +60,7 @@ get_num_threads(void)
 }
 
 static int
-get_thread_num(void)
+get_thread_id(void)
 {
     return tbb::task_arena::current_thread_index();
 }
@@ -298,8 +298,8 @@ MOD_INIT(tbbpool)
                            PyLong_FromVoidPtr((void*)&set_num_threads));
     PyObject_SetAttrString(m, "get_num_threads",
                            PyLong_FromVoidPtr((void*)&get_num_threads));
-    PyObject_SetAttrString(m, "get_thread_num",
-                           PyLong_FromVoidPtr((void*)&get_thread_num));
+    PyObject_SetAttrString(m, "get_thread_id",
+                           PyLong_FromVoidPtr((void*)&get_thread_id));
 
     return MOD_SUCCESS_VAL(m);
 }
diff --git a/numba/npyufunc/workqueue.c b/numba/npyufunc/workqueue.c
index 8267e6621cf..ba7786ff3df 100644
--- a/numba/npyufunc/workqueue.c
+++ b/numba/npyufunc/workqueue.c
@@ -115,7 +115,7 @@ numba_new_thread(void *worker, void *arg)
 }
 
 static int
-get_thread_num(void)
+get_thread_id(void)
 {
     return pthread_self();
 }
@@ -206,7 +206,7 @@ numba_new_thread(void *worker, void *arg)
 }
 
 static int
-get_thread_num(void)
+get_thread_id(void)
 {
     return GetCurrentThreadId();
 }
@@ -534,7 +534,7 @@ MOD_INIT(workqueue)
                            PyLong_FromVoidPtr((void*)&set_num_threads));
     PyObject_SetAttrString(m, "get_num_threads",
                            PyLong_FromVoidPtr((void*)&get_num_threads));
-    PyObject_SetAttrString(m, "get_thread_num",
-                           PyLong_FromVoidPtr((void*)&get_thread_num));
+    PyObject_SetAttrString(m, "get_thread_id",
+                           PyLong_FromVoidPtr((void*)&get_thread_id));
     return MOD_SUCCESS_VAL(m);
 }
diff --git a/numba/npyufunc/workqueue.h b/numba/npyufunc/workqueue.h
index 865a082116a..80558a0e534 100644
--- a/numba/npyufunc/workqueue.h
+++ b/numba/npyufunc/workqueue.h
@@ -62,4 +62,4 @@ set_num_threads(int count);
 static int
 get_num_threads(void);
 static int
-get_thread_num(void);
+get_thread_id(void);
diff --git a/numba/tests/test_num_threads.py b/numba/tests/test_num_threads.py
index 610b2dbae95..5fc7f6b5a07 100644
--- a/numba/tests/test_num_threads.py
+++ b/numba/tests/test_num_threads.py
@@ -3,7 +3,7 @@
 
 import numpy as np
 
-from numba import (njit, set_num_threads, get_num_threads, get_thread_num,
+from numba import (njit, set_num_threads, get_num_threads, get_thread_id,
                    prange, config, threading_layer, guvectorize)
 from numba import unittest_support as unittest
 from .support import TestCase, skip_parfors_unsupported
@@ -199,7 +199,7 @@ def test_func():
                 x = 5000000
                 buf = np.empty((x,))
                 for i in prange(x):
-                    buf[i] = get_thread_num()
+                    buf[i] = get_thread_id()
                 return len(np.unique(buf)), get_num_threads()
 
             out = test_func()
@@ -210,7 +210,7 @@ def test_func():
                          nopython=True,
                          target='parallel')
             def test_gufunc(x, out):
-                x[:] = get_thread_num()
+                x[:] = get_thread_id()
                 out[0] = get_num_threads()
 
             # Reshape to force parallelism
@@ -234,7 +234,7 @@ def test_func():
                 x = 5000000
                 buf = np.empty((x,))
                 for i in prange(x):
-                    buf[i] = get_thread_num()
+                    buf[i] = get_thread_id()
                 return len(np.unique(buf)), get_num_threads()
 
             out = test_func()
@@ -247,7 +247,7 @@ def test_func():
                          target='parallel')
             def test_gufunc(x, out):
                 set_num_threads(mask)
-                x[:] = get_thread_num()
+                x[:] = get_thread_id()
                 out[0] = get_num_threads()
 
             # Reshape to force parallelism
@@ -265,7 +265,7 @@ def test_nested_parallelism_1(self):
         if threading_layer() == 'workqueue':
             self.skipTest("workqueue is not threadsafe")
 
-        # check that get_thread_num is ok in nesting
+        # check that get_thread_id is ok in nesting
         mask = config.NUMBA_NUM_THREADS - 1
 
         N = 4
@@ -321,7 +321,7 @@ def test_nested_parallelism_2(self):
         if threading_layer() == 'workqueue':
             self.skipTest("workqueue is not threadsafe")
 
-        # check that get_thread_num is ok in nesting
+        # check that get_thread_id is ok in nesting
 
         N = 5
         M = 17
@@ -406,7 +406,7 @@ def work(local_nt):
             set_num_threads(local_nt)
             for i in prange(BIG):
                 acc += 1
-                tid[i] = get_thread_num()
+                tid[i] = get_thread_id()
             return acc, np.unique(tid)
 
         @njit(parallel=True)

From 0af2d58022c7aac3d9dc5d44e8f2d3e3a3f47252 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Thu, 12 Dec 2019 16:41:03 -0700
Subject: [PATCH 043/136] Rename get_thread_id() to _get_thread_id()

This also removes it from the main numba.__all__.
---
 numba/__init__.py               |  2 +-
 numba/npyufunc/__init__.py      |  2 +-
 numba/npyufunc/parallel.py      |  4 ++--
 numba/tests/test_num_threads.py | 19 ++++++++++---------
 4 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/numba/__init__.py b/numba/__init__.py
index 643354bee0d..b77773749ec 100644
--- a/numba/__init__.py
+++ b/numba/__init__.py
@@ -32,7 +32,7 @@
 
 # Re-export vectorize decorators and the thread layer querying function
 from .npyufunc import (vectorize, guvectorize, threading_layer,
-                       get_num_threads, set_num_threads, get_thread_id)
+                       get_num_threads, set_num_threads)
 
 # Re-export Numpy helpers
 from .numpy_support import carray, farray, from_dtype
diff --git a/numba/npyufunc/__init__.py b/numba/npyufunc/__init__.py
index 48217324f8a..680141aa9ea 100644
--- a/numba/npyufunc/__init__.py
+++ b/numba/npyufunc/__init__.py
@@ -5,7 +5,7 @@
 from ._internal import PyUFunc_None, PyUFunc_Zero, PyUFunc_One
 from . import _internal, array_exprs, parfor
 from .parallel import (threading_layer, get_num_threads, set_num_threads,
-                       get_thread_id)
+                       _get_thread_id)
 if hasattr(_internal, 'PyUFunc_ReorderableNone'):
     PyUFunc_ReorderableNone = _internal.PyUFunc_ReorderableNone
 del _internal, array_exprs
diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index 108b0f7d5d5..333684d1bdf 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -572,14 +572,14 @@ def impl():
         return _get_num_threads()
     return impl
 
-def get_thread_id():
+def _get_thread_id():
     """
     docs
     """
     _launch_threads()
     return _get_thread_id()
 
-@overload(get_thread_id)
+@overload(_get_thread_id)
 def ol_get_thread_id():
     _launch_threads()
     def impl():
diff --git a/numba/tests/test_num_threads.py b/numba/tests/test_num_threads.py
index 5fc7f6b5a07..2ef76336c6d 100644
--- a/numba/tests/test_num_threads.py
+++ b/numba/tests/test_num_threads.py
@@ -3,8 +3,9 @@
 
 import numpy as np
 
-from numba import (njit, set_num_threads, get_num_threads, get_thread_id,
-                   prange, config, threading_layer, guvectorize)
+from numba import (njit, set_num_threads, get_num_threads, prange, config,
+                   threading_layer, guvectorize)
+from numba.npyufunc.parallel import _get_thread_id
 from numba import unittest_support as unittest
 from .support import TestCase, skip_parfors_unsupported
 
@@ -199,7 +200,7 @@ def test_func():
                 x = 5000000
                 buf = np.empty((x,))
                 for i in prange(x):
-                    buf[i] = get_thread_id()
+                    buf[i] = _get_thread_id()
                 return len(np.unique(buf)), get_num_threads()
 
             out = test_func()
@@ -210,7 +211,7 @@ def test_func():
                          nopython=True,
                          target='parallel')
             def test_gufunc(x, out):
-                x[:] = get_thread_id()
+                x[:] = _get_thread_id()
                 out[0] = get_num_threads()
 
             # Reshape to force parallelism
@@ -234,7 +235,7 @@ def test_func():
                 x = 5000000
                 buf = np.empty((x,))
                 for i in prange(x):
-                    buf[i] = get_thread_id()
+                    buf[i] = _get_thread_id()
                 return len(np.unique(buf)), get_num_threads()
 
             out = test_func()
@@ -247,7 +248,7 @@ def test_func():
                          target='parallel')
             def test_gufunc(x, out):
                 set_num_threads(mask)
-                x[:] = get_thread_id()
+                x[:] = _get_thread_id()
                 out[0] = get_num_threads()
 
             # Reshape to force parallelism
@@ -265,7 +266,7 @@ def test_nested_parallelism_1(self):
         if threading_layer() == 'workqueue':
             self.skipTest("workqueue is not threadsafe")
 
-        # check that get_thread_id is ok in nesting
+        # check that _get_thread_id is ok in nesting
         mask = config.NUMBA_NUM_THREADS - 1
 
         N = 4
@@ -321,7 +322,7 @@ def test_nested_parallelism_2(self):
         if threading_layer() == 'workqueue':
             self.skipTest("workqueue is not threadsafe")
 
-        # check that get_thread_id is ok in nesting
+        # check that _get_thread_id is ok in nesting
 
         N = 5
         M = 17
@@ -406,7 +407,7 @@ def work(local_nt):
             set_num_threads(local_nt)
             for i in prange(BIG):
                 acc += 1
-                tid[i] = get_thread_id()
+                tid[i] = _get_thread_id()
             return acc, np.unique(tid)
 
         @njit(parallel=True)

From deb7ff37118b10d43e1b0bda48ce080555a0b526 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Thu, 12 Dec 2019 16:58:05 -0700
Subject: [PATCH 044/136] Don't assume that set_num_threads inside a jitted
 function will propogate outside of it

See https://github.com/numba/numba/pull/4615#issuecomment-564503993
---
 numba/tests/test_num_threads.py | 27 ++++++++-------------------
 1 file changed, 8 insertions(+), 19 deletions(-)

diff --git a/numba/tests/test_num_threads.py b/numba/tests/test_num_threads.py
index 2ef76336c6d..4fbae485d7b 100644
--- a/numba/tests/test_num_threads.py
+++ b/numba/tests/test_num_threads.py
@@ -43,14 +43,10 @@ def test_set_num_threads_basic_jit(self):
         def get_n():
             return get_num_threads()
 
-        @njit
-        def set_n(n):
-            set_num_threads(n)
-
         self.assertEqual(get_n(), max_threads)
-        set_n(2)
+        set_num_threads(2)
         self.assertEqual(get_n(), 2)
-        set_n(max_threads)
+        set_num_threads(max_threads)
         self.assertEqual(get_n(), max_threads)
 
         @njit
@@ -62,10 +58,10 @@ def set_get_n(n):
         self.assertEqual(set_get_n(max_threads), max_threads)
 
         with self.assertRaises(ValueError):
-            set_n(0)
+            set_get_n(0)
 
         with self.assertRaises(ValueError):
-            set_n(max_threads + 1)
+            set_get_n(max_threads + 1)
 
     @skip_parfors_unsupported
     @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
@@ -79,21 +75,14 @@ def test_set_num_threads_basic_guvectorize(self):
         def get_n(x):
             x[:] = get_num_threads()
 
-        @guvectorize(['void(int64[:])'],
-                     '(n)',
-                     nopython=True,
-                     target='parallel')
-        def set_n(n):
-            set_num_threads(n[0])
-
         x = np.zeros((5000000,), dtype=np.int64)
         get_n(x)
         np.testing.assert_equal(x, max_threads)
-        set_n(np.array([2]))
+        set_num_threads(2)
         x = np.zeros((5000000,), dtype=np.int64)
         get_n(x)
         np.testing.assert_equal(x, 2)
-        set_n(np.array([max_threads]))
+        set_num_threads(max_threads)
         x = np.zeros((5000000,), dtype=np.int64)
         get_n(x)
         np.testing.assert_equal(x, max_threads)
@@ -116,10 +105,10 @@ def set_get_n(n):
         np.testing.assert_equal(x, max_threads)
 
         with self.assertRaises(ValueError):
-            set_n(np.array([0]))
+            set_get_n(np.array([0]))
 
         with self.assertRaises(ValueError):
-            set_n(np.array([max_threads + 1]))
+            set_get_n(np.array([max_threads + 1]))
 
     @skip_parfors_unsupported
     @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")

From 94020e9ebee234cd7850b901f3eaf67b32c9e347 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Fri, 13 Dec 2019 14:05:38 -0700
Subject: [PATCH 045/136] Generalize the nested parallelism tests to work with
 any number of cores

---
 numba/tests/test_num_threads.py | 74 ++++++++++++---------------------
 1 file changed, 26 insertions(+), 48 deletions(-)

diff --git a/numba/tests/test_num_threads.py b/numba/tests/test_num_threads.py
index 4fbae485d7b..b971a972ac9 100644
--- a/numba/tests/test_num_threads.py
+++ b/numba/tests/test_num_threads.py
@@ -250,28 +250,22 @@ def test_gufunc(x, out):
     # this test can only run on OpenMP (providing OMP_MAX_ACTIVE_LEVELS is not
     # set or >= 2) and TBB backends
     @skip_parfors_unsupported
-    @unittest.skipIf(config.NUMBA_NUM_THREADS < 4, "Not enough CPU cores")
+    @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
     def test_nested_parallelism_1(self):
         if threading_layer() == 'workqueue':
             self.skipTest("workqueue is not threadsafe")
 
-        # check that _get_thread_id is ok in nesting
+        # check that get_num_threads is ok in nesting
         mask = config.NUMBA_NUM_THREADS - 1
 
-        N = 4
-        M = 8
-
-        def gen(fid):
-            @njit(parallel=True)
-            def child_func(buf):
-                M, N = buf.shape
-                for i in prange(N):
-                    buf[fid, i] = get_num_threads()
-            return child_func
+        N = config.NUMBA_NUM_THREADS
+        M = 2*config.NUMBA_NUM_THREADS
 
-        child1 = gen(1)
-        child2 = gen(2)
-        child3 = gen(3)
+        @njit(parallel=True)
+        def child_func(buf, fid):
+            M, N = buf.shape
+            for i in prange(N):
+                buf[fid, i] = get_num_threads()
 
         @njit(parallel=True)
         def test_func(nthreads):
@@ -281,12 +275,8 @@ def test_func(nthreads):
             for i in prange(M):
                 local_mask = 1 + i % mask
                 set_num_threads(local_mask)  # set threads in parent function
-                if local_mask == 1:
-                    child1(buf)
-                elif local_mask == 2:
-                    child2(buf)
-                elif local_mask == 3:
-                    child3(buf)
+                if local_mask < N:
+                    child_func(buf, local_mask)
                 acc += get_num_threads()
             return acc, buf
 
@@ -299,22 +289,23 @@ def test_func(nthreads):
         math_acc = np.sum(1 + np.arange(M) % mask)
         self.assertEqual(math_acc, got_acc)
         math_arr = np.zeros((M, N))
-        for i in range(1, 4):  # there's branches on 1, 2, 3
+        for i in range(1, N):  # there's branches on 1, ..., num_threads - 1
             math_arr[i, :] = i
         np.testing.assert_equal(math_arr, got_arr)
 
     # this test can only run on OpenMP (providing OMP_MAX_ACTIVE_LEVELS is not
     # set or >= 2) and TBB backends
     @skip_parfors_unsupported
-    @unittest.skipIf(config.NUMBA_NUM_THREADS < 4, "Not enough CPU cores")
+    @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
     def test_nested_parallelism_2(self):
         if threading_layer() == 'workqueue':
             self.skipTest("workqueue is not threadsafe")
 
-        # check that _get_thread_id is ok in nesting
+        # check that get_num_threads is ok in nesting
+
+        N = config.NUMBA_NUM_THREADS + 1
+        M = 4*config.NUMBA_NUM_THREADS + 1
 
-        N = 5
-        M = 17
         def get_impl(flag):
 
             if flag == True:
@@ -324,18 +315,12 @@ def get_impl(flag):
             else:
                 def dec(x): return x
 
-            def gen(fid):
-                @dec
-                def child_func(buf):
-                    M, N = buf.shape
-                    set_num_threads(fid)  # set threads in child function
-                    for i in prange(N):
-                        buf[fid, i] = get_num_threads()
-                return child_func
-
-            child1 = gen(1)
-            child2 = gen(2)
-            child3 = gen(3)
+            @dec
+            def child(buf, fid):
+                M, N = buf.shape
+                set_num_threads(fid)  # set threads in child function
+                for i in prange(N):
+                    buf[fid, i] = get_num_threads()
 
             @dec
             def test_func(nthreads):
@@ -346,14 +331,8 @@ def test_func(nthreads):
                     # when the threads exit the child functions they should have
                     # a TLS slot value of the local mask as it was set in
                     # child
-                    if local_mask == 1:
-                        child1(buf)
-                        assert get_num_threads() == local_mask
-                    elif local_mask == 2:
-                        child2(buf)
-                        assert get_num_threads() == local_mask
-                    elif local_mask == 3:
-                        child3(buf)
+                    if local_mask < config.NUMBA_NUM_THREADS:
+                        child(buf, local_mask)
                         assert get_num_threads() == local_mask
                 return buf
             return test_func
@@ -371,8 +350,7 @@ def test_func(nthreads):
 
         # check the maths reconciles
         math_arr = np.zeros((M, N))
-        for i in range(
-                1, 4):  # there's branches on modulo mask but only 3 funcs
+        for i in range(1, config.NUMBA_NUM_THREADS):  # there's branches on modulo mask but only NUMBA_NUM_THREADS funcs
             math_arr[i, :] = i
 
         np.testing.assert_equal(math_arr, pf_arr)

From 39815fb1ccb406340c3ace4c515ef27b68499c43 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Fri, 13 Dec 2019 15:51:57 -0700
Subject: [PATCH 046/136] Make test_nested_parallelism_1 test nesting from a
 guvectorized function as well

---
 numba/tests/test_num_threads.py | 82 +++++++++++++++++++++++----------
 1 file changed, 58 insertions(+), 24 deletions(-)

diff --git a/numba/tests/test_num_threads.py b/numba/tests/test_num_threads.py
index b971a972ac9..9a4aa0955bf 100644
--- a/numba/tests/test_num_threads.py
+++ b/numba/tests/test_num_threads.py
@@ -267,31 +267,65 @@ def child_func(buf, fid):
             for i in prange(N):
                 buf[fid, i] = get_num_threads()
 
-        @njit(parallel=True)
-        def test_func(nthreads):
-            acc = 0
-            buf = np.zeros((M, N))
-            set_num_threads(nthreads)
-            for i in prange(M):
-                local_mask = 1 + i % mask
-                set_num_threads(local_mask)  # set threads in parent function
-                if local_mask < N:
-                    child_func(buf, local_mask)
-                acc += get_num_threads()
-            return acc, buf
-
-        got_acc, got_arr = test_func(mask)
-        exp_acc, exp_arr = test_func.py_func(mask)
-        self.assertEqual(exp_acc, got_acc)
-        np.testing.assert_equal(exp_arr, got_arr)
+        def get_test(test_type):
+            if test_type == 'njit':
+                def test_func(nthreads, py_func=False):
+                    @njit(parallel=True)
+                    def _test_func(nthreads):
+                        acc = 0
+                        buf = np.zeros((M, N))
+                        set_num_threads(nthreads)
+                        for i in prange(M):
+                            local_mask = 1 + i % mask
+                            set_num_threads(local_mask)  # set threads in parent function
+                            if local_mask < N:
+                                child_func(buf, local_mask)
+                            acc += get_num_threads()
+                        return acc, buf
+                    if py_func:
+                        return _test_func.py_func(nthreads)
+                    else:
+                        return _test_func(nthreads)
+
+            elif test_type == 'guvectorize':
+                def test_func(nthreads, py_func=False):
+                    def _test_func(acc, buf, local_mask):
+                        set_num_threads(nthreads)
+                        set_num_threads(local_mask[0])  # set threads in parent function
+                        if local_mask[0] < N:
+                            child_func(buf, local_mask[0])
+                        acc[0] += get_num_threads()
+
+                    buf = np.zeros((M, N), dtype=np.int64)
+                    acc = np.array([0])
+                    local_mask = (1 + np.arange(M) % mask).reshape((M, 1))
+                    if not py_func:
+                        _test_func = guvectorize(['void(int64[:], int64[:, :], int64[:])'],
+                                                 '(k), (n, m), (p)', nopython=True,
+                                                 target='parallel')(_test_func)
+                    else:
+                        _test_func = guvectorize(['void(int64[:], int64[:, :], int64[:])'],
+                                                 '(k), (n, m), (p)', forceobj=True)(_test_func)
+                    _test_func(acc, buf, local_mask)
+                    return acc, buf
 
-        # check the maths reconciles
-        math_acc = np.sum(1 + np.arange(M) % mask)
-        self.assertEqual(math_acc, got_acc)
-        math_arr = np.zeros((M, N))
-        for i in range(1, N):  # there's branches on 1, ..., num_threads - 1
-            math_arr[i, :] = i
-        np.testing.assert_equal(math_arr, got_arr)
+            return test_func
+
+        for test_type in ['njit', 'guvectorize']:
+            test_func = get_test(test_type)
+
+            got_acc, got_arr = test_func(mask)
+            exp_acc, exp_arr = test_func(mask, py_func=True)
+            self.assertEqual(exp_acc, got_acc, test_type)
+            np.testing.assert_equal(exp_arr, got_arr)
+
+            # check the maths reconciles
+            math_acc = np.sum(1 + np.arange(M) % mask)
+            self.assertEqual(math_acc, got_acc)
+            math_arr = np.zeros((M, N))
+            for i in range(1, N):  # there's branches on 1, ..., num_threads - 1
+                math_arr[i, :] = i
+            np.testing.assert_equal(math_arr, got_arr)
 
     # this test can only run on OpenMP (providing OMP_MAX_ACTIVE_LEVELS is not
     # set or >= 2) and TBB backends

From fe0af898ce6c8aaedaa699fccadbb77907572dc0 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Fri, 13 Dec 2019 16:09:34 -0700
Subject: [PATCH 047/136] Test guvectorize combinations in
 test_nested_parallelism_2

---
 numba/tests/test_num_threads.py | 97 +++++++++++++++++++++++----------
 1 file changed, 67 insertions(+), 30 deletions(-)

diff --git a/numba/tests/test_num_threads.py b/numba/tests/test_num_threads.py
index 9a4aa0955bf..ef4fea87cb0 100644
--- a/numba/tests/test_num_threads.py
+++ b/numba/tests/test_num_threads.py
@@ -340,54 +340,91 @@ def test_nested_parallelism_2(self):
         N = config.NUMBA_NUM_THREADS + 1
         M = 4*config.NUMBA_NUM_THREADS + 1
 
-        def get_impl(flag):
+        def get_impl(child_type, test_type):
 
-            if flag == True:
-                dec = njit(parallel=True)
-            elif flag == False:
-                dec = njit(parallel=False)
-            else:
-                def dec(x): return x
+            if child_type == 'parallel':
+                child_dec = njit(parallel=True)
+            elif child_type == 'njit':
+                child_dec = njit(parallel=False)
+            elif child_type == 'none':
+                def child_dec(x): return x
 
-            @dec
+            @child_dec
             def child(buf, fid):
                 M, N = buf.shape
                 set_num_threads(fid)  # set threads in child function
                 for i in prange(N):
                     buf[fid, i] = get_num_threads()
 
-            @dec
-            def test_func(nthreads):
-                buf = np.zeros((M, N))
-                set_num_threads(nthreads)
-                for i in prange(M):
-                    local_mask = 1 + i % mask
-                    # when the threads exit the child functions they should have
-                    # a TLS slot value of the local mask as it was set in
-                    # child
-                    if local_mask < config.NUMBA_NUM_THREADS:
-                        child(buf, local_mask)
-                        assert get_num_threads() == local_mask
-                return buf
+
+            if test_type in ['parallel', 'njit', 'none']:
+                if test_type == 'parallel':
+                    test_dec = njit(parallel=True)
+                elif test_type == 'njit':
+                    test_dec = njit(parallel=False)
+                elif test_type == 'none':
+                    def test_dec(x): return x
+
+                @test_dec
+                def test_func(nthreads):
+                    buf = np.zeros((M, N))
+                    set_num_threads(nthreads)
+                    for i in prange(M):
+                        local_mask = 1 + i % mask
+                        # when the threads exit the child functions they should have
+                        # a TLS slot value of the local mask as it was set in
+                        # child
+                        if local_mask < config.NUMBA_NUM_THREADS:
+                            child(buf, local_mask)
+                            assert get_num_threads() == local_mask
+                    return buf
+            else:
+                if test_type == 'guvectorize':
+                    test_dec = guvectorize(['int64[:,:], int64[:]'],
+                                           '(n, m), (k)', nopython=True,
+                                           target='parallel')
+                elif test_type == 'guvectorize-obj':
+                    test_dec = guvectorize(['int64[:,:], int64[:]'],
+                                           '(n, m), (k)', forceobj=True)
+
+                def test_func(nthreads):
+                    @test_dec
+                    def _test_func(buf, local_mask):
+                        set_num_threads(nthreads)
+                        # when the threads exit the child functions they should have
+                        # a TLS slot value of the local mask as it was set in
+                        # child
+                        if local_mask[0] < config.NUMBA_NUM_THREADS:
+                            child(buf, local_mask[0])
+                            assert get_num_threads() == local_mask[0]
+
+                    buf = np.zeros((M, N), dtype=np.int64)
+                    local_mask = (1 + np.arange(M) % mask).reshape((M, 1))
+                    _test_func(buf, local_mask)
+                    return buf
+
             return test_func
 
         mask = config.NUMBA_NUM_THREADS - 1
-        set_num_threads(mask)
-        pf_arr = get_impl(True)(mask)
-        set_num_threads(mask)
-        nj_arr = get_impl(False)(mask)
-        set_num_threads(mask)
-        py_arr = get_impl(None)(mask)
 
-        np.testing.assert_equal(pf_arr, py_arr)
-        np.testing.assert_equal(nj_arr, py_arr)
+        res_arrays = {}
+        for test_type in ['parallel', 'njit', 'none', 'guvectorize', 'guvectorize-obj']:
+            for child_type in ['parallel', 'njit', 'none']:
+                if child_type == 'none' and test_type != 'none':
+                    continue
+                set_num_threads(mask)
+                res_arrays[test_type, child_type] = get_impl(child_type, test_type)(mask)
+
+        py_arr = res_arrays['none', 'none']
+        for arr in res_arrays.values():
+            np.testing.assert_equal(arr, py_arr)
 
         # check the maths reconciles
         math_arr = np.zeros((M, N))
         for i in range(1, config.NUMBA_NUM_THREADS):  # there's branches on modulo mask but only NUMBA_NUM_THREADS funcs
             math_arr[i, :] = i
 
-        np.testing.assert_equal(math_arr, pf_arr)
+        np.testing.assert_equal(math_arr, py_arr)
 
     # this test can only run on OpenMP (providing OMP_MAX_ACTIVE_LEVELS is not
     # set or >= 2) and TBB backends

From 35b69f72f6dbc3b74f463664ae4f15781e1c498f Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Fri, 13 Dec 2019 16:20:22 -0700
Subject: [PATCH 048/136] Add guvectorize to test_nested_parallelism_3

---
 numba/tests/test_num_threads.py | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/numba/tests/test_num_threads.py b/numba/tests/test_num_threads.py
index ef4fea87cb0..d436713611c 100644
--- a/numba/tests/test_num_threads.py
+++ b/numba/tests/test_num_threads.py
@@ -449,7 +449,7 @@ def work(local_nt):
             return acc, np.unique(tid)
 
         @njit(parallel=True)
-        def test_func(nthreads):
+        def test_func_jit(nthreads):
             set_num_threads(nthreads)
             lens = np.zeros(nthreads)
             total = 0
@@ -463,10 +463,31 @@ def test_func(nthreads):
         expected_acc = BIG * NT
         expected_thread_count = NT + 1
 
-        got_acc, got_tc = test_func(NT)
+        got_acc, got_tc = test_func_jit(NT)
+        self.assertEqual(expected_acc, got_acc)
+        np.testing.assert_equal(expected_thread_count, got_tc)
+
+        def test_guvectorize(nthreads):
+            @guvectorize(['int64[:], int64[:]'],
+                         '(n), (m)',
+                         nopython=True,
+                         target='parallel')
+            def test_func_guvectorize(total, lens):
+                my_acc, tids = work(nthreads + 1)
+                lens[:] = len(tids)
+                total += my_acc
+
+            total = np.array([0])
+            lens = np.zeros(nthreads, dtype=np.int64).reshape((nthreads, 1))
+
+            test_func_guvectorize(total, lens)
+            return total, np.unique(lens)
+
+        got_acc, got_tc = test_guvectorize(NT)
         self.assertEqual(expected_acc, got_acc)
         np.testing.assert_equal(expected_thread_count, got_tc)
 
+
     def tearDown(self):
         set_num_threads(config.NUMBA_NUM_THREADS)
 

From 5f2bd38ee9031dedaa5e302befde611b86cf3893 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Fri, 13 Dec 2019 16:25:42 -0700
Subject: [PATCH 049/136] Remove raises tests for set_num_threads inside of a
 jitted function

The exception is only raised if the jitted function happens to be called on
the main thread, which is not the case for the workqueue backend, and is an
implementation detail that it happens on openmp and tbb.
---
 numba/tests/test_num_threads.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/numba/tests/test_num_threads.py b/numba/tests/test_num_threads.py
index d436713611c..380ac41f34e 100644
--- a/numba/tests/test_num_threads.py
+++ b/numba/tests/test_num_threads.py
@@ -57,12 +57,6 @@ def set_get_n(n):
         self.assertEqual(set_get_n(2), 2)
         self.assertEqual(set_get_n(max_threads), max_threads)
 
-        with self.assertRaises(ValueError):
-            set_get_n(0)
-
-        with self.assertRaises(ValueError):
-            set_get_n(max_threads + 1)
-
     @skip_parfors_unsupported
     @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
     def test_set_num_threads_basic_guvectorize(self):
@@ -104,12 +98,6 @@ def set_get_n(n):
         set_get_n(x)
         np.testing.assert_equal(x, max_threads)
 
-        with self.assertRaises(ValueError):
-            set_get_n(np.array([0]))
-
-        with self.assertRaises(ValueError):
-            set_get_n(np.array([max_threads + 1]))
-
     @skip_parfors_unsupported
     @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
     def test_set_num_threads_outside_jit(self):

From 78b2dd1f4f2da6bc73f742b574579da76b567654 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Fri, 13 Dec 2019 16:28:12 -0700
Subject: [PATCH 050/136] Add get_num_threads and set_num_threads to
 numba.__all__

---
 numba/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/numba/__init__.py b/numba/__init__.py
index b77773749ec..6ec2aded9f0 100644
--- a/numba/__init__.py
+++ b/numba/__init__.py
@@ -72,6 +72,8 @@
     vectorize
     objmode
     literal_unroll
+    get_num_threads
+    set_num_threads
     """.split() + types.__all__ + errors.__all__
 
 

From e489e1018451a975906821d0bb8b2102ffb3c918 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Mon, 16 Dec 2019 17:13:33 -0700
Subject: [PATCH 051/136] Run the num_threads tests in all backends

This reuses the machinery from test_parallel_backend.py, which has been
refactored slightly to make it more reusable.
---
 numba/tests/test_num_threads.py      | 56 ++++++++++++++++++++++------
 numba/tests/test_parallel_backend.py | 23 ++++++------
 2 files changed, 57 insertions(+), 22 deletions(-)

diff --git a/numba/tests/test_num_threads.py b/numba/tests/test_num_threads.py
index 380ac41f34e..da3186664d9 100644
--- a/numba/tests/test_num_threads.py
+++ b/numba/tests/test_num_threads.py
@@ -1,6 +1,8 @@
 # -*- coding: utf-8 -*-
 from __future__ import print_function, absolute_import, division
 
+import sys
+
 import numpy as np
 
 from numba import (njit, set_num_threads, get_num_threads, prange, config,
@@ -8,6 +10,7 @@
 from numba.npyufunc.parallel import _get_thread_id
 from numba import unittest_support as unittest
 from .support import TestCase, skip_parfors_unsupported
+from .test_parallel_backend import TestInSubprocess
 
 class TestNumThreads(TestCase):
     _numba_parallel_test_ = False
@@ -19,7 +22,7 @@ def setUp(self):
 
     @skip_parfors_unsupported
     @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
-    def test_set_num_threads_basic(self):
+    def _test_set_num_threads_basic(self):
         max_threads = config.NUMBA_NUM_THREADS
 
         self.assertEqual(get_num_threads(), max_threads)
@@ -36,7 +39,7 @@ def test_set_num_threads_basic(self):
 
     @skip_parfors_unsupported
     @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
-    def test_set_num_threads_basic_jit(self):
+    def _test_set_num_threads_basic_jit(self):
         max_threads = config.NUMBA_NUM_THREADS
 
         @njit
@@ -59,7 +62,7 @@ def set_get_n(n):
 
     @skip_parfors_unsupported
     @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
-    def test_set_num_threads_basic_guvectorize(self):
+    def _test_set_num_threads_basic_guvectorize(self):
         max_threads = config.NUMBA_NUM_THREADS
 
         @guvectorize(['void(int64[:])'],
@@ -100,7 +103,7 @@ def set_get_n(n):
 
     @skip_parfors_unsupported
     @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
-    def test_set_num_threads_outside_jit(self):
+    def _test_set_num_threads_outside_jit(self):
 
         # Test set_num_threads outside a jitted function
         set_num_threads(2)
@@ -130,7 +133,7 @@ def test_gufunc(x):
 
     @skip_parfors_unsupported
     @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
-    def test_set_num_threads_inside_jit(self):
+    def _test_set_num_threads_inside_jit(self):
         # Test set_num_threads inside a jitted function
         @njit(parallel=True)
         def test_func(nthreads):
@@ -147,7 +150,7 @@ def test_func(nthreads):
 
     @skip_parfors_unsupported
     @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
-    def test_set_num_threads_inside_guvectorize(self):
+    def _test_set_num_threads_inside_guvectorize(self):
         # Test set_num_threads inside a jitted guvectorize function
         @guvectorize(['void(int64[:])'],
                      '(n)',
@@ -165,7 +168,7 @@ def test_func(x):
 
     @skip_parfors_unsupported
     @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
-    def test_get_num_threads_truth_outside_jit(self):
+    def _test_get_num_threads_truth_outside_jit(self):
 
         for mask in range(2, min(6, config.NUMBA_NUM_THREADS + 1)):
             set_num_threads(mask)
@@ -200,7 +203,7 @@ def test_gufunc(x, out):
 
     @skip_parfors_unsupported
     @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
-    def test_get_num_threads_truth_inside_jit(self):
+    def _test_get_num_threads_truth_inside_jit(self):
 
         for mask in range(2, min(6, config.NUMBA_NUM_THREADS + 1)):
 
@@ -239,7 +242,7 @@ def test_gufunc(x, out):
     # set or >= 2) and TBB backends
     @skip_parfors_unsupported
     @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
-    def test_nested_parallelism_1(self):
+    def _test_nested_parallelism_1(self):
         if threading_layer() == 'workqueue':
             self.skipTest("workqueue is not threadsafe")
 
@@ -319,7 +322,7 @@ def _test_func(acc, buf, local_mask):
     # set or >= 2) and TBB backends
     @skip_parfors_unsupported
     @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
-    def test_nested_parallelism_2(self):
+    def _test_nested_parallelism_2(self):
         if threading_layer() == 'workqueue':
             self.skipTest("workqueue is not threadsafe")
 
@@ -418,7 +421,7 @@ def _test_func(buf, local_mask):
     # set or >= 2) and TBB backends
     @skip_parfors_unsupported
     @unittest.skipIf(config.NUMBA_NUM_THREADS < 3, "Not enough CPU cores")
-    def test_nested_parallelism_3(self):
+    def _test_nested_parallelism_3(self):
         if threading_layer() == 'workqueue':
             self.skipTest("workqueue is not threadsafe")
 
@@ -480,5 +483,36 @@ def tearDown(self):
         set_num_threads(config.NUMBA_NUM_THREADS)
 
 
+class TestNumThreadsBackends(TestInSubprocess, TestCase):
+    _class = TestNumThreads
+    _DEBUG = False
+
+    @classmethod
+    def _inject(cls, name, backend, backend_guard):
+        themod = cls.__module__
+        thecls = cls._class.__name__
+        injected_method = '%s.%s.%s' % (themod, thecls, name)
+
+        def test_template(self):
+            o, e = self.run_test_in_separate_process(injected_method, backend)
+            if self._DEBUG:
+                print('stdout:\n "%s"\n stderr:\n "%s"' % (o, e))
+            self.assertIn('OK', e)
+            self.assertTrue('FAIL' not in e)
+            self.assertTrue('ERROR' not in e)
+        injected_test = "%s_%s" % (name[1:], backend)
+        setattr(cls, injected_test,
+                backend_guard(test_template))
+
+    @classmethod
+    def generate(cls):
+        for name in cls._class.__dict__.copy():
+            for backend, backend_guard in cls.backends.items():
+                if not name.startswith('_test_'):
+                    continue
+                cls._inject(name, backend, backend_guard)
+
+TestNumThreadsBackends.generate()
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/numba/tests/test_parallel_backend.py b/numba/tests/test_parallel_backend.py
index 1204a7c5879..6f82de7e6ba 100644
--- a/numba/tests/test_parallel_backend.py
+++ b/numba/tests/test_parallel_backend.py
@@ -337,17 +337,7 @@ def test_method(self):
 TestParallelBackend.generate()
 
 
-class TestSpecificBackend(TestParallelBackendBase):
-    """
-    This is quite contrived, for each test in the TestParallelBackend tests it
-    generates a test that will run the TestParallelBackend test in a new python
-    process with an environment modified to ensure a specific threadsafe backend
-    is used. This is with view of testing the backends independently and in an
-    isolated manner such that if they hang/crash/have issues, it doesn't kill
-    the test suite.
-    """
-    _DEBUG = False
-
+class TestInSubprocess(object):
     backends = {'tbb': skip_no_tbb,
                 'omp': skip_no_omp,
                 'workqueue': unittest.skipIf(False, '')}
@@ -377,6 +367,17 @@ def run_test_in_separate_process(self, test, threading_layer):
         cmdline = [sys.executable, "-m", "numba.runtests", test]
         return self.run_cmd(cmdline, env_copy)
 
+class TestSpecificBackend(TestInSubprocess, TestParallelBackendBase):
+    """
+    This is quite contrived, for each test in the TestParallelBackend tests it
+    generates a test that will run the TestParallelBackend test in a new python
+    process with an environment modified to ensure a specific threadsafe backend
+    is used. This is with view of testing the backends independently and in an
+    isolated manner such that if they hang/crash/have issues, it doesn't kill
+    the test suite.
+    """
+    _DEBUG = False
+
     @classmethod
     def _inject(cls, p, name, backend, backend_guard):
         themod = cls.__module__

From 8fba01b1d29aed680b4e52055f6e5bb59fb41afc Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Mon, 16 Dec 2019 17:24:18 -0700
Subject: [PATCH 052/136] Parameterize the num_threads tests over
 NUMBA_NUM_THREADS

---
 numba/tests/test_num_threads.py | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/numba/tests/test_num_threads.py b/numba/tests/test_num_threads.py
index da3186664d9..093c12097c6 100644
--- a/numba/tests/test_num_threads.py
+++ b/numba/tests/test_num_threads.py
@@ -2,6 +2,7 @@
 from __future__ import print_function, absolute_import, division
 
 import sys
+import os
 
 import numpy as np
 
@@ -487,20 +488,31 @@ class TestNumThreadsBackends(TestInSubprocess, TestCase):
     _class = TestNumThreads
     _DEBUG = False
 
+    # 1 is mainly here to ensure tests skip correctly
+    num_threads = [i for i in [1, 2, 4, 8, 16] if i <= config.NUMBA_NUM_THREADS]
+
+    def run_test_in_separate_process(self, test, threading_layer, num_threads):
+        env_copy = os.environ.copy()
+        env_copy['NUMBA_THREADING_LAYER'] = str(threading_layer)
+        env_copy['NUMBA_NUM_THREADS'] = str(num_threads)
+        cmdline = [sys.executable, "-m", "numba.runtests", test]
+        return self.run_cmd(cmdline, env_copy)
+
     @classmethod
-    def _inject(cls, name, backend, backend_guard):
+    def _inject(cls, name, backend, backend_guard, num_threads):
         themod = cls.__module__
         thecls = cls._class.__name__
         injected_method = '%s.%s.%s' % (themod, thecls, name)
 
         def test_template(self):
-            o, e = self.run_test_in_separate_process(injected_method, backend)
+            o, e = self.run_test_in_separate_process(injected_method, backend,
+                                                     num_threads)
             if self._DEBUG:
                 print('stdout:\n "%s"\n stderr:\n "%s"' % (o, e))
             self.assertIn('OK', e)
             self.assertTrue('FAIL' not in e)
             self.assertTrue('ERROR' not in e)
-        injected_test = "%s_%s" % (name[1:], backend)
+        injected_test = "%s_%s_%s_threads" % (name[1:], backend, num_threads)
         setattr(cls, injected_test,
                 backend_guard(test_template))
 
@@ -508,9 +520,10 @@ def test_template(self):
     def generate(cls):
         for name in cls._class.__dict__.copy():
             for backend, backend_guard in cls.backends.items():
-                if not name.startswith('_test_'):
-                    continue
-                cls._inject(name, backend, backend_guard)
+                for num_threads in cls.num_threads:
+                    if not name.startswith('_test_'):
+                        continue
+                    cls._inject(name, backend, backend_guard, num_threads)
 
 TestNumThreadsBackends.generate()
 

From 980b550e160d96b3965d63162778a3b93266da0a Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Mon, 16 Dec 2019 17:36:41 -0700
Subject: [PATCH 053/136] Propagate the skip message from the subprocess in the
 num_threads tests

---
 numba/tests/test_num_threads.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/numba/tests/test_num_threads.py b/numba/tests/test_num_threads.py
index 093c12097c6..abb2e07910d 100644
--- a/numba/tests/test_num_threads.py
+++ b/numba/tests/test_num_threads.py
@@ -3,6 +3,7 @@
 
 import sys
 import os
+import re
 
 import numpy as np
 
@@ -495,7 +496,7 @@ def run_test_in_separate_process(self, test, threading_layer, num_threads):
         env_copy = os.environ.copy()
         env_copy['NUMBA_THREADING_LAYER'] = str(threading_layer)
         env_copy['NUMBA_NUM_THREADS'] = str(num_threads)
-        cmdline = [sys.executable, "-m", "numba.runtests", test]
+        cmdline = [sys.executable, "-m", "numba.runtests", "-v", test]
         return self.run_cmd(cmdline, env_copy)
 
     @classmethod
@@ -512,6 +513,10 @@ def test_template(self):
             self.assertIn('OK', e)
             self.assertTrue('FAIL' not in e)
             self.assertTrue('ERROR' not in e)
+            m = re.search(r"\.\.\. skipped '(.*?)'", e)
+            if m:
+                self.skipTest(m.group(1))
+
         injected_test = "%s_%s_%s_threads" % (name[1:], backend, num_threads)
         setattr(cls, injected_test,
                 backend_guard(test_template))

From 35c92ba373b81c20fc03423834f15304cd07a87a Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Mon, 16 Dec 2019 17:37:11 -0700
Subject: [PATCH 054/136] Tag the num_threads tests as long running

---
 numba/tests/test_num_threads.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/numba/tests/test_num_threads.py b/numba/tests/test_num_threads.py
index abb2e07910d..1df85ea2943 100644
--- a/numba/tests/test_num_threads.py
+++ b/numba/tests/test_num_threads.py
@@ -11,7 +11,7 @@
                    threading_layer, guvectorize)
 from numba.npyufunc.parallel import _get_thread_id
 from numba import unittest_support as unittest
-from .support import TestCase, skip_parfors_unsupported
+from .support import TestCase, skip_parfors_unsupported, tag
 from .test_parallel_backend import TestInSubprocess
 
 class TestNumThreads(TestCase):
@@ -519,7 +519,7 @@ def test_template(self):
 
         injected_test = "%s_%s_%s_threads" % (name[1:], backend, num_threads)
         setattr(cls, injected_test,
-                backend_guard(test_template))
+                tag('long_running')(backend_guard(test_template)))
 
     @classmethod
     def generate(cls):

From 413fd3dd3df45242c5e710fe04b5639e95ca335b Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Mon, 6 Jan 2020 16:21:13 -0700
Subject: [PATCH 055/136] Raise an exception if get_num_threads() returns 0

This means it is called before set_num_threads, which shouldn't happen. This
will make identifying bugs with it easier, such as the one with mkl.

This should probably be modified to be raised inside of the C implementation
rather than in every place where it is called.
---
 numba/npyufunc/parallel.py | 14 ++++++++++----
 numba/npyufunc/parfor.py   |  8 ++++++++
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index 333684d1bdf..6aed51d5ae5 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -538,7 +538,6 @@ def impl(n):
         _set_num_threads(n)
     return impl
 
-
 def get_num_threads():
     """
     Get the number of threads used for parallel execution.
@@ -562,14 +561,21 @@ def get_num_threads():
 
     """
     _launch_threads()
-    return _get_num_threads()
-
+    num_threads = _get_num_threads()
+    if num_threads == 0:
+        raise RuntimeError("Invalid number of threads. "
+                           "This likely indicates a bug in numba.")
+    return num_threads
 
 @overload(get_num_threads)
 def ol_get_num_threads():
     _launch_threads()
     def impl():
-        return _get_num_threads()
+        num_threads = _get_num_threads()
+        if num_threads == 0:
+            raise RuntimeError("Invalid number of threads. "
+                               "This likely indicates a bug in numba.")
+        return num_threads
     return impl
 
 def _get_thread_id():
diff --git a/numba/npyufunc/parfor.py b/numba/npyufunc/parfor.py
index bf9dcbce284..cd24b6dc3a9 100644
--- a/numba/npyufunc/parfor.py
+++ b/numba/npyufunc/parfor.py
@@ -1343,6 +1343,14 @@ def load_range(v):
 
     num_threads = builder.call(get_num_threads, [])
 
+    with cgutils.if_unlikely(builder, builder.icmp_signed('==', num_threads,
+                                                  num_threads.type(0))):
+        context.call_conv.return_user_exc(builder, RuntimeError,
+                                                  ("Invalid number of threads. "
+                                                   "This likely indicates a bug in numba.",))
+
+    cgutils.printf(builder, "num_threads: %d\n", num_threads)
+
     builder.call(
         do_scheduling, [
             context.get_constant(

From 0435f2d547af870e85212f9497f68f52f517d012 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Mon, 6 Jan 2020 16:22:54 -0700
Subject: [PATCH 056/136] Remove debug printf

---
 numba/npyufunc/parfor.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/numba/npyufunc/parfor.py b/numba/npyufunc/parfor.py
index cd24b6dc3a9..3b03ea5ac82 100644
--- a/numba/npyufunc/parfor.py
+++ b/numba/npyufunc/parfor.py
@@ -1349,8 +1349,6 @@ def load_range(v):
                                                   ("Invalid number of threads. "
                                                    "This likely indicates a bug in numba.",))
 
-    cgutils.printf(builder, "num_threads: %d\n", num_threads)
-
     builder.call(
         do_scheduling, [
             context.get_constant(

From d6922c6472a513f0bd37c1249ba73c368c435816 Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Tue, 7 Jan 2020 15:51:35 +0000
Subject: [PATCH 057/136] Rename TLS var

---
 numba/npyufunc/omppool.cpp | 6 +++---
 numba/npyufunc/tbbpool.cpp | 6 +++---
 numba/npyufunc/workqueue.c | 8 ++++----
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/numba/npyufunc/omppool.cpp b/numba/npyufunc/omppool.cpp
index 12e0ee88f70..91f9352aad2 100644
--- a/numba/npyufunc/omppool.cpp
+++ b/numba/npyufunc/omppool.cpp
@@ -48,18 +48,18 @@ static pid_t parent_pid = 0; // 0 is not set, users can't own this anyway
 #define THREAD_LOCAL(ty) __thread ty
 #endif
 
-static THREAD_LOCAL(int) num_threads = 0;
+static THREAD_LOCAL(int) _TLS_num_threads = 0;
 
 static void
 set_num_threads(int count)
 {
-    num_threads = count;
+    _TLS_num_threads = count;
 }
 
 static int
 get_num_threads(void)
 {
-    return num_threads;
+    return _TLS_num_threads;
 }
 
 static int
diff --git a/numba/npyufunc/tbbpool.cpp b/numba/npyufunc/tbbpool.cpp
index 49c2ff0b2f7..81b6bfd9e98 100644
--- a/numba/npyufunc/tbbpool.cpp
+++ b/numba/npyufunc/tbbpool.cpp
@@ -45,18 +45,18 @@ static int tsi_count = 0;
 #define THREAD_LOCAL(ty) __thread ty
 #endif
 
-static THREAD_LOCAL(int) num_threads = 0;
+static THREAD_LOCAL(int) _TLS_num_threads = 0;
 
 static void
 set_num_threads(int count)
 {
-    num_threads = count;
+    _TLS_num_threads = count;
 }
 
 static int
 get_num_threads(void)
 {
-    return num_threads;
+    return _TLS_num_threads;
 }
 
 static int
diff --git a/numba/npyufunc/workqueue.c b/numba/npyufunc/workqueue.c
index ba7786ff3df..57d0c442854 100644
--- a/numba/npyufunc/workqueue.c
+++ b/numba/npyufunc/workqueue.c
@@ -259,18 +259,18 @@ void debug_marker() {};
 #define THREAD_LOCAL(ty) __thread ty
 #endif
 
-static THREAD_LOCAL(int) num_threads = 0;
+static THREAD_LOCAL(int) _TLS_num_threads = 0;
 
 static void
 set_num_threads(int count)
 {
-    num_threads = count;
+    _TLS_num_threads = count;
 }
 
 static int
 get_num_threads(void)
 {
-    return num_threads;
+    return _TLS_num_threads;
 }
 
 
@@ -283,7 +283,7 @@ get_num_threads(void)
 // synchronize the TLS num_threads slot to value args[0]
 static void sync_tls(void *args, void *dims, void *steps, void *data) {
     int nthreads = *((int *)(args));
-    num_threads = nthreads;
+    _TLS_num_threads = nthreads;
 };
 
 

From 851437184d0de1dbdd425aee3896bbe01da7670e Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Tue, 7 Jan 2020 18:09:53 +0000
Subject: [PATCH 058/136] Sync TLS slot on main thread with launch default

---
 numba/npyufunc/omppool.cpp | 13 +++++++++++++
 numba/npyufunc/parallel.py |  5 ++++-
 numba/npyufunc/parfor.py   |  2 +-
 numba/npyufunc/tbbpool.cpp | 15 +++++++++++++++
 numba/npyufunc/workqueue.c | 18 ++++++++++++++++++
 5 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/numba/npyufunc/omppool.cpp b/numba/npyufunc/omppool.cpp
index 91f9352aad2..b1830a9e94f 100644
--- a/numba/npyufunc/omppool.cpp
+++ b/numba/npyufunc/omppool.cpp
@@ -48,6 +48,11 @@ static pid_t parent_pid = 0; // 0 is not set, users can't own this anyway
 #define THREAD_LOCAL(ty) __thread ty
 #endif
 
+// This is the number of threads that is default, it is set on initialisation of
+// the threading backend via the launch_threads() call
+static int _INIT_NUM_THREADS = -1;
+
+// This is the per-thread thread mask, each thread can carry its own mask.
 static THREAD_LOCAL(int) _TLS_num_threads = 0;
 
 static void
@@ -59,6 +64,13 @@ set_num_threads(int count)
 static int
 get_num_threads(void)
 {
+    if (_TLS_num_threads == 0)
+    {
+        // This is a thread that did not call launch_threads() but is still a
+        // "main" thread, probably from e.g. threading.Thread() use, it still
+        // has a TLS slot which is 0 from the lack of launch_threads() call
+        _TLS_num_threads = _INIT_NUM_THREADS;
+    }
     return _TLS_num_threads;
 }
 
@@ -212,6 +224,7 @@ static void launch_threads(int count)
         return;
     omp_set_num_threads(count);
     omp_set_nested(0x1); // enable nesting, control depth with OMP env var
+    _INIT_NUM_THREADS = count;
 }
 
 static void synchronize(void)
diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index 6aed51d5ae5..9c351a5469f 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -464,6 +464,7 @@ def raise_with_hint(required):
             _is_initialized = True
 
 
+
 def _load_num_threads_funcs(lib):
 
     ll.add_symbol('get_num_threads', lib.get_num_threads)
@@ -563,8 +564,9 @@ def get_num_threads():
     _launch_threads()
     num_threads = _get_num_threads()
     if num_threads == 0:
+        print("Broken: ", _get_thread_id())
         raise RuntimeError("Invalid number of threads. "
-                           "This likely indicates a bug in numba.")
+                           "This likely indicates a bug in numba.", _get_thread_id())
     return num_threads
 
 @overload(get_num_threads)
@@ -573,6 +575,7 @@ def ol_get_num_threads():
     def impl():
         num_threads = _get_num_threads()
         if num_threads == 0:
+            print("Broken: ", _get_thread_id())
             raise RuntimeError("Invalid number of threads. "
                                "This likely indicates a bug in numba.")
         return num_threads
diff --git a/numba/npyufunc/parfor.py b/numba/npyufunc/parfor.py
index 3b03ea5ac82..7ea65fdb32f 100644
--- a/numba/npyufunc/parfor.py
+++ b/numba/npyufunc/parfor.py
@@ -1346,7 +1346,7 @@ def load_range(v):
     with cgutils.if_unlikely(builder, builder.icmp_signed('==', num_threads,
                                                   num_threads.type(0))):
         context.call_conv.return_user_exc(builder, RuntimeError,
-                                                  ("Invalid number of threads. "
+                                                  ("3 Invalid number of threads. "
                                                    "This likely indicates a bug in numba.",))
 
     builder.call(
diff --git a/numba/npyufunc/tbbpool.cpp b/numba/npyufunc/tbbpool.cpp
index 81b6bfd9e98..b7fc8a9f0b1 100644
--- a/numba/npyufunc/tbbpool.cpp
+++ b/numba/npyufunc/tbbpool.cpp
@@ -45,8 +45,14 @@ static int tsi_count = 0;
 #define THREAD_LOCAL(ty) __thread ty
 #endif
 
+// This is the number of threads that is default, it is set on initialisation of
+// the threading backend via the launch_threads() call
+static int _INIT_NUM_THREADS = -1;
+
+// This is the per-thread thread mask, each thread can carry its own mask.
 static THREAD_LOCAL(int) _TLS_num_threads = 0;
 
+
 static void
 set_num_threads(int count)
 {
@@ -56,6 +62,13 @@ set_num_threads(int count)
 static int
 get_num_threads(void)
 {
+    if (_TLS_num_threads == 0)
+    {
+        // This is a thread that did not call launch_threads() but is still a
+        // "main" thread, probably from e.g. threading.Thread() use, it still
+        // has a TLS slot which is 0 from the lack of launch_threads() call
+        _TLS_num_threads = _INIT_NUM_THREADS;
+    }
     return _TLS_num_threads;
 }
 
@@ -251,6 +264,8 @@ static void launch_threads(int count)
     tg = new tbb::task_group;
     tg->run([] {}); // start creating threads asynchronously
 
+    _INIT_NUM_THREADS = count;
+
 #ifndef _MSC_VER
     pthread_atfork(prepare_fork, reset_after_fork, reset_after_fork);
 #endif
diff --git a/numba/npyufunc/workqueue.c b/numba/npyufunc/workqueue.c
index 57d0c442854..b2981db48b5 100644
--- a/numba/npyufunc/workqueue.c
+++ b/numba/npyufunc/workqueue.c
@@ -259,6 +259,11 @@ void debug_marker() {};
 #define THREAD_LOCAL(ty) __thread ty
 #endif
 
+// This is the number of threads that is default, it is set on initialisation of
+// the threading backend via the launch_threads() call
+static int _INIT_NUM_THREADS = -1;
+
+// This is the per-thread thread mask, each thread can carry its own mask.
 static THREAD_LOCAL(int) _TLS_num_threads = 0;
 
 static void
@@ -270,6 +275,16 @@ set_num_threads(int count)
 static int
 get_num_threads(void)
 {
+    // This is purely to permit the implementation to survive to the point
+    // where it can exit cleanly as multiple threads cannot be used with this
+    // backend
+    if (_TLS_num_threads == 0)
+    {
+        // This is a thread that did not call launch_threads() but is still a
+        // "main" thread, probably from e.g. threading.Thread() use, it still
+        // has a TLS slot which is 0 from the lack of launch_threads() call
+        _TLS_num_threads = _INIT_NUM_THREADS;
+    }
     return _TLS_num_threads;
 }
 
@@ -481,6 +496,8 @@ static void launch_threads(int count)
             queue_condition_init(&queues[i].cond);
             numba_new_thread(thread_worker, &queues[i]);
         }
+
+        _INIT_NUM_THREADS = count;
     }
 }
 
@@ -507,6 +524,7 @@ static void reset_after_fork(void)
     free(queues);
     queues = NULL;
     NUM_THREADS = -1;
+    _INIT_NUM_THREADS = -1;
 }
 
 MOD_INIT(workqueue)

From 01d76bc7eec99646af90ca65463f01d5cdd68f0d Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Wed, 8 Jan 2020 11:45:21 +0000
Subject: [PATCH 059/136] Fix up flake8

As title
---
 numba/npyufunc/parallel.py           | 15 ++++++--
 numba/npyufunc/tbbpool.cpp           |  2 ++
 numba/tests/test_num_threads.py      | 52 ++++++++++++++++------------
 numba/tests/test_parallel_backend.py |  1 +
 4 files changed, 44 insertions(+), 26 deletions(-)

diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index 9c351a5469f..a85169464ad 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -276,6 +276,7 @@ def build_gufunc_wrapper(py_func, cres, sin, sout, cache, is_parfors):
 class _nop(object):
     """A no-op contextmanager
     """
+
     def __enter__(self):
         pass
 
@@ -456,7 +457,7 @@ def raise_with_hint(required):
             launch_threads = CFUNCTYPE(None, c_int)(lib.launch_threads)
             launch_threads(NUM_THREADS)
 
-            _load_num_threads_funcs(lib) # load late
+            _load_num_threads_funcs(lib)  # load late
 
             # set library name so it can be queried
             global _threading_layer
@@ -464,7 +465,6 @@ def raise_with_hint(required):
             _is_initialized = True
 
 
-
 def _load_num_threads_funcs(lib):
 
     ll.add_symbol('get_num_threads', lib.get_num_threads)
@@ -534,11 +534,13 @@ def set_num_threads(n):
 @overload(set_num_threads)
 def ol_set_num_threads(n):
     _launch_threads()
+
     def impl(n):
         snt_check(n)
         _set_num_threads(n)
     return impl
 
+
 def get_num_threads():
     """
     Get the number of threads used for parallel execution.
@@ -566,12 +568,15 @@ def get_num_threads():
     if num_threads == 0:
         print("Broken: ", _get_thread_id())
         raise RuntimeError("Invalid number of threads. "
-                           "This likely indicates a bug in numba.", _get_thread_id())
+                           "This likely indicates a bug in numba.",
+                           _get_thread_id())
     return num_threads
 
+
 @overload(get_num_threads)
 def ol_get_num_threads():
     _launch_threads()
+
     def impl():
         num_threads = _get_num_threads()
         if num_threads == 0:
@@ -581,6 +586,7 @@ def impl():
         return num_threads
     return impl
 
+
 def _get_thread_id():
     """
     docs
@@ -588,13 +594,16 @@ def _get_thread_id():
     _launch_threads()
     return _get_thread_id()
 
+
 @overload(_get_thread_id)
 def ol_get_thread_id():
     _launch_threads()
+
     def impl():
         return _get_thread_id()
     return impl
 
+
 _DYLD_WORKAROUND_SET = 'NUMBA_DYLD_WORKAROUND' in os.environ
 _DYLD_WORKAROUND_VAL = int(os.environ.get('NUMBA_DYLD_WORKAROUND', 0))
 
diff --git a/numba/npyufunc/tbbpool.cpp b/numba/npyufunc/tbbpool.cpp
index b7fc8a9f0b1..08e1e48a48b 100644
--- a/numba/npyufunc/tbbpool.cpp
+++ b/numba/npyufunc/tbbpool.cpp
@@ -2,6 +2,8 @@
 Implement parallel vectorize workqueue on top of Intel TBB.
 */
 
+#define TBB_USE_DEBUG 1
+#define __TBB_EXTRA_DEBUG 1
 #define TBB_PREVIEW_WAITING_FOR_WORKERS 1
 /* tbb.h redefines these */
 #include "../_pymodule.h"
diff --git a/numba/tests/test_num_threads.py b/numba/tests/test_num_threads.py
index 1df85ea2943..9158132b0b5 100644
--- a/numba/tests/test_num_threads.py
+++ b/numba/tests/test_num_threads.py
@@ -14,6 +14,7 @@
 from .support import TestCase, skip_parfors_unsupported, tag
 from .test_parallel_backend import TestInSubprocess
 
+
 class TestNumThreads(TestCase):
     _numba_parallel_test_ = False
 
@@ -125,7 +126,6 @@ def test_func():
         def test_gufunc(x):
             x[:] = get_num_threads()
 
-
         out = test_func()
         np.testing.assert_equal(out, 2)
 
@@ -223,7 +223,6 @@ def test_func():
             out = test_func()
             self.assertEqual(out, (mask, mask))
 
-
             @guvectorize(['void(int64[:], int64[:])'],
                          '(n), (m)',
                          nopython=True,
@@ -252,7 +251,7 @@ def _test_nested_parallelism_1(self):
         mask = config.NUMBA_NUM_THREADS - 1
 
         N = config.NUMBA_NUM_THREADS
-        M = 2*config.NUMBA_NUM_THREADS
+        M = 2 * config.NUMBA_NUM_THREADS
 
         @njit(parallel=True)
         def child_func(buf, fid):
@@ -270,7 +269,8 @@ def _test_func(nthreads):
                         set_num_threads(nthreads)
                         for i in prange(M):
                             local_mask = 1 + i % mask
-                            set_num_threads(local_mask)  # set threads in parent function
+                            # set threads in parent function
+                            set_num_threads(local_mask)
                             if local_mask < N:
                                 child_func(buf, local_mask)
                             acc += get_num_threads()
@@ -284,7 +284,8 @@ def _test_func(nthreads):
                 def test_func(nthreads, py_func=False):
                     def _test_func(acc, buf, local_mask):
                         set_num_threads(nthreads)
-                        set_num_threads(local_mask[0])  # set threads in parent function
+                        # set threads in parent function
+                        set_num_threads(local_mask[0])
                         if local_mask[0] < N:
                             child_func(buf, local_mask[0])
                         acc[0] += get_num_threads()
@@ -292,13 +293,14 @@ def _test_func(acc, buf, local_mask):
                     buf = np.zeros((M, N), dtype=np.int64)
                     acc = np.array([0])
                     local_mask = (1 + np.arange(M) % mask).reshape((M, 1))
+                    sig = ['void(int64[:], int64[:, :], int64[:])']
                     if not py_func:
-                        _test_func = guvectorize(['void(int64[:], int64[:, :], int64[:])'],
-                                                 '(k), (n, m), (p)', nopython=True,
+                        _test_func = guvectorize(sig, '(k), (n, m), (p)',
+                                                 nopython=True,
                                                  target='parallel')(_test_func)
                     else:
-                        _test_func = guvectorize(['void(int64[:], int64[:, :], int64[:])'],
-                                                 '(k), (n, m), (p)', forceobj=True)(_test_func)
+                        _test_func = guvectorize(sig, '(k), (n, m), (p)',
+                                                 forceobj=True)(_test_func)
                     _test_func(acc, buf, local_mask)
                     return acc, buf
 
@@ -331,7 +333,7 @@ def _test_nested_parallelism_2(self):
         # check that get_num_threads is ok in nesting
 
         N = config.NUMBA_NUM_THREADS + 1
-        M = 4*config.NUMBA_NUM_THREADS + 1
+        M = 4 * config.NUMBA_NUM_THREADS + 1
 
         def get_impl(child_type, test_type):
 
@@ -340,7 +342,8 @@ def get_impl(child_type, test_type):
             elif child_type == 'njit':
                 child_dec = njit(parallel=False)
             elif child_type == 'none':
-                def child_dec(x): return x
+                def child_dec(x):
+                    return x
 
             @child_dec
             def child(buf, fid):
@@ -349,14 +352,14 @@ def child(buf, fid):
                 for i in prange(N):
                     buf[fid, i] = get_num_threads()
 
-
             if test_type in ['parallel', 'njit', 'none']:
                 if test_type == 'parallel':
                     test_dec = njit(parallel=True)
                 elif test_type == 'njit':
                     test_dec = njit(parallel=False)
                 elif test_type == 'none':
-                    def test_dec(x): return x
+                    def test_dec(x):
+                        return x
 
                 @test_dec
                 def test_func(nthreads):
@@ -364,9 +367,9 @@ def test_func(nthreads):
                     set_num_threads(nthreads)
                     for i in prange(M):
                         local_mask = 1 + i % mask
-                        # when the threads exit the child functions they should have
-                        # a TLS slot value of the local mask as it was set in
-                        # child
+                        # when the threads exit the child functions they should
+                        # have a TLS slot value of the local mask as it was set
+                        # in child
                         if local_mask < config.NUMBA_NUM_THREADS:
                             child(buf, local_mask)
                             assert get_num_threads() == local_mask
@@ -384,9 +387,9 @@ def test_func(nthreads):
                     @test_dec
                     def _test_func(buf, local_mask):
                         set_num_threads(nthreads)
-                        # when the threads exit the child functions they should have
-                        # a TLS slot value of the local mask as it was set in
-                        # child
+                        # when the threads exit the child functions they should
+                        # have a TLS slot value of the local mask as it was set
+                        # in child
                         if local_mask[0] < config.NUMBA_NUM_THREADS:
                             child(buf, local_mask[0])
                             assert get_num_threads() == local_mask[0]
@@ -401,12 +404,14 @@ def _test_func(buf, local_mask):
         mask = config.NUMBA_NUM_THREADS - 1
 
         res_arrays = {}
-        for test_type in ['parallel', 'njit', 'none', 'guvectorize', 'guvectorize-obj']:
+        for test_type in ['parallel', 'njit', 'none',
+                          'guvectorize', 'guvectorize-obj']:
             for child_type in ['parallel', 'njit', 'none']:
                 if child_type == 'none' and test_type != 'none':
                     continue
                 set_num_threads(mask)
-                res_arrays[test_type, child_type] = get_impl(child_type, test_type)(mask)
+                res_arrays[test_type, child_type] = get_impl(
+                    child_type, test_type)(mask)
 
         py_arr = res_arrays['none', 'none']
         for arr in res_arrays.values():
@@ -414,7 +419,8 @@ def _test_func(buf, local_mask):
 
         # check the maths reconciles
         math_arr = np.zeros((M, N))
-        for i in range(1, config.NUMBA_NUM_THREADS):  # there's branches on modulo mask but only NUMBA_NUM_THREADS funcs
+        # there's branches on modulo mask but only NUMBA_NUM_THREADS funcs
+        for i in range(1, config.NUMBA_NUM_THREADS):
             math_arr[i, :] = i
 
         np.testing.assert_equal(math_arr, py_arr)
@@ -480,7 +486,6 @@ def test_func_guvectorize(total, lens):
         self.assertEqual(expected_acc, got_acc)
         np.testing.assert_equal(expected_thread_count, got_tc)
 
-
     def tearDown(self):
         set_num_threads(config.NUMBA_NUM_THREADS)
 
@@ -530,6 +535,7 @@ def generate(cls):
                         continue
                     cls._inject(name, backend, backend_guard, num_threads)
 
+
 TestNumThreadsBackends.generate()
 
 if __name__ == '__main__':
diff --git a/numba/tests/test_parallel_backend.py b/numba/tests/test_parallel_backend.py
index 6f82de7e6ba..1ec3379211f 100644
--- a/numba/tests/test_parallel_backend.py
+++ b/numba/tests/test_parallel_backend.py
@@ -367,6 +367,7 @@ def run_test_in_separate_process(self, test, threading_layer):
         cmdline = [sys.executable, "-m", "numba.runtests", test]
         return self.run_cmd(cmdline, env_copy)
 
+
 class TestSpecificBackend(TestInSubprocess, TestParallelBackendBase):
     """
     This is quite contrived, for each test in the TestParallelBackend tests it

From 9a99a119d82a79c90389514ab52c3c83e92a8bca Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Wed, 8 Jan 2020 12:09:45 +0000
Subject: [PATCH 060/136] Work around TBB scheduler not guaranteeing full mask
 use.

As title
---
 numba/tests/test_num_threads.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/numba/tests/test_num_threads.py b/numba/tests/test_num_threads.py
index 9158132b0b5..5d4bb43a401 100644
--- a/numba/tests/test_num_threads.py
+++ b/numba/tests/test_num_threads.py
@@ -433,6 +433,16 @@ def _test_nested_parallelism_3(self):
         if threading_layer() == 'workqueue':
             self.skipTest("workqueue is not threadsafe")
 
+        def check_mask(expected, result):
+            # There's no guarantee that TBB will use a full mask worth of
+            # threads if it deems it inefficient to do so
+            if threading_layer() == 'tbb':
+                self.assertTrue(np.all(result <= expected))
+            elif threading_layer() == 'omp':
+                np.testing.assert_equal(expected, result)
+            else:
+                assert 0, 'unreachable'
+
         # check that the right number of threads are present in nesting
         # this relies on there being a load of cores present
         BIG = 1000000
@@ -464,7 +474,7 @@ def test_func_jit(nthreads):
 
         got_acc, got_tc = test_func_jit(NT)
         self.assertEqual(expected_acc, got_acc)
-        np.testing.assert_equal(expected_thread_count, got_tc)
+        check_mask(expected_thread_count, got_tc)
 
         def test_guvectorize(nthreads):
             @guvectorize(['int64[:], int64[:]'],
@@ -484,7 +494,7 @@ def test_func_guvectorize(total, lens):
 
         got_acc, got_tc = test_guvectorize(NT)
         self.assertEqual(expected_acc, got_acc)
-        np.testing.assert_equal(expected_thread_count, got_tc)
+        check_mask(expected_thread_count, got_tc)
 
     def tearDown(self):
         set_num_threads(config.NUMBA_NUM_THREADS)

From 48ffe548f3007cdf2eddff3b6795d146df052f72 Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Wed, 8 Jan 2020 12:19:51 +0000
Subject: [PATCH 061/136] Add OpenMP vendor string to numba -s

As title
---
 numba/numba_entry.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/numba/numba_entry.py b/numba/numba_entry.py
index 3747b234f45..5264dcd1fe0 100644
--- a/numba/numba_entry.py
+++ b/numba/numba_entry.py
@@ -280,6 +280,7 @@ def parse_error(e, backend):
         try:
             from numba.npyufunc import omppool
             print(fmt % ("OpenMP Threading layer available", True))
+            print(fmt % ("+--> Vendor: ", omppool.openmp_vendor))
         except ImportError as e:
             print(fmt % ("OpenMP Threading layer available", False))
             print(fmt % ("+--> Disabled due to",

From 5b323b615c960a84813b99929fe43349601296e9 Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Wed, 8 Jan 2020 13:30:01 +0000
Subject: [PATCH 062/136] Fix up OMP vendor ordering and test.

As title.
---
 numba/npyufunc/omppool.cpp           |  4 ++--
 numba/tests/test_parallel_backend.py | 20 ++++++++++++++++++++
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/numba/npyufunc/omppool.cpp b/numba/npyufunc/omppool.cpp
index b1830a9e94f..1002d2c9741 100644
--- a/numba/npyufunc/omppool.cpp
+++ b/numba/npyufunc/omppool.cpp
@@ -30,10 +30,10 @@ Threading layer on top of OpenMP.
 // OpenMP vendor strings
 #if defined(_MSC_VER)
 #define _OMP_VENDOR "MS"
-#elif defined(__GNUC__)
-#define _OMP_VENDOR "GNU"
 #elif defined(__clang__)
 #define _OMP_VENDOR "Intel"
+#elif defined(__GNUC__) // NOTE: clang also defines this, but it's checked above
+#define _OMP_VENDOR "GNU"
 #endif
 
 #if defined(__GNUC__)
diff --git a/numba/tests/test_parallel_backend.py b/numba/tests/test_parallel_backend.py
index 1ec3379211f..ecbb2a64017 100644
--- a/numba/tests/test_parallel_backend.py
+++ b/numba/tests/test_parallel_backend.py
@@ -858,5 +858,25 @@ def run_cmd(cmdline):
             print("ERR:", err)
 
 
+@parfors_skip_unsupported
+@skip_no_omp
+class TestOpenMPVendors(TestCase):
+
+    def test_vendors(self):
+        """
+        Checks the OpenMP vendor strings are correct
+        """
+        expected = dict()
+        expected['win32'] = "MS"
+        expected['darwin'] = "Intel"
+        expected['linux'] = "GNU"
+
+        # only check OS that are supported, custom toolchains may well work as
+        # may other OS
+        for k in expected.keys():
+            if sys.platform.startswith(k):
+                self.assertEqual(expected[k], omppool.openmp_vendor)
+
+
 if __name__ == '__main__':
     unittest.main()

From 6b8338e1cf05f7245a03529907ac3e95d3d97696 Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Wed, 8 Jan 2020 13:49:36 +0000
Subject: [PATCH 063/136] Update forksafe definition site

As title
---
 numba/npyufunc/omppool.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/numba/npyufunc/omppool.cpp b/numba/npyufunc/omppool.cpp
index 1002d2c9741..ade5d93aeb8 100644
--- a/numba/npyufunc/omppool.cpp
+++ b/numba/npyufunc/omppool.cpp
@@ -33,10 +33,11 @@ Threading layer on top of OpenMP.
 #elif defined(__clang__)
 #define _OMP_VENDOR "Intel"
 #elif defined(__GNUC__) // NOTE: clang also defines this, but it's checked above
+#define _NOT_FORKSAFE 1 // GNU OpenMP Not forksafe
 #define _OMP_VENDOR "GNU"
 #endif
 
-#if defined(__GNUC__)
+#if defined(_NOT_FORKSAFE)
 static pid_t parent_pid = 0; // 0 is not set, users can't own this anyway
 #endif
 
@@ -102,7 +103,7 @@ parallel_for(void *fn, char **args, size_t *dimensions, size_t *steps, void *dat
         printed = true;
     }
 
-#if defined(__GNUC__)
+#if defined(_NOT_FORKSAFE)
     // Handle GNU OpenMP not being forksafe...
     // This checks if the pid set by the process that initialized this library
     // matches the parent of this pid. If they do match this is a fork() from
@@ -209,7 +210,7 @@ static void launch_threads(int count)
 {
     // this must be called in a fork+thread safe region from Python
     static bool initialized = false;
-#ifdef __GNUC__
+#ifdef _NOT_FORKSAFE
     parent_pid = getpid(); // record the parent PID for use later
     if(_DEBUG_FORK)
     {

From ce196b3bd0b184e4b157db3f3f64aeee2763af4b Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Wed, 8 Jan 2020 14:32:27 +0000
Subject: [PATCH 064/136] TMP: Add conda-forge TBB for windows

---
 buildscripts/azure/azure-windows.yml | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/buildscripts/azure/azure-windows.yml b/buildscripts/azure/azure-windows.yml
index f3fe8cc49ed..acf639ba307 100644
--- a/buildscripts/azure/azure-windows.yml
+++ b/buildscripts/azure/azure-windows.yml
@@ -43,22 +43,18 @@ jobs:
         call activate %CONDA_ENV%
         conda remove -y tbb tbb-devel
       displayName: 'Remove TBB'
-      condition: eq(variables['PYTHON'], '2.7')
 
     - script: |
-        buildscripts\\incremental\\build.cmd
-      displayName: 'Build'
+        # temporarily patch this in to get a recent TBB
+        call activate %CONDA_ENV%
+        conda install -c conda-forge -y tbb tbb-devel
+      displayName: 'Add in conda-forge TBB'
 
     - script: |
-        # One of the tbb tests is failing on Azure.  Removing tbb before
-        # testing until we can figure out why.  Only do this for Python 3
-        # because we already removed TBB before build on Python 2.
-        call activate %CONDA_ENV%
-        conda remove -y tbb tbb-devel
-      displayName: 'Remove TBB'
-      condition: ne(variables['PYTHON'], '2.7')
+        buildscripts\\incremental\\build.cmd
+      displayName: 'Build'
 
-    # not working on windows?
+      # not working on windows?
     #- script: |
     #    call activate %CONDA_ENV%
     #    numba.exe -s

From 234a88e4c8f95d0911a978de676bc368cd5def0e Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Wed, 8 Jan 2020 15:21:35 +0000
Subject: [PATCH 065/136] Get test type in feedback message

---
 numba/tests/test_num_threads.py | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/numba/tests/test_num_threads.py b/numba/tests/test_num_threads.py
index 5d4bb43a401..cd12d8c9146 100644
--- a/numba/tests/test_num_threads.py
+++ b/numba/tests/test_num_threads.py
@@ -311,16 +311,21 @@ def _test_func(acc, buf, local_mask):
 
             got_acc, got_arr = test_func(mask)
             exp_acc, exp_arr = test_func(mask, py_func=True)
-            self.assertEqual(exp_acc, got_acc, test_type)
-            np.testing.assert_equal(exp_arr, got_arr)
-
-            # check the maths reconciles
-            math_acc = np.sum(1 + np.arange(M) % mask)
-            self.assertEqual(math_acc, got_acc)
-            math_arr = np.zeros((M, N))
-            for i in range(1, N):  # there's branches on 1, ..., num_threads - 1
-                math_arr[i, :] = i
-            np.testing.assert_equal(math_arr, got_arr)
+            try:
+                self.assertEqual(exp_acc, got_acc, test_type)
+                np.testing.assert_equal(exp_arr, got_arr)
+
+                # check the maths reconciles
+                math_acc = np.sum(1 + np.arange(M) % mask)
+                self.assertEqual(math_acc, got_acc)
+                math_arr = np.zeros((M, N))
+                for i in range(1, N):
+                    # there's branches on 1, ..., num_threads - 1
+                    math_arr[i, :] = i
+                np.testing.assert_equal(math_arr, got_arr)
+            except Exception as e:
+                msg = "TYPE: %s, error: %s" % (test_type, e.args[0])
+                raise type(e)(msg, *e.args[1:])
 
     # this test can only run on OpenMP (providing OMP_MAX_ACTIVE_LEVELS is not
     # set or >= 2) and TBB backends

From f23a38521b481279c1968aa81aeeec0a8deb29a2 Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Wed, 8 Jan 2020 15:33:52 +0000
Subject: [PATCH 066/136] Extend TBB flexible testing to other tests

---
 numba/tests/test_num_threads.py | 36 ++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/numba/tests/test_num_threads.py b/numba/tests/test_num_threads.py
index cd12d8c9146..5b3abdb8c8e 100644
--- a/numba/tests/test_num_threads.py
+++ b/numba/tests/test_num_threads.py
@@ -23,6 +23,16 @@ def setUp(self):
         # the threads are launched.
         set_num_threads(config.NUMBA_NUM_THREADS)
 
+    def check_mask(self, expected, result):
+        # There's no guarantee that TBB will use a full mask worth of
+        # threads if it deems it inefficient to do so
+        if threading_layer() == 'tbb':
+            self.assertTrue(np.all(result <= expected))
+        elif threading_layer() in ('omp', 'workqueue'):
+            np.testing.assert_equal(expected, result)
+        else:
+            assert 0, 'unreachable'
+
     @skip_parfors_unsupported
     @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
     def _test_set_num_threads_basic(self):
@@ -186,7 +196,7 @@ def test_func():
                 return len(np.unique(buf)), get_num_threads()
 
             out = test_func()
-            self.assertEqual(out, (mask, mask))
+            self.check_mask((mask, mask), out)
 
             @guvectorize(['void(int64[:], int64[:])'],
                          '(n), (m)',
@@ -200,8 +210,8 @@ def test_gufunc(x, out):
             x = np.full((5000000,), -1, dtype=np.int64).reshape((100, 50000))
             out = np.zeros((1,), dtype=np.int64)
             test_gufunc(x, out)
-            np.testing.assert_equal(out, np.array([mask]))
-            self.assertEqual(len(np.unique(x)), mask)
+            self.check_mask(mask, out)
+            self.check_mask(mask, len(np.unique(x)))
 
     @skip_parfors_unsupported
     @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
@@ -221,7 +231,7 @@ def test_func():
                 return len(np.unique(buf)), get_num_threads()
 
             out = test_func()
-            self.assertEqual(out, (mask, mask))
+            self.check_mask((mask, mask), out)
 
             @guvectorize(['void(int64[:], int64[:])'],
                          '(n), (m)',
@@ -236,8 +246,8 @@ def test_gufunc(x, out):
             x = np.full((5000000,), -1, dtype=np.int64).reshape((100, 50000))
             out = np.zeros((1,), dtype=np.int64)
             test_gufunc(x, out)
-            np.testing.assert_equal(out, np.array([mask]))
-            self.assertEqual(len(np.unique(x)), mask)
+            self.check_mask(mask, out)
+            self.check_mask(mask, len(np.unique(x)))
 
     # this test can only run on OpenMP (providing OMP_MAX_ACTIVE_LEVELS is not
     # set or >= 2) and TBB backends
@@ -438,16 +448,6 @@ def _test_nested_parallelism_3(self):
         if threading_layer() == 'workqueue':
             self.skipTest("workqueue is not threadsafe")
 
-        def check_mask(expected, result):
-            # There's no guarantee that TBB will use a full mask worth of
-            # threads if it deems it inefficient to do so
-            if threading_layer() == 'tbb':
-                self.assertTrue(np.all(result <= expected))
-            elif threading_layer() == 'omp':
-                np.testing.assert_equal(expected, result)
-            else:
-                assert 0, 'unreachable'
-
         # check that the right number of threads are present in nesting
         # this relies on there being a load of cores present
         BIG = 1000000
@@ -479,7 +479,7 @@ def test_func_jit(nthreads):
 
         got_acc, got_tc = test_func_jit(NT)
         self.assertEqual(expected_acc, got_acc)
-        check_mask(expected_thread_count, got_tc)
+        self.check_mask(expected_thread_count, got_tc)
 
         def test_guvectorize(nthreads):
             @guvectorize(['int64[:], int64[:]'],
@@ -499,7 +499,7 @@ def test_func_guvectorize(total, lens):
 
         got_acc, got_tc = test_guvectorize(NT)
         self.assertEqual(expected_acc, got_acc)
-        check_mask(expected_thread_count, got_tc)
+        self.check_mask(expected_thread_count, got_tc)
 
     def tearDown(self):
         set_num_threads(config.NUMBA_NUM_THREADS)

From 05536255ce52c70c16ad8388311c8c24155c5ac2 Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Wed, 8 Jan 2020 22:11:10 +0000
Subject: [PATCH 067/136] Add numba -s to azure

---
 buildscripts/azure/azure-windows.yml | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/buildscripts/azure/azure-windows.yml b/buildscripts/azure/azure-windows.yml
index acf639ba307..050ec64379a 100644
--- a/buildscripts/azure/azure-windows.yml
+++ b/buildscripts/azure/azure-windows.yml
@@ -54,11 +54,10 @@ jobs:
         buildscripts\\incremental\\build.cmd
       displayName: 'Build'
 
-      # not working on windows?
-    #- script: |
-    #    call activate %CONDA_ENV%
-    #    numba.exe -s
-    #  displayName: 'Display numba system information'
+    - script: |
+        call activate %CONDA_ENV%
+        python -m numba -s
+      displayName: 'Display numba system information'
 
     - script: |
         call activate %CONDA_ENV%

From 5dd3bbc7ca141b7c2ad8d9bca9421291ce4a6df2 Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Wed, 8 Jan 2020 22:20:22 +0000
Subject: [PATCH 068/136] Debug output

---
 numba/tests/test_num_threads.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/numba/tests/test_num_threads.py b/numba/tests/test_num_threads.py
index 5b3abdb8c8e..df0b8c2a013 100644
--- a/numba/tests/test_num_threads.py
+++ b/numba/tests/test_num_threads.py
@@ -318,9 +318,11 @@ def _test_func(acc, buf, local_mask):
 
         for test_type in ['njit', 'guvectorize']:
             test_func = get_test(test_type)
-
             got_acc, got_arr = test_func(mask)
             exp_acc, exp_arr = test_func(mask, py_func=True)
+            print(test_type.center(80, '-'))
+            print(got_acc, '\n', got_arr)
+            print(exp_acc, '\n', exp_arr)
             try:
                 self.assertEqual(exp_acc, got_acc, test_type)
                 np.testing.assert_equal(exp_arr, got_arr)
@@ -328,6 +330,7 @@ def _test_func(acc, buf, local_mask):
                 # check the maths reconciles
                 math_acc = np.sum(1 + np.arange(M) % mask)
                 self.assertEqual(math_acc, got_acc)
+                print(math_acc, '\n', got_acc)
                 math_arr = np.zeros((M, N))
                 for i in range(1, N):
                     # there's branches on 1, ..., num_threads - 1

From 74b81d3ff9f1f7e4ab717fcb6d707173a3532cfb Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Thu, 9 Jan 2020 11:39:13 +0000
Subject: [PATCH 069/136] Fix up guvectorize race condition.

As title.
---
 numba/tests/test_num_threads.py | 44 ++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 23 deletions(-)

diff --git a/numba/tests/test_num_threads.py b/numba/tests/test_num_threads.py
index df0b8c2a013..376d82bc68a 100644
--- a/numba/tests/test_num_threads.py
+++ b/numba/tests/test_num_threads.py
@@ -301,15 +301,15 @@ def _test_func(acc, buf, local_mask):
                         acc[0] += get_num_threads()
 
                     buf = np.zeros((M, N), dtype=np.int64)
-                    acc = np.array([0])
+                    acc = np.zeros((M, 1), dtype=np.int64)
                     local_mask = (1 + np.arange(M) % mask).reshape((M, 1))
                     sig = ['void(int64[:], int64[:, :], int64[:])']
+                    layout = '(p), (n, m), (p)'
                     if not py_func:
-                        _test_func = guvectorize(sig, '(k), (n, m), (p)',
-                                                 nopython=True,
+                        _test_func = guvectorize(sig, layout, nopython=True,
                                                  target='parallel')(_test_func)
                     else:
-                        _test_func = guvectorize(sig, '(k), (n, m), (p)',
+                        _test_func = guvectorize(sig, layout,
                                                  forceobj=True)(_test_func)
                     _test_func(acc, buf, local_mask)
                     return acc, buf
@@ -320,25 +320,23 @@ def _test_func(acc, buf, local_mask):
             test_func = get_test(test_type)
             got_acc, got_arr = test_func(mask)
             exp_acc, exp_arr = test_func(mask, py_func=True)
-            print(test_type.center(80, '-'))
-            print(got_acc, '\n', got_arr)
-            print(exp_acc, '\n', exp_arr)
-            try:
-                self.assertEqual(exp_acc, got_acc, test_type)
-                np.testing.assert_equal(exp_arr, got_arr)
-
-                # check the maths reconciles
-                math_acc = np.sum(1 + np.arange(M) % mask)
-                self.assertEqual(math_acc, got_acc)
-                print(math_acc, '\n', got_acc)
-                math_arr = np.zeros((M, N))
-                for i in range(1, N):
-                    # there's branches on 1, ..., num_threads - 1
-                    math_arr[i, :] = i
-                np.testing.assert_equal(math_arr, got_arr)
-            except Exception as e:
-                msg = "TYPE: %s, error: %s" % (test_type, e.args[0])
-                raise type(e)(msg, *e.args[1:])
+            np.testing.assert_equal(exp_acc, got_acc)
+            np.testing.assert_equal(exp_arr, got_arr)
+
+            # check the maths reconciles, guvectorize does not reduce, njit does
+            math_acc_exp = 1 + np.arange(M) % mask
+            if test_type == 'guvectorize':
+                math_acc = math_acc_exp.reshape((M, 1))
+            else:
+                math_acc = np.sum(math_acc_exp)
+
+            np.testing.assert_equal(math_acc, got_acc)
+
+            math_arr = np.zeros((M, N))
+            for i in range(1, N):
+                # there's branches on 1, ..., num_threads - 1
+                math_arr[i, :] = i
+            np.testing.assert_equal(math_arr, got_arr)
 
     # this test can only run on OpenMP (providing OMP_MAX_ACTIVE_LEVELS is not
     # set or >= 2) and TBB backends

From 8eb2a09aa12884425afadb8f81da970c972005dd Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Thu, 9 Jan 2020 11:58:20 +0000
Subject: [PATCH 070/136] Fix another race condition

---
 numba/tests/test_num_threads.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/numba/tests/test_num_threads.py b/numba/tests/test_num_threads.py
index 376d82bc68a..a734c0a1a46 100644
--- a/numba/tests/test_num_threads.py
+++ b/numba/tests/test_num_threads.py
@@ -443,6 +443,8 @@ def _test_func(buf, local_mask):
 
     # this test can only run on OpenMP (providing OMP_MAX_ACTIVE_LEVELS is not
     # set or >= 2) and TBB backends
+    # This test needs at least 3 threads to run, N>=2 for the launch, M>=N+1 for
+    # the nested function
     @skip_parfors_unsupported
     @unittest.skipIf(config.NUMBA_NUM_THREADS < 3, "Not enough CPU cores")
     def _test_nested_parallelism_3(self):
@@ -454,10 +456,10 @@ def _test_nested_parallelism_3(self):
         BIG = 1000000
 
         @njit(parallel=True)
-        def work(local_nt):
+        def work(local_nt):  # arg is value 3
             tid = np.zeros(BIG)
             acc = 0
-            set_num_threads(local_nt)
+            set_num_threads(local_nt)  # set to 3 threads
             for i in prange(BIG):
                 acc += 1
                 tid[i] = _get_thread_id()
@@ -465,11 +467,11 @@ def work(local_nt):
 
         @njit(parallel=True)
         def test_func_jit(nthreads):
-            set_num_threads(nthreads)
+            set_num_threads(nthreads) # set to 2 threads
             lens = np.zeros(nthreads)
             total = 0
             for i in prange(nthreads):
-                my_acc, tids = work(nthreads + 1)
+                my_acc, tids = work(nthreads + 1)  # call with value 3
                 lens[i] = len(tids)
                 total += my_acc
             return total, np.unique(lens)
@@ -484,21 +486,23 @@ def test_func_jit(nthreads):
 
         def test_guvectorize(nthreads):
             @guvectorize(['int64[:], int64[:]'],
-                         '(n), (m)',
+                         '(n), (n)',
                          nopython=True,
                          target='parallel')
             def test_func_guvectorize(total, lens):
                 my_acc, tids = work(nthreads + 1)
-                lens[:] = len(tids)
-                total += my_acc
+                lens[0] = len(tids)
+                total[0] += my_acc
 
-            total = np.array([0])
+            total = np.zeros((nthreads, 1), dtype=np.int64)
             lens = np.zeros(nthreads, dtype=np.int64).reshape((nthreads, 1))
 
             test_func_guvectorize(total, lens)
-            return total, np.unique(lens)
+            # vectorize does not reduce, so total is summed
+            return total.sum(), np.unique(lens)
 
         got_acc, got_tc = test_guvectorize(NT)
+
         self.assertEqual(expected_acc, got_acc)
         self.check_mask(expected_thread_count, got_tc)
 

From 1fa4da7c1599bd4e7cb7bb9a7bb9ac851dfacbc4 Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Thu, 9 Jan 2020 13:14:50 +0000
Subject: [PATCH 071/136] Trying out nested parallelism prevention in workqueue

---
 numba/npyufunc/tbbpool.cpp |  2 --
 numba/npyufunc/workqueue.c | 31 +++++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/numba/npyufunc/tbbpool.cpp b/numba/npyufunc/tbbpool.cpp
index 08e1e48a48b..b7fc8a9f0b1 100644
--- a/numba/npyufunc/tbbpool.cpp
+++ b/numba/npyufunc/tbbpool.cpp
@@ -2,8 +2,6 @@
 Implement parallel vectorize workqueue on top of Intel TBB.
 */
 
-#define TBB_USE_DEBUG 1
-#define __TBB_EXTRA_DEBUG 1
 #define TBB_PREVIEW_WAITING_FOR_WORKERS 1
 /* tbb.h redefines these */
 #include "../_pymodule.h"
diff --git a/numba/npyufunc/workqueue.c b/numba/npyufunc/workqueue.c
index b2981db48b5..1cf4798d8fd 100644
--- a/numba/npyufunc/workqueue.c
+++ b/numba/npyufunc/workqueue.c
@@ -21,12 +21,16 @@ race condition.
 #include <windows.h>
 #include <process.h>
 #include <malloc.h>
+#include <signal.h>
 #define NUMBA_WINTHREAD
 #else
 /* PThread */
 #include <pthread.h>
 #include <unistd.h>
 #include <alloca.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <signal.h>
 #define NUMBA_PTHREAD
 #endif
 
@@ -38,6 +42,16 @@ race condition.
 
 #define _DEBUG 0
 
+/* workqueue is not threadsafe, so we use DSO globals to flag and update various
+ * states.
+ */
+/* This variable is the nesting level, it's incremented at the start of each
+ * parallel region and decremented at the end, if parallel regions are nested
+ * on entry the value == 1 and workqueue will abort (this in preference to just
+ * hanging or segfaulting).
+ */
+static int _nesting_level = 0;
+
 /* As the thread-pool isn't inherited by children,
    free the task-queue, too. */
 static void reset_after_fork(void);
@@ -312,6 +326,20 @@ parallel_for(void *fn, char **args, size_t *dimensions, size_t *steps, void *dat
     //     steps = <ir.Argument '.3' of type i64*>
     //     data = <ir.Argument '.4' of type i8*>
 
+    // check the nesting level, if it's already 1, abort, workqueue cannot
+    // handle nesting.
+    if (_nesting_level >= 1){
+        fprintf(stderr, "%s", "Terminating: Nested parallel kernel launch "
+                              "detected, the workqueue threading layer does "
+                              "not supported nested parallelism. Try the TBB "
+                              "threading layer.\n");
+        raise(SIGTERM);
+        return;
+    }
+
+    // increment the nest level
+    _nesting_level += 1;
+
     size_t * count_space = NULL;
     char ** array_arg_space = NULL;
     const size_t arg_len = (inner_ndim + 1);
@@ -431,6 +459,8 @@ parallel_for(void *fn, char **args, size_t *dimensions, size_t *steps, void *dat
     synchronize();
 
     queue_count = old_queue_count;
+    // decrement the nest level
+    _nesting_level -= 1;
 }
 
 static void
@@ -525,6 +555,7 @@ static void reset_after_fork(void)
     queues = NULL;
     NUM_THREADS = -1;
     _INIT_NUM_THREADS = -1;
+    _nesting_level = 0;
 }
 
 MOD_INIT(workqueue)

From 3c33a0b8c765ff65b5cd70d2f998e3908dd58a5d Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Thu, 9 Jan 2020 13:41:58 +0000
Subject: [PATCH 072/136] Add test for sigterm on nested workqueue use

---
 numba/npyufunc/workqueue.c           |  2 +-
 numba/tests/test_parallel_backend.py | 40 ++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/numba/npyufunc/workqueue.c b/numba/npyufunc/workqueue.c
index 1cf4798d8fd..92940a46139 100644
--- a/numba/npyufunc/workqueue.c
+++ b/numba/npyufunc/workqueue.c
@@ -333,7 +333,7 @@ parallel_for(void *fn, char **args, size_t *dimensions, size_t *steps, void *dat
                               "detected, the workqueue threading layer does "
                               "not supported nested parallelism. Try the TBB "
                               "threading layer.\n");
-        raise(SIGTERM);
+        raise(SIGABRT);
         return;
     }
 
diff --git a/numba/tests/test_parallel_backend.py b/numba/tests/test_parallel_backend.py
index ecbb2a64017..c01af21ca11 100644
--- a/numba/tests/test_parallel_backend.py
+++ b/numba/tests/test_parallel_backend.py
@@ -8,6 +8,7 @@
 import multiprocessing
 import random
 import os
+import signal
 import sys
 import subprocess
 
@@ -579,6 +580,45 @@ def foo(n):
             print(out, err)
         self.assertIn("@tbb@", out)
 
+    def test_workqueue_aborts_on_nested_parallelism(self):
+        """
+        Tests workqueue raises sigabrt if a nested parallel call is performed
+        """
+        runme = """if 1:
+            from numba import njit, prange
+            import numpy as np
+
+            @njit(parallel=True)
+            def nested(x):
+                for i in prange(len(x)):
+                    x[i] += 1
+
+
+            @njit(parallel=True)
+            def main():
+                Z = np.zeros((5, 10))
+                for i in prange(Z.shape[0]):
+                    nested(Z[i])
+                return Z
+
+            main()
+        """
+        cmdline = [sys.executable, '-c', runme]
+        env = os.environ.copy()
+        env['NUMBA_THREADING_LAYER'] = "workqueue"
+        env['NUMBA_NUM_THREADS'] = "4"
+
+        try:
+            out, err = self.run_cmd(cmdline, env=env)
+        except AssertionError as e:
+            if self._DEBUG:
+                print(out, err)
+            e_msg = str(e)
+            self.assertIn("failed with code", e_msg)
+            self.assertIn(str(signal.SIGTERM.value), e_msg)
+            self.assertIn("Terminating: Nested parallel kernel launch detected",
+                          e_msg)
+
 
 # 32bit or windows py27 (not that this runs on windows)
 @parfors_skip_unsupported

From 3f080fbaa5b01111aadb2c6fb451719b7441b6ce Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Thu, 9 Jan 2020 14:20:07 +0000
Subject: [PATCH 073/136] Update signal

---
 numba/npyufunc/workqueue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numba/npyufunc/workqueue.c b/numba/npyufunc/workqueue.c
index 92940a46139..1cf4798d8fd 100644
--- a/numba/npyufunc/workqueue.c
+++ b/numba/npyufunc/workqueue.c
@@ -333,7 +333,7 @@ parallel_for(void *fn, char **args, size_t *dimensions, size_t *steps, void *dat
                               "detected, the workqueue threading layer does "
                               "not supported nested parallelism. Try the TBB "
                               "threading layer.\n");
-        raise(SIGABRT);
+        raise(SIGTERM);
         return;
     }
 

From 2bcf552685ed9edcbf571097b0004e64bfedcadb Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Thu, 9 Jan 2020 15:35:41 +0000
Subject: [PATCH 074/136] Update signal for windows

---
 numba/tests/test_parallel_backend.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/numba/tests/test_parallel_backend.py b/numba/tests/test_parallel_backend.py
index c01af21ca11..00fde72a57d 100644
--- a/numba/tests/test_parallel_backend.py
+++ b/numba/tests/test_parallel_backend.py
@@ -615,7 +615,13 @@ def main():
                 print(out, err)
             e_msg = str(e)
             self.assertIn("failed with code", e_msg)
-            self.assertIn(str(signal.SIGTERM.value), e_msg)
+            # raised a SIGTERM, on windows in practise this seems to 
+            # materialise as a SIGQUIT
+            # https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/signal?view=vs-2019
+            if _windows:
+                self.assertIn(str(signal.SIGQUIT.value), e_msg)
+            else:
+                self.assertIn(str(signal.SIGTERM.value), e_msg)
             self.assertIn("Terminating: Nested parallel kernel launch detected",
                           e_msg)
 

From 5f5179739ae7228f979a3151eae8302992b9ee0d Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Thu, 9 Jan 2020 15:43:12 +0000
Subject: [PATCH 075/136] Fix flake8

---
 numba/tests/test_parallel_backend.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/numba/tests/test_parallel_backend.py b/numba/tests/test_parallel_backend.py
index 00fde72a57d..7767738acfe 100644
--- a/numba/tests/test_parallel_backend.py
+++ b/numba/tests/test_parallel_backend.py
@@ -615,9 +615,9 @@ def main():
                 print(out, err)
             e_msg = str(e)
             self.assertIn("failed with code", e_msg)
-            # raised a SIGTERM, on windows in practise this seems to 
+            # raised a SIGTERM, on windows in practise this seems to
             # materialise as a SIGQUIT
-            # https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/signal?view=vs-2019
+            # https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/signal?view=vs-2019  # noqa: E501
             if _windows:
                 self.assertIn(str(signal.SIGQUIT.value), e_msg)
             else:

From b95da892283367614560b417411563c92ca8f113 Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Thu, 9 Jan 2020 16:07:14 +0000
Subject: [PATCH 076/136] Move to SIGABRT as all OS have that?!

---
 numba/npyufunc/workqueue.c           | 2 +-
 numba/tests/test_parallel_backend.py | 8 ++------
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/numba/npyufunc/workqueue.c b/numba/npyufunc/workqueue.c
index 1cf4798d8fd..92940a46139 100644
--- a/numba/npyufunc/workqueue.c
+++ b/numba/npyufunc/workqueue.c
@@ -333,7 +333,7 @@ parallel_for(void *fn, char **args, size_t *dimensions, size_t *steps, void *dat
                               "detected, the workqueue threading layer does "
                               "not supported nested parallelism. Try the TBB "
                               "threading layer.\n");
-        raise(SIGTERM);
+        raise(SIGABRT);
         return;
     }
 
diff --git a/numba/tests/test_parallel_backend.py b/numba/tests/test_parallel_backend.py
index 7767738acfe..7c11dc67559 100644
--- a/numba/tests/test_parallel_backend.py
+++ b/numba/tests/test_parallel_backend.py
@@ -615,13 +615,9 @@ def main():
                 print(out, err)
             e_msg = str(e)
             self.assertIn("failed with code", e_msg)
-            # raised a SIGTERM, on windows in practise this seems to
-            # materialise as a SIGQUIT
+            # raised a SIGABRT, this seems to exist everywhere
             # https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/signal?view=vs-2019  # noqa: E501
-            if _windows:
-                self.assertIn(str(signal.SIGQUIT.value), e_msg)
-            else:
-                self.assertIn(str(signal.SIGTERM.value), e_msg)
+            self.assertIn(str(signal.SIGABRT.value), e_msg)
             self.assertIn("Terminating: Nested parallel kernel launch detected",
                           e_msg)
 

From 370b73279a7e0703d553185c150d1e071ef5bb13 Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Thu, 9 Jan 2020 17:32:00 +0000
Subject: [PATCH 077/136] Stop checking signal value

---
 numba/tests/test_parallel_backend.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/numba/tests/test_parallel_backend.py b/numba/tests/test_parallel_backend.py
index 7c11dc67559..b23807c4777 100644
--- a/numba/tests/test_parallel_backend.py
+++ b/numba/tests/test_parallel_backend.py
@@ -8,7 +8,6 @@
 import multiprocessing
 import random
 import os
-import signal
 import sys
 import subprocess
 
@@ -615,9 +614,8 @@ def main():
                 print(out, err)
             e_msg = str(e)
             self.assertIn("failed with code", e_msg)
-            # raised a SIGABRT, this seems to exist everywhere
-            # https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/signal?view=vs-2019  # noqa: E501
-            self.assertIn(str(signal.SIGABRT.value), e_msg)
+            # raised a SIGABRT, but the value is platform specific so just check
+            # the error message
             self.assertIn("Terminating: Nested parallel kernel launch detected",
                           e_msg)
 

From 0b2e22220a0f8dddce7c776aed88d6f73cc58e54 Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Thu, 9 Jan 2020 20:56:20 +0000
Subject: [PATCH 078/136] Cast pthread_self() to int.

This is not ideal, but is likely sufficient given the use case
(testing and debug).
---
 numba/npyufunc/workqueue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numba/npyufunc/workqueue.c b/numba/npyufunc/workqueue.c
index 92940a46139..e732a3c368c 100644
--- a/numba/npyufunc/workqueue.c
+++ b/numba/npyufunc/workqueue.c
@@ -131,7 +131,7 @@ numba_new_thread(void *worker, void *arg)
 static int
 get_thread_id(void)
 {
-    return pthread_self();
+    return (int)pthread_self();
 }
 
 #endif

From d67b21fb65a9f1455b9eba7e40b113283209459c Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Thu, 9 Jan 2020 15:26:11 -0700
Subject: [PATCH 079/136] Add thread mask tests to test_parallel_backend

---
 numba/tests/test_parallel_backend.py | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/numba/tests/test_parallel_backend.py b/numba/tests/test_parallel_backend.py
index b23807c4777..4c64a8e2d9d 100644
--- a/numba/tests/test_parallel_backend.py
+++ b/numba/tests/test_parallel_backend.py
@@ -16,7 +16,7 @@
 from numba import config, utils
 
 from numba import unittest_support as unittest
-from numba import jit, vectorize, guvectorize
+from numba import jit, vectorize, guvectorize, set_num_threads
 
 from .support import temp_directory, override_config, TestCase, tag
 
@@ -110,6 +110,17 @@ def __call__(self):
         got = cfunc(a, b)
         np.testing.assert_allclose(expected, got)
 
+class mask_runner(object):
+    def __init__(self, runner, mask, **options):
+        self.runner = runner
+        self.mask = mask
+
+    def __call__(self):
+        if self.mask:
+            # Tests are all run in isolated subprocesses, so we
+            # don't have to worry about this affecting other tests
+            set_num_threads(self.mask)
+        self.runner()
 
 class linalg_runner(runnable):
 
@@ -243,6 +254,13 @@ class TestParallelBackendBase(TestCase):
         ]
         all_impls.extend(parfor_impls)
 
+    masks = [i for i in [1, 2, 4, 8, 16] if i <= config.NUMBA_NUM_THREADS]
+
+    mask_impls = []
+    for impl in all_impls:
+        for mask in masks:
+            mask_impls.append(mask_runner(impl, mask))
+
     parallelism = ['threading', 'random']
     if utils.PYVERSION > (3, 0):
         parallelism.append('multiprocessing_spawn')
@@ -263,6 +281,7 @@ class TestParallelBackendBase(TestCase):
             guvectorize_runner(nopython=True, target='parallel'),
         ],
         'concurrent_mix_use': all_impls,
+        'concurrent_mix_use_masks': mask_impls,
     }
 
     safe_backends = {'omp', 'tbb'}

From 20167f395e117fca39522f267fad8c42263a4bc0 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Thu, 9 Jan 2020 16:20:13 -0700
Subject: [PATCH 080/136] Somewhat improved the error messages if
 get_num_threads() gets an invalid value

---
 numba/npyufunc/parallel.py | 13 +++++++------
 numba/npyufunc/parfor.py   |  3 ++-
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index a85169464ad..5f368cf9ec4 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -565,11 +565,11 @@ def get_num_threads():
     """
     _launch_threads()
     num_threads = _get_num_threads()
-    if num_threads == 0:
-        print("Broken: ", _get_thread_id())
+    if num_threads <= 0:
         raise RuntimeError("Invalid number of threads. "
-                           "This likely indicates a bug in numba.",
-                           _get_thread_id())
+                           "This likely indicates a bug in numba. "
+                           "(thread_id=%s, num_threads=%s)" %
+                           (_get_thread_id(), num_threads))
     return num_threads
 
 
@@ -579,8 +579,9 @@ def ol_get_num_threads():
 
     def impl():
         num_threads = _get_num_threads()
-        if num_threads == 0:
-            print("Broken: ", _get_thread_id())
+        if num_threads <= 0:
+            print("Broken thread_id: ", _get_thread_id())
+            print("num_threads: ", num_threads)
             raise RuntimeError("Invalid number of threads. "
                                "This likely indicates a bug in numba.")
         return num_threads
diff --git a/numba/npyufunc/parfor.py b/numba/npyufunc/parfor.py
index 7ea65fdb32f..096c4eb595e 100644
--- a/numba/npyufunc/parfor.py
+++ b/numba/npyufunc/parfor.py
@@ -1345,8 +1345,9 @@ def load_range(v):
 
     with cgutils.if_unlikely(builder, builder.icmp_signed('==', num_threads,
                                                   num_threads.type(0))):
+        cgutils.printf(builder, "num_threads: %d\n", num_threads)
         context.call_conv.return_user_exc(builder, RuntimeError,
-                                                  ("3 Invalid number of threads. "
+                                                  ("Invalid number of threads. "
                                                    "This likely indicates a bug in numba.",))
 
     builder.call(

From 3135b9078740baff968f7e70a5a475d07bb01e8b Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Thu, 9 Jan 2020 18:59:17 -0700
Subject: [PATCH 081/136] Use fewer masks in the parallel_backend tests

---
 numba/tests/test_parallel_backend.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/numba/tests/test_parallel_backend.py b/numba/tests/test_parallel_backend.py
index 4c64a8e2d9d..f0e59ddf7e4 100644
--- a/numba/tests/test_parallel_backend.py
+++ b/numba/tests/test_parallel_backend.py
@@ -254,7 +254,10 @@ class TestParallelBackendBase(TestCase):
         ]
         all_impls.extend(parfor_impls)
 
-    masks = [i for i in [1, 2, 4, 8, 16] if i <= config.NUMBA_NUM_THREADS]
+    if config.NUMBA_NUM_THREADS < 2:
+        # Not enough cores
+        masks = []
+    masks = [1, 2]
 
     mask_impls = []
     for impl in all_impls:

From e3e9bad9c2c7402bc88b8e8f375644c83ba01d5a Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Thu, 9 Jan 2020 15:26:11 -0700
Subject: [PATCH 082/136] Add thread mask tests to test_parallel_backend

---
 numba/tests/test_parallel_backend.py | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/numba/tests/test_parallel_backend.py b/numba/tests/test_parallel_backend.py
index b23807c4777..4c64a8e2d9d 100644
--- a/numba/tests/test_parallel_backend.py
+++ b/numba/tests/test_parallel_backend.py
@@ -16,7 +16,7 @@
 from numba import config, utils
 
 from numba import unittest_support as unittest
-from numba import jit, vectorize, guvectorize
+from numba import jit, vectorize, guvectorize, set_num_threads
 
 from .support import temp_directory, override_config, TestCase, tag
 
@@ -110,6 +110,17 @@ def __call__(self):
         got = cfunc(a, b)
         np.testing.assert_allclose(expected, got)
 
+class mask_runner(object):
+    def __init__(self, runner, mask, **options):
+        self.runner = runner
+        self.mask = mask
+
+    def __call__(self):
+        if self.mask:
+            # Tests are all run in isolated subprocesses, so we
+            # don't have to worry about this affecting other tests
+            set_num_threads(self.mask)
+        self.runner()
 
 class linalg_runner(runnable):
 
@@ -243,6 +254,13 @@ class TestParallelBackendBase(TestCase):
         ]
         all_impls.extend(parfor_impls)
 
+    masks = [i for i in [1, 2, 4, 8, 16] if i <= config.NUMBA_NUM_THREADS]
+
+    mask_impls = []
+    for impl in all_impls:
+        for mask in masks:
+            mask_impls.append(mask_runner(impl, mask))
+
     parallelism = ['threading', 'random']
     if utils.PYVERSION > (3, 0):
         parallelism.append('multiprocessing_spawn')
@@ -263,6 +281,7 @@ class TestParallelBackendBase(TestCase):
             guvectorize_runner(nopython=True, target='parallel'),
         ],
         'concurrent_mix_use': all_impls,
+        'concurrent_mix_use_masks': mask_impls,
     }
 
     safe_backends = {'omp', 'tbb'}

From 81ea4fb3ad1b041e0bdfe434162aeb97f73be619 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Thu, 9 Jan 2020 16:20:13 -0700
Subject: [PATCH 083/136] Somewhat improved the error messages if
 get_num_threads() gets an invalid value

---
 numba/npyufunc/parallel.py | 13 +++++++------
 numba/npyufunc/parfor.py   |  3 ++-
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index a85169464ad..5f368cf9ec4 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -565,11 +565,11 @@ def get_num_threads():
     """
     _launch_threads()
     num_threads = _get_num_threads()
-    if num_threads == 0:
-        print("Broken: ", _get_thread_id())
+    if num_threads <= 0:
         raise RuntimeError("Invalid number of threads. "
-                           "This likely indicates a bug in numba.",
-                           _get_thread_id())
+                           "This likely indicates a bug in numba. "
+                           "(thread_id=%s, num_threads=%s)" %
+                           (_get_thread_id(), num_threads))
     return num_threads
 
 
@@ -579,8 +579,9 @@ def ol_get_num_threads():
 
     def impl():
         num_threads = _get_num_threads()
-        if num_threads == 0:
-            print("Broken: ", _get_thread_id())
+        if num_threads <= 0:
+            print("Broken thread_id: ", _get_thread_id())
+            print("num_threads: ", num_threads)
             raise RuntimeError("Invalid number of threads. "
                                "This likely indicates a bug in numba.")
         return num_threads
diff --git a/numba/npyufunc/parfor.py b/numba/npyufunc/parfor.py
index 7ea65fdb32f..096c4eb595e 100644
--- a/numba/npyufunc/parfor.py
+++ b/numba/npyufunc/parfor.py
@@ -1345,8 +1345,9 @@ def load_range(v):
 
     with cgutils.if_unlikely(builder, builder.icmp_signed('==', num_threads,
                                                   num_threads.type(0))):
+        cgutils.printf(builder, "num_threads: %d\n", num_threads)
         context.call_conv.return_user_exc(builder, RuntimeError,
-                                                  ("3 Invalid number of threads. "
+                                                  ("Invalid number of threads. "
                                                    "This likely indicates a bug in numba.",))
 
     builder.call(

From eb44a96085a4a16fbd7335889547c5c997e22740 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Thu, 9 Jan 2020 18:59:17 -0700
Subject: [PATCH 084/136] Use fewer masks in the parallel_backend tests

---
 numba/tests/test_parallel_backend.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/numba/tests/test_parallel_backend.py b/numba/tests/test_parallel_backend.py
index 4c64a8e2d9d..f0e59ddf7e4 100644
--- a/numba/tests/test_parallel_backend.py
+++ b/numba/tests/test_parallel_backend.py
@@ -254,7 +254,10 @@ class TestParallelBackendBase(TestCase):
         ]
         all_impls.extend(parfor_impls)
 
-    masks = [i for i in [1, 2, 4, 8, 16] if i <= config.NUMBA_NUM_THREADS]
+    if config.NUMBA_NUM_THREADS < 2:
+        # Not enough cores
+        masks = []
+    masks = [1, 2]
 
     mask_impls = []
     for impl in all_impls:

From 547fd1a3b3f7c599010e14b567cf38d66471a0ab Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Fri, 10 Jan 2020 12:51:06 +0000
Subject: [PATCH 085/136] fix flake8

---
 numba/tests/test_parallel_backend.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/numba/tests/test_parallel_backend.py b/numba/tests/test_parallel_backend.py
index f0e59ddf7e4..4dcf1ef5a8a 100644
--- a/numba/tests/test_parallel_backend.py
+++ b/numba/tests/test_parallel_backend.py
@@ -110,6 +110,7 @@ def __call__(self):
         got = cfunc(a, b)
         np.testing.assert_allclose(expected, got)
 
+
 class mask_runner(object):
     def __init__(self, runner, mask, **options):
         self.runner = runner
@@ -122,6 +123,7 @@ def __call__(self):
             set_num_threads(self.mask)
         self.runner()
 
+
 class linalg_runner(runnable):
 
     def __call__(self):

From d787cfb09613ddf0b11ef636d03ae0dcc3db86d1 Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Fri, 10 Jan 2020 13:12:43 +0000
Subject: [PATCH 086/136] Fix issue in masked testing when cores=1

---
 numba/tests/test_parallel_backend.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/numba/tests/test_parallel_backend.py b/numba/tests/test_parallel_backend.py
index 4dcf1ef5a8a..d6bb016adb8 100644
--- a/numba/tests/test_parallel_backend.py
+++ b/numba/tests/test_parallel_backend.py
@@ -259,7 +259,8 @@ class TestParallelBackendBase(TestCase):
     if config.NUMBA_NUM_THREADS < 2:
         # Not enough cores
         masks = []
-    masks = [1, 2]
+    else:
+        masks = [1, 2]
 
     mask_impls = []
     for impl in all_impls:

From cbd1c45548140bb68e98297f9077cf4ff300a949 Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Fri, 10 Jan 2020 13:46:37 +0000
Subject: [PATCH 087/136] refactor snt checker

---
 numba/npyufunc/parallel.py | 25 ++++++++++++-------------
 numba/npyufunc/parfor.py   |  2 +-
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index 5f368cf9ec4..9be186af8fa 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -484,23 +484,22 @@ def _load_num_threads_funcs(lib):
 
 # Some helpers to make set_num_threads jittable
 
-
-def snt_check(n):
+def gen_snt_check():
     from numba.config import NUMBA_NUM_THREADS
     msg = "The number of threads must be between 1 and %s" % NUMBA_NUM_THREADS
-    if n > NUMBA_NUM_THREADS or n < 1:
-        raise ValueError(msg)
+
+    def snt_check(n):
+        if n > NUMBA_NUM_THREADS or n < 1:
+            raise ValueError(msg)
+    return snt_check
+
+
+snt_check = gen_snt_check()
 
 
 @overload(snt_check)
 def ol_snt_check(n):
-    from numba.config import NUMBA_NUM_THREADS
-    msg = "The number of threads must be between 1 and %s" % NUMBA_NUM_THREADS
-
-    def impl(n):
-        if n > NUMBA_NUM_THREADS or n < 1:
-            raise ValueError(msg)
-    return impl
+    return snt_check
 
 
 def set_num_threads(n):
@@ -567,7 +566,7 @@ def get_num_threads():
     num_threads = _get_num_threads()
     if num_threads <= 0:
         raise RuntimeError("Invalid number of threads. "
-                           "This likely indicates a bug in numba. "
+                           "This likely indicates a bug in Numba. "
                            "(thread_id=%s, num_threads=%s)" %
                            (_get_thread_id(), num_threads))
     return num_threads
@@ -583,7 +582,7 @@ def impl():
             print("Broken thread_id: ", _get_thread_id())
             print("num_threads: ", num_threads)
             raise RuntimeError("Invalid number of threads. "
-                               "This likely indicates a bug in numba.")
+                               "This likely indicates a bug in Numba.")
         return num_threads
     return impl
 
diff --git a/numba/npyufunc/parfor.py b/numba/npyufunc/parfor.py
index 096c4eb595e..68925d306c8 100644
--- a/numba/npyufunc/parfor.py
+++ b/numba/npyufunc/parfor.py
@@ -1348,7 +1348,7 @@ def load_range(v):
         cgutils.printf(builder, "num_threads: %d\n", num_threads)
         context.call_conv.return_user_exc(builder, RuntimeError,
                                                   ("Invalid number of threads. "
-                                                   "This likely indicates a bug in numba.",))
+                                                   "This likely indicates a bug in Numba.",))
 
     builder.call(
         do_scheduling, [

From b9e11a3e2bc5143f2682f5c17f50a01e7a41ab49 Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Fri, 10 Jan 2020 16:17:46 +0000
Subject: [PATCH 088/136] Assert int type in set_num_threads

---
 numba/npyufunc/parallel.py      |  7 ++++++-
 numba/tests/test_num_threads.py | 14 ++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index 9be186af8fa..5534dcbf7ac 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -26,7 +26,7 @@
 
 from numba.npyufunc import ufuncbuilder
 from numba.numpy_support import as_dtype
-from numba import types, config, utils
+from numba import types, config, utils, errors
 from numba.npyufunc.wrappers import _wrapper_info
 from numba.extending import overload
 
@@ -526,6 +526,8 @@ def set_num_threads(n):
 
     """
     _launch_threads()
+    if not isinstance(n, int):
+        raise TypeError("The number of threads specified must be an integer")
     snt_check(n)
     _set_num_threads(n)
 
@@ -533,6 +535,9 @@ def set_num_threads(n):
 @overload(set_num_threads)
 def ol_set_num_threads(n):
     _launch_threads()
+    if not isinstance(n, types.Integer):
+        msg = "The number of threads specified must be an integer"
+        raise errors.TypingError(msg)
 
     def impl(n):
         snt_check(n)
diff --git a/numba/tests/test_num_threads.py b/numba/tests/test_num_threads.py
index a734c0a1a46..56f2e970b19 100644
--- a/numba/tests/test_num_threads.py
+++ b/numba/tests/test_num_threads.py
@@ -11,6 +11,7 @@
                    threading_layer, guvectorize)
 from numba.npyufunc.parallel import _get_thread_id
 from numba import unittest_support as unittest
+from numba.errors import TypingError
 from .support import TestCase, skip_parfors_unsupported, tag
 from .test_parallel_backend import TestInSubprocess
 
@@ -33,6 +34,19 @@ def check_mask(self, expected, result):
         else:
             assert 0, 'unreachable'
 
+    @skip_parfors_unsupported
+    def test_set_num_threads_type(self):
+
+        @njit
+        def foo():
+            set_num_threads('wrong_type')
+
+        expected = "The number of threads specified must be an integer"
+        for fn, errty in ((foo, TypingError), (foo.py_func, TypeError)):
+            with self.assertRaises(errty) as raises:
+                fn()
+            self.assertIn(expected, str(raises.exception))
+
     @skip_parfors_unsupported
     @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
     def _test_set_num_threads_basic(self):

From fd55a90875e908ae14bce71a893f239ba9a02660 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Fri, 10 Jan 2020 10:46:55 -0700
Subject: [PATCH 089/136] Remove duplicated class

---
 numba/tests/test_parallel_backend.py | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/numba/tests/test_parallel_backend.py b/numba/tests/test_parallel_backend.py
index 477e918ab19..9ec8fb1ec7e 100644
--- a/numba/tests/test_parallel_backend.py
+++ b/numba/tests/test_parallel_backend.py
@@ -122,19 +122,6 @@ def __call__(self):
             set_num_threads(self.mask)
         self.runner()
 
-class mask_runner(object):
-    def __init__(self, runner, mask, **options):
-        self.runner = runner
-        self.mask = mask
-
-    def __call__(self):
-        if self.mask:
-            # Tests are all run in isolated subprocesses, so we
-            # don't have to worry about this affecting other tests
-            set_num_threads(self.mask)
-        self.runner()
-
-
 class linalg_runner(runnable):
 
     def __call__(self):

From 36295d959b985ebc3f8f811ec949063d1a04a72e Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Fri, 10 Jan 2020 10:55:11 -0700
Subject: [PATCH 090/136] Allow numpy integers in set_num_threads()

---
 numba/npyufunc/parallel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index 5534dcbf7ac..14c82d2e970 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -526,7 +526,7 @@ def set_num_threads(n):
 
     """
     _launch_threads()
-    if not isinstance(n, int):
+    if not isinstance(n, (int, np.integer)):
         raise TypeError("The number of threads specified must be an integer")
     snt_check(n)
     _set_num_threads(n)

From 9c5e0c43c5d3ea10cb77360b3fd498f8c238518c Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Fri, 10 Jan 2020 14:19:45 -0700
Subject: [PATCH 091/136] Fix flake8

---
 numba/tests/test_parallel_backend.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/numba/tests/test_parallel_backend.py b/numba/tests/test_parallel_backend.py
index 9ec8fb1ec7e..d6bb016adb8 100644
--- a/numba/tests/test_parallel_backend.py
+++ b/numba/tests/test_parallel_backend.py
@@ -110,6 +110,7 @@ def __call__(self):
         got = cfunc(a, b)
         np.testing.assert_allclose(expected, got)
 
+
 class mask_runner(object):
     def __init__(self, runner, mask, **options):
         self.runner = runner
@@ -122,6 +123,7 @@ def __call__(self):
             set_num_threads(self.mask)
         self.runner()
 
+
 class linalg_runner(runnable):
 
     def __call__(self):

From 1ee0a49440ab8c34e169c3f18ef0afd1c0a9578f Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Fri, 10 Jan 2020 17:22:50 -0700
Subject: [PATCH 092/136] Add an example of thread masking to the documentation

---
 docs/source/user/threading-layer.rst | 55 ++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/docs/source/user/threading-layer.rst b/docs/source/user/threading-layer.rst
index 9e8fcf2b9a2..0c887256377 100644
--- a/docs/source/user/threading-layer.rst
+++ b/docs/source/user/threading-layer.rst
@@ -194,6 +194,61 @@ The current number of threads used by numba can be accessed with
 :func:`numba.get_num_threads`. Both functions work inside of a jitted
 function.
 
+Example of Limiting the Number of Threads
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In this example, suppose the machine we are running on has 8 cores (so
+:obj:`numba.config.NUMBA_NUM_THREADS` would be ``8``). Suppose we want to run
+some code with ``@njit(parallel=True)``, but we also want to run our code
+concurrently in 4 different processes. With the default number of threads,
+each Python process would run 8 threads, for a total in 4*8 = 32 threads,
+which is oversubscription for our 8 cores. We should rather limit each process
+to 2 threads, so that the total will be 4*2 = 8, which matches our number of
+physical cores.
+
+There are two ways to do this. One is to set the :envvar:`NUMBA_NUM_THREADS`
+environment variable to ``2``.
+
+.. code:: bash
+
+   $ NUMBA_NUM_THREADS=2 python ourcode.py
+
+However, there are two downsides to this approach:
+
+1. :envvar:`NUMBA_NUM_THREADS` must be set before Numba is imported, and
+   ideally before Python is launched. As soon as Numba is imported the
+   environment variable is read and that number of threads is locked in as the
+   number of threads Numba launches.
+
+2. If we want to later increase the number of threads used by the process, we
+   cannot. :envvar:`NUMBA_NUM_THREADS` sets the *maximum* number of threads
+   that are launched for a process. Calling :func:`~.set_num_threads()` with a
+   value greater than :obj:`numba.config.NUMBA_NUM_THREADS` results in an
+   error.
+
+The advantage of this approach is that we can do it from outside of the
+process without changing the code.
+
+Another approach is to use the :func:`numba.set_num_threads` function in our code
+
+.. code:: python
+
+   from numba import njit, set_num_threads
+
+   @njit(parallel=True)
+   def func():
+       ...
+
+   set_num_threads(2)
+   func()
+
+If we call ``set_num_threads(2)`` before executing our parallel code, it has
+the same effect as calling the process with ``NUMBA_NUM_THREADS=2``, in that
+the parallel code will only execute on 2 threads. However, we can later call
+``set_num_threads(8)`` to increase the number of threads back to the default
+size. And we do not have to worry about setting it before Numba gets imported.
+It only needs to be called before the parallel function is run.
+
 API Reference
 ~~~~~~~~~~~~~
 

From 5ed37c52fd3b766530d3afabf3442397d0cd629e Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Fri, 10 Jan 2020 17:22:50 -0700
Subject: [PATCH 093/136] Add an example of thread masking to the documentation

---
 docs/source/user/threading-layer.rst | 55 ++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/docs/source/user/threading-layer.rst b/docs/source/user/threading-layer.rst
index 9e8fcf2b9a2..0c887256377 100644
--- a/docs/source/user/threading-layer.rst
+++ b/docs/source/user/threading-layer.rst
@@ -194,6 +194,61 @@ The current number of threads used by numba can be accessed with
 :func:`numba.get_num_threads`. Both functions work inside of a jitted
 function.
 
+Example of Limiting the Number of Threads
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In this example, suppose the machine we are running on has 8 cores (so
+:obj:`numba.config.NUMBA_NUM_THREADS` would be ``8``). Suppose we want to run
+some code with ``@njit(parallel=True)``, but we also want to run our code
+concurrently in 4 different processes. With the default number of threads,
+each Python process would run 8 threads, for a total in 4*8 = 32 threads,
+which is oversubscription for our 8 cores. We should rather limit each process
+to 2 threads, so that the total will be 4*2 = 8, which matches our number of
+physical cores.
+
+There are two ways to do this. One is to set the :envvar:`NUMBA_NUM_THREADS`
+environment variable to ``2``.
+
+.. code:: bash
+
+   $ NUMBA_NUM_THREADS=2 python ourcode.py
+
+However, there are two downsides to this approach:
+
+1. :envvar:`NUMBA_NUM_THREADS` must be set before Numba is imported, and
+   ideally before Python is launched. As soon as Numba is imported the
+   environment variable is read and that number of threads is locked in as the
+   number of threads Numba launches.
+
+2. If we want to later increase the number of threads used by the process, we
+   cannot. :envvar:`NUMBA_NUM_THREADS` sets the *maximum* number of threads
+   that are launched for a process. Calling :func:`~.set_num_threads()` with a
+   value greater than :obj:`numba.config.NUMBA_NUM_THREADS` results in an
+   error.
+
+The advantage of this approach is that we can do it from outside of the
+process without changing the code.
+
+Another approach is to use the :func:`numba.set_num_threads` function in our code
+
+.. code:: python
+
+   from numba import njit, set_num_threads
+
+   @njit(parallel=True)
+   def func():
+       ...
+
+   set_num_threads(2)
+   func()
+
+If we call ``set_num_threads(2)`` before executing our parallel code, it has
+the same effect as calling the process with ``NUMBA_NUM_THREADS=2``, in that
+the parallel code will only execute on 2 threads. However, we can later call
+``set_num_threads(8)`` to increase the number of threads back to the default
+size. And we do not have to worry about setting it before Numba gets imported.
+It only needs to be called before the parallel function is run.
+
 API Reference
 ~~~~~~~~~~~~~
 

From 4b06a0faca076f354a68c7c60be3f5933b579afc Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Fri, 10 Jan 2020 10:55:11 -0700
Subject: [PATCH 094/136] Allow numpy integers in set_num_threads()

---
 numba/npyufunc/parallel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index 5534dcbf7ac..14c82d2e970 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -526,7 +526,7 @@ def set_num_threads(n):
 
     """
     _launch_threads()
-    if not isinstance(n, int):
+    if not isinstance(n, (int, np.integer)):
         raise TypeError("The number of threads specified must be an integer")
     snt_check(n)
     _set_num_threads(n)

From a65067544b37bffe08cfac985094859a8db7b430 Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Tue, 14 Jan 2020 14:09:39 +0000
Subject: [PATCH 095/136] Initial draft of developer docs for the thread
 masking impl.

As title.
---
 docs/source/developer/index.rst               |   1 +
 .../developer/threading_implementation.rst    | 114 ++++++++++++++++++
 docs/source/user/threading-layer.rst          |   2 +
 3 files changed, 117 insertions(+)
 create mode 100644 docs/source/developer/threading_implementation.rst

diff --git a/docs/source/developer/index.rst b/docs/source/developer/index.rst
index 1a01e1e0067..b1efc6defa1 100644
--- a/docs/source/developer/index.rst
+++ b/docs/source/developer/index.rst
@@ -22,5 +22,6 @@ Developer Manual
    environment.rst
    hashing.rst
    caching.rst
+   threading_implementation.rst
    literal.rst
    roadmap.rst
diff --git a/docs/source/developer/threading_implementation.rst b/docs/source/developer/threading_implementation.rst
new file mode 100644
index 00000000000..48eaeb879d4
--- /dev/null
+++ b/docs/source/developer/threading_implementation.rst
@@ -0,0 +1,114 @@
+
+=========================================
+Notes on Numba's threading implementation
+=========================================
+
+The execution of the work presented by the Numba ``parallel`` targets is
+undertaken by the Numba threading layer. Practically, the "threading layer"
+is a Numba built-in library that can perform the required concurrent execution.
+At the time of writing there are three threading layers available, each
+implemented via a different lower level native threading library. More
+information on the threading layers and appropriate selection of a threading
+layer for a given application/system can be found in the
+:ref:`threading layer documentation <numba-threading-layer>`.
+
+The pertinent information to note for the following sections is that the
+function in the threading library that performs the parallel execution is the
+``parallel_for`` function. The job of this function is to both orchestrate and
+execute the parallel tasks.
+
+Thread masking
+--------------
+:ref:`Thread masking <numba-threading-layer-thread-masking>` was added to make
+it possible for a user to programmatically alter the number of threads
+performing work in the threading layer. Thread masking proved challenging to
+implement as it required the development of a programming model that is suitable
+for users, easy to reason about, and could be implemented safely, with
+consistent behaviour across the various threading layers.
+
+Programming model
+~~~~~~~~~~~~~~~~~
+The programming model chosen is similar to that found in OpenMP. The reasons for
+this choice were that it is familiar to a lot of users, restricted in scope and
+also simple. The number of threads in use is specified by calling
+``set_num_threads`` and the number of threads in use can be queried by calling
+``get_num_threads``.These two functions are synonymous with their OpenMP
+counterparts. The execution semantic is also similar to OpenmP in that once a
+parallel region is launched altering the thread mask has no impact on the
+currently executing region but will have an impact on parallel regions executed
+subsequently.
+
+
+The Implementation
+~~~~~~~~~~~~~~~~~~
+
+So as to place no further restrictions on user code other than those that
+already existed in the threading layer libraries, careful consideration of the
+design of thread masking was required. The "thread mask" cannot be stored in a
+global value as concurrent use of the threading layer may result in classic
+forms of race conditions on the value itself. Numerous designs were discussed
+involving various types of mutex on such a global value, all of which were
+eventually broken through thought experiment alone. It eventually transpired
+that, following some OpenMP implementations, the "thread mask" is best
+implemented as a ``thread local``. This means each thread that executes a Numba
+parallel function will have a thread local storage (TLS) slot that contains the
+value of the thread mask to use when scheduling threads in the ``parallel_for``
+function.
+
+The above notion of TLS use for a thread mask is relatively easy to implement,
+``get_num_threads`` and ``set_num_threads`` simply need to address the TLS slot
+in a given threading layer. This also means that the execution schedule for a
+parallel region can be derived from a run time call to ``get_num_threads``. This
+is achieved via a well known and relatively easy to implement pattern of a ``C``
+library function registration and wrapping it in the internal Numba
+implementation.
+
+In addition to satisfying the original upfront thread masking requirements, a
+few more complicated scenarios needed consideration as follows.
+
+Nested parallelism
+******************
+
+In all threading layers a "main thread" will invoke the ``parallel_for``
+function and then in the parallel region, depending on the threading layer,
+some number of additional threads will assist in doing the actual work.
+If the work contains a call to another parallel function (i.e. nested
+parallelism) it is necessary for the thread making the call to know what the
+"thread mask" of the main thread is so that it can propagate it into the
+``parallel_for`` call it makes when executing the nested parallel function.
+The implementation of this behaviour is threading layer specific but the general
+principle is for the "main thread" to always "send" the value of the thread mask
+from its TLS slot to all threads in the threading layer that are active in the
+parallel region. These active threads then update their TLS slots with this
+value prior to performing any work. The net result of this implementation detail
+is that:
+
+* thread masks correctly propagate into nested functions
+* it's still possible for each thread in a parallel region to safely have a
+  different mask with which to call nested functions, if it's not set explicitly
+  then the inherited mask from the "main thread" is used
+* threading layers which have dynamic scheduling with threads potentially
+  joining and leaving the active pool during a ``parallel_for`` execution are
+  successfully accommodated
+* any "main thread" thread mask is entirely decoupled from the in-flux nature
+  of the thread masks of the threads in the active thread pool
+
+Python threads independently invoking parallel functions
+********************************************************
+
+The threading layer launch sequence is heavily guarded to ensure that the
+launch is both thread and process safe and run once per process. In a system
+with numerous Python ``threading`` module threads all using Numba, the first
+thread through the launch sequence will get its thread mask set
+appropriately, but no further threads can run the launch sequence. This means
+that other threads will need their initial thread mask set some other way,
+this is achieved when ``get_num_threads`` is called and no thread mask is
+present, in this case the thread mask will be set to the default.
+
+OS ``fork()`` calls
+*******************
+
+The use of TLS was also in part driven by the Linux (the most popular
+platform for Numba use by far) having a ``fork(2, 3P)`` call that will do TLS
+propagation into child processes, see ``clone(2)``'s ``CLONE_SETTLS``.
+
diff --git a/docs/source/user/threading-layer.rst b/docs/source/user/threading-layer.rst
index 0c887256377..30a52e35909 100644
--- a/docs/source/user/threading-layer.rst
+++ b/docs/source/user/threading-layer.rst
@@ -194,6 +194,8 @@ The current number of threads used by numba can be accessed with
 :func:`numba.get_num_threads`. Both functions work inside of a jitted
 function.
 
+.. _numba-threading-layer-thread-masking:
+
 Example of Limiting the Number of Threads
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

From 58641147259fce0fa0bcf349382ad5607c94253d Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Tue, 14 Jan 2020 14:56:14 +0000
Subject: [PATCH 096/136] Add fork test

---
 numba/tests/test_num_threads.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/numba/tests/test_num_threads.py b/numba/tests/test_num_threads.py
index 56f2e970b19..de44f2b93ff 100644
--- a/numba/tests/test_num_threads.py
+++ b/numba/tests/test_num_threads.py
@@ -4,6 +4,7 @@
 import sys
 import os
 import re
+import multiprocessing
 
 import numpy as np
 
@@ -520,6 +521,29 @@ def test_func_guvectorize(total, lens):
         self.assertEqual(expected_acc, got_acc)
         self.check_mask(expected_thread_count, got_tc)
 
+    @skip_parfors_unsupported
+    @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores")
+    @unittest.skipIf(not sys.platform.startswith('linux'), "Linux only")
+    def _test_threadmask_across_fork(self):
+        forkctx = multiprocessing.get_context('fork')
+        @njit
+        def foo():
+            return get_num_threads()
+
+        def wrap(queue):
+            queue.put(foo())
+
+        mask = 1
+        self.assertEqual(foo(), config.NUMBA_NUM_THREADS)
+        set_num_threads(mask)
+        self.assertEqual(foo(), mask)
+        shared_queue = forkctx.Queue()
+        # check TLS slot inheritance in fork
+        p = forkctx.Process(target=wrap, args=(shared_queue,))
+        p.start()
+        p.join()
+        self.assertEqual(shared_queue.get(), mask)
+
     def tearDown(self):
         set_num_threads(config.NUMBA_NUM_THREADS)
 

From 73681fe28810626c83640607eaed9a20f76139e0 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Wed, 15 Jan 2020 13:36:41 -0700
Subject: [PATCH 097/136] Guard against get_num_threads being -1 as well

---
 numba/npyufunc/parfor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numba/npyufunc/parfor.py b/numba/npyufunc/parfor.py
index 68925d306c8..0fd904d0790 100644
--- a/numba/npyufunc/parfor.py
+++ b/numba/npyufunc/parfor.py
@@ -1343,7 +1343,7 @@ def load_range(v):
 
     num_threads = builder.call(get_num_threads, [])
 
-    with cgutils.if_unlikely(builder, builder.icmp_signed('==', num_threads,
+    with cgutils.if_unlikely(builder, builder.icmp_signed('<=', num_threads,
                                                   num_threads.type(0))):
         cgutils.printf(builder, "num_threads: %d\n", num_threads)
         context.call_conv.return_user_exc(builder, RuntimeError,

From 4bc33d338b8a5ef5d2056308867b784c55c2cb68 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Wed, 15 Jan 2020 15:30:21 -0700
Subject: [PATCH 098/136] Add a short docstring to _get_thread_id()

---
 numba/npyufunc/parallel.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index 14c82d2e970..cb5bc488c29 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -594,7 +594,9 @@ def impl():
 
 def _get_thread_id():
     """
-    docs
+    Returns a unique ID for each thread
+
+    This function is private and should only be used for testing purposes.
     """
     _launch_threads()
     return _get_thread_id()

From b08e0918378e724cd1fe7e5add701e038f8ba7cb Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Wed, 15 Jan 2020 15:30:48 -0700
Subject: [PATCH 099/136] Add some stuff to the threading implementation
 documentation

---
 .../developer/threading_implementation.rst    | 98 +++++++++++++++++--
 1 file changed, 90 insertions(+), 8 deletions(-)

diff --git a/docs/source/developer/threading_implementation.rst b/docs/source/developer/threading_implementation.rst
index 48eaeb879d4..e34587ab574 100644
--- a/docs/source/developer/threading_implementation.rst
+++ b/docs/source/developer/threading_implementation.rst
@@ -1,4 +1,3 @@
-
 =========================================
 Notes on Numba's threading implementation
 =========================================
@@ -19,15 +18,29 @@ execute the parallel tasks.
 
 Thread masking
 --------------
+
+In order to simplify the design, it was decided that Numba should never launch
+new threads beyond the threads that are launched initially with
+``_launch_threads`` when the first parallel execution is run. Consequently,
+the programmatic setting of the number of threads can only be done by setting
+the number of threads to a number less than the total number that have already
+been launched. This is done by "masking" out unused threads, causing them to
+do no work. For example, on a 16 core machine, if the user were to call
+``set_num_threads(4)``, numba would always have 16 threads present, but 12 of
+them would sit idle for parallel computations. A further call to
+``set_num_threads(16)`` would cause those same threads to do work in later
+computations.
+
 :ref:`Thread masking <numba-threading-layer-thread-masking>` was added to make
 it possible for a user to programmatically alter the number of threads
 performing work in the threading layer. Thread masking proved challenging to
 implement as it required the development of a programming model that is suitable
 for users, easy to reason about, and could be implemented safely, with
-consistent behaviour across the various threading layers.
+consistent behavior across the various threading layers.
 
 Programming model
 ~~~~~~~~~~~~~~~~~
+
 The programming model chosen is similar to that found in OpenMP. The reasons for
 this choice were that it is familiar to a lot of users, restricted in scope and
 also simple. The number of threads in use is specified by calling
@@ -76,7 +89,7 @@ If the work contains a call to another parallel function (i.e. nested
 parallelism) it is necessary for the thread making the call to know what the
 "thread mask" of the main thread is so that it can propagate it into the
 ``parallel_for`` call it makes when executing the nested parallel function.
-The implementation of this behaviour is threading layer specific but the general
+The implementation of this behavior is threading layer specific but the general
 principle is for the "main thread" to always "send" the value of the thread mask
 from its TLS slot to all threads in the threading layer that are active in the
 parallel region. These active threads then update their TLS slots with this
@@ -99,11 +112,16 @@ Python threads independently invoking parallel functions
 The threading layer launch sequence is heavily guarded to ensure that the
 launch is both thread and process safe and run once per process. In a system
 with numerous Python ``threading`` module threads all using Numba, the first
-thread through the launch sequence will get its thread mask set
-appropriately, but no further threads can run the launch sequence. This means
-that other threads will need their initial thread mask set some other way,
-this is achieved when ``get_num_threads`` is called and no thread mask is
-present, in this case the thread mask will be set to the default.
+thread through the launch sequence will get its thread mask set appropriately,
+but no further threads can run the launch sequence. This means that other
+threads will need their initial thread mask set some other way. This is
+achieved when ``get_num_threads`` is called and no thread mask is present, in
+this case the thread mask will be set to the default. In the implementation,
+"no thread mask is present" is represented by the value -1 and the "default
+thread mask" (unset) is represented by the value 0. The implementation also
+immediately calls ``set_num_threads(NUMBA_NUM_THREADS)`` after doing this, so
+if either -1 or 0 is encountered as a result from ``get_num_threads()`` it
+indicates a bug in the above processes.
 
 OS ``fork()`` calls
 *******************
@@ -112,3 +130,67 @@ The use of TLS was also in part driven by the Linux (the most popular
 platform for Numba use by far) having a ``fork(2, 3P)`` call that will do TLS
 propagation into child processes, see ``clone(2)``'s ``CLONE_SETTLS``.
 
+Thread ID
+*********
+
+A private ``get_thread_id()`` function was added to each threading backend,
+which returns a unique ID for each thread. This can be accessed from Python by
+``numba.npyufunc.parallel._get_thread_id()`` (it can also be used inside of an
+njitted function). The thread ID function is useful for testing that the
+thread masking behavior is correct, but it should not be used outside of the
+tests. For example, one can call ``set_num_threads(4)`` and then collect all
+unique ``_get_thread_id()``s in a parallel region to verify that only 4
+threads are run.
+
+Caveats
+~~~~~~~
+
+Some caveats to be aware of when testing this:
+
+- The TBB backend may choose to schedule fewer than the given mask number of
+  threads. Thus a test such as the one described above may return fewer than 4
+  unique threads.
+
+- The workqueue backend is not threadsafe, so attempts to do nested
+  parallelism with it may result in deadlocks or other undefined behavior.
+
+- Certain backends may reuse the main thread for computation, but this
+  behavior shouldn't be relied on (for instance, for exceptions propagating).
+
+Use in Code Generation
+~~~~~~~~~~~~~~~~~~~~~~
+
+The general pattern for using ``get_num_threads`` in code generation is
+
+.. code:: python
+
+   import llvmlite.llvmpy.core as lc
+
+   get_num_threads = builder.module.get_or_insert_function(
+       lc.Type.function(lc.Type.int(types.intp.bitwidth), []),
+       name="get_num_threads")
+
+   num_threads = builder.call(get_num_threads, [])
+
+   with cgutils.if_unlikely(builder, builder.icmp_signed('==', num_threads,
+                                                 num_threads.type(0))):
+       cgutils.printf(builder, "num_threads: %d\n", num_threads)
+       context.call_conv.return_user_exc(builder, RuntimeError,
+                                                 ("Invalid number of threads. "
+                                                  "This likely indicates a bug in Numba.",))
+
+   # Pass num_threads through to the appropriate backend function
+
+See the code in ``numba/npyufunc/parfor.py``. Here ``builder.module`` is the thread pool backend library, e.g., ``tbbpool``.
+
+The guard against ``num_threads`` being <= 0 is not strictly necessary, but it
+can protect against accidentally incorrect behavior in case the thread masking
+logic contains a bug.
+
+The ``num_threads`` variable should be passed through to the appropriate
+backend function, such as ``do_scheduling`` or ``parallel_for``. If it's used
+in some way other than passing it through to the backend function, the above
+considerations should be taken into account to ensure the use of the
+``num_threads`` variable is safe. It would probably be better to keep such
+logic in the threading backends, rather than trying to do it in code
+generation.

From 2a7ae35f6795acfd4eac267ffcbe6308e5d9af02 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Wed, 15 Jan 2020 15:34:27 -0700
Subject: [PATCH 100/136] Clarify some bits about OpenMP in the threading
 implementation docs

---
 .../developer/threading_implementation.rst       | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/source/developer/threading_implementation.rst b/docs/source/developer/threading_implementation.rst
index e34587ab574..660ed2767d0 100644
--- a/docs/source/developer/threading_implementation.rst
+++ b/docs/source/developer/threading_implementation.rst
@@ -41,16 +41,16 @@ consistent behavior across the various threading layers.
 Programming model
 ~~~~~~~~~~~~~~~~~
 
-The programming model chosen is similar to that found in OpenMP. The reasons for
-this choice were that it is familiar to a lot of users, restricted in scope and
-also simple. The number of threads in use is specified by calling
+The programming model chosen is similar to that found in OpenMP. The reasons
+for this choice were that it is familiar to a lot of users, restricted in
+scope and also simple. The number of threads in use is specified by calling
 ``set_num_threads`` and the number of threads in use can be queried by calling
 ``get_num_threads``.These two functions are synonymous with their OpenMP
-counterparts. The execution semantic is also similar to OpenmP in that once a
-parallel region is launched altering the thread mask has no impact on the
-currently executing region but will have an impact on parallel regions executed
-subsequently.
-
+counterparts (with the above restriction that the mask must be <= the number
+of launched threads). The execution semantics are also similar to OpenmP in
+that once a parallel region is launched altering the thread mask, it has no
+impact on the currently executing region but will have an impact on parallel
+regions executed subsequently.
 
 The Implementation
 ~~~~~~~~~~~~~~~~~~

From aab11a097623404b116997a58aad4488083e844a Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Wed, 15 Jan 2020 15:38:11 -0700
Subject: [PATCH 101/136] Fix docs build errors

---
 docs/source/developer/threading_implementation.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/developer/threading_implementation.rst b/docs/source/developer/threading_implementation.rst
index 660ed2767d0..a0b19e4ac31 100644
--- a/docs/source/developer/threading_implementation.rst
+++ b/docs/source/developer/threading_implementation.rst
@@ -128,7 +128,7 @@ OS ``fork()`` calls
 
 The use of TLS was also in part driven by the Linux (the most popular
 platform for Numba use by far) having a ``fork(2, 3P)`` call that will do TLS
-propagation into child processes, see ``clone(2)``'s ``CLONE_SETTLS``.
+propagation into child processes, see ``clone(2)``\ 's ``CLONE_SETTLS``.
 
 Thread ID
 *********
@@ -139,7 +139,7 @@ which returns a unique ID for each thread. This can be accessed from Python by
 njitted function). The thread ID function is useful for testing that the
 thread masking behavior is correct, but it should not be used outside of the
 tests. For example, one can call ``set_num_threads(4)`` and then collect all
-unique ``_get_thread_id()``s in a parallel region to verify that only 4
+unique ``_get_thread_id()``\ 's in a parallel region to verify that only 4
 threads are run.
 
 Caveats

From a59a199abadca917c6a7b76b1d5edb7d80191037 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Wed, 15 Jan 2020 15:38:56 -0700
Subject: [PATCH 102/136] Small fix in the threading implementation docs

---
 docs/source/developer/threading_implementation.rst | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/source/developer/threading_implementation.rst b/docs/source/developer/threading_implementation.rst
index a0b19e4ac31..aff452ed38c 100644
--- a/docs/source/developer/threading_implementation.rst
+++ b/docs/source/developer/threading_implementation.rst
@@ -172,7 +172,7 @@ The general pattern for using ``get_num_threads`` in code generation is
 
    num_threads = builder.call(get_num_threads, [])
 
-   with cgutils.if_unlikely(builder, builder.icmp_signed('==', num_threads,
+   with cgutils.if_unlikely(builder, builder.icmp_signed('<=', num_threads,
                                                  num_threads.type(0))):
        cgutils.printf(builder, "num_threads: %d\n", num_threads)
        context.call_conv.return_user_exc(builder, RuntimeError,
@@ -181,7 +181,8 @@ The general pattern for using ``get_num_threads`` in code generation is
 
    # Pass num_threads through to the appropriate backend function
 
-See the code in ``numba/npyufunc/parfor.py``. Here ``builder.module`` is the thread pool backend library, e.g., ``tbbpool``.
+See the code in ``numba/npyufunc/parfor.py``. Here ``builder.module`` is the
+thread pool backend library, e.g., ``tbbpool``.
 
 The guard against ``num_threads`` being <= 0 is not strictly necessary, but it
 can protect against accidentally incorrect behavior in case the thread masking

From ef90d089518ff47cdc0ede3f13cd1d311e0fcd68 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Wed, 15 Jan 2020 13:36:41 -0700
Subject: [PATCH 103/136] Guard against get_num_threads being -1 as well

---
 numba/npyufunc/parfor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numba/npyufunc/parfor.py b/numba/npyufunc/parfor.py
index 68925d306c8..0fd904d0790 100644
--- a/numba/npyufunc/parfor.py
+++ b/numba/npyufunc/parfor.py
@@ -1343,7 +1343,7 @@ def load_range(v):
 
     num_threads = builder.call(get_num_threads, [])
 
-    with cgutils.if_unlikely(builder, builder.icmp_signed('==', num_threads,
+    with cgutils.if_unlikely(builder, builder.icmp_signed('<=', num_threads,
                                                   num_threads.type(0))):
         cgutils.printf(builder, "num_threads: %d\n", num_threads)
         context.call_conv.return_user_exc(builder, RuntimeError,

From 99212851bf4167bb39e6bbd060f78b3bb934cbe4 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Wed, 15 Jan 2020 15:30:21 -0700
Subject: [PATCH 104/136] Add a short docstring to _get_thread_id()

---
 numba/npyufunc/parallel.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index 14c82d2e970..cb5bc488c29 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -594,7 +594,9 @@ def impl():
 
 def _get_thread_id():
     """
-    docs
+    Returns a unique ID for each thread
+
+    This function is private and should only be used for testing purposes.
     """
     _launch_threads()
     return _get_thread_id()

From fac8c4db2e8fa23dcec41f7b7f87f00a2976d963 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Wed, 15 Jan 2020 15:30:48 -0700
Subject: [PATCH 105/136] Add some stuff to the threading implementation
 documentation

---
 .../developer/threading_implementation.rst    | 98 +++++++++++++++++--
 1 file changed, 90 insertions(+), 8 deletions(-)

diff --git a/docs/source/developer/threading_implementation.rst b/docs/source/developer/threading_implementation.rst
index 48eaeb879d4..e34587ab574 100644
--- a/docs/source/developer/threading_implementation.rst
+++ b/docs/source/developer/threading_implementation.rst
@@ -1,4 +1,3 @@
-
 =========================================
 Notes on Numba's threading implementation
 =========================================
@@ -19,15 +18,29 @@ execute the parallel tasks.
 
 Thread masking
 --------------
+
+In order to simplify the design, it was decided that Numba should never launch
+new threads beyond the threads that are launched initially with
+``_launch_threads`` when the first parallel execution is run. Consequently,
+the programmatic setting of the number of threads can only be done by setting
+the number of threads to a number less than the total number that have already
+been launched. This is done by "masking" out unused threads, causing them to
+do no work. For example, on a 16 core machine, if the user were to call
+``set_num_threads(4)``, numba would always have 16 threads present, but 12 of
+them would sit idle for parallel computations. A further call to
+``set_num_threads(16)`` would cause those same threads to do work in later
+computations.
+
 :ref:`Thread masking <numba-threading-layer-thread-masking>` was added to make
 it possible for a user to programmatically alter the number of threads
 performing work in the threading layer. Thread masking proved challenging to
 implement as it required the development of a programming model that is suitable
 for users, easy to reason about, and could be implemented safely, with
-consistent behaviour across the various threading layers.
+consistent behavior across the various threading layers.
 
 Programming model
 ~~~~~~~~~~~~~~~~~
+
 The programming model chosen is similar to that found in OpenMP. The reasons for
 this choice were that it is familiar to a lot of users, restricted in scope and
 also simple. The number of threads in use is specified by calling
@@ -76,7 +89,7 @@ If the work contains a call to another parallel function (i.e. nested
 parallelism) it is necessary for the thread making the call to know what the
 "thread mask" of the main thread is so that it can propagate it into the
 ``parallel_for`` call it makes when executing the nested parallel function.
-The implementation of this behaviour is threading layer specific but the general
+The implementation of this behavior is threading layer specific but the general
 principle is for the "main thread" to always "send" the value of the thread mask
 from its TLS slot to all threads in the threading layer that are active in the
 parallel region. These active threads then update their TLS slots with this
@@ -99,11 +112,16 @@ Python threads independently invoking parallel functions
 The threading layer launch sequence is heavily guarded to ensure that the
 launch is both thread and process safe and run once per process. In a system
 with numerous Python ``threading`` module threads all using Numba, the first
-thread through the launch sequence will get its thread mask set
-appropriately, but no further threads can run the launch sequence. This means
-that other threads will need their initial thread mask set some other way,
-this is achieved when ``get_num_threads`` is called and no thread mask is
-present, in this case the thread mask will be set to the default.
+thread through the launch sequence will get its thread mask set appropriately,
+but no further threads can run the launch sequence. This means that other
+threads will need their initial thread mask set some other way. This is
+achieved when ``get_num_threads`` is called and no thread mask is present, in
+this case the thread mask will be set to the default. In the implementation,
+"no thread mask is present" is represented by the value -1 and the "default
+thread mask" (unset) is represented by the value 0. The implementation also
+immediately calls ``set_num_threads(NUMBA_NUM_THREADS)`` after doing this, so
+if either -1 or 0 is encountered as a result from ``get_num_threads()`` it
+indicates a bug in the above processes.
 
 OS ``fork()`` calls
 *******************
@@ -112,3 +130,67 @@ The use of TLS was also in part driven by the Linux (the most popular
 platform for Numba use by far) having a ``fork(2, 3P)`` call that will do TLS
 propagation into child processes, see ``clone(2)``'s ``CLONE_SETTLS``.
 
+Thread ID
+*********
+
+A private ``get_thread_id()`` function was added to each threading backend,
+which returns a unique ID for each thread. This can be accessed from Python by
+``numba.npyufunc.parallel._get_thread_id()`` (it can also be used inside of an
+njitted function). The thread ID function is useful for testing that the
+thread masking behavior is correct, but it should not be used outside of the
+tests. For example, one can call ``set_num_threads(4)`` and then collect all
+unique ``_get_thread_id()``s in a parallel region to verify that only 4
+threads are run.
+
+Caveats
+~~~~~~~
+
+Some caveats to be aware of when testing this:
+
+- The TBB backend may choose to schedule fewer than the given mask number of
+  threads. Thus a test such as the one described above may return fewer than 4
+  unique threads.
+
+- The workqueue backend is not threadsafe, so attempts to do nested
+  parallelism with it may result in deadlocks or other undefined behavior.
+
+- Certain backends may reuse the main thread for computation, but this
+  behavior shouldn't be relied on (for instance, for exceptions propagating).
+
+Use in Code Generation
+~~~~~~~~~~~~~~~~~~~~~~
+
+The general pattern for using ``get_num_threads`` in code generation is
+
+.. code:: python
+
+   import llvmlite.llvmpy.core as lc
+
+   get_num_threads = builder.module.get_or_insert_function(
+       lc.Type.function(lc.Type.int(types.intp.bitwidth), []),
+       name="get_num_threads")
+
+   num_threads = builder.call(get_num_threads, [])
+
+   with cgutils.if_unlikely(builder, builder.icmp_signed('==', num_threads,
+                                                 num_threads.type(0))):
+       cgutils.printf(builder, "num_threads: %d\n", num_threads)
+       context.call_conv.return_user_exc(builder, RuntimeError,
+                                                 ("Invalid number of threads. "
+                                                  "This likely indicates a bug in Numba.",))
+
+   # Pass num_threads through to the appropriate backend function
+
+See the code in ``numba/npyufunc/parfor.py``. Here ``builder.module`` is the thread pool backend library, e.g., ``tbbpool``.
+
+The guard against ``num_threads`` being <= 0 is not strictly necessary, but it
+can protect against accidentally incorrect behavior in case the thread masking
+logic contains a bug.
+
+The ``num_threads`` variable should be passed through to the appropriate
+backend function, such as ``do_scheduling`` or ``parallel_for``. If it's used
+in some way other than passing it through to the backend function, the above
+considerations should be taken into account to ensure the use of the
+``num_threads`` variable is safe. It would probably be better to keep such
+logic in the threading backends, rather than trying to do it in code
+generation.

From 8011b14b2ac422d7bba62650a79af588d79a2f7e Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Wed, 15 Jan 2020 15:34:27 -0700
Subject: [PATCH 106/136] Clarify some bits about OpenMP in the threading
 implementation docs

---
 .../developer/threading_implementation.rst       | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/source/developer/threading_implementation.rst b/docs/source/developer/threading_implementation.rst
index e34587ab574..660ed2767d0 100644
--- a/docs/source/developer/threading_implementation.rst
+++ b/docs/source/developer/threading_implementation.rst
@@ -41,16 +41,16 @@ consistent behavior across the various threading layers.
 Programming model
 ~~~~~~~~~~~~~~~~~
 
-The programming model chosen is similar to that found in OpenMP. The reasons for
-this choice were that it is familiar to a lot of users, restricted in scope and
-also simple. The number of threads in use is specified by calling
+The programming model chosen is similar to that found in OpenMP. The reasons
+for this choice were that it is familiar to a lot of users, restricted in
+scope and also simple. The number of threads in use is specified by calling
 ``set_num_threads`` and the number of threads in use can be queried by calling
 ``get_num_threads``.These two functions are synonymous with their OpenMP
-counterparts. The execution semantic is also similar to OpenmP in that once a
-parallel region is launched altering the thread mask has no impact on the
-currently executing region but will have an impact on parallel regions executed
-subsequently.
-
+counterparts (with the above restriction that the mask must be <= the number
+of launched threads). The execution semantics are also similar to OpenmP in
+that once a parallel region is launched altering the thread mask, it has no
+impact on the currently executing region but will have an impact on parallel
+regions executed subsequently.
 
 The Implementation
 ~~~~~~~~~~~~~~~~~~

From b394b3204bbe5d4506799e0fc3f37419b2127b9e Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Wed, 15 Jan 2020 15:38:11 -0700
Subject: [PATCH 107/136] Fix docs build errors

---
 docs/source/developer/threading_implementation.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/developer/threading_implementation.rst b/docs/source/developer/threading_implementation.rst
index 660ed2767d0..a0b19e4ac31 100644
--- a/docs/source/developer/threading_implementation.rst
+++ b/docs/source/developer/threading_implementation.rst
@@ -128,7 +128,7 @@ OS ``fork()`` calls
 
 The use of TLS was also in part driven by the Linux (the most popular
 platform for Numba use by far) having a ``fork(2, 3P)`` call that will do TLS
-propagation into child processes, see ``clone(2)``'s ``CLONE_SETTLS``.
+propagation into child processes, see ``clone(2)``\ 's ``CLONE_SETTLS``.
 
 Thread ID
 *********
@@ -139,7 +139,7 @@ which returns a unique ID for each thread. This can be accessed from Python by
 njitted function). The thread ID function is useful for testing that the
 thread masking behavior is correct, but it should not be used outside of the
 tests. For example, one can call ``set_num_threads(4)`` and then collect all
-unique ``_get_thread_id()``s in a parallel region to verify that only 4
+unique ``_get_thread_id()``\ 's in a parallel region to verify that only 4
 threads are run.
 
 Caveats

From 8b8c1986c2fab904cbb328812a51b97b6b515e33 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Wed, 15 Jan 2020 15:38:56 -0700
Subject: [PATCH 108/136] Small fix in the threading implementation docs

---
 docs/source/developer/threading_implementation.rst | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/source/developer/threading_implementation.rst b/docs/source/developer/threading_implementation.rst
index a0b19e4ac31..aff452ed38c 100644
--- a/docs/source/developer/threading_implementation.rst
+++ b/docs/source/developer/threading_implementation.rst
@@ -172,7 +172,7 @@ The general pattern for using ``get_num_threads`` in code generation is
 
    num_threads = builder.call(get_num_threads, [])
 
-   with cgutils.if_unlikely(builder, builder.icmp_signed('==', num_threads,
+   with cgutils.if_unlikely(builder, builder.icmp_signed('<=', num_threads,
                                                  num_threads.type(0))):
        cgutils.printf(builder, "num_threads: %d\n", num_threads)
        context.call_conv.return_user_exc(builder, RuntimeError,
@@ -181,7 +181,8 @@ The general pattern for using ``get_num_threads`` in code generation is
 
    # Pass num_threads through to the appropriate backend function
 
-See the code in ``numba/npyufunc/parfor.py``. Here ``builder.module`` is the thread pool backend library, e.g., ``tbbpool``.
+See the code in ``numba/npyufunc/parfor.py``. Here ``builder.module`` is the
+thread pool backend library, e.g., ``tbbpool``.
 
 The guard against ``num_threads`` being <= 0 is not strictly necessary, but it
 can protect against accidentally incorrect behavior in case the thread masking

From 56a826c41d47dd518c62d701df3d9f123f3e6e2d Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Fri, 17 Jan 2020 13:11:09 +0000
Subject: [PATCH 109/136] Add TBB version checks at compile time.

As title.
---
 buildscripts/condarecipe.local/meta.yaml |  6 +++---
 numba/npyufunc/tbbpool.cpp               | 20 +++++++++++---------
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/buildscripts/condarecipe.local/meta.yaml b/buildscripts/condarecipe.local/meta.yaml
index 90dcffdb898..de22640b696 100644
--- a/buildscripts/condarecipe.local/meta.yaml
+++ b/buildscripts/condarecipe.local/meta.yaml
@@ -36,7 +36,7 @@ requirements:
     - funcsigs                 # [py27]
     - singledispatch           # [py27]
     # TBB devel version is to match TBB libs
-    - tbb-devel >=2018.0.5     # [not ((armv6l or armv7l or aarch64) or (win and py27))]
+    - tbb-devel >=2019.5       # [not ((armv6l or armv7l or aarch64) or (win and py27))]
   run:
     - python >=3.6
     - numpy >=1.15
@@ -48,7 +48,7 @@ requirements:
   run_constrained:
     # If TBB is present it must be at least this version from Anaconda due to
     # build flag issues triggering UB
-    - tbb >=2018.0.5           # [not ((armv6l or armv7l or aarch64) or (win and py27))]
+    - tbb >=2019.5             # [not ((armv6l or armv7l or aarch64) or (win and py27))]
     # avoid confusion from openblas bugs
     - libopenblas !=0.3.6      # [x86_64]
     # CUDA 8.0 or later is required for CUDA support
@@ -65,7 +65,7 @@ test:
     - ipython                  # [not (armv6l or armv7l or aarch64)]
     - setuptools
     - faulthandler             # [py27 and (not (armv6l or armv7l or aarch64))]
-    - tbb  >=2018.0.5          # [not ((armv6l or armv7l or aarch64) or (win and py27))]
+    - tbb  >=2019.5            # [not ((armv6l or armv7l or aarch64) or (win and py27))]
     - intel-openmp             # [osx]
     # Need these for AOT. Do not init msvc as it may not be present
     - {{ compiler('c') }}      # [not (win or armv6l or armv7l or aarch64)]
diff --git a/numba/npyufunc/tbbpool.cpp b/numba/npyufunc/tbbpool.cpp
index b7fc8a9f0b1..a089ec24244 100644
--- a/numba/npyufunc/tbbpool.cpp
+++ b/numba/npyufunc/tbbpool.cpp
@@ -19,17 +19,19 @@ Implement parallel vectorize workqueue on top of Intel TBB.
 
 #include "gufunc_scheduler.h"
 
-#if TBB_INTERFACE_VERSION >= 9106
+/* TBB 2019 U5 is the minimum required version as this is needed:
+ * https://github.com/intel/tbb/blob/18070344d755ece04d169e6cc40775cae9288cee/CHANGES#L133-L134
+ * and therefore
+ * https://github.com/intel/tbb/blob/18070344d755ece04d169e6cc40775cae9288cee/CHANGES#L128-L129
+ * from here:
+ * https://github.com/intel/tbb/blob/2019_U5/include/tbb/tbb_stddef.h#L29
+ */
+#if TBB_INTERFACE_VERSION < 11006
+#error "TBB version is too old, 2019 update 5, i.e. TBB_INTERFACE_VERSION >= 11005 required"
+#endif
+
 #define TSI_INIT(count) tbb::task_scheduler_init(count)
 #define TSI_TERMINATE(tsi) tsi->blocking_terminate(std::nothrow)
-#else
-#if __TBB_SUPPORTS_WORKERS_WAITING_IN_TERMINATE
-#define TSI_INIT(count) tbb::task_scheduler_init(count, 0, /*blocking termination*/true)
-#define TSI_TERMINATE(tsi) tsi->terminate()
-#else
-#error This version of TBB does not support blocking terminate
-#endif
-#endif
 
 #define _DEBUG 0
 #define _TRACE_SPLIT 0

From dc99a57f5fca596a14a295cb8379afd4fd218ff3 Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Fri, 17 Jan 2020 13:20:44 +0000
Subject: [PATCH 110/136] Add runtime check on TBB version.

As title.
---
 numba/npyufunc/parallel.py | 32 +++++++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index cb5bc488c29..7bdb95dc0ac 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -12,12 +12,11 @@
 from __future__ import print_function, absolute_import
 
 import os
-import platform
 import sys
 import warnings
 from threading import RLock as threadRLock
 import multiprocessing
-from ctypes import CFUNCTYPE, c_int
+from ctypes import CFUNCTYPE, c_int, CDLL
 
 import numpy as np
 
@@ -333,6 +332,10 @@ def _launch_threads():
             if _is_initialized:
                 return
 
+            _IS_OSX = sys.platform.startswith('darwin')
+            _IS_LINUX = sys.platform.startswith('linux')
+            _IS_WINDOWS = sys.platform.startswith('win32')
+
             def select_known_backend(backend):
                 """
                 Loads a specific threading layer backend based on string
@@ -340,8 +343,29 @@ def select_known_backend(backend):
                 lib = None
                 if backend.startswith("tbb"):
                     try:
+                        # first check that the TBB version is new enough
+                        if _IS_WINDOWS:
+                            libtbb_name = 'tbb.lib'
+                        elif _IS_OSX:
+                            libtbb_name = 'libtbb.dylib'
+                        elif _IS_LINUX:
+                            libtbb_name = 'libtbb.so.2'
+                        else:
+                            raise ValueError("Unknown operating system")
+                        libtbb = CDLL(libtbb_name)
+                        version_func = libtbb.TBB_runtime_interface_version
+                        version_func.argtypes = []
+                        version_func.restype = c_int
+                        tbb_interface_version = version_func()
+                        if tbb_interface_version < 11005:
+                            msg = ("The TBB threading layer requires a version "
+                                   "of TBB greater than 2019 update 5, i.e. "
+                                   "TBB_INTERFACE_VERSION >= 11005, found "
+                                   "TBB_INTERFACE_VERSION = %s")
+                            raise RuntimeError(msg % tbb_interface_version)
+                        # now try and load the backend
                         from . import tbbpool as lib
-                    except ImportError:
+                    except (ImportError, OSError):
                         pass
                 elif backend.startswith("omp"):
                     # TODO: Check that if MKL is present that it is a version
@@ -374,8 +398,6 @@ def select_from_backends(backends):
             namedbackends = ['tbb', 'omp', 'workqueue']
 
             lib = None
-            _IS_OSX = platform.system() == "Darwin"
-            _IS_LINUX = platform.system() == "Linux"
             err_helpers = dict()
             err_helpers['TBB'] = ("Intel TBB is required, try:\n"
                                   "$ conda/pip install tbb")

From 7b1db0bb2a719f4927a35efa16c2784270641d7a Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Fri, 17 Jan 2020 15:41:02 +0000
Subject: [PATCH 111/136] Make wrong TBB a warning and disable

As title
---
 numba/npyufunc/parallel.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index 7bdb95dc0ac..131e0456b28 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -356,13 +356,16 @@ def select_known_backend(backend):
                         version_func = libtbb.TBB_runtime_interface_version
                         version_func.argtypes = []
                         version_func.restype = c_int
-                        tbb_interface_version = version_func()
-                        if tbb_interface_version < 11005:
-                            msg = ("The TBB threading layer requires a version "
-                                   "of TBB greater than 2019 update 5, i.e. "
-                                   "TBB_INTERFACE_VERSION >= 11005, found "
-                                   "TBB_INTERFACE_VERSION = %s")
-                            raise RuntimeError(msg % tbb_interface_version)
+                        tbb_iface_ver = version_func()
+                        if tbb_iface_ver < 11005: # magic number from TBB
+                            msg = ("The TBB threading layer requires TBB "
+                                   "version 2019 update 5 or later i.e. "
+                                   "TBB_INTERFACE_VERSION >= 11005. Found "
+                                   "TBB_INTERFACE_VERSION = %s. The TBB "
+                                   "threading layer is disabled.")
+                            problem = errors.NumbaWarning(msg % tbb_iface_ver)
+                            warnings.warn(problem)
+                            raise ImportError # to trigger except + skip
                         # now try and load the backend
                         from . import tbbpool as lib
                     except (ImportError, OSError):

From bce47d4dbe4abc1843f6b55b109eb87f3f7c5e1b Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Fri, 17 Jan 2020 18:11:07 +0000
Subject: [PATCH 112/136] Change library spelling on windows

As title
---
 numba/npyufunc/parallel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index 131e0456b28..fe2090ff055 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -345,7 +345,7 @@ def select_known_backend(backend):
                     try:
                         # first check that the TBB version is new enough
                         if _IS_WINDOWS:
-                            libtbb_name = 'tbb.lib'
+                            libtbb_name = 'tbb'
                         elif _IS_OSX:
                             libtbb_name = 'libtbb.dylib'
                         elif _IS_LINUX:

From 33f7ef18830b27305f2c57ddd21d7d77a1876f75 Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Mon, 20 Jan 2020 11:14:09 +0000
Subject: [PATCH 113/136] Add version check to tests using TBB.

As title.
---
 numba/npyufunc/parallel.py           | 62 ++++++++++++++++------------
 numba/tests/test_parallel_backend.py |  4 ++
 2 files changed, 39 insertions(+), 27 deletions(-)

diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index fe2090ff055..a3f3258d0be 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -29,6 +29,10 @@
 from numba.npyufunc.wrappers import _wrapper_info
 from numba.extending import overload
 
+_IS_OSX = sys.platform.startswith('darwin')
+_IS_LINUX = sys.platform.startswith('linux')
+_IS_WINDOWS = sys.platform.startswith('win32')
+
 
 def get_thread_count():
     """
@@ -325,6 +329,35 @@ def threading_layer():
         return _threading_layer
 
 
+def _check_tbb_version_compatible():
+    """
+    Checks that if TBB is present it is of a compatible version.
+    """
+    # first check that the TBB version is new enough
+    if _IS_WINDOWS:
+        libtbb_name = 'tbb'
+    elif _IS_OSX:
+        libtbb_name = 'libtbb.dylib'
+    elif _IS_LINUX:
+        libtbb_name = 'libtbb.so.2'
+    else:
+        raise ValueError("Unknown operating system")
+    libtbb = CDLL(libtbb_name)
+    version_func = libtbb.TBB_runtime_interface_version
+    version_func.argtypes = []
+    version_func.restype = c_int
+    tbb_iface_ver = version_func()
+    if tbb_iface_ver < 11005: # magic number from TBB
+        msg = ("The TBB threading layer requires TBB "
+               "version 2019 update 5 or later i.e. "
+               "TBB_INTERFACE_VERSION >= 11005. Found "
+               "TBB_INTERFACE_VERSION = %s. The TBB "
+               "threading layer is disabled.")
+        problem = errors.NumbaWarning(msg % tbb_iface_ver)
+        warnings.warn(problem)
+        raise ImportError("Incompatible TBB version") # to trigger except + skip
+
+
 def _launch_threads():
     with _backend_init_process_lock:
         with _backend_init_thread_lock:
@@ -332,10 +365,6 @@ def _launch_threads():
             if _is_initialized:
                 return
 
-            _IS_OSX = sys.platform.startswith('darwin')
-            _IS_LINUX = sys.platform.startswith('linux')
-            _IS_WINDOWS = sys.platform.startswith('win32')
-
             def select_known_backend(backend):
                 """
                 Loads a specific threading layer backend based on string
@@ -343,29 +372,8 @@ def select_known_backend(backend):
                 lib = None
                 if backend.startswith("tbb"):
                     try:
-                        # first check that the TBB version is new enough
-                        if _IS_WINDOWS:
-                            libtbb_name = 'tbb'
-                        elif _IS_OSX:
-                            libtbb_name = 'libtbb.dylib'
-                        elif _IS_LINUX:
-                            libtbb_name = 'libtbb.so.2'
-                        else:
-                            raise ValueError("Unknown operating system")
-                        libtbb = CDLL(libtbb_name)
-                        version_func = libtbb.TBB_runtime_interface_version
-                        version_func.argtypes = []
-                        version_func.restype = c_int
-                        tbb_iface_ver = version_func()
-                        if tbb_iface_ver < 11005: # magic number from TBB
-                            msg = ("The TBB threading layer requires TBB "
-                                   "version 2019 update 5 or later i.e. "
-                                   "TBB_INTERFACE_VERSION >= 11005. Found "
-                                   "TBB_INTERFACE_VERSION = %s. The TBB "
-                                   "threading layer is disabled.")
-                            problem = errors.NumbaWarning(msg % tbb_iface_ver)
-                            warnings.warn(problem)
-                            raise ImportError # to trigger except + skip
+                        # check if TBB is present and compatible
+                        _check_tbb_version_compatible()
                         # now try and load the backend
                         from . import tbbpool as lib
                     except (ImportError, OSError):
diff --git a/numba/tests/test_parallel_backend.py b/numba/tests/test_parallel_backend.py
index a26c1fef999..4eb59231d48 100644
--- a/numba/tests/test_parallel_backend.py
+++ b/numba/tests/test_parallel_backend.py
@@ -34,6 +34,10 @@
 # Check which backends are available
 # TODO: Put this in a subprocess so the address space is kept clean
 try:
+    # Check it's a compatible TBB before loading it
+    from numba.npyufunc.parallel import _check_tbb_version_compatible
+    _check_tbb_version_compatible()
+
     from numba.npyufunc import tbbpool    # noqa: F401
     _HAVE_TBB_POOL = True
 except ImportError:

From aa95d99d668ee14ea534b5752a45579d34335f92 Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Mon, 20 Jan 2020 11:30:27 +0000
Subject: [PATCH 114/136] Update to translate to ImportError

As title
---
 numba/npyufunc/parallel.py | 52 ++++++++++++++++++++------------------
 1 file changed, 28 insertions(+), 24 deletions(-)

diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index a3f3258d0be..34a5bf7d93e 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -333,29 +333,33 @@ def _check_tbb_version_compatible():
     """
     Checks that if TBB is present it is of a compatible version.
     """
-    # first check that the TBB version is new enough
-    if _IS_WINDOWS:
-        libtbb_name = 'tbb'
-    elif _IS_OSX:
-        libtbb_name = 'libtbb.dylib'
-    elif _IS_LINUX:
-        libtbb_name = 'libtbb.so.2'
-    else:
-        raise ValueError("Unknown operating system")
-    libtbb = CDLL(libtbb_name)
-    version_func = libtbb.TBB_runtime_interface_version
-    version_func.argtypes = []
-    version_func.restype = c_int
-    tbb_iface_ver = version_func()
-    if tbb_iface_ver < 11005: # magic number from TBB
-        msg = ("The TBB threading layer requires TBB "
-               "version 2019 update 5 or later i.e. "
-               "TBB_INTERFACE_VERSION >= 11005. Found "
-               "TBB_INTERFACE_VERSION = %s. The TBB "
-               "threading layer is disabled.")
-        problem = errors.NumbaWarning(msg % tbb_iface_ver)
-        warnings.warn(problem)
-        raise ImportError("Incompatible TBB version") # to trigger except + skip
+    try:
+        # first check that the TBB version is new enough
+        if _IS_WINDOWS:
+            libtbb_name = 'tbb'
+        elif _IS_OSX:
+            libtbb_name = 'libtbb.dylib'
+        elif _IS_LINUX:
+            libtbb_name = 'libtbb.so.2'
+        else:
+            raise ValueError("Unknown operating system")
+        libtbb = CDLL(libtbb_name)
+        version_func = libtbb.TBB_runtime_interface_version
+        version_func.argtypes = []
+        version_func.restype = c_int
+        tbb_iface_ver = version_func()
+        if tbb_iface_ver < 11005: # magic number from TBB
+            msg = ("The TBB threading layer requires TBB "
+                    "version 2019 update 5 or later i.e. "
+                    "TBB_INTERFACE_VERSION >= 11005. Found "
+                    "TBB_INTERFACE_VERSION = %s. The TBB "
+                    "threading layer is disabled.")
+            problem = errors.NumbaWarning(msg % tbb_iface_ver)
+            warnings.warn(problem)
+    except (ValueError, OSError) as e:
+        # Translate as an ImportError for consistent error class use, this error
+        # will never materialise
+        raise ImportError("Problem with TBB. Reason: %s" % e)
 
 
 def _launch_threads():
@@ -376,7 +380,7 @@ def select_known_backend(backend):
                         _check_tbb_version_compatible()
                         # now try and load the backend
                         from . import tbbpool as lib
-                    except (ImportError, OSError):
+                    except ImportError:
                         pass
                 elif backend.startswith("omp"):
                     # TODO: Check that if MKL is present that it is a version

From 46a96f342f1ecd7689313681ea99b9b0f771020a Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Mon, 20 Jan 2020 11:35:18 +0000
Subject: [PATCH 115/136] flake8

as title
---
 numba/npyufunc/parallel.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index 34a5bf7d93e..d5ad4bcf89a 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -350,10 +350,10 @@ def _check_tbb_version_compatible():
         tbb_iface_ver = version_func()
         if tbb_iface_ver < 11005: # magic number from TBB
             msg = ("The TBB threading layer requires TBB "
-                    "version 2019 update 5 or later i.e. "
-                    "TBB_INTERFACE_VERSION >= 11005. Found "
-                    "TBB_INTERFACE_VERSION = %s. The TBB "
-                    "threading layer is disabled.")
+                   "version 2019 update 5 or later i.e. "
+                   "TBB_INTERFACE_VERSION >= 11005. Found "
+                   "TBB_INTERFACE_VERSION = %s. The TBB "
+                   "threading layer is disabled.")
             problem = errors.NumbaWarning(msg % tbb_iface_ver)
             warnings.warn(problem)
     except (ValueError, OSError) as e:

From f2e851edd748e341184bace6cee7f22baaa64247 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Mon, 20 Jan 2020 14:17:52 -0700
Subject: [PATCH 116/136] Apply suggestions to the threading implementation
 docs from code review

Co-Authored-By: stuartarchibald <stuartarchibald@users.noreply.github.com>
---
 docs/source/developer/threading_implementation.rst | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/source/developer/threading_implementation.rst b/docs/source/developer/threading_implementation.rst
index aff452ed38c..620b6827138 100644
--- a/docs/source/developer/threading_implementation.rst
+++ b/docs/source/developer/threading_implementation.rst
@@ -26,7 +26,7 @@ the programmatic setting of the number of threads can only be done by setting
 the number of threads to a number less than the total number that have already
 been launched. This is done by "masking" out unused threads, causing them to
 do no work. For example, on a 16 core machine, if the user were to call
-``set_num_threads(4)``, numba would always have 16 threads present, but 12 of
+``set_num_threads(4)``, Numba would always have 16 threads present, but 12 of
 them would sit idle for parallel computations. A further call to
 ``set_num_threads(16)`` would cause those same threads to do work in later
 computations.
@@ -46,7 +46,7 @@ for this choice were that it is familiar to a lot of users, restricted in
 scope and also simple. The number of threads in use is specified by calling
 ``set_num_threads`` and the number of threads in use can be queried by calling
 ``get_num_threads``.These two functions are synonymous with their OpenMP
-counterparts (with the above restriction that the mask must be <= the number
+counterparts (with the above restriction that the mask must be less than or equal to the number
 of launched threads). The execution semantics are also similar to OpenmP in
 that once a parallel region is launched altering the thread mask, it has no
 impact on the currently executing region but will have an impact on parallel
@@ -117,10 +117,10 @@ but no further threads can run the launch sequence. This means that other
 threads will need their initial thread mask set some other way. This is
 achieved when ``get_num_threads`` is called and no thread mask is present, in
 this case the thread mask will be set to the default. In the implementation,
-"no thread mask is present" is represented by the value -1 and the "default
-thread mask" (unset) is represented by the value 0. The implementation also
+"no thread mask is present" is represented by the value ``-1`` and the "default
+thread mask" (unset) is represented by the value ``0``. The implementation also
 immediately calls ``set_num_threads(NUMBA_NUM_THREADS)`` after doing this, so
-if either -1 or 0 is encountered as a result from ``get_num_threads()`` it
+if either ``-1`` or ``0`` is encountered as a result from ``get_num_threads()`` it
 indicates a bug in the above processes.
 
 OS ``fork()`` calls
@@ -155,7 +155,7 @@ Some caveats to be aware of when testing this:
   parallelism with it may result in deadlocks or other undefined behavior.
 
 - Certain backends may reuse the main thread for computation, but this
-  behavior shouldn't be relied on (for instance, for exceptions propagating).
+  behavior shouldn't be relied upon (for instance, if propagating exceptions).
 
 Use in Code Generation
 ~~~~~~~~~~~~~~~~~~~~~~

From 5ac38ca7653402c111ab2b257aa6572767fba89a Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Mon, 20 Jan 2020 15:35:41 -0700
Subject: [PATCH 117/136] Some modifications to the threading implementation
 docs from review

---
 .../developer/threading_implementation.rst    | 47 ++++++++++---------
 1 file changed, 25 insertions(+), 22 deletions(-)

diff --git a/docs/source/developer/threading_implementation.rst b/docs/source/developer/threading_implementation.rst
index 620b6827138..f269f3fec03 100644
--- a/docs/source/developer/threading_implementation.rst
+++ b/docs/source/developer/threading_implementation.rst
@@ -19,15 +19,17 @@ execute the parallel tasks.
 Thread masking
 --------------
 
-In order to simplify the design, it was decided that Numba should never launch
-new threads beyond the threads that are launched initially with
-``_launch_threads`` when the first parallel execution is run. Consequently,
-the programmatic setting of the number of threads can only be done by setting
-the number of threads to a number less than the total number that have already
-been launched. This is done by "masking" out unused threads, causing them to
-do no work. For example, on a 16 core machine, if the user were to call
-``set_num_threads(4)``, Numba would always have 16 threads present, but 12 of
-them would sit idle for parallel computations. A further call to
+As part of its design, Numba never launches new threads beyond the threads
+that are launched initially with ``numba.npyufunc.parallel._launch_threads()``
+when the first parallel execution is run. This is due to the way threads were
+already implemented in Numba prior to thread masking being implemented. This
+restriction was kept to keep the design simple, although it could be removed
+in the future. Consequently, it's possible to programmatically set the number
+of threads, but only to less than or equal to the total number that have
+already been launched. This is done by "masking" out unused threads, causing
+them to do no work. For example, on a 16 core machine, if the user were to
+call ``set_num_threads(4)``, Numba would always have 16 threads present, but
+12 of them would sit idle for parallel computations. A further call to
 ``set_num_threads(16)`` would cause those same threads to do work in later
 computations.
 
@@ -46,11 +48,11 @@ for this choice were that it is familiar to a lot of users, restricted in
 scope and also simple. The number of threads in use is specified by calling
 ``set_num_threads`` and the number of threads in use can be queried by calling
 ``get_num_threads``.These two functions are synonymous with their OpenMP
-counterparts (with the above restriction that the mask must be less than or equal to the number
-of launched threads). The execution semantics are also similar to OpenmP in
-that once a parallel region is launched altering the thread mask, it has no
-impact on the currently executing region but will have an impact on parallel
-regions executed subsequently.
+counterparts (with the above restriction that the mask must be less than or
+equal to the number of launched threads). The execution semantics are also
+similar to OpenmP in that once a parallel region is launched, altering the
+thread mask has no impact on the currently executing region, but will have an
+impact on parallel regions executed subsequently.
 
 The Implementation
 ~~~~~~~~~~~~~~~~~~
@@ -135,8 +137,8 @@ Thread ID
 
 A private ``get_thread_id()`` function was added to each threading backend,
 which returns a unique ID for each thread. This can be accessed from Python by
-``numba.npyufunc.parallel._get_thread_id()`` (it can also be used inside of an
-njitted function). The thread ID function is useful for testing that the
+``numba.npyufunc.parallel._get_thread_id()`` (it can also be used inside of
+JIT compiled function). The thread ID function is useful for testing that the
 thread masking behavior is correct, but it should not be used outside of the
 tests. For example, one can call ``set_num_threads(4)`` and then collect all
 unique ``_get_thread_id()``\ 's in a parallel region to verify that only 4
@@ -145,14 +147,16 @@ threads are run.
 Caveats
 ~~~~~~~
 
-Some caveats to be aware of when testing this:
+Some caveats to be aware of when testing thread masking:
 
 - The TBB backend may choose to schedule fewer than the given mask number of
   threads. Thus a test such as the one described above may return fewer than 4
   unique threads.
 
-- The workqueue backend is not threadsafe, so attempts to do nested
-  parallelism with it may result in deadlocks or other undefined behavior.
+- The workqueue backend is not threadsafe, so attempts to do multithreading
+  nested parallelism with it may result in deadlocks or other undefined
+  behavior. The workqueue backend will raise a SIGABRT signal if it detects
+  nested parallelism.
 
 - Certain backends may reuse the main thread for computation, but this
   behavior shouldn't be relied upon (for instance, if propagating exceptions).
@@ -179,10 +183,9 @@ The general pattern for using ``get_num_threads`` in code generation is
                                                  ("Invalid number of threads. "
                                                   "This likely indicates a bug in Numba.",))
 
-   # Pass num_threads through to the appropriate backend function
+   # Pass num_threads through to the appropriate backend function here
 
-See the code in ``numba/npyufunc/parfor.py``. Here ``builder.module`` is the
-thread pool backend library, e.g., ``tbbpool``.
+See the code in ``numba/npyufunc/parfor.py``.
 
 The guard against ``num_threads`` being <= 0 is not strictly necessary, but it
 can protect against accidentally incorrect behavior in case the thread masking

From 61f40a49751ec6123adf5486b4a3c2d2e6fcabc4 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Fri, 24 Jan 2020 11:27:33 -0600
Subject: [PATCH 118/136] Apply suggestions to threading docs from @gmarkall
 code review

Co-Authored-By: Graham Markall <535640+gmarkall@users.noreply.github.com>
---
 docs/source/developer/threading_implementation.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/developer/threading_implementation.rst b/docs/source/developer/threading_implementation.rst
index f269f3fec03..b2610118ab2 100644
--- a/docs/source/developer/threading_implementation.rst
+++ b/docs/source/developer/threading_implementation.rst
@@ -50,7 +50,7 @@ scope and also simple. The number of threads in use is specified by calling
 ``get_num_threads``.These two functions are synonymous with their OpenMP
 counterparts (with the above restriction that the mask must be less than or
 equal to the number of launched threads). The execution semantics are also
-similar to OpenmP in that once a parallel region is launched, altering the
+similar to OpenMP in that once a parallel region is launched, altering the
 thread mask has no impact on the currently executing region, but will have an
 impact on parallel regions executed subsequently.
 
@@ -137,11 +137,11 @@ Thread ID
 
 A private ``get_thread_id()`` function was added to each threading backend,
 which returns a unique ID for each thread. This can be accessed from Python by
-``numba.npyufunc.parallel._get_thread_id()`` (it can also be used inside of
+``numba.npyufunc.parallel._get_thread_id()`` (it can also be used inside a
 JIT compiled function). The thread ID function is useful for testing that the
 thread masking behavior is correct, but it should not be used outside of the
 tests. For example, one can call ``set_num_threads(4)`` and then collect all
-unique ``_get_thread_id()``\ 's in a parallel region to verify that only 4
+unique ``_get_thread_id()``\ s in a parallel region to verify that only 4
 threads are run.
 
 Caveats

From 43ca3fa15922ce72e6f6799afd03199b38c93d4a Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Fri, 24 Jan 2020 10:39:19 -0700
Subject: [PATCH 119/136] Use the same tbb version format as conda

This makes the error message a little easier to read.
---
 numba/npyufunc/parallel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numba/npyufunc/parallel.py b/numba/npyufunc/parallel.py
index d5ad4bcf89a..e9fad3ac256 100644
--- a/numba/npyufunc/parallel.py
+++ b/numba/npyufunc/parallel.py
@@ -350,7 +350,7 @@ def _check_tbb_version_compatible():
         tbb_iface_ver = version_func()
         if tbb_iface_ver < 11005: # magic number from TBB
             msg = ("The TBB threading layer requires TBB "
-                   "version 2019 update 5 or later i.e. "
+                   "version 2019.5 or later i.e., "
                    "TBB_INTERFACE_VERSION >= 11005. Found "
                    "TBB_INTERFACE_VERSION = %s. The TBB "
                    "threading layer is disabled.")

From f1e80c76d83753122bf27041ff5736ac51d1282c Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Mon, 27 Jan 2020 15:56:34 -0700
Subject: [PATCH 120/136] Note the relevant files in the threading
 implementation documentation

---
 .../developer/threading_implementation.rst    | 27 +++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/docs/source/developer/threading_implementation.rst b/docs/source/developer/threading_implementation.rst
index f269f3fec03..0866d55bd64 100644
--- a/docs/source/developer/threading_implementation.rst
+++ b/docs/source/developer/threading_implementation.rst
@@ -16,6 +16,33 @@ function in the threading library that performs the parallel execution is the
 ``parallel_for`` function. The job of this function is to both orchestrate and
 execute the parallel tasks.
 
+The relevant source files referenced in this document are
+
+- ``numba/npyufunc/tbbpool.cpp``
+- ``numba/npyufunc/omppool.cpp``
+- ``numba/npyufunc/workqueue.c``
+
+  These files contain the TBB, OpenMP, and workqueue threadpool
+  implementations, respectively. Each includes the functions
+  ``set_num_threads()``, ``get_num_threads()``, and ``get_thread_id()``, as
+  well as the relevant logic for thread masking in their respective
+  schedulers. Note that the basic thread local variable logic is duplicated in
+  each of these files, and not shared between them.
+
+- ``numba/npyufunc/parallel.py``
+
+  This file contains the Python and JIT compatible wrappers for
+  ``set_num_threads()``, ``get_num_threads()``, and ``get_thread_id()``, as
+  well as the code that loads the above libraries into Python and launches the
+  threadpool.
+
+- ``numba/npyufunc/parfor.py``
+
+  This file contains the main logic for generating code for the parallel
+  backend. The thread mask is accessed in this file in the code that generates
+  scheduler code, and passed to the relevant backend scheduler function (see
+  below).
+
 Thread masking
 --------------
 

From 4c24949321376e374a77fe186f48b44a61391460 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Tue, 11 Feb 2020 09:44:51 -0700
Subject: [PATCH 121/136] Update docs/source/user/threading-layer.rst

Co-Authored-By: Graham Markall <535640+gmarkall@users.noreply.github.com>
---
 docs/source/user/threading-layer.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/user/threading-layer.rst b/docs/source/user/threading-layer.rst
index f6f26c2f523..0f69b1f5a03 100644
--- a/docs/source/user/threading-layer.rst
+++ b/docs/source/user/threading-layer.rst
@@ -256,7 +256,7 @@ API Reference
 
    The total (maximum) number of threads launched by numba.
 
-   Defaults :obj:`numba.config.NUMBA_DEFAULT_NUM_THREADS`, but can be
+   Defaults to :obj:`numba.config.NUMBA_DEFAULT_NUM_THREADS`, but can be
    overridden with the :envvar:`NUMBA_NUM_THREADS` environment variable.
 
 .. py:data:: numba.config.NUMBA_DEFAULT_NUM_THREADS

From a82bc3189df9c50629ae25e71efa037ed886d142 Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Tue, 11 Feb 2020 18:10:36 +0000
Subject: [PATCH 122/136] Prvent use of TBB if the interface is of too low
 version

---
 numba/np/ufunc/parallel.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/numba/np/ufunc/parallel.py b/numba/np/ufunc/parallel.py
index 0240df359e4..b5387c03722 100644
--- a/numba/np/ufunc/parallel.py
+++ b/numba/np/ufunc/parallel.py
@@ -353,6 +353,7 @@ def _check_tbb_version_compatible():
                    "threading layer is disabled.")
             problem = errors.NumbaWarning(msg % tbb_iface_ver)
             warnings.warn(problem)
+            raise ImportError("Problem with TBB. Reason: %s" % msg)
     except (ValueError, OSError) as e:
         # Translate as an ImportError for consistent error class use, this error
         # will never materialise

From baaa14bb0423bc492c04eecf80b80cd793d6e093 Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Tue, 11 Feb 2020 18:15:30 +0000
Subject: [PATCH 123/136] Add in TBB from defaults

---
 buildscripts/azure/azure-windows.yml | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/buildscripts/azure/azure-windows.yml b/buildscripts/azure/azure-windows.yml
index dc053a8ddbc..d9b82d8c0ab 100644
--- a/buildscripts/azure/azure-windows.yml
+++ b/buildscripts/azure/azure-windows.yml
@@ -37,18 +37,11 @@ jobs:
         buildscripts\\incremental\\setup_conda_environment.cmd
       displayName: 'Before Install'
 
-    # VC 9.0 cannot build tbbpool.cpp in Numba, so we need to remove
-    # tbb from the environment before the build stage.
     - script: |
+        # use TBB
         call activate %CONDA_ENV%
-        conda remove -y tbb tbb-devel
-      displayName: 'Remove TBB'
-
-    - script: |
-        # temporarily patch this in to get a recent TBB
-        call activate %CONDA_ENV%
-        conda install -c conda-forge -y tbb tbb-devel
-      displayName: 'Add in conda-forge TBB'
+        conda install -y tbb tbb-devel
+      displayName: 'Add in TBB'
 
     - script: |
         buildscripts\\incremental\\build.cmd

From 38bd9926af71d80ab9940b7e4cf2e1d37659e487 Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Wed, 12 Feb 2020 14:27:04 +0000
Subject: [PATCH 124/136] tmp patch for debug

---
 numba/np/ufunc/parallel.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/numba/np/ufunc/parallel.py b/numba/np/ufunc/parallel.py
index b5387c03722..a77bc200c25 100644
--- a/numba/np/ufunc/parallel.py
+++ b/numba/np/ufunc/parallel.py
@@ -23,10 +23,10 @@
 import llvmlite.binding as ll
 
 from numba.np.numpy_support import as_dtype
-from numba.core import types, config, errors
+from numba.core import types, config, errors, cgutils
 from numba.np.ufunc.wrappers import _wrapper_info
 from numba.np.ufunc import ufuncbuilder
-from numba.extending import overload
+from numba.extending import overload, intrinsic
 
 
 _IS_OSX = sys.platform.startswith('darwin')
@@ -518,6 +518,17 @@ def _load_num_threads_funcs(lib):
 
 
 # Some helpers to make set_num_threads jittable
+@intrinsic
+def debug(tyctx, max_t, ill_t):
+    max_t_ty = getattr(max_t, 'literal_type', max_t)
+    ill_t_ty = getattr(ill_t, 'literal_type', ill_t)
+    sig = types.void(max_t_ty, ill_t_ty)
+    def codegen(cgctx, builder, sig, args):
+        a, b = args
+        cgutils.printf(builder, "Max threads %d. Illegal thread count %d\n", a,
+                       b)
+    return sig, codegen
+
 
 def gen_snt_check():
     from numba.core.config import NUMBA_NUM_THREADS
@@ -525,6 +536,7 @@ def gen_snt_check():
 
     def snt_check(n):
         if n > NUMBA_NUM_THREADS or n < 1:
+            debug(NUMBA_NUM_THREADS, n)
             raise ValueError(msg)
     return snt_check
 

From a11048dcbfaeb04d40fbc2e2f333a090218b2480 Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Wed, 12 Feb 2020 16:59:29 +0000
Subject: [PATCH 125/136] Make debug work in python code

---
 numba/np/ufunc/parallel.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/numba/np/ufunc/parallel.py b/numba/np/ufunc/parallel.py
index a77bc200c25..c8d176589d1 100644
--- a/numba/np/ufunc/parallel.py
+++ b/numba/np/ufunc/parallel.py
@@ -27,6 +27,7 @@
 from numba.np.ufunc.wrappers import _wrapper_info
 from numba.np.ufunc import ufuncbuilder
 from numba.extending import overload, intrinsic
+from numba import njit
 
 
 _IS_OSX = sys.platform.startswith('darwin')
@@ -518,8 +519,9 @@ def _load_num_threads_funcs(lib):
 
 
 # Some helpers to make set_num_threads jittable
+
 @intrinsic
-def debug(tyctx, max_t, ill_t):
+def _debug(tyctx, max_t, ill_t):
     max_t_ty = getattr(max_t, 'literal_type', max_t)
     ill_t_ty = getattr(ill_t, 'literal_type', ill_t)
     sig = types.void(max_t_ty, ill_t_ty)
@@ -529,6 +531,9 @@ def codegen(cgctx, builder, sig, args):
                        b)
     return sig, codegen
 
+@njit
+def debug(max_t, ill_t):
+    _debug(max_t, ill_t)
 
 def gen_snt_check():
     from numba.core.config import NUMBA_NUM_THREADS

From 6f72a7ff26b2fd92678eb08ff0e70cb2c053ffcf Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Wed, 12 Feb 2020 11:18:17 -0700
Subject: [PATCH 126/136] Don't allow reloading NUMBA_NUM_THREADS if threads
 have already been launched

---
 numba/core/config.py                            | 13 ++++++++++++-
 .../npyufunc/test_parallel_env_variable.py      | 17 +++++++++++------
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/numba/core/config.py b/numba/core/config.py
index aaeaafcba7f..b53b3f95a2e 100644
--- a/numba/core/config.py
+++ b/numba/core/config.py
@@ -322,8 +322,19 @@ def avx_default():
         NUMBA_DEFAULT_NUM_THREADS = max(1, multiprocessing.cpu_count())
 
         # Numba thread pool size (defaults to number of CPUs on the system).
-        NUMBA_NUM_THREADS = _readenv("NUMBA_NUM_THREADS", int,
+        _NUMBA_NUM_THREADS = _readenv("NUMBA_NUM_THREADS", int,
                                      NUMBA_DEFAULT_NUM_THREADS)
+        if ('NUMBA_NUM_THREADS' in globals()
+            and globals()['NUMBA_NUM_THREADS'] != _NUMBA_NUM_THREADS):
+            from numba.np.ufunc import parallel
+            if parallel._is_initialized:
+                raise RuntimeError("Cannot set NUMBA_NUM_THREADS to a "
+                    "different value once the threads have been launched "
+                    "(%s != %s)" % (_NUMBA_NUM_THREADS,
+                                    globals()['NUMBA_NUM_THREADS']))
+
+        NUMBA_NUM_THREADS = _NUMBA_NUM_THREADS
+        del _NUMBA_NUM_THREADS
 
         # Profiling support
 
diff --git a/numba/tests/npyufunc/test_parallel_env_variable.py b/numba/tests/npyufunc/test_parallel_env_variable.py
index 8cf58eaae23..3044ee383c8 100644
--- a/numba/tests/npyufunc/test_parallel_env_variable.py
+++ b/numba/tests/npyufunc/test_parallel_env_variable.py
@@ -20,14 +20,19 @@ def test_num_threads_variable(self):
         current = str(getattr(env, key, config.NUMBA_DEFAULT_NUM_THREADS))
         threads = "3154"
         env[key] = threads
-        config.reload_config()
         try:
-            self.assertEqual(threads, str(get_thread_count()))
-            self.assertEqual(threads, str(config.NUMBA_NUM_THREADS))
-        finally:
-            # reset the env variable/set to default
-            env[key] = current
             config.reload_config()
+        except RuntimeError as e:
+            # This test should fail if threads have already been launched
+            self.assertIn("Cannot set NUMBA_NUM_THREADS", e.args[0])
+        else:
+            try:
+                self.assertEqual(threads, str(get_thread_count()))
+                self.assertEqual(threads, str(config.NUMBA_NUM_THREADS))
+            finally:
+                # reset the env variable/set to default
+                env[key] = current
+                config.reload_config()
 
 if __name__ == '__main__':
     unittest.main()

From 24333a8a5968c33cc7e0be62e09c1b1429b06e4c Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Wed, 12 Feb 2020 11:24:51 -0700
Subject: [PATCH 127/136] Fix flake8

---
 numba/core/config.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/numba/core/config.py b/numba/core/config.py
index b53b3f95a2e..3456b33ff32 100644
--- a/numba/core/config.py
+++ b/numba/core/config.py
@@ -323,14 +323,16 @@ def avx_default():
 
         # Numba thread pool size (defaults to number of CPUs on the system).
         _NUMBA_NUM_THREADS = _readenv("NUMBA_NUM_THREADS", int,
-                                     NUMBA_DEFAULT_NUM_THREADS)
+                                      NUMBA_DEFAULT_NUM_THREADS)
         if ('NUMBA_NUM_THREADS' in globals()
-            and globals()['NUMBA_NUM_THREADS'] != _NUMBA_NUM_THREADS):
+                and globals()['NUMBA_NUM_THREADS'] != _NUMBA_NUM_THREADS):
+
             from numba.np.ufunc import parallel
             if parallel._is_initialized:
                 raise RuntimeError("Cannot set NUMBA_NUM_THREADS to a "
-                    "different value once the threads have been launched "
-                    "(%s != %s)" % (_NUMBA_NUM_THREADS,
+                                   "different value once the threads have been "
+                                   "launched (%s != %s)" %
+                                   (_NUMBA_NUM_THREADS,
                                     globals()['NUMBA_NUM_THREADS']))
 
         NUMBA_NUM_THREADS = _NUMBA_NUM_THREADS

From 6167215e4ed49e8a4300f327d5b4ed4540d1a420 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Wed, 12 Feb 2020 12:03:56 -0700
Subject: [PATCH 128/136] Fix the parallel env variable test to reset the env
 correctly

---
 numba/tests/npyufunc/test_parallel_env_variable.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/numba/tests/npyufunc/test_parallel_env_variable.py b/numba/tests/npyufunc/test_parallel_env_variable.py
index 3044ee383c8..b391d2c1014 100644
--- a/numba/tests/npyufunc/test_parallel_env_variable.py
+++ b/numba/tests/npyufunc/test_parallel_env_variable.py
@@ -26,13 +26,13 @@ def test_num_threads_variable(self):
             # This test should fail if threads have already been launched
             self.assertIn("Cannot set NUMBA_NUM_THREADS", e.args[0])
         else:
-            try:
-                self.assertEqual(threads, str(get_thread_count()))
-                self.assertEqual(threads, str(config.NUMBA_NUM_THREADS))
-            finally:
-                # reset the env variable/set to default
-                env[key] = current
-                config.reload_config()
+            self.assertEqual(threads, str(get_thread_count()))
+            self.assertEqual(threads, str(config.NUMBA_NUM_THREADS))
+        finally:
+            # reset the env variable/set to default. Should not fail even if
+            # threads are launched because the value is the same.
+            env[key] = current
+            config.reload_config()
 
 if __name__ == '__main__':
     unittest.main()

From 554572f327e4b9c920f65b416bfc6a3a5b549846 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Wed, 12 Feb 2020 12:29:41 -0700
Subject: [PATCH 129/136] Reset the num threads to the env variable, not the
 default

---
 numba/tests/npyufunc/test_parallel_env_variable.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numba/tests/npyufunc/test_parallel_env_variable.py b/numba/tests/npyufunc/test_parallel_env_variable.py
index b391d2c1014..7d11692ad34 100644
--- a/numba/tests/npyufunc/test_parallel_env_variable.py
+++ b/numba/tests/npyufunc/test_parallel_env_variable.py
@@ -17,7 +17,7 @@ def test_num_threads_variable(self):
         Tests the NUMBA_NUM_THREADS env variable behaves as expected.
         """
         key = 'NUMBA_NUM_THREADS'
-        current = str(getattr(env, key, config.NUMBA_DEFAULT_NUM_THREADS))
+        current = str(getattr(env, key, config.NUMBA_NUM_THREADS))
         threads = "3154"
         env[key] = threads
         try:

From 185950bfada928476f3a1ec4ec8479b9583e13ef Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Wed, 12 Feb 2020 12:29:41 -0700
Subject: [PATCH 130/136] Reset the num threads to the env variable, not the
 default

---
 numba/tests/npyufunc/test_parallel_env_variable.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numba/tests/npyufunc/test_parallel_env_variable.py b/numba/tests/npyufunc/test_parallel_env_variable.py
index 8cf58eaae23..dd3f36bc890 100644
--- a/numba/tests/npyufunc/test_parallel_env_variable.py
+++ b/numba/tests/npyufunc/test_parallel_env_variable.py
@@ -17,7 +17,7 @@ def test_num_threads_variable(self):
         Tests the NUMBA_NUM_THREADS env variable behaves as expected.
         """
         key = 'NUMBA_NUM_THREADS'
-        current = str(getattr(env, key, config.NUMBA_DEFAULT_NUM_THREADS))
+        current = str(getattr(env, key, config.NUMBA_NUM_THREADS))
         threads = "3154"
         env[key] = threads
         config.reload_config()

From ffff5044b462dd7d3d841692a744dc595aba973b Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Wed, 12 Feb 2020 13:51:35 -0700
Subject: [PATCH 131/136] Update numba/core/config.py

Co-Authored-By: stuartarchibald <stuartarchibald@users.noreply.github.com>
---
 numba/core/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numba/core/config.py b/numba/core/config.py
index 3456b33ff32..1303ec79f81 100644
--- a/numba/core/config.py
+++ b/numba/core/config.py
@@ -331,7 +331,7 @@ def avx_default():
             if parallel._is_initialized:
                 raise RuntimeError("Cannot set NUMBA_NUM_THREADS to a "
                                    "different value once the threads have been "
-                                   "launched (%s != %s)" %
+                                   "launched (currently have %s, trying to set %s)" %
                                    (_NUMBA_NUM_THREADS,
                                     globals()['NUMBA_NUM_THREADS']))
 

From 6d0a050898deff8612ce2b28ab812c1292daaef7 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Wed, 12 Feb 2020 13:58:32 -0700
Subject: [PATCH 132/136] Fix flake8

---
 numba/core/config.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/numba/core/config.py b/numba/core/config.py
index 1303ec79f81..8dada97f6ae 100644
--- a/numba/core/config.py
+++ b/numba/core/config.py
@@ -331,7 +331,8 @@ def avx_default():
             if parallel._is_initialized:
                 raise RuntimeError("Cannot set NUMBA_NUM_THREADS to a "
                                    "different value once the threads have been "
-                                   "launched (currently have %s, trying to set %s)" %
+                                   "launched (currently have %s, "
+                                   "trying to set %s)" %
                                    (_NUMBA_NUM_THREADS,
                                     globals()['NUMBA_NUM_THREADS']))
 

From fb10f425263541149de73de8d049f6c3d0dacbc5 Mon Sep 17 00:00:00 2001
From: Aaron Meurer <asmeurer@gmail.com>
Date: Wed, 12 Feb 2020 15:42:46 -0700
Subject: [PATCH 133/136] Use set_num_threads instead of NUMBA_NUM_THREADS in
 TestParforsVectorizer

---
 numba/tests/test_parfors.py | 43 +++++++++++++++++++++----------------
 1 file changed, 24 insertions(+), 19 deletions(-)

diff --git a/numba/tests/test_parfors.py b/numba/tests/test_parfors.py
index c07b6c50aae..79a8d74088e 100644
--- a/numba/tests/test_parfors.py
+++ b/numba/tests/test_parfors.py
@@ -19,7 +19,7 @@
 from collections import defaultdict
 
 import numba.parfors.parfor
-from numba import njit, prange
+from numba import njit, prange, set_num_threads, get_num_threads
 from numba.core import types, utils, typing, errors, ir, rewrites, typed_passes, inline_closurecall, config, compiler, cpu
 from numba.core.registry import cpu_target
 from numba.core.annotations import type_annotations
@@ -2417,35 +2417,40 @@ def get_gufunc_asm(self, func, schedule_type, *args, **kwargs):
 
         fastmath = kwargs.pop('fastmath', False)
         nthreads = kwargs.pop('nthreads', 2)
+        old_nthreads = get_num_threads()
         cpu_name = kwargs.pop('cpu_name', 'skylake-avx512')
         assertions = kwargs.pop('assertions', True)
 
         env_opts = {'NUMBA_CPU_NAME': cpu_name,
                     'NUMBA_CPU_FEATURES': '',
-                    'NUMBA_NUM_THREADS': str(nthreads)
                     }
 
         overrides = []
         for k, v in env_opts.items():
             overrides.append(override_env_config(k, v))
 
-        with overrides[0], overrides[1], overrides[2]:
-            sig = tuple([numba.typeof(x) for x in args])
-            pfunc_vectorizable = self.generate_prange_func(func, None)
-            if fastmath == True:
-                cres = self.compile_parallel_fastmath(pfunc_vectorizable, sig)
-            else:
-                cres = self.compile_parallel(pfunc_vectorizable, sig)
-
-            # get the gufunc asm
-            asm = self._get_gufunc_asm(cres)
-
-            if assertions:
-                schedty = re.compile('call\s+\w+\*\s+@do_scheduling_(\w+)\(')
-                matches = schedty.findall(cres.library.get_llvm_str())
-                self.assertGreaterEqual(len(matches), 1) # at least 1 parfor call
-                self.assertEqual(matches[0], schedule_type)
-                self.assertTrue(asm != {})
+        with overrides[0], overrides[1]:
+            # Replace this with set_num_threads as a context manager when that exists
+            try:
+                set_num_threads(nthreads)
+                sig = tuple([numba.typeof(x) for x in args])
+                pfunc_vectorizable = self.generate_prange_func(func, None)
+                if fastmath == True:
+                    cres = self.compile_parallel_fastmath(pfunc_vectorizable, sig)
+                else:
+                    cres = self.compile_parallel(pfunc_vectorizable, sig)
+
+                # get the gufunc asm
+                asm = self._get_gufunc_asm(cres)
+
+                if assertions:
+                    schedty = re.compile('call\s+\w+\*\s+@do_scheduling_(\w+)\(')
+                    matches = schedty.findall(cres.library.get_llvm_str())
+                    self.assertGreaterEqual(len(matches), 1) # at least 1 parfor call
+                    self.assertEqual(matches[0], schedule_type)
+                    self.assertTrue(asm != {})
+            finally:
+                set_num_threads(old_nthreads)
 
         return asm
 

From 282333a1901b08d142565c28641131c3e9b8328d Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Thu, 13 Feb 2020 12:54:04 +0000
Subject: [PATCH 134/136] Undo debug code

---
 numba/np/ufunc/parallel.py | 21 ++-------------------
 1 file changed, 2 insertions(+), 19 deletions(-)

diff --git a/numba/np/ufunc/parallel.py b/numba/np/ufunc/parallel.py
index c8d176589d1..b5387c03722 100644
--- a/numba/np/ufunc/parallel.py
+++ b/numba/np/ufunc/parallel.py
@@ -23,11 +23,10 @@
 import llvmlite.binding as ll
 
 from numba.np.numpy_support import as_dtype
-from numba.core import types, config, errors, cgutils
+from numba.core import types, config, errors
 from numba.np.ufunc.wrappers import _wrapper_info
 from numba.np.ufunc import ufuncbuilder
-from numba.extending import overload, intrinsic
-from numba import njit
+from numba.extending import overload
 
 
 _IS_OSX = sys.platform.startswith('darwin')
@@ -520,28 +519,12 @@ def _load_num_threads_funcs(lib):
 
 # Some helpers to make set_num_threads jittable
 
-@intrinsic
-def _debug(tyctx, max_t, ill_t):
-    max_t_ty = getattr(max_t, 'literal_type', max_t)
-    ill_t_ty = getattr(ill_t, 'literal_type', ill_t)
-    sig = types.void(max_t_ty, ill_t_ty)
-    def codegen(cgctx, builder, sig, args):
-        a, b = args
-        cgutils.printf(builder, "Max threads %d. Illegal thread count %d\n", a,
-                       b)
-    return sig, codegen
-
-@njit
-def debug(max_t, ill_t):
-    _debug(max_t, ill_t)
-
 def gen_snt_check():
     from numba.core.config import NUMBA_NUM_THREADS
     msg = "The number of threads must be between 1 and %s" % NUMBA_NUM_THREADS
 
     def snt_check(n):
         if n > NUMBA_NUM_THREADS or n < 1:
-            debug(NUMBA_NUM_THREADS, n)
             raise ValueError(msg)
     return snt_check
 

From e520c809aa666dc45f997465de17f0b49337716a Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Thu, 13 Feb 2020 12:48:46 +0000
Subject: [PATCH 135/136] Disable TBB on linux x86

---
 buildscripts/condarecipe.local/meta.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/buildscripts/condarecipe.local/meta.yaml b/buildscripts/condarecipe.local/meta.yaml
index 6b6e52bd8c5..53a51f5961d 100644
--- a/buildscripts/condarecipe.local/meta.yaml
+++ b/buildscripts/condarecipe.local/meta.yaml
@@ -36,7 +36,7 @@ requirements:
     - funcsigs                 # [py27]
     - singledispatch           # [py27]
     # TBB devel version is to match TBB libs
-    - tbb-devel >=2019.5       # [not ((armv6l or armv7l or aarch64) or (win and py27))]
+    - tbb-devel >=2019.5       # [not (armv6l or armv7l or aarch64 or linux32)]
   run:
     - python >=3.6
     - numpy >=1.15
@@ -48,7 +48,7 @@ requirements:
   run_constrained:
     # If TBB is present it must be at least this version from Anaconda due to
     # build flag issues triggering UB
-    - tbb >=2019.5             # [not ((armv6l or armv7l or aarch64) or (win and py27))]
+    - tbb >=2019.5             # [not (armv6l or armv7l or aarch64 or linux32)]
     # avoid confusion from openblas bugs
     - libopenblas !=0.3.6      # [x86_64]
     # CUDA 8.0 or later is required for CUDA support
@@ -65,7 +65,7 @@ test:
     - ipython                  # [not (armv6l or armv7l or aarch64)]
     - setuptools
     - faulthandler             # [py27 and (not (armv6l or armv7l or aarch64))]
-    - tbb  >=2019.5            # [not ((armv6l or armv7l or aarch64) or (win and py27))]
+    - tbb  >=2019.5            # [not (armv6l or armv7l or aarch64 or linux32)]
     - intel-openmp             # [osx]
     # Need these for AOT. Do not init msvc as it may not be present
     - {{ compiler('c') }}      # [not (win or armv6l or armv7l or aarch64)]

From 7fe82e9ff53f4c0dfe11e69d9536ccc25785176a Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Thu, 13 Feb 2020 13:17:15 +0000
Subject: [PATCH 136/136] Remove dead code in gufunc asm test

---
 numba/tests/test_parfors.py | 37 +++++++++++++++----------------------
 1 file changed, 15 insertions(+), 22 deletions(-)

diff --git a/numba/tests/test_parfors.py b/numba/tests/test_parfors.py
index 79a8d74088e..890dfd857d7 100644
--- a/numba/tests/test_parfors.py
+++ b/numba/tests/test_parfors.py
@@ -2416,8 +2416,6 @@ class TestParforsVectorizer(TestPrangeBase):
     def get_gufunc_asm(self, func, schedule_type, *args, **kwargs):
 
         fastmath = kwargs.pop('fastmath', False)
-        nthreads = kwargs.pop('nthreads', 2)
-        old_nthreads = get_num_threads()
         cpu_name = kwargs.pop('cpu_name', 'skylake-avx512')
         assertions = kwargs.pop('assertions', True)
 
@@ -2430,29 +2428,24 @@ def get_gufunc_asm(self, func, schedule_type, *args, **kwargs):
             overrides.append(override_env_config(k, v))
 
         with overrides[0], overrides[1]:
-            # Replace this with set_num_threads as a context manager when that exists
-            try:
-                set_num_threads(nthreads)
-                sig = tuple([numba.typeof(x) for x in args])
-                pfunc_vectorizable = self.generate_prange_func(func, None)
-                if fastmath == True:
-                    cres = self.compile_parallel_fastmath(pfunc_vectorizable, sig)
-                else:
-                    cres = self.compile_parallel(pfunc_vectorizable, sig)
+            sig = tuple([numba.typeof(x) for x in args])
+            pfunc_vectorizable = self.generate_prange_func(func, None)
+            if fastmath == True:
+                cres = self.compile_parallel_fastmath(pfunc_vectorizable, sig)
+            else:
+                cres = self.compile_parallel(pfunc_vectorizable, sig)
 
-                # get the gufunc asm
-                asm = self._get_gufunc_asm(cres)
+            # get the gufunc asm
+            asm = self._get_gufunc_asm(cres)
 
-                if assertions:
-                    schedty = re.compile('call\s+\w+\*\s+@do_scheduling_(\w+)\(')
-                    matches = schedty.findall(cres.library.get_llvm_str())
-                    self.assertGreaterEqual(len(matches), 1) # at least 1 parfor call
-                    self.assertEqual(matches[0], schedule_type)
-                    self.assertTrue(asm != {})
-            finally:
-                set_num_threads(old_nthreads)
+            if assertions:
+                schedty = re.compile('call\s+\w+\*\s+@do_scheduling_(\w+)\(')
+                matches = schedty.findall(cres.library.get_llvm_str())
+                self.assertGreaterEqual(len(matches), 1) # at least 1 parfor call
+                self.assertEqual(matches[0], schedule_type)
+                self.assertTrue(asm != {})
 
-        return asm
+            return asm
 
     # this is a common match pattern for something like:
     # \n\tvsqrtpd\t-192(%rbx,%rsi,8), %zmm0\n