diff --git a/doc/release/1.17.0-notes.rst b/doc/release/1.17.0-notes.rst
index ccb44d32c75c..645e0b95abda 100644
--- a/doc/release/1.17.0-notes.rst
+++ b/doc/release/1.17.0-notes.rst
@@ -176,6 +176,10 @@ thereby saving a level of indentation
 In some cases where ``np.interp`` would previously return ``np.nan``, it now
 returns an appropriate infinity.
 
+``np.block`` is ported to C for performance and supports ``out`` argument
+-------------------------------------------------------------------------
+``np.block`` should have increased performance, particularly for inputs with
+many small arrays. It also has a new ``out`` keyword argument.
 
 Changes
 =======
diff --git a/numpy/core/numeric.py b/numpy/core/numeric.py
index 42fee4eb7002..7c9d396b3bfe 100644
--- a/numpy/core/numeric.py
+++ b/numpy/core/numeric.py
@@ -26,7 +26,7 @@
     inner, int_asbuffer, lexsort, matmul, may_share_memory,
     min_scalar_type, ndarray, nditer, nested_iters, promote_types,
     putmask, result_type, set_numeric_ops, shares_memory, vdot, where,
-    zeros, normalize_axis_index)
+    zeros, normalize_axis_index, block)
 if sys.version_info[0] < 3:
     from .multiarray import newbuffer, getbuffer
 
diff --git a/numpy/core/shape_base.py b/numpy/core/shape_base.py
index 0eac772e86a7..89b2a6a96640 100644
--- a/numpy/core/shape_base.py
+++ b/numpy/core/shape_base.py
@@ -416,238 +416,8 @@ def stack(arrays, axis=0, out=None):
     return _nx.concatenate(expanded_arrays, axis=axis, out=out)
 
 
-def _block_format_index(index):
-    """
-    Convert a list of indices ``[0, 1, 2]`` into ``"arrays[0][1][2]"``.
-    """
-    idx_str = ''.join('[{}]'.format(i) for i in index if i is not None)
-    return 'arrays' + idx_str
-
-
-def _block_check_depths_match(arrays, parent_index=[]):
-    """
-    Recursive function checking that the depths of nested lists in `arrays`
-    all match. Mismatch raises a ValueError as described in the block
-    docstring below.
-
-    The entire index (rather than just the depth) needs to be calculated
-    for each innermost list, in case an error needs to be raised, so that
-    the index of the offending list can be printed as part of the error.
-
-    Parameters
-    ----------
-    arrays : nested list of arrays
-        The arrays to check
-    parent_index : list of int
-        The full index of `arrays` within the nested lists passed to
-        `_block_check_depths_match` at the top of the recursion.
-
-    Returns
-    -------
-    first_index : list of int
-        The full index of an element from the bottom of the nesting in
-        `arrays`. If any element at the bottom is an empty list, this will
-        refer to it, and the last index along the empty axis will be `None`.
-    max_arr_ndim : int
-        The maximum of the ndims of the arrays nested in `arrays`.
-    final_size: int
-        The number of elements in the final array. This is used the motivate
-        the choice of algorithm used using benchmarking wisdom.
-
-    """
-    if type(arrays) is tuple:
-        # not strictly necessary, but saves us from:
-        #  - more than one way to do things - no point treating tuples like
-        #    lists
-        #  - horribly confusing behaviour that results when tuples are
-        #    treated like ndarray
-        raise TypeError(
-            '{} is a tuple. '
-            'Only lists can be used to arrange blocks, and np.block does '
-            'not allow implicit conversion from tuple to ndarray.'.format(
-                _block_format_index(parent_index)
-            )
-        )
-    elif type(arrays) is list and len(arrays) > 0:
-        idxs_ndims = (_block_check_depths_match(arr, parent_index + [i])
-                      for i, arr in enumerate(arrays))
-
-        first_index, max_arr_ndim, final_size = next(idxs_ndims)
-        for index, ndim, size in idxs_ndims:
-            final_size += size
-            if ndim > max_arr_ndim:
-                max_arr_ndim = ndim
-            if len(index) != len(first_index):
-                raise ValueError(
-                    "List depths are mismatched. First element was at depth "
-                    "{}, but there is an element at depth {} ({})".format(
-                        len(first_index),
-                        len(index),
-                        _block_format_index(index)
-                    )
-                )
-            # propagate our flag that indicates an empty list at the bottom
-            if index[-1] is None:
-                first_index = index
-
-        return first_index, max_arr_ndim, final_size
-    elif type(arrays) is list and len(arrays) == 0:
-        # We've 'bottomed out' on an empty list
-        return parent_index + [None], 0, 0
-    else:
-        # We've 'bottomed out' - arrays is either a scalar or an array
-        size = _nx.size(arrays)
-        return parent_index, _nx.ndim(arrays), size
-
-
-def _atleast_nd(a, ndim):
-    # Ensures `a` has at least `ndim` dimensions by prepending
-    # ones to `a.shape` as necessary
-    return array(a, ndmin=ndim, copy=False, subok=True)
-
-
-def _accumulate(values):
-    # Helper function because Python 2.7 doesn't have
-    # itertools.accumulate
-    value = 0
-    accumulated = []
-    for v in values:
-        value += v
-        accumulated.append(value)
-    return accumulated
-
-
-def _concatenate_shapes(shapes, axis):
-    """Given array shapes, return the resulting shape and slices prefixes.
-
-    These help in nested concatation.
-    Returns
-    -------
-    shape: tuple of int
-        This tuple satisfies:
-        ```
-        shape, _ = _concatenate_shapes([arr.shape for shape in arrs], axis)
-        shape == concatenate(arrs, axis).shape
-        ```
-
-    slice_prefixes: tuple of (slice(start, end), )
-        For a list of arrays being concatenated, this returns the slice
-        in the larger array at axis that needs to be sliced into.
-
-        For example, the following holds:
-        ```
-        ret = concatenate([a, b, c], axis)
-        _, (sl_a, sl_b, sl_c) = concatenate_slices([a, b, c], axis)
-
-        ret[(slice(None),) * axis + sl_a] == a
-        ret[(slice(None),) * axis + sl_b] == b
-        ret[(slice(None),) * axis + sl_c] == c
-        ```
-
-        Thses are called slice prefixes since they are used in the recursive
-        blocking algorithm to compute the left-most slices during the
-        recursion. Therefore, they must be prepended to rest of the slice
-        that was computed deeper in the recusion.
-
-        These are returned as tuples to ensure that they can quickly be added
-        to existing slice tuple without creating a new tuple everytime.
-
-    """
-    # Cache a result that will be reused.
-    shape_at_axis = [shape[axis] for shape in shapes]
-
-    # Take a shape, any shape
-    first_shape = shapes[0]
-    first_shape_pre = first_shape[:axis]
-    first_shape_post = first_shape[axis+1:]
-
-    if any(shape[:axis] != first_shape_pre or
-           shape[axis+1:] != first_shape_post for shape in shapes):
-        raise ValueError(
-            'Mismatched array shapes in block along axis {}.'.format(axis))
-
-    shape = (first_shape_pre + (sum(shape_at_axis),) + first_shape[axis+1:])
-
-    offsets_at_axis = _accumulate(shape_at_axis)
-    slice_prefixes = [(slice(start, end),)
-                      for start, end in zip([0] + offsets_at_axis,
-                                            offsets_at_axis)]
-    return shape, slice_prefixes
 
-
-def _block_info_recursion(arrays, max_depth, result_ndim, depth=0):
-    """
-    Returns the shape of the final array, along with a list
-    of slices and a list of arrays that can be used for assignment inside the
-    new array
-
-    Parameters
-    ----------
-    arrays : nested list of arrays
-        The arrays to check
-    max_depth : list of int
-        The number of nested lists
-    result_ndim: int
-        The number of dimensions in thefinal array.
-
-    Returns
-    -------
-    shape : tuple of int
-        The shape that the final array will take on.
-    slices: list of tuple of slices
-        The slices into the full array required for assignment. These are
-        required to be prepended with ``(Ellipsis, )`` to obtain to correct
-        final index.
-    arrays: list of ndarray
-        The data to assign to each slice of the full array
-
-    """
-    if depth < max_depth:
-        shapes, slices, arrays = zip(
-            *[_block_info_recursion(arr, max_depth, result_ndim, depth+1)
-              for arr in arrays])
-
-        axis = result_ndim - max_depth + depth
-        shape, slice_prefixes = _concatenate_shapes(shapes, axis)
-
-        # Prepend the slice prefix and flatten the slices
-        slices = [slice_prefix + the_slice
-                  for slice_prefix, inner_slices in zip(slice_prefixes, slices)
-                  for the_slice in inner_slices]
-
-        # Flatten the array list
-        arrays = functools.reduce(operator.add, arrays)
-
-        return shape, slices, arrays
-    else:
-        # We've 'bottomed out' - arrays is either a scalar or an array
-        # type(arrays) is not list
-        # Return the slice and the array inside a list to be consistent with
-        # the recursive case.
-        arr = _atleast_nd(arrays, result_ndim)
-        return arr.shape, [()], [arr]
-
-
-def _block(arrays, max_depth, result_ndim, depth=0):
-    """
-    Internal implementation of block based on repeated concatenation.
-    `arrays` is the argument passed to
-    block. `max_depth` is the depth of nested lists within `arrays` and
-    `result_ndim` is the greatest of the dimensions of the arrays in
-    `arrays` and the depth of the lists in `arrays` (see block docstring
-    for details).
-    """
-    if depth < max_depth:
-        arrs = [_block(arr, max_depth, result_ndim, depth+1)
-                for arr in arrays]
-        return _nx.concatenate(arrs, axis=-(max_depth-depth))
-    else:
-        # We've 'bottomed out' - arrays is either a scalar or an array
-        # type(arrays) is not list
-        return _atleast_nd(arrays, result_ndim)
-
-
-def _block_dispatcher(arrays):
+def _block_dispatcher(arrays, out=None):
     # Use type(...) is list to match the behavior of np.block(), which special
     # cases list specifically rather than allowing for generic iterables or
     # tuple. Also, we know that list.__array_function__ will never exist.
@@ -658,9 +428,12 @@ def _block_dispatcher(arrays):
     else:
         yield arrays
 
+    if out is not None:
+        yield out
+
 
 @array_function_dispatch(_block_dispatcher)
-def block(arrays):
+def block(arrays, out=None):
     """
     Assemble an nd-array from nested lists of blocks.
 
@@ -688,6 +461,8 @@ def block(arrays):
         Elements shapes must match along the appropriate axes (without
         broadcasting), but leading 1s will be prepended to the shape as
         necessary to make the dimensions match.
+    out : ndarray
+        ndarray to which to copy the blocks.
 
     Returns
     -------
@@ -808,74 +583,6 @@ def block(arrays):
 
 
     """
-    arrays, list_ndim, result_ndim, final_size = _block_setup(arrays)
-
-    # It was found through benchmarking that making an array of final size
-    # around 256x256 was faster by straight concatenation on a
-    # i7-7700HQ processor and dual channel ram 2400MHz.
-    # It didn't seem to matter heavily on the dtype used.
-    #
-    # A 2D array using repeated concatenation requires 2 copies of the array.
-    #
-    # The fastest algorithm will depend on the ratio of CPU power to memory
-    # speed.
-    # One can monitor the results of the benchmark
+    # This function is benchmarked at
     # https://pv.github.io/numpy-bench/#bench_shape_base.Block2D.time_block2d
-    # to tune this parameter until a C version of the `_block_info_recursion`
-    # algorithm is implemented which would likely be faster than the python
-    # version.
-    if list_ndim * final_size > (2 * 512 * 512):
-        return _block_slicing(arrays, list_ndim, result_ndim)
-    else:
-        return _block_concatenate(arrays, list_ndim, result_ndim)
-
-
-# Theses helper functions are mostly used for testing.
-# They allow us to write tests that directly call `_block_slicing`
-# or `_block_concatenate` wtihout blocking large arrays to forse the wisdom
-# to trigger the desired path.
-def _block_setup(arrays):
-    """
-    Returns
-    (`arrays`, list_ndim, result_ndim, final_size)
-    """
-    bottom_index, arr_ndim, final_size = _block_check_depths_match(arrays)
-    list_ndim = len(bottom_index)
-    if bottom_index and bottom_index[-1] is None:
-        raise ValueError(
-            'List at {} cannot be empty'.format(
-                _block_format_index(bottom_index)
-            )
-        )
-    result_ndim = max(arr_ndim, list_ndim)
-    return arrays, list_ndim, result_ndim, final_size
-
-
-def _block_slicing(arrays, list_ndim, result_ndim):
-    shape, slices, arrays = _block_info_recursion(
-        arrays, list_ndim, result_ndim)
-    dtype = _nx.result_type(*[arr.dtype for arr in arrays])
-
-    # Test preferring F only in the case that all input arrays are F
-    F_order = all(arr.flags['F_CONTIGUOUS'] for arr in arrays)
-    C_order =  all(arr.flags['C_CONTIGUOUS'] for arr in arrays)
-    order = 'F' if F_order and not C_order else 'C'
-    result = _nx.empty(shape=shape, dtype=dtype, order=order)
-    # Note: In a c implementation, the function
-    # PyArray_CreateMultiSortedStridePerm could be used for more advanced
-    # guessing of the desired order.
-
-    for the_slice, arr in zip(slices, arrays):
-        result[(Ellipsis,) + the_slice] = arr
-    return result
-
-
-def _block_concatenate(arrays, list_ndim, result_ndim):
-    result = _block(arrays, list_ndim, result_ndim)
-    if list_ndim == 0:
-        # Catch an edge case where _block returns a view because
-        # `arrays` is a single numpy array and not a list of numpy arrays.
-        # This might copy scalars or lists twice, but this isn't a likely
-        # usecase for those interested in performance
-        result = result.copy()
-    return result
+    return _nx.block(arrays, out=out)
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 1b825d318724..24bcdf0ae7a2 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -368,6 +368,400 @@ PyArray_GetSubType(int narrays, PyArrayObject **arrays) {
     return subtype;
 }
 
+/*
+ * Helper function for np.block to create error messages with depth info.
+ */
+void
+_concat_block_depth_str(PyObject **str, int depth, int *position){
+    int i;
+    PyUString_ConcatAndDel(str, PyUString_FromString("arrays"));
+    for (i = 0; i < depth; i++) {
+        PyObject *ind_str = PyUString_FromFormat("[%i]", position[i]);
+        PyUString_ConcatAndDel(str, ind_str);
+    }
+}
+
+/*
+ * Helper function for np.block which pads a shape with 1s.
+ */
+void
+_block_prepend_shape_ones(npy_intp *shape, int ndim, int n_ones){
+    int j;
+    for (j = ndim; j >= 0; j--) {
+        shape[j + n_ones] = shape[j];
+    }
+    for(j = 0; j < n_ones; j++) {
+        shape[j] = 1;
+    }
+}
+
+/*
+ * Helper function for np.block doing first pass to process input arrays. Given
+ * input of a nested list of ndarray/scalars, recursively compute the output
+ * ndim, dtype, shape, subtype, and also the max_depth of the input, and return
+ * a new nested list with all inner elements converted to ndarray.
+ *
+ * Returns a new reference to a nested list like the input nested list,
+ * converted to ndarrays. Sets dtype to a new reference, except on error in
+ * which case dtype will be null.
+ */
+PyObject *
+_get_block_info(PyObject *arrays, int *ndim, PyArray_Descr **dtype,
+                npy_intp *shape, double *sub_priority, PyTypeObject **subtype,
+                int *contigflag, int depth, int *concat_axis, int *position){
+
+    if (depth > NPY_MAXDIMS) {
+        PyErr_SetString(PyExc_ValueError, "arrays exceeded maximum dimensions");
+        return NULL;
+    }
+
+    *dtype = NULL;
+
+    if (PyTuple_Check(arrays)) {
+        PyObject *errmsg = PyUString_FromString("Element at ");
+        _concat_block_depth_str(&errmsg, depth, position);
+        PyUString_ConcatAndDel(&errmsg, PyUString_FromString(
+                        " is a tuple. Only lists can be used to arrange "
+                        "blocks, and np.block does not allow implicit "
+                        "conversion from tuple to ndarray."));
+        PyErr_SetObject(PyExc_TypeError, errmsg);
+        Py_DECREF(errmsg);
+        return NULL;
+    }
+    else if (PyList_Check(arrays)) {
+        int i, idim;
+        PyObject *arrlist, *seq, *item;
+        PyObject *sub_block;
+        int len;
+
+        /* Error if we get an empty list */
+        if (PyList_Size(arrays) == 0){
+            PyObject *errmsg = PyUString_FromString("List at ");
+            _concat_block_depth_str(&errmsg, depth, position);
+            PyUString_ConcatAndDel(&errmsg,
+                                   PyUString_FromString(" cannot be empty"));
+            PyErr_SetObject(PyExc_ValueError, errmsg);
+            Py_DECREF(errmsg);
+            return NULL;
+        }
+
+        /* begin constructing output nested list */
+        seq = PySequence_Fast(arrays, "expected a sequence");
+        if(seq == NULL){
+            return NULL;
+        }
+        len = PySequence_Fast_GET_SIZE(seq);
+        arrlist = PyList_New(len);
+
+        /* take care of first element */
+        item = PySequence_Fast_GET_ITEM(seq, 0);
+        if (item == NULL) {
+            goto block_fail;
+        }
+        position[depth] = 0;
+        sub_block = _get_block_info(item, ndim, dtype, shape, sub_priority,
+                                    subtype, contigflag, depth+1, concat_axis,
+                                    position);
+        if (sub_block == NULL) {
+            goto block_fail;
+        }
+        PyList_SET_ITEM(arrlist, 0, sub_block);
+
+        /* loop over remaining elements */
+        for (i = 1; i < len; i++) {
+            int ndim_i;
+            double priority_i;
+            int concat_axis_i;
+            int contigflag_i;
+            PyTypeObject *subtype_i;
+            PyArray_Descr *dtype_i, *promoted_dtype;
+            npy_intp shape_i[NPY_MAXDIMS];
+
+            item = PySequence_Fast_GET_ITEM(seq, i);
+            position[depth] = i;
+            sub_block = _get_block_info(item, &ndim_i, &dtype_i, shape_i,
+                                        &priority_i, &subtype_i, &contigflag_i,
+                                        depth+1, &concat_axis_i, position);
+            if(sub_block == NULL){
+                goto block_fail;
+            }
+            PyList_SET_ITEM(arrlist, i, sub_block);
+
+            /* update total dtype */
+            promoted_dtype = PyArray_PromoteTypes(*dtype, dtype_i);
+            Py_DECREF(dtype_i);
+            Py_DECREF(*dtype);
+            *dtype = promoted_dtype;
+            if (promoted_dtype == NULL) {
+                goto block_fail;
+            }
+
+            /* update total ndim (prepend ones if needed) */
+            if (ndim_i > *ndim) {
+                _block_prepend_shape_ones(shape, *ndim, ndim_i - *ndim);
+                *concat_axis += ndim_i - *ndim;
+                *ndim = ndim_i;
+            }
+            else if (ndim_i < *ndim) {
+                _block_prepend_shape_ones(shape_i, ndim_i, *ndim - ndim_i);
+                concat_axis_i += *ndim - ndim_i;
+                ndim_i = *ndim;
+            }
+
+            /* check depths */
+            if (concat_axis_i != *concat_axis) {
+                PyObject *errmsg = PyUString_FromString(
+                    "List depths are mismatched. Element ");
+                _concat_block_depth_str(&errmsg, depth+1, position);
+                PyUString_ConcatAndDel(&errmsg, PyUString_FromFormat(
+                    " has depth %i but previous elements have depth %i.",
+                    depth + (ndim_i - concat_axis_i),
+                    depth + (*ndim - *concat_axis)));
+                PyErr_SetObject(PyExc_ValueError, errmsg);
+                Py_DECREF(errmsg);
+                goto block_fail;
+            }
+
+            /* update total shape */
+            for (idim = 0; idim < *ndim; idim++) {
+                /* Build up the size of the concatenation axis */
+                if (idim == concat_axis_i) {
+                    shape[idim] += shape_i[idim];
+                }
+                /* Validate that the rest of the dimensions match */
+                else if (shape[idim] != shape_i[idim]) {
+                    PyObject *errmsg = PyUString_FromString(
+                        "Shape mismatch at ");
+                    _concat_block_depth_str(&errmsg, depth+1, position);
+                    PyUString_ConcatAndDel(&errmsg, PyUString_FromFormat(
+                        " along broadcasted axis %i. Previous elemement's "
+                        "length is %i but this element's is %i.",
+                        idim, shape[idim], shape_i[idim]));
+                    PyErr_SetObject(PyExc_ValueError, errmsg);
+                    Py_DECREF(errmsg);
+                    goto block_fail;
+                }
+            }
+
+            /* update subtype */
+            if (priority_i > *sub_priority) {
+                *sub_priority = priority_i;
+                *subtype = subtype_i;
+            }
+
+            *contigflag = *contigflag & contigflag_i;
+        }
+        Py_DECREF(seq);
+        *concat_axis -= 1;
+        return arrlist;
+
+block_fail:
+        Py_XDECREF(*dtype);
+        *dtype = NULL;
+        Py_DECREF(arrlist);
+        Py_DECREF(seq);
+        return NULL;
+    }
+    else {
+        int arr_ndim;
+
+        /* We've 'bottomed out' - arrays is either a scalar or an array */
+        PyArrayObject *ret = (PyArrayObject *)PyArray_FromAny(arrays, NULL,
+                                                              0, 0, 0, NULL);
+        if (ret == NULL) {
+            return NULL;
+        }
+
+        arr_ndim = PyArray_NDIM(ret);
+        memcpy(shape, PyArray_SHAPE(ret), arr_ndim * sizeof(shape[0]));
+
+        /* pad dimensions with 1s if deeply nested */
+        if (depth > arr_ndim) {
+            _block_prepend_shape_ones(shape, arr_ndim, depth - arr_ndim);
+            arr_ndim = depth;
+        }
+
+        *concat_axis = arr_ndim - 1;
+
+        *ndim = arr_ndim;
+        *dtype = PyArray_DESCR(ret);
+        Py_INCREF(*dtype);
+        *sub_priority = PyArray_GetPriority((PyObject *)(ret), 0.0);
+        *subtype = Py_TYPE(ret);
+        *contigflag = PyArray_FLAGS(ret) &
+                      (NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_F_CONTIGUOUS);
+        return (PyObject *)ret;
+    }
+}
+
+/*
+ * Helper function for np.block. Recursively copies a list of lists of
+ * ndarrays to the appropriate location in the out ndarray.
+ *
+ * Clobbers out's shape. Returns -1 on failure.
+ */
+int
+_block_copy_blocks(PyObject *blocks, PyArrayObject *out, int axis){
+    PyObject *seq, *item;
+    int len, i;
+    void *orig_data;
+
+    /* if we bottomed out at an ndarray, copy to out at the current offset */
+    if (PyArray_Check(blocks)) {
+        PyArrayObject *arr = (PyArrayObject *)blocks;
+        int dim_diff = PyArray_NDIM(out) - PyArray_NDIM(arr);
+
+        /* set out's shape to the block shape (prepend ones if needed) */
+        for (i = 0; i < dim_diff; i++) {
+            PyArray_SHAPE(out)[i] = 1;
+        }
+        for ( ; i < PyArray_NDIM(out); i++) {
+            PyArray_SHAPE(out)[i] = PyArray_SHAPE(arr)[i - dim_diff];
+        }
+
+        /* do the copy */
+        if (PyArray_AssignArray(out, arr, NULL, NPY_SAME_KIND_CASTING) < 0) {
+            return -1;
+        }
+
+        return 0;
+    }
+
+    /* otherwise, iterate through the list at this level and recurse */
+    seq = PySequence_Fast(blocks, "expected a sequence");
+    if(seq == NULL){
+        return -1;
+    }
+    len = PySequence_Fast_GET_SIZE(blocks);
+    orig_data = ((PyArrayObject_fields *)out)->data;
+
+    for (i = 0; i < len; i++) {
+        item = PySequence_Fast_GET_ITEM(seq, i);
+
+        if (item == NULL || _block_copy_blocks(item, out, axis+1) < 0) {
+            Py_DECREF(seq);
+            return -1;
+        }
+
+        /*
+         * move datapointer offset based on the shape of the block we just
+         * copied, which happens to be what out's shape was last set to.
+         */
+        ((PyArrayObject_fields *)out)->data +=
+                PyArray_SHAPE(out)[axis] * PyArray_STRIDES(out)[axis];
+    }
+
+    ((PyArrayObject_fields *)out)->data = orig_data;  /* reset data pointer */
+    Py_DECREF(seq);
+    return 0;
+}
+
+/*
+ * Core implementation of np.block, in 2 passes.
+ *
+ * First pass calls _get_block_info, which processes input nested list.
+ * Second pass calls _block_copy_blocks, which copies input arrays to out.
+ *
+ * Returns -1 on failure.
+ */
+PyObject *
+_block_arrays(PyObject *arrays, PyArrayObject *out){
+    /* Compare to PyArray_ConcatenateArrays. */
+    PyArray_Descr *dtype;
+    int ndim;
+    npy_intp shape[NPY_MAXDIMS];
+    double sub_priority;
+    PyTypeObject *subtype;
+    int contigflag;
+    int start_axis;
+    PyObject *blocks;
+    int position[NPY_MAXDIMS];
+    int ret;
+
+    /*
+     * First-pass through the input arrays to get ndim, output shape, dtype,
+     * etc (with validation). Convert input to a nested list of ndarrays.
+     */
+    blocks = _get_block_info(arrays, &ndim, &dtype, shape, &sub_priority,
+                             &subtype, &contigflag, 0, &start_axis, position);
+    if (blocks == NULL) {
+        return NULL;
+    }
+
+    /* Set up the output array with the detected shape and dtype. */
+    if (out == NULL) {
+        int f_flag =  (contigflag & NPY_ARRAY_F_CONTIGUOUS) &&
+                     !(contigflag & NPY_ARRAY_C_CONTIGUOUS);
+        out = (PyArrayObject *)PyArray_NewFromDescr(subtype, dtype, ndim, shape,
+                                                    NULL, NULL, f_flag, NULL);
+        if (out == NULL) {
+            return NULL;
+        }
+    }
+    else {
+        if (PyArray_NDIM(out) != ndim ||
+                !PyArray_CompareLists(shape, PyArray_SHAPE(out), ndim)) {
+            int i;
+            PyObject *ind_str, *errmsg;
+
+            errmsg = PyUString_FromFormat(
+                              "Output array has wrong shape. Expected shape (");
+            for (i = 0; i < ndim-1; i++) {
+                ind_str = PyUString_FromFormat("%i, ", shape[i]);
+                PyUString_ConcatAndDel(&errmsg, ind_str);
+            }
+            ind_str = PyUString_FromFormat("%i)", shape[ndim-1]);
+            PyUString_ConcatAndDel(&errmsg, ind_str);
+            PyErr_SetObject(PyExc_ValueError, errmsg);
+            Py_DECREF(errmsg);
+
+            Py_DECREF(blocks);
+            return NULL;
+        }
+        Py_INCREF(out);
+    }
+
+    /*
+     * Second pass through the nested list of ndarrays, doing the copy. Rather
+     * than slicing, we (temporarily) clobber out's shape and data pointer
+     * using the fact that the increment in memory position at each axis level
+     * can be determined from the shape of the last ndarray copied.
+     */
+    ret = _block_copy_blocks(blocks, out, start_axis + 1);
+    memcpy(PyArray_SHAPE(out), shape, ndim*sizeof(shape[0])); /* reset shape */
+    Py_DECREF(blocks);
+
+    if (ret < 0) {
+        Py_DECREF(out);
+        return NULL;
+    }
+
+    return (PyObject *)out;
+}
+
+static PyObject *
+array_block(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
+{
+    PyObject *arrays = NULL;
+    PyObject *out = NULL;
+    static char *kwlist[] = {"arrays", "out", NULL};
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|O:block", kwlist,
+                &arrays, &out)) {
+        return NULL;
+    }
+    if (out != NULL) {
+        if (out == Py_None) {
+            out = NULL;
+        }
+        else if (!PyArray_Check(out)) {
+            PyErr_SetString(PyExc_TypeError, "'out' must be an array");
+            return NULL;
+        }
+    }
+
+    return _block_arrays(arrays, (PyArrayObject *)out);
+}
 
 /*
  * Concatenates a list of ndarrays.
@@ -4140,6 +4534,9 @@ static struct PyMethodDef array_module_methods[] = {
     {"concatenate",
         (PyCFunction)array_concatenate,
         METH_VARARGS|METH_KEYWORDS, NULL},
+    {"block",
+        (PyCFunction)array_block,
+        METH_VARARGS|METH_KEYWORDS, NULL},
     {"inner",
         (PyCFunction)array_innerproduct,
         METH_VARARGS, NULL},
diff --git a/numpy/core/tests/test_shape_base.py b/numpy/core/tests/test_shape_base.py
index 53d272fc5c0d..c3f4c4e4cfd7 100644
--- a/numpy/core/tests/test_shape_base.py
+++ b/numpy/core/tests/test_shape_base.py
@@ -7,8 +7,6 @@
     array, arange, atleast_1d, atleast_2d, atleast_3d, block, vstack, hstack,
     newaxis, concatenate, stack
     )
-from numpy.core.shape_base import (_block_dispatcher, _block_setup,
-                                   _block_concatenate, _block_slicing)
 from numpy.testing import (
     assert_, assert_raises, assert_array_equal, assert_equal,
     assert_raises_regex, assert_warns
@@ -410,55 +408,13 @@ def test_stack():
 
 
 class TestBlock(object):
-    @pytest.fixture(params=['block', 'force_concatenate', 'force_slicing'])
-    def block(self, request):
-        # blocking small arrays and large arrays go through different paths.
-        # the algorithm is triggered depending on the number of element
-        # copies required.
-        # We define a test fixture that forces most tests to go through
-        # both code paths.
-        # Ultimately, this should be removed if a single algorithm is found
-        # to be faster for both small and large arrays.
-        def _block_force_concatenate(arrays):
-            arrays, list_ndim, result_ndim, _ = _block_setup(arrays)
-            return _block_concatenate(arrays, list_ndim, result_ndim)
-
-        def _block_force_slicing(arrays):
-            arrays, list_ndim, result_ndim, _ = _block_setup(arrays)
-            return _block_slicing(arrays, list_ndim, result_ndim)
-
-        if request.param == 'force_concatenate':
-            return _block_force_concatenate
-        elif request.param == 'force_slicing':
-            return _block_force_slicing
-        elif request.param == 'block':
-            return block
-        else:
-            raise ValueError('Unknown blocking request. There is a typo in the tests.')
-
-    def test_returns_copy(self, block):
+    def test_returns_copy(self):
         a = np.eye(3)
         b = block(a)
         b[0, 0] = 2
         assert b[0, 0] != a[0, 0]
 
-    def test_block_total_size_estimate(self, block):
-        _, _, _, total_size = _block_setup([1])
-        assert total_size == 1
-
-        _, _, _, total_size = _block_setup([[1]])
-        assert total_size == 1
-
-        _, _, _, total_size = _block_setup([[1, 1]])
-        assert total_size == 2
-
-        _, _, _, total_size = _block_setup([[1], [1]])
-        assert total_size == 2
-
-        _, _, _, total_size = _block_setup([[1, 2], [3, 4]])
-        assert total_size == 4
-
-    def test_block_simple_row_wise(self, block):
+    def test_block_simple_row_wise(self):
         a_2d = np.ones((2, 2))
         b_2d = 2 * a_2d
         desired = np.array([[1, 1, 2, 2],
@@ -466,7 +422,7 @@ def test_block_simple_row_wise(self, block):
         result = block([a_2d, b_2d])
         assert_equal(desired, result)
 
-    def test_block_simple_column_wise(self, block):
+    def test_block_simple_column_wise(self):
         a_2d = np.ones((2, 2))
         b_2d = 2 * a_2d
         expected = np.array([[1, 1],
@@ -476,7 +432,7 @@ def test_block_simple_column_wise(self, block):
         result = block([[a_2d], [b_2d]])
         assert_equal(expected, result)
 
-    def test_block_with_1d_arrays_row_wise(self, block):
+    def test_block_with_1d_arrays_row_wise(self):
         # # # 1-D vectors are treated as row arrays
         a = np.array([1, 2, 3])
         b = np.array([2, 3, 4])
@@ -484,7 +440,7 @@ def test_block_with_1d_arrays_row_wise(self, block):
         result = block([a, b])
         assert_equal(expected, result)
 
-    def test_block_with_1d_arrays_multiple_rows(self, block):
+    def test_block_with_1d_arrays_multiple_rows(self):
         a = np.array([1, 2, 3])
         b = np.array([2, 3, 4])
         expected = np.array([[1, 2, 3, 2, 3, 4],
@@ -492,7 +448,7 @@ def test_block_with_1d_arrays_multiple_rows(self, block):
         result = block([[a, b], [a, b]])
         assert_equal(expected, result)
 
-    def test_block_with_1d_arrays_column_wise(self, block):
+    def test_block_with_1d_arrays_column_wise(self):
         # # # 1-D vectors are treated as row arrays
         a_1d = np.array([1, 2, 3])
         b_1d = np.array([2, 3, 4])
@@ -501,7 +457,7 @@ def test_block_with_1d_arrays_column_wise(self, block):
         result = block([[a_1d], [b_1d]])
         assert_equal(expected, result)
 
-    def test_block_mixed_1d_and_2d(self, block):
+    def test_block_mixed_1d_and_2d(self):
         a_2d = np.ones((2, 2))
         b_1d = np.array([2, 2])
         result = block([[a_2d], [b_1d]])
@@ -510,7 +466,7 @@ def test_block_mixed_1d_and_2d(self, block):
                              [2, 2]])
         assert_equal(expected, result)
 
-    def test_block_complicated(self, block):
+    def test_block_complicated(self):
         # a bit more complicated
         one_2d = np.array([[1, 1, 1]])
         two_2d = np.array([[2, 2, 2]])
@@ -534,7 +490,7 @@ def test_block_complicated(self, block):
                         [zero_2d]])
         assert_equal(result, expected)
 
-    def test_nested(self, block):
+    def test_nested(self):
         one = np.array([1, 1, 1])
         two = np.array([[2, 2, 2], [2, 2, 2], [2, 2, 2]])
         three = np.array([3, 3, 3])
@@ -564,7 +520,7 @@ def test_nested(self, block):
 
         assert_equal(result, expected)
 
-    def test_3d(self, block):
+    def test_3d(self):
         a000 = np.ones((2, 2, 2), int) * 1
 
         a100 = np.ones((3, 2, 2), int) * 2
@@ -619,7 +575,7 @@ def test_3d(self, block):
 
         assert_array_equal(result, expected)
 
-    def test_block_with_mismatched_shape(self, block):
+    def test_block_with_mismatched_shape(self):
         a = np.array([0, 0])
         b = np.eye(2)
         assert_raises(ValueError, block, [a, b])
@@ -628,32 +584,40 @@ def test_block_with_mismatched_shape(self, block):
         to_block = [[np.ones((2,3)), np.ones((2,2))],
                     [np.ones((2,2)), np.ones((2,2))]]
         assert_raises(ValueError, block, to_block)
-    def test_no_lists(self, block):
+
+    def test_no_lists(self):
         assert_equal(block(1),         np.array(1))
         assert_equal(block(np.eye(3)), np.eye(3))
 
-    def test_invalid_nesting(self, block):
-        msg = 'depths are mismatched'
-        assert_raises_regex(ValueError, msg, block, [1, [2]])
-        assert_raises_regex(ValueError, msg, block, [1, []])
-        assert_raises_regex(ValueError, msg, block, [[1], 2])
-        assert_raises_regex(ValueError, msg, block, [[], 2])
-        assert_raises_regex(ValueError, msg, block, [
+    def test_invalid_nesting(self):
+        depth_msg = 'List depths are mismatched'
+        shape_msg = 'Shape mismatch'
+        empty_msg = 'List at'
+
+        assert_raises_regex(ValueError, depth_msg, block, [1, [2]])
+        assert_raises_regex(ValueError, empty_msg, block, [1, []])
+        assert_raises_regex(ValueError, depth_msg, block, [[1], 2])
+        assert_raises_regex(ValueError, empty_msg, block, [[], 2])
+        assert_raises_regex(ValueError, shape_msg, block, [
             [[1], [2]],
             [[3, 4]],
+        ])
+        assert_raises_regex(ValueError, depth_msg, block, [
+            [[1], [2]],
+            [[3], [4]],
             [5]  # missing brackets
         ])
 
-    def test_empty_lists(self, block):
+    def test_empty_lists(self):
         assert_raises_regex(ValueError, 'empty', block, [])
         assert_raises_regex(ValueError, 'empty', block, [[]])
         assert_raises_regex(ValueError, 'empty', block, [[1], []])
 
-    def test_tuple(self, block):
+    def test_tuple(self):
         assert_raises_regex(TypeError, 'tuple', block, ([1, 2], [3, 4]))
         assert_raises_regex(TypeError, 'tuple', block, [(1, 2), (3, 4)])
 
-    def test_different_ndims(self, block):
+    def test_different_ndims(self):
         a = 1.
         b = 2 * np.ones((1, 2))
         c = 3 * np.ones((1, 1, 3))
@@ -663,7 +627,7 @@ def test_different_ndims(self, block):
 
         assert_equal(result, expected)
 
-    def test_different_ndims_depths(self, block):
+    def test_different_ndims_depths(self):
         a = 1.
         b = 2 * np.ones((1, 2))
         c = 3 * np.ones((1, 2, 3))
@@ -675,7 +639,7 @@ def test_different_ndims_depths(self, block):
 
         assert_equal(result, expected)
 
-    def test_block_memory_order(self, block):
+    def test_block_memory_order(self):
         # 3D
         arr_c = np.zeros((3,)*3, order='C')
         arr_f = np.zeros((3,)*3, order='F')
@@ -705,16 +669,10 @@ def test_block_memory_order(self, block):
         assert block(b_c).flags['C_CONTIGUOUS']
         assert block(b_f).flags['F_CONTIGUOUS']
 
+    def test_out(self):
+        x = np.zeros((3,4))
+        np.block([np.ones((3,3)), np.ones((3,1))], out=x)
+        assert_equal(x, np.ones((3,4)))
+        assert_raises(ValueError, np.block, [np.ones((3,3)), np.ones((3,1))],
+                                            out=np.ones((3,3)))
 
-def test_block_dispatcher():
-    class ArrayLike(object):
-        pass
-    a = ArrayLike()
-    b = ArrayLike()
-    c = ArrayLike()
-    assert_equal(list(_block_dispatcher(a)), [a])
-    assert_equal(list(_block_dispatcher([a])), [a])
-    assert_equal(list(_block_dispatcher([a, b])), [a, b])
-    assert_equal(list(_block_dispatcher([[a], [b, [c]]])), [a, b, c])
-    # don't recurse into non-lists
-    assert_equal(list(_block_dispatcher((a, b))), [(a, b)])