diff --git a/CHANGES.rst b/CHANGES.rst
index 72abcd5a..aafc3637 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -17,7 +17,7 @@ This is a bug-fix release and contains general code quality improvements.
 
 Enhancements
 ------------
-
+- A new helper routine to find the combination of (RA, DEC) refinements that produces fastest runtime in ``DDtheta_mocks`` [#216]
 
 
 Bug fixes
diff --git a/Corrfunc/mocks/DDtheta_mocks.py b/Corrfunc/mocks/DDtheta_mocks.py
index 9e23dac7..52d6ea8f 100644
--- a/Corrfunc/mocks/DDtheta_mocks.py
+++ b/Corrfunc/mocks/DDtheta_mocks.py
@@ -11,8 +11,8 @@
 from __future__ import (division, print_function, absolute_import,
                         unicode_literals)
 
-__author__ = ('Manodeep Sinha')
-__all__ = ('DDtheta_mocks',)
+__author__ = ('Manodeep Sinha', 'Kris Akira Stern')
+__all__ = ('DDtheta_mocks', 'find_fastest_DDtheta_mocks_bin_refs')
 
 
 def DDtheta_mocks(autocorr, nthreads, binfile,
@@ -158,7 +158,7 @@ def DDtheta_mocks(autocorr, nthreads, binfile,
 
     max_cells_per_dim : integer, default is 100, typical values in [50-300]
         Controls the maximum number of cells per dimension. Total number of
-        cells can be up to (max_cells_per_dim)^3. Only increase if ``thetamax``
+        cells can be up to (max_cells_per_dim)^2. Only increase if ``thetamax``
         is too small relative to the boxsize (and increasing helps the
         runtime).
 
@@ -211,7 +211,7 @@ def DDtheta_mocks(autocorr, nthreads, binfile,
         the time spent within the C library and ignores all python overhead.
 
     Example
-    --------
+    -------
 
     >>> from __future__ import print_function
     >>> import numpy as np
@@ -239,7 +239,8 @@ def DDtheta_mocks(autocorr, nthreads, binfile,
     ...                         weights1=weights, weight_type='pair_product',
     ...                         link_in_dec=link_in_dec, link_in_ra=link_in_ra,
     ...                         isa=isa, verbose=True)
-    >>> for r in results: print("{0:10.6f} {1:10.6f} {2:10.6f} {3:10d} {4:10.6f}".
+    >>> for r in results:
+    ...    print("{0:10.6f} {1:10.6f} {2:10.6f} {3:10d} {4:10.6f}".
     ...                         format(r['thetamin'], r['thetamax'],
     ...                         r['thetaavg'], r['npairs'], r['weightavg']))
     ...                         # doctest: +NORMALIZE_WHITESPACE
@@ -265,7 +266,6 @@ def DDtheta_mocks(autocorr, nthreads, binfile,
       7.079458  10.000000   8.622400   37842502   1.000000
 
     """
-
     try:
         from Corrfunc._countpairs_mocks import countpairs_theta_mocks as\
             DDtheta_mocks_extn
@@ -289,7 +289,8 @@ def DDtheta_mocks(autocorr, nthreads, binfile,
         RA2 = np.empty(1)
         DEC2 = np.empty(1)
 
-    weights1, weights2 = process_weights(weights1, weights2, RA1, RA2, weight_type, autocorr)
+    weights1, weights2 = process_weights(weights1, weights2,
+                                         RA1, RA2, weight_type, autocorr)
 
     # Ensure all input arrays are native endian
     RA1, DEC1, weights1, RA2, DEC2, weights2 = [
@@ -300,10 +301,9 @@ def DDtheta_mocks(autocorr, nthreads, binfile,
     if autocorr == 0:
         fix_ra_dec(RA2, DEC2)
 
-    if link_in_ra is True:
+    if link_in_ra:
         link_in_dec = True
 
-
     # Passing None parameters breaks the parsing code, so avoid this
     kwargs = {}
     for k in ['weights1', 'weights2', 'weight_type', 'RA2', 'DEC2']:
@@ -314,20 +314,20 @@ def DDtheta_mocks(autocorr, nthreads, binfile,
     integer_isa = translate_isa_string_to_enum(isa)
     rbinfile, delete_after_use = return_file_with_rbins(binfile)
     with sys_pipes():
-      extn_results = DDtheta_mocks_extn(autocorr, nthreads, rbinfile,
-                                        RA1, DEC1,
-                                        verbose=verbose,
-                                        link_in_dec=link_in_dec,
-                                        link_in_ra=link_in_ra,
-                                        output_thetaavg=output_thetaavg,
-                                        fast_acos=fast_acos,
-                                        ra_refine_factor=ra_refine_factor,
-                                        dec_refine_factor=dec_refine_factor,
-                                        max_cells_per_dim=max_cells_per_dim,
-                                        copy_particles=copy_particles,
-                                        enable_min_sep_opt=enable_min_sep_opt,
-                                        c_api_timer=c_api_timer,
-                                        isa=integer_isa, **kwargs)
+        extn_results = DDtheta_mocks_extn(autocorr, nthreads, rbinfile,
+                                          RA1, DEC1,
+                                          verbose=verbose,
+                                          link_in_dec=link_in_dec,
+                                          link_in_ra=link_in_ra,
+                                          output_thetaavg=output_thetaavg,
+                                          fast_acos=fast_acos,
+                                          ra_refine_factor=ra_refine_factor,
+                                          dec_refine_factor=dec_refine_factor,
+                                          max_cells_per_dim=max_cells_per_dim,
+                                          copy_particles=copy_particles,
+                                          enable_min_sep_opt=enable_min_sep_opt,
+                                          c_api_timer=c_api_timer,
+                                          isa=integer_isa, **kwargs)
     if extn_results is None:
         msg = "RuntimeError occurred"
         raise RuntimeError(msg)
@@ -351,6 +351,300 @@ def DDtheta_mocks(autocorr, nthreads, binfile,
         return results, api_time
 
 
+def find_fastest_DDtheta_mocks_bin_refs(autocorr, nthreads, binfile,
+                                        RA1, DEC1, RA2=None, DEC2=None,
+                                        link_in_dec=True, link_in_ra=True,
+                                        verbose=False, output_thetaavg=False,
+                                        max_cells_per_dim=100,
+                                        isa=r'fastest',
+                                        maxbinref=3, nrepeats=3,
+                                        return_runtimes=False):
+    """
+
+    Parameters
+    ----------
+    autocorr : boolean, required
+        Boolean flag for auto/cross-correlation. If autocorr is set to 1,
+        then the second set of particle positions are not required.
+
+    nthreads : integer
+        Number of threads to use.
+
+    binfile: string or an list/array of floats. Units: degrees.
+        For string input: filename specifying the ``theta`` bins for
+        ``DDtheta_mocks``. The file should contain white-space separated values
+        of (thetamin, thetamax)  for each ``theta`` wanted. The bins need to be
+        contiguous and sorted in increasing order (smallest bins come first).
+
+        For array-like input: A sequence of ``theta`` values that provides the
+        bin-edges. For example,
+        ``np.logspace(np.log10(0.1), np.log10(10.0), 15)`` is a valid
+        input specifying **14** (logarithmic) bins between 0.1 and 10.0
+        degrees. This array does not need to be sorted.
+
+    RA1 : array-like, real (float/double)
+        The array of Right Ascensions for the first set of points. RA1's
+        are expected to be in [0.0, 360.0], but the code will try to fix cases
+        where the RA1's are in [-180, 180.0]. For peace of mind, always supply
+        RA1's in [0.0, 360.0].
+
+        Calculations are done in the precision of the supplied arrays.
+
+    DEC1 : array-like, real (float/double)
+        Array of Declinations for the first set of points. DEC1's are expected
+        to be in the [-90.0, 90.0], but the code will try to fix cases where
+        the DEC1's are in [0.0, 180.0]. Again, for peace of mind, always supply
+        DEC1's in [-90.0, 90.0].
+        Must be of same precision type as RA1.
+
+    RA2 : array-like, real (float/double)
+        The array of Right Ascensions for the second set of points. RA2's
+        are expected to be in [0.0, 360.0], but the code will try to fix cases
+        where the RA2's are in [-180, 180.0]. For peace of mind, always supply
+        RA2's in [0.0, 360.0].
+        Must be of same precision type as RA1/DEC1.
+
+    DEC2 : array-like, real (float/double)
+        Array of Declinations for the second set of points. DEC2's are expected
+        to be in the [-90.0, 90.0], but the code will try to fix cases where
+        the DEC2's are in [0.0, 180.0]. Again, for peace of mind, always supply
+        DEC2's in [-90.0, 90.0].
+        Must be of same precision type as RA1/DEC1.
+
+    verbose : boolean (default false)
+        Boolean flag to control output of informational messages
+
+    output_thetaavg : boolean (default false)
+        Boolean flag to output the average ``\theta`` for each bin. Code will
+        run slower if you set this flag.
+
+        If you are calculating in single-precision, ``thetaavg`` will
+        suffer from numerical loss of precision and can not be trusted. If you
+        need accurate ``thetaavg`` values, then pass in double precision arrays
+        for ``RA/DEC``.
+
+    isa: string, case-insensitive (default ``fastest``)
+        Controls the runtime dispatch for the instruction set to use. Options
+        are: [``fastest``, ``avx512f``, ``avx``, ``sse42``, ``fallback``]
+
+        Setting isa to ``fastest`` will pick the fastest available instruction
+        set on the current computer. However, if you set ``isa`` to, say,
+        ``avx`` and ``avx`` is not available on the computer, then the code
+        will revert to using ``fallback`` (even though ``sse42`` might be
+        available).
+
+        Unless you are benchmarking the different instruction sets, you should
+        always leave ``isa`` to the default value. And if you *are*
+        benchmarking, then the string supplied here gets translated into an
+        ``enum`` for the instruction set defined in ``utils/defs.h``.
+
+    max_cells_per_dim: integer, default is 100, typical values in [50-300]
+        Controls the maximum number of cells per dimension. Total number of
+        cells can be up to (max_cells_per_dim)^2. Only increase if ``rpmax`` is
+        too small relative to the boxsize (and increasing helps the runtime).
+
+    maxbinref: integer (default 3)
+        The maximum ``bin refine factor`` to use along each dimension.
+
+        Runtime of module scales as ``maxbinref^2``, so change the value of
+        ``maxbinref`` with caution.
+
+        Note that ``max_cells_per_dim`` might need to be increased
+        to accommodate really large ``maxbinref``.
+
+    nrepeats: integer (default 3)
+        Number of times to repeat the timing for an individual run. Accounts
+        for the dispersion in runtimes on computers with multiple user
+        processes.
+
+    return_runtimes: boolean (default ``false``)
+        If set, also returns the array of runtimes.
+
+    Returns
+    -------
+    (nRA, nDEC) : tuple of integers
+        The combination of ``bin refine factors`` along each dimension that
+        produces the fastest code.
+
+    runtimes : numpy structured array
+
+        Only returned if ``return_runtimes`` is set, then the return value
+        is a tuple containing ((nRA, nDEC), runtimes). ``runtimes`` is a
+        ``numpy`` structured array containing the fields, [``nRA``, ``nDEC``,
+        ``avg_runtime``, ``sigma_time``]. Here, ``avg_runtime`` is the
+        average time, measured over ``nrepeats`` invocations, spent in
+        the python extension. ``sigma_time`` is the dispersion of the
+        run times across those ``nrepeats`` invocations.
+
+    Example
+    -------
+    >>> from __future__ import print_function
+    >>> import numpy as np
+    >>> import time
+    >>> from math import pi
+    >>> from os.path import dirname, abspath, join as pjoin
+    >>> import Corrfunc
+    >>> from Corrfunc.mocks.DDtheta_mocks \
+        import find_fastest_DDtheta_mocks_bin_refs
+    >>> binfile = pjoin(dirname(abspath(Corrfunc.__file__)),
+    ...                 "../mocks/tests/", "angular_bins")
+    >>> N = 100000
+    >>> nthreads = 4
+    >>> seed = 42
+    >>> np.random.seed(seed)
+    >>> RA1 = np.random.uniform(0.0, 2.0*pi, N)*180.0/pi
+    >>> cos_theta = np.random.uniform(-1.0, 1.0, N)
+    >>> DEC1 = 90.0 - np.arccos(cos_theta)*180.0/pi
+    >>> autocorr = 1
+    >>> best, _ = find_fastest_DDtheta_mocks_bin_refs(autocorr, nthreads,\
+    ...                                               binfile, RA1, DEC1,\
+    ...                                               return_runtimes=True)
+    >>> print(best) # doctest:+SKIP
+    (2, 1)
+
+    .. note:: Since the result might change depending on the computer,
+    doctest is skipped for this function.
+    """
+
+    weights1 = None
+    weights2 = None
+    weight_type = None
+
+    try:
+        from Corrfunc._countpairs_mocks import countpairs_theta_mocks as\
+            DDtheta_mocks_extn
+    except ImportError:
+        msg = "Could not import the C extension for the angular "\
+              "correlation function for mocks."
+        raise ImportError(msg)
+
+    import numpy as np
+    from Corrfunc.utils import translate_isa_string_to_enum, fix_ra_dec,\
+        return_file_with_rbins, convert_to_native_endian, process_weights
+    from future.utils import bytes_to_native_str
+    import itertools
+    import time
+
+    weights1, weights2 = process_weights(weights1, weights2, RA1, RA2,
+                                         weight_type, autocorr)
+
+    # Ensure all input arrays are native endian
+    RA1, DEC1, weights1, RA2, DEC2, weights2 = [
+        convert_to_native_endian(arr, warn=True) for arr in
+        [RA1, DEC1, weights1, RA2, DEC2, weights2]]
+
+    fix_ra_dec(RA1, DEC1)
+    if autocorr == 0:
+        fix_ra_dec(RA2, DEC2)
+
+    # Passing None parameters breaks the parsing code, so avoid this
+    kwargs = {}
+    for k in ['weights1', 'weights2', 'weight_type', 'RA2', 'DEC2']:
+        v = locals()[k]
+        if v is not None:
+            kwargs[k] = v
+
+    integer_isa = translate_isa_string_to_enum(isa)
+    rbinfile, delete_after_use = return_file_with_rbins(binfile)
+    bin_refs = np.arange(1, maxbinref + 1)
+    if link_in_ra:
+        bin_ref_perms = itertools.product(bin_refs, bin_refs)
+        nperms = maxbinref ** 2
+    else:
+        bin_ref_perms = [(1, binref) for binref in bin_refs]
+        nperms = maxbinref
+
+    dtype = np.dtype([(bytes_to_native_str(b'nRA'), np.int),
+                      (bytes_to_native_str(b'nDEC'), np.int),
+                      (bytes_to_native_str(b'avg_time'), np.float),
+                      (bytes_to_native_str(b'sigma_time'), np.float)])
+    all_runtimes = np.zeros(nperms, dtype=dtype)
+    all_runtimes[:] = np.inf
+
+    if autocorr == 0:
+        if RA2 is None or DEC2 is None:
+            msg = "Must pass valid arrays for RA2/DEC2 for "\
+                  "computing cross-correlation."
+            raise ValueError(msg)
+    else:
+        RA2 = np.empty(1)
+        DEC2 = np.empty(1)
+
+    if link_in_ra:
+        link_in_dec = True
+
+    if not link_in_dec:
+        msg = "Error: Brute-force calculation without any gridding " \
+              "is forced, as link_in_dec and link_in_ra are both set " \
+              "to False. Please set at least one of link_in_dec, link_in_ra=True " \
+              "to enable gridding along DEC or along both RA and DEC."
+        raise ValueError(msg)
+
+    if verbose and not link_in_ra:
+        msg = "INFO: Since ``link_in_ra`` is not set, only gridding in declination " \
+              "Checking with refinements in declination ranging from [1, {}] and a " \
+              "maximum of {} bins".format(maxbinref, max_cells_per_dim)
+        print(msg)
+
+    for ii, (nRA, nDEC) in enumerate(bin_ref_perms):
+        total_runtime = 0.0
+        total_sqr_runtime = 0.0
+
+        for _ in range(nrepeats):
+            t0 = time.perf_counter()
+            extn_results = DDtheta_mocks_extn(autocorr, nthreads, rbinfile,
+                                              RA1, DEC1, RA2, DEC2,
+                                              link_in_dec=link_in_dec,
+                                              link_in_ra=link_in_ra,
+                                              verbose=verbose,
+                                              output_thetaavg=output_thetaavg,
+                                              ra_refine_factor=nRA,
+                                              dec_refine_factor=nDEC,
+                                              max_cells_per_dim=max_cells_per_dim,
+                                              isa=integer_isa)
+            t1 = time.perf_counter()
+
+            if extn_results is None:
+                msg = "RuntimeError occurred with perms = ({0}, {1})".\
+                    format(nRA, nDEC)
+                raise ValueError(msg)
+
+            dt = (t1 - t0)
+            total_runtime += dt
+            total_sqr_runtime += dt * dt
+
+        avg_runtime = total_runtime / nrepeats
+
+        # variance = E(X^2) - E^2(X)
+        # disp = sqrt(variance)
+        runtime_disp = np.sqrt(total_sqr_runtime / nrepeats -
+                               avg_runtime * avg_runtime)
+
+        all_runtimes[ii]['nRA'] = nRA
+        all_runtimes[ii]['nDEC'] = nDEC
+        all_runtimes[ii]['avg_time'] = avg_runtime
+        all_runtimes[ii]['sigma_time'] = runtime_disp
+
+    if delete_after_use:
+        import os
+        os.remove(rbinfile)
+
+    all_runtimes.sort(order=('avg_time', 'sigma_time'))
+    results = (all_runtimes[0]['nRA'],
+               all_runtimes[0]['nDEC'])
+
+    optional_returns = return_runtimes
+    if not optional_returns:
+        ret = results
+    else:
+        ret = (results,)
+        if return_runtimes:
+            ret += (all_runtimes,)
+
+    return ret
+
+
 if __name__ == '__main__':
     import doctest
+
     doctest.testmod()
diff --git a/Corrfunc/theory/wp.py b/Corrfunc/theory/wp.py
index 31bf9269..64d3c71e 100644
--- a/Corrfunc/theory/wp.py
+++ b/Corrfunc/theory/wp.py
@@ -116,7 +116,7 @@ def find_fastest_wp_bin_refs(boxsize, pimax, nthreads, binfile, X, Y, Z,
 
     runtimes : numpy structured array
 
-        if ``return_runtimes`` is set, then the return value is a tuple
+        Only returned if ``return_runtimes`` is set, then the return value is a tuple
         containing ((nx, ny, nz), runtimes). ``runtimes`` is a ``numpy``
         structured array containing the fields, [``nx``, ``ny``, ``nz``,
         ``avg_runtime``, ``sigma_time``]. Here, ``avg_runtime`` is the
diff --git a/mocks/DDtheta_mocks/countpairs_theta_mocks_impl.c.src b/mocks/DDtheta_mocks/countpairs_theta_mocks_impl.c.src
index 1fd759b2..345b659e 100644
--- a/mocks/DDtheta_mocks/countpairs_theta_mocks_impl.c.src
+++ b/mocks/DDtheta_mocks/countpairs_theta_mocks_impl.c.src
@@ -545,14 +545,19 @@ int countpairs_theta_mocks_DOUBLE(const int64_t ND1, DOUBLE *ra1, DOUBLE *dec1,
         options->bin_refine_factors[1]=numthreads;
     }
 #endif
-    /* Only check the ra and dec bin refine factors (not all 3 bin refs)*/
-    for(int i=0;i<2;i++) {
-        if(options->bin_refine_factors[i] < 1) {
-            fprintf(stderr,"Warning: bin refine factor along axis = %d *must* be >=1. Instead found bin refine factor =%d\n",
-                    i, options->bin_refine_factors[i]);
-            reset_bin_refine_factors(options);
-            break;/* all factors have been reset -> no point continuing with the loop */
-        }
+
+    /* Only check the ra and dec bin refine factors (not all 3 bin refs) */
+    /* As evidenced by the PR #216, the error-message and resetting is not quite right! */
+    if(options->link_in_ra && options->bin_refine_factors[0] < 1) {
+        fprintf(stderr,"Warning: Linking in RA is requested, so the RA-bin refine factor *must* be >=1. Instead found bin refine factor =%d...resetting\n",
+                options->bin_refine_factors[0]);
+        reset_bin_refine_factors(options);
+    }
+
+    if(options->link_in_dec && options->bin_refine_factors[1] < 1) {
+        fprintf(stderr,"Warning: Linking in DEC is requested, so the DEC-bin refine factor *must* be >=1. Instead found bin refine factor =%d...resetting\n",
+                options->bin_refine_factors[1]);
+        reset_bin_refine_factors(options);
     }
 
     if(options->max_cells_per_dim == 0) {