Skip to content

Commit

Permalink
Merge pull request #9 from bashtage/choice-dtype
Browse files Browse the repository at this point in the history
MAINT: Simplify return types
  • Loading branch information
mattip committed Apr 13, 2019
2 parents cb6f40f + 6c4bc0c commit 0e7589e
Show file tree
Hide file tree
Showing 4 changed files with 125 additions and 43 deletions.
120 changes: 83 additions & 37 deletions numpy/random/generator.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -368,26 +368,11 @@ cdef class RandomGenerator:
[ True, True]]])
"""
cdef np.npy_intp n
cdef np.ndarray randoms
cdef int64_t *randoms_data

if size is None:
with self.lock:
return random_positive_int(self._brng)

randoms = <np.ndarray>np.empty(size, dtype=np.int64)
randoms_data = <int64_t*>np.PyArray_DATA(randoms)
n = np.PyArray_SIZE(randoms)

for i in range(n):
with self.lock, nogil:
randoms_data[i] = random_positive_int(self._brng)
return randoms
return self.randint(0, np.iinfo(np.int).max + 1, dtype=np.int, size=size)

def randint(self, low, high=None, size=None, dtype=int, use_masked=True):
def randint(self, low, high=None, size=None, dtype=np.int64, use_masked=True):
"""
randint(low, high=None, size=None, dtype='l', use_masked=True)
randint(low, high=None, size=None, dtype='int64', use_masked=True)
Return random integers from `low` (inclusive) to `high` (exclusive).
Expand Down Expand Up @@ -530,9 +515,9 @@ cdef class RandomGenerator:
return self.randint(0, 4294967296, size=n_uint32, dtype=np.uint32).tobytes()[:length]

@cython.wraparound(True)
def choice(self, a, size=None, replace=True, p=None):
def choice(self, a, size=None, replace=True, p=None, axis=0):
"""
choice(a, size=None, replace=True, p=None)
choice(a, size=None, replace=True, p=None, axis=0):
Generates a random sample from a given 1-D array
Expand All @@ -553,6 +538,9 @@ cdef class RandomGenerator:
The probabilities associated with each entry in a.
If not given the sample assumes a uniform distribution over all
entries in a.
axis : int, optional
The axis along which the selection is performed. The default, 0,
selects by row.
Returns
-------
Expand All @@ -562,11 +550,11 @@ cdef class RandomGenerator:
Raises
------
ValueError
If a is an int and less than zero, if a or p are not 1-dimensional,
if a is an array-like of size 0, if p is not a vector of
If a is an int and less than zero, if p is not 1-dimensional, if
a is array-like with a size 0, if p is not a vector of
probabilities, if a and p have different lengths, or if
replace=False and the sample size is greater than the population
size
size.
See Also
--------
Expand Down Expand Up @@ -607,7 +595,14 @@ cdef class RandomGenerator:
dtype='<U11')
"""

cdef char* idx_ptr
cdef int64_t buf
cdef char* buf_ptr

cdef set idx_set
cdef int64_t val, t, loc, size_i, pop_size_i
cdef int64_t *idx_data
cdef np.npy_intp j
# Format and Verify input
a = np.array(a, copy=False)
if a.ndim == 0:
Expand All @@ -618,11 +613,9 @@ cdef class RandomGenerator:
raise ValueError("a must be 1-dimensional or an integer")
if pop_size <= 0 and np.prod(size) != 0:
raise ValueError("a must be greater than 0 unless no samples are taken")
elif a.ndim != 1:
raise ValueError("a must be 1-dimensional")
else:
pop_size = a.shape[0]
if pop_size is 0 and np.prod(size) != 0:
pop_size = a.shape[axis]
if pop_size == 0 and np.prod(size) != 0:
raise ValueError("'a' cannot be empty unless no samples are taken")

if p is not None:
Expand Down Expand Up @@ -661,9 +654,9 @@ cdef class RandomGenerator:
cdf /= cdf[-1]
uniform_samples = self.random_sample(shape)
idx = cdf.searchsorted(uniform_samples, side='right')
idx = np.array(idx, copy=False) # searchsorted returns a scalar
idx = np.array(idx, copy=False, dtype=np.int64) # searchsorted returns a scalar
else:
idx = self.randint(0, pop_size, size=shape)
idx = self.randint(0, pop_size, size=shape, dtype=np.int64)
else:
if size > pop_size:
raise ValueError("Cannot take a larger sample than "
Expand Down Expand Up @@ -692,7 +685,39 @@ cdef class RandomGenerator:
n_uniq += new.size
idx = found
else:
idx = self.permutation(pop_size)[:size]
size_i = size
pop_size_i = pop_size
# This is a heuristic tuning. should be improvable
if pop_size_i > 200 and (size > 200 or size > (10 * pop_size // size)):
# Tail shuffle size elements
idx = np.arange(pop_size, dtype=np.int64)
idx_ptr = np.PyArray_BYTES(<np.ndarray>idx)
buf_ptr = <char*>&buf
self._shuffle_raw(pop_size_i, max(pop_size_i - size_i,1),
8, 8, idx_ptr, buf_ptr)
# Copy to allow potentially large array backing idx to be gc
idx = idx[(pop_size - size):].copy()
else:
# Floyds's algorithm with precomputed indices
# Worst case, O(n**2) when size is close to pop_size
idx = np.empty(size, dtype=np.int64)
idx_data = <int64_t*>np.PyArray_DATA(<np.ndarray>idx)
idx_set = set()
loc = 0
# Sample indices with one pass to avoid reacquiring the lock
with self.lock:
for j in range(pop_size_i - size_i, pop_size_i):
idx_data[loc] = random_interval(self._brng, j)
loc += 1
loc = 0
while len(idx_set) < size_i:
for j in range(pop_size_i - size_i, pop_size_i):
if idx_data[loc] not in idx_set:
val = idx_data[loc]
else:
idx_data[loc] = val = j
idx_set.add(val)
loc += 1
if shape is not None:
idx.shape = shape

Expand All @@ -714,7 +739,9 @@ cdef class RandomGenerator:
res[()] = a[idx]
return res

return a[idx]
# asarray downcasts on 32-bit platforms, always safe
# no-op on 64-bit platforms
return a.take(np.asarray(idx, dtype=np.intp), axis=axis)

def uniform(self, low=0.0, high=1.0, size=None):
"""
Expand Down Expand Up @@ -3986,9 +4013,9 @@ cdef class RandomGenerator:
# the most common case, yielding a ~33% performance improvement.
# Note that apparently, only one branch can ever be specialized.
if itemsize == sizeof(np.npy_intp):
self._shuffle_raw(n, sizeof(np.npy_intp), stride, x_ptr, buf_ptr)
self._shuffle_raw(n, 1, sizeof(np.npy_intp), stride, x_ptr, buf_ptr)
else:
self._shuffle_raw(n, itemsize, stride, x_ptr, buf_ptr)
self._shuffle_raw(n, 1, itemsize, stride, x_ptr, buf_ptr)
elif isinstance(x, np.ndarray) and x.ndim and x.size:
buf = np.empty_like(x[0, ...])
with self.lock:
Expand All @@ -4007,10 +4034,29 @@ cdef class RandomGenerator:
j = random_interval(self._brng, i)
x[i], x[j] = x[j], x[i]

cdef inline _shuffle_raw(self, np.npy_intp n, np.npy_intp itemsize,
np.npy_intp stride, char* data, char* buf):
cdef inline _shuffle_raw(self, np.npy_intp n, np.npy_intp first,
np.npy_intp itemsize, np.npy_intp stride,
char* data, char* buf):
"""
Parameters
----------
n
Number of elements in data
first
First observation to shuffle. Shuffles n-1,
n-2, ..., first, so that when first=1 the entire
array is shuffled
itemsize
Size in bytes of item
stride
Array stride
data
Location of data
buf
Location of buffer (itemsize)
"""
cdef np.npy_intp i, j
for i in reversed(range(1, n)):
for i in reversed(range(first, n)):
j = random_interval(self._brng, i)
string.memcpy(buf, data + j * stride, itemsize)
string.memcpy(data + j * stride, data + i * stride, itemsize)
Expand Down
2 changes: 1 addition & 1 deletion numpy/random/src/distributions/distributions.c
Original file line number Diff line number Diff line change
Expand Up @@ -1070,7 +1070,7 @@ int64_t random_zipf(brng_t *brng_state, double a) {

T = pow(1.0 + 1.0 / X, am1);
if (V * X * (T - 1.0) / (b - 1.0) <= T / b) {
return (long)X;
return (int64_t)X;
}
}
}
Expand Down
2 changes: 2 additions & 0 deletions numpy/random/tests/test_against_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ def test_standard_exponential(self):
self.rs.standard_exponential)
self._is_state_common_legacy()

@pytest.mark.xfail(reason='Stream broken for simplicity')
def test_tomaxint(self):
self._set_common_state()
self._is_state_common()
Expand Down Expand Up @@ -327,6 +328,7 @@ def test_multinomial(self):
g(100, np.array(p), size=(7, 23)))
self._is_state_common()

@pytest.mark.xfail(reason='Stream broken for performance')
def test_choice(self):
self._set_common_state()
self._is_state_common()
Expand Down
44 changes: 39 additions & 5 deletions numpy/random/tests/test_generator_mt19937.py
Original file line number Diff line number Diff line change
Expand Up @@ -542,25 +542,25 @@ def test_random_sample_unsupported_type(self):
def test_choice_uniform_replace(self):
random.brng.seed(self.seed)
actual = random.choice(4, 4)
desired = np.array([2, 3, 2, 3])
desired = np.array([2, 3, 2, 3], dtype=np.int64)
assert_array_equal(actual, desired)

def test_choice_nonuniform_replace(self):
random.brng.seed(self.seed)
actual = random.choice(4, 4, p=[0.4, 0.4, 0.1, 0.1])
desired = np.array([1, 1, 2, 2])
desired = np.array([1, 1, 2, 2], dtype=np.int64)
assert_array_equal(actual, desired)

def test_choice_uniform_noreplace(self):
random.brng.seed(self.seed)
actual = random.choice(4, 3, replace=False)
desired = np.array([0, 1, 3])
desired = np.array([0, 2, 3], dtype=np.int64)
assert_array_equal(actual, desired)

def test_choice_nonuniform_noreplace(self):
random.brng.seed(self.seed)
actual = random.choice(4, 3, replace=False, p=[0.1, 0.3, 0.5, 0.1])
desired = np.array([2, 3, 1])
desired = np.array([2, 3, 1], dtype=np.int64)
assert_array_equal(actual, desired)

def test_choice_noninteger(self):
Expand All @@ -569,11 +569,22 @@ def test_choice_noninteger(self):
desired = np.array(['c', 'd', 'c', 'd'])
assert_array_equal(actual, desired)

def test_choice_multidimensional_default_axis(self):
random.brng.seed(self.seed)
actual = random.choice([[0, 1], [2, 3], [4, 5], [6, 7]], 3)
desired = np.array([[4, 5], [6, 7], [4, 5]])
assert_array_equal(actual, desired)

def test_choice_multidimensional_custom_axis(self):
random.brng.seed(self.seed)
actual = random.choice([[0, 1], [2, 3], [4, 5], [6, 7]], 1, axis=1)
desired = np.array([[0], [2], [4], [6]])
assert_array_equal(actual, desired)

def test_choice_exceptions(self):
sample = random.choice
assert_raises(ValueError, sample, -1, 3)
assert_raises(ValueError, sample, 3., 3)
assert_raises(ValueError, sample, [[1, 2], [3, 4]], 3)
assert_raises(ValueError, sample, [], 3)
assert_raises(ValueError, sample, [1, 2, 3, 4], 3,
p=[[0.25, 0.25], [0.25, 0.25]])
Expand Down Expand Up @@ -639,6 +650,29 @@ def test_choice_nan_probabilities(self):
p = [None, None, None]
assert_raises(ValueError, random.choice, a, p=p)

def test_choice_return_type(self):
# gh 9867
p = np.ones(4) / 4.
actual = random.choice(4, 2)
assert actual.dtype == np.int64
actual = random.choice(4, 2, replace=False)
assert actual.dtype == np.int64
actual = random.choice(4, 2, p=p)
assert actual.dtype == np.int64
actual = random.choice(4, 2, p=p, replace=False)
assert actual.dtype == np.int64

def test_choice_large_sample(self):
import hashlib

choice_hash = '6395868be877d27518c832213c17977c'
random.brng.seed(self.seed)
actual = random.choice(10000, 5000, replace=False)
if sys.byteorder != 'little':
actual = actual.byteswap()
res = hashlib.md5(actual.view(np.int8)).hexdigest()
assert_(choice_hash == res)

def test_bytes(self):
random.brng.seed(self.seed)
actual = random.bytes(10)
Expand Down

0 comments on commit 0e7589e

Please sign in to comment.