-
Notifications
You must be signed in to change notification settings - Fork 1.2k
/
Copy pathrandom.py
292 lines (225 loc) · 10.2 KB
/
random.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
import math
from numba import (config, cuda, float32, float64, uint32, int64, uint64,
from_dtype, jit)
import numpy as np
# This implementation is based upon the xoroshiro128+ and splitmix64 algorithms
# described at:
#
# http://xoroshiro.di.unimi.it/
#
# and originally implemented by David Blackman and Sebastiano Vigna.
#
# The implementations below are based on the C source code:
#
# * http://xoroshiro.di.unimi.it/xoroshiro128plus.c
# * http://xoroshiro.di.unimi.it/splitmix64.c
#
# Splitmix64 is used to generate the initial state of the xoroshiro128+
# generator to ensure that small seeds don't result in predictable output.
# **WARNING**: There is a lot of verbose casting in this file to ensure that
# NumPy casting conventions (which cast uint64 [op] int32 to float64) don't
# turn integers into floats when using these functions in the CUDA simulator.
#
# There are also no function type signatures to ensure that compilation is
# deferred so that import is quick, and Sphinx autodoc works. We are also
# using the CPU @jit decorator everywhere to create functions that work as
# both CPU and CUDA device functions.
xoroshiro128p_dtype = np.dtype([('s0', np.uint64), ('s1', np.uint64)],
align=True)
xoroshiro128p_type = from_dtype(xoroshiro128p_dtype)
# When cudasim is enabled, Fake CUDA arrays are passed to some of the
# @jit-decorated functions. This required fallback to object mode. With
# Numba 0.59.0 object mode must be explicitly enabled.
# https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit
# In order to avoid the warning / future error, we explicitly specify that
# object mode with loop lifting is acceptable when using the simulator.
_forceobj = _looplift = config.ENABLE_CUDASIM
_nopython = not config.ENABLE_CUDASIM
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
def init_xoroshiro128p_state(states, index, seed):
'''Use SplitMix64 to generate an xoroshiro128p state from 64-bit seed.
This ensures that manually set small seeds don't result in a predictable
initial sequence from the random number generator.
:type states: 1D array, dtype=xoroshiro128p_dtype
:param states: array of RNG states
:type index: uint64
:param index: offset in states to update
:type seed: int64
:param seed: seed value to use when initializing state
'''
index = int64(index)
seed = uint64(seed)
z = seed + uint64(0x9E3779B97F4A7C15)
z = (z ^ (z >> uint32(30))) * uint64(0xBF58476D1CE4E5B9)
z = (z ^ (z >> uint32(27))) * uint64(0x94D049BB133111EB)
z = z ^ (z >> uint32(31))
states[index]['s0'] = z
states[index]['s1'] = z
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
def rotl(x, k):
'''Left rotate x by k bits.'''
x = uint64(x)
k = uint32(k)
return (x << k) | (x >> uint32(64 - k))
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
def xoroshiro128p_next(states, index):
'''Return the next random uint64 and advance the RNG in states[index].
:type states: 1D array, dtype=xoroshiro128p_dtype
:param states: array of RNG states
:type index: int64
:param index: offset in states to update
:rtype: uint64
'''
index = int64(index)
s0 = states[index]['s0']
s1 = states[index]['s1']
result = s0 + s1
s1 ^= s0
states[index]['s0'] = uint64(rotl(s0, uint32(55))) ^ s1 ^ (s1 << uint32(14))
states[index]['s1'] = uint64(rotl(s1, uint32(36)))
return result
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
def xoroshiro128p_jump(states, index):
'''Advance the RNG in ``states[index]`` by 2**64 steps.
:type states: 1D array, dtype=xoroshiro128p_dtype
:param states: array of RNG states
:type index: int64
:param index: offset in states to update
'''
index = int64(index)
jump = (uint64(0xbeac0467eba5facb), uint64(0xd86b048b86aa9922))
s0 = uint64(0)
s1 = uint64(0)
for i in range(2):
for b in range(64):
if jump[i] & (uint64(1) << uint32(b)):
s0 ^= states[index]['s0']
s1 ^= states[index]['s1']
xoroshiro128p_next(states, index)
states[index]['s0'] = s0
states[index]['s1'] = s1
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
def uint64_to_unit_float64(x):
'''Convert uint64 to float64 value in the range [0.0, 1.0)'''
x = uint64(x)
return (x >> uint32(11)) * (float64(1) / (uint64(1) << uint32(53)))
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
def uint64_to_unit_float32(x):
'''Convert uint64 to float32 value in the range [0.0, 1.0)'''
x = uint64(x)
return float32(uint64_to_unit_float64(x))
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
def xoroshiro128p_uniform_float32(states, index):
'''Return a float32 in range [0.0, 1.0) and advance ``states[index]``.
:type states: 1D array, dtype=xoroshiro128p_dtype
:param states: array of RNG states
:type index: int64
:param index: offset in states to update
:rtype: float32
'''
index = int64(index)
return uint64_to_unit_float32(xoroshiro128p_next(states, index))
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
def xoroshiro128p_uniform_float64(states, index):
'''Return a float64 in range [0.0, 1.0) and advance ``states[index]``.
:type states: 1D array, dtype=xoroshiro128p_dtype
:param states: array of RNG states
:type index: int64
:param index: offset in states to update
:rtype: float64
'''
index = int64(index)
return uint64_to_unit_float64(xoroshiro128p_next(states, index))
TWO_PI_FLOAT32 = np.float32(2 * math.pi)
TWO_PI_FLOAT64 = np.float64(2 * math.pi)
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
def xoroshiro128p_normal_float32(states, index):
'''Return a normally distributed float32 and advance ``states[index]``.
The return value is drawn from a Gaussian of mean=0 and sigma=1 using the
Box-Muller transform. This advances the RNG sequence by two steps.
:type states: 1D array, dtype=xoroshiro128p_dtype
:param states: array of RNG states
:type index: int64
:param index: offset in states to update
:rtype: float32
'''
index = int64(index)
u1 = xoroshiro128p_uniform_float32(states, index)
u2 = xoroshiro128p_uniform_float32(states, index)
z0 = math.sqrt(-float32(2.0) * math.log(u1)) * math.cos(TWO_PI_FLOAT32 * u2)
# discarding second normal value
# z1 = math.sqrt(-float32(2.0) * math.log(u1))
# * math.sin(TWO_PI_FLOAT32 * u2)
return z0
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
def xoroshiro128p_normal_float64(states, index):
'''Return a normally distributed float32 and advance ``states[index]``.
The return value is drawn from a Gaussian of mean=0 and sigma=1 using the
Box-Muller transform. This advances the RNG sequence by two steps.
:type states: 1D array, dtype=xoroshiro128p_dtype
:param states: array of RNG states
:type index: int64
:param index: offset in states to update
:rtype: float64
'''
index = int64(index)
u1 = xoroshiro128p_uniform_float32(states, index)
u2 = xoroshiro128p_uniform_float32(states, index)
z0 = math.sqrt(-float64(2.0) * math.log(u1)) * math.cos(TWO_PI_FLOAT64 * u2)
# discarding second normal value
# z1 = math.sqrt(-float64(2.0) * math.log(u1))
# * math.sin(TWO_PI_FLOAT64 * u2)
return z0
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
def init_xoroshiro128p_states_cpu(states, seed, subsequence_start):
n = states.shape[0]
seed = uint64(seed)
subsequence_start = uint64(subsequence_start)
if n >= 1:
init_xoroshiro128p_state(states, 0, seed)
# advance to starting subsequence number
for _ in range(subsequence_start):
xoroshiro128p_jump(states, 0)
# populate the rest of the array
for i in range(1, n):
states[i] = states[i - 1] # take state of previous generator
xoroshiro128p_jump(states, i) # and jump forward 2**64 steps
def init_xoroshiro128p_states(states, seed, subsequence_start=0, stream=0):
'''Initialize RNG states on the GPU for parallel generators.
This initializes the RNG states so that each state in the array corresponds
subsequences in the separated by 2**64 steps from each other in the main
sequence. Therefore, as long no CUDA thread requests more than 2**64
random numbers, all of the RNG states produced by this function are
guaranteed to be independent.
The subsequence_start parameter can be used to advance the first RNG state
by a multiple of 2**64 steps.
:type states: 1D DeviceNDArray, dtype=xoroshiro128p_dtype
:param states: array of RNG states
:type seed: uint64
:param seed: starting seed for list of generators
'''
# Initialization on CPU is much faster than the GPU
states_cpu = np.empty(shape=states.shape, dtype=xoroshiro128p_dtype)
init_xoroshiro128p_states_cpu(states_cpu, seed, subsequence_start)
states.copy_to_device(states_cpu, stream=stream)
def create_xoroshiro128p_states(n, seed, subsequence_start=0, stream=0):
'''Returns a new device array initialized for n random number generators.
This initializes the RNG states so that each state in the array corresponds
subsequences in the separated by 2**64 steps from each other in the main
sequence. Therefore, as long no CUDA thread requests more than 2**64
random numbers, all of the RNG states produced by this function are
guaranteed to be independent.
The subsequence_start parameter can be used to advance the first RNG state
by a multiple of 2**64 steps.
:type n: int
:param n: number of RNG states to create
:type seed: uint64
:param seed: starting seed for list of generators
:type subsequence_start: uint64
:param subsequence_start:
:type stream: CUDA stream
:param stream: stream to run initialization kernel on
'''
states = cuda.device_array(n, dtype=xoroshiro128p_dtype, stream=stream)
init_xoroshiro128p_states(states, seed, subsequence_start, stream)
return states