In [1]:
import tohu
from tohu import *
from utils import print_generated_sequence

In [2]:
# NBVAL_IGNORE_OUTPUT
tohu.__version__

'v0.5.0+17.g2ac1895.dirty'

This notebook contains high-level tests for `tohu`'s "standard" generators.

## Class `Integer`

Generates random integers in the range [`lo`, `hi`].

In [3]:
g = Integer(low=100, high=200)

In [4]:
g.reset(seed=12345); print_generated_sequence(g, num=15)
g.reset(seed=9999); print_generated_sequence(g, num=15)

Generated sequence: 153, 193, 101, 138, 147, 124, 134, 172, 155, 120, 147, 115, 155, 133, 171
Generated sequence: 115, 120, 196, 109, 116, 124, 136, 124, 187, 199, 176, 174, 138, 180, 170


In [5]:
some_integers = g.generate(5, seed=99999)

In [6]:
for x in some_integers:
    print(x)

115
139
164
183
194


The default distribution is "uniform", but we can use any(?) of the distributions [supported](https://docs.scipy.org/doc/numpy/reference/routines.random.html) by numpy.

In [7]:
#g = Integer(low=100, high=200, distribution=None)

## Class `Float`

Generates random floating point numbers in the range [`lo`, `hi`].

In [8]:
g = Float(low=2.71828, high=3.14159)

In [9]:
g.reset(seed=12345); print_generated_sequence(g, num=4)
g.reset(seed=9999); print_generated_sequence(g, num=4)

Generated sequence: 2.8946393582471686, 2.7225847111228716, 3.0675981674322017, 2.8446972371045396
Generated sequence: 3.0716413078479454, 2.785006097591815, 2.750284761944705, 3.0530348312992466


## Class `NumpyRandomGenerator`

Generates random numbers using one of the random number generators [supported](https://docs.scipy.org/doc/numpy/reference/routines.random.html) by numpy.

In [10]:
g1 = NumpyRandomGenerator(method="normal", loc=3.0, scale=5.0)
g2 = NumpyRandomGenerator(method="poisson", lam=30)
g3 = NumpyRandomGenerator(method="exponential", scale=0.3)

In [11]:
g1.reset(seed=12345); print_generated_sequence(g1, num=4)
g2.reset(seed=12345); print_generated_sequence(g2, num=15)
g3.reset(seed=12345); print_generated_sequence(g3, num=4)

Generated sequence: 1.9764617025764353, 5.394716690287741, 0.40280642471630923, 0.22134847826254989
Generated sequence: 40, 24, 31, 34, 27, 32, 29, 29, 35, 38, 30, 32, 38, 36, 36
Generated sequence: 0.7961371899305246, 0.11410397056571128, 0.060972430042086474, 0.06865806254932436


## Class `FakerGenerator`

It is also possible to use any generator provided by the [faker](http://faker.readthedocs.io/) library.

In [12]:
g1 = FakerGenerator(method="name")
g2 = FakerGenerator(method="name", locale='hi_IN')
g3 = FakerGenerator(method="phone_number")
g4 = FakerGenerator(method="job")

In [13]:
g1.reset(seed=12345); print_generated_sequence(g1, num=4)
g2.reset(seed=12345); print_generated_sequence(g2, num=4)
g3.reset(seed=12345); print_generated_sequence(g3, num=4)
g4.reset(seed=12345); print_generated_sequence(g4, num=4)

Generated sequence: Adam Bryan, Jacob Lee, Candice Martinez, Justin Thompson
Generated sequence: आदित्य ढींगरा, ललित दीक्षित, कुण्डा, कैलाश, ईश कुण्डा
Generated sequence: (045)349-6251x648, 298-251-8698x22313, 1-507-508-6002, 1-241-619-2638x9503
Generated sequence: Pension scheme manager, Administrator, Hydrogeologist, Merchandiser, retail


## Class Constant

Generates a sequence repeating the same element indefinitely.

In [14]:
g = Constant("Foobar"); print_generated_sequence(g, num=10)
g = Constant(42); print_generated_sequence(g, num=20)

Generated sequence: Foobar, Foobar, Foobar, Foobar, Foobar, Foobar, Foobar, Foobar, Foobar, Foobar
Generated sequence: 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42


## Class `Sequential`

Generates a sequence of sequentially numbered strings with a given prefix.

In [15]:
g = Sequential(prefix='Foo_', digits=3)

Calling `reset()` on the generator makes the numbering start from 1 again.

In [16]:
g.reset()
print_generated_sequence(g, num=5)
print_generated_sequence(g, num=5)
print("-----------------------------")
g.reset()
print_generated_sequence(g, num=5)

Generated sequence: Foo_001, Foo_002, Foo_003, Foo_004, Foo_005
Generated sequence: Foo_006, Foo_007, Foo_008, Foo_009, Foo_010
-----------------------------
Generated sequence: Foo_001, Foo_002, Foo_003, Foo_004, Foo_005


**Note**: the method `Sequential.reset()` supports the `seed` argument for consistency with other generators, but its value is ignored - the generator is simply reset to its initial value. This is illustrated here:

In [17]:
g.reset(seed=12345); print_generated_sequence(g, num=5)
g.reset(seed=9999); print_generated_sequence(g, num=5)

Generated sequence: Foo_001, Foo_002, Foo_003, Foo_004, Foo_005
Generated sequence: Foo_001, Foo_002, Foo_003, Foo_004, Foo_005


If a new `Sequential` generator is created from an existing one via the `_spawn()` method then its count will start again from 1.

In [18]:
g1 = Sequential(prefix="Quux_", digits=2)
g1.reset(seed=12345)
print_generated_sequence(g1, num=5)

g2 = g1._spawn()
print_generated_sequence(g1, num=5)
print_generated_sequence(g2, num=5)

Generated sequence: Quux_01, Quux_02, Quux_03, Quux_04, Quux_05
Generated sequence: Quux_06, Quux_07, Quux_08, Quux_09, Quux_10
Generated sequence: Quux_01, Quux_02, Quux_03, Quux_04, Quux_05


## Class `SelectOne`

In [19]:
g = SelectOne(values=['foobar', 42, 'quux', True, 1.2345])

In [20]:
g.reset(seed=12345); print_generated_sequence(g, num=15)
g.reset(seed=9999); print_generated_sequence(g, num=15)

Generated sequence: True, foobar, quux, quux, 42, quux, 1.2345, True, 42, quux, foobar, True, quux, 1.2345, 42
Generated sequence: foobar, 42, foobar, 42, 42, quux, 42, 1.2345, 1.2345, quux, 1.2345, 42, foobar, 1.2345, 1.2345


## Class `SelectMultiple`

In [21]:
g = SelectMultiple(values=['foobar', 42, 'quux', True, 1.2345], size=3)

In [22]:
g.reset(seed=12345); print_generated_sequence(g, num=4)
g.reset(seed=99999); print_generated_sequence(g, num=4)

Generated sequence: (True, 1.2345, True), (42, True, 'foobar'), (42, True, 'quux'), (1.2345, 42, 'foobar')
Generated sequence: (42, 'quux', 1.2345), (True, 42, 1.2345), ('quux', 1.2345, True), ('foobar', 1.2345, 1.2345)


It is possible to pass a random generator for the argument `n`. This produces tuples of _varying_ length, where the length of each tuple is determined by the values produced by this generator.

In [23]:
rand_nums = Integer(low=2, high=5)

In [24]:
g = SelectMultiple(values=['a', 'b', 'c', 'd', 'e'], size=rand_nums)

In [25]:
g.reset(seed=11111); print_generated_sequence(g, num=10, sep='\n')

Generated sequence:
('b', 'c', 'c', 'b', 'd')
('c', 'b', 'd', 'b')
('a', 'a', 'e', 'e')
('a', 'c', 'a')
('c', 'e', 'b', 'd')
('c', 'c')
('d', 'c', 'd', 'a')
('c', 'a', 'b', 'c', 'e')
('c', 'd', 'e', 'e')
('c', 'a', 'e', 'e', 'c')


## Class `Subsample`

The `Subsample` generator can extract a subsample from a given set of values, where each individual element is chosen with a given probability `p`.

In [26]:
values = list(range(50))

In [27]:
g = Subsample(values, p=0.3)

In [28]:
g.reset(seed=12345); print_generated_sequence(g, num=10, sep='\n')

Generated sequence:
[ 2  3 12 13 14 29 30 33 37 43 44 45 49]
[ 1  5 13 14 25 26 32 36 44 49]
[ 0  1  3  4 11 14 22 27 28 31 36 39 43 44 46]
[ 2  3  4  8 11 21 29 30 33 34 36 39 42 43 45 46 48]
[ 5 16 19 20 21 22 24 25 31 32 33 34 35 36 37 44]
[ 0  5  9 17 21 22 29 35 38 45 49]
[ 0  2  4  6  9 13 14 17 18 20 23 25 28 29 31 33 38 41 42 44 45 46 48]
[ 0  2  7  9 11 12 21 22 23 29 36 38 47]
[ 1  2  4  8  9 10 16 24 26 28 35 36 38 44 46 47]
[ 8 11 13 18 19 23 27 33 48]


## Class `CharString`

In [29]:
g = CharString(length=15)
g.reset(seed=12345); print_generated_sequence(g, num=5)
g.reset(seed=9999); print_generated_sequence(g, num=5)

Generated sequence: bG0jQ9PmyDaEFN1, 7welW2WuxdAOOSK, Hx7dVhiOYF2otR7, lVgNqllmNM7opRa, vomxeG7d9Gy4q57
Generated sequence: rfDq4wZcjZo5c0D, 7sYliztRTZfr2sK, 0PHz3tPuM7mCtFK, 1jcbYuFHWM2cC6u, mVM17b17JmPT7c4


It is possible to vary the length of generated character strings, and to specify the character set.

In [30]:
g = CharString(min_length=4, max_length=12, charset="ABCDEFGHIJKLMNOPQRSTUVWXYZ")

In [31]:
g.reset(seed=12345); print_generated_sequence(g, num=5, sep='\n')

Generated sequence:
AQEVUGMOAP
PTLCFYY
KLBNUU
WSQLBXDEUZPH
JVFXDT


## Class `DigitString`

In [32]:
g = DigitString(length=15)
g.reset(seed=12345); print_generated_sequence(g, num=5)
g.reset(seed=9999); print_generated_sequence(g, num=5)

Generated sequence: 082367077951255, 069850127342194, 223993305335180, 864906629411901, 764575636684616
Generated sequence: 417450230742264, 144986459374792, 005789075390830, 553780210864241, 328935916592898


## Class `HashDigest`

In [33]:
g = HashDigest(length=8)
g.reset(seed=12345); print_generated_sequence(g, num=9)
g.reset(seed=9999); print_generated_sequence(g, num=9)

Generated sequence: 046CE0FF, B25AB1DB, 134F7953, 8556770A, 76B21C80, DD593213, FD8AFBC6, CD8C2CD5, 822BB6E4
Generated sequence: 82E8B147, 1E954C92, 89C9A6E9, F410AF1E, A6061AB7, E0431C85, 93746B2D, B52D432D, A1371A57


In [34]:
g = HashDigest(length=20)
g.reset(seed=12345); print_generated_sequence(g, num=4)
g.reset(seed=9999); print_generated_sequence(g, num=4)

Generated sequence: 046CE0FFB25AB1DB134F, 79538556770A76B21C80, DD593213FD8AFBC6CD8C, 2CD5822BB6E4C6E93E7E
Generated sequence: 82E8B1471E954C9289C9, A6E9F410AF1EA6061AB7, E0431C8593746B2DB52D, 432DA1371A57CCF6A691


In [35]:
g = HashDigest(length=16, as_bytes=True)

In [36]:
g.reset(seed=12345); print_generated_sequence(g, num=3, sep='\n')

Generated sequence:
b'\x04l\xe0\xff\xb2Z\xb1\xdb'
b'\x13OyS\x85Vw\n'
b'v\xb2\x1c\x80\xddY2\x13'


## Class `Geolocation`

In [37]:
g = GeolocationPair()
g.reset(seed=12345); print_generated_sequence(g, num=5, sep='\n')

Generated sequence:
(-30.016845883677178, -15.008422941838589)
(-176.3390989954554, -88.1695494977277)
(117.07434333134756, 58.53717166567378)
(-72.48965212814659, -36.244826064073294)
(-47.37179178414874, -23.68589589207437)


## Class `TimestampNEW`

In [38]:
from tohu.generators import TimestampNEW

In [39]:
g = TimestampNEW(start='2016-02-14', end='2016-02-18')

In [40]:
g.reset(seed=12345); print_generated_sequence(g, num=5, sep='\n')

Generated sequence:
2016-02-16 12:40:28
2016-02-18 10:42:18
2016-02-14 01:28:51
2016-02-18 23:26:47
2016-02-18 20:55:23


In [41]:
g = TimestampNEW(start='1998-03-01 00:02:00', end='1998-03-01 00:02:15')

In [42]:
g.reset(seed=99999); print_generated_sequence(g, num=10, sep='\n')

Generated sequence:
1998-03-01 00:02:03
1998-03-01 00:02:09
1998-03-01 00:02:07
1998-03-01 00:02:11
1998-03-01 00:02:13
1998-03-01 00:02:06
1998-03-01 00:02:08
1998-03-01 00:02:12
1998-03-01 00:02:06
1998-03-01 00:02:01


Note that the generated items are `datetime` objects (even though they appear as strings when printed above).

In [43]:
type(next(g))

datetime.datetime

## Class `ExtractAttribute`

In [44]:
class QuuxGenerator(CustomGenerator):
    aaa = Integer(0, 100)
    bbb = HashDigest(length=6)

In [45]:
g = QuuxGenerator()

Using `ExtractAttribute` we can produce \"derived\" generators which extract the attributes `aaa`, `bbb` from the elements produced by `g`.

In [46]:
h1 = ExtractAttribute(g, 'aaa')
h2 = ExtractAttribute(g, 'bbb')

In [47]:
g.reset(seed=99999); print_generated_sequence(g, num=5, sep='\n')

Generated sequence:
Quux(aaa=20, bbb='0F0C79')
Quux(aaa=24, bbb='93846A')
Quux(aaa=70, bbb='5398AE')
Quux(aaa=47, bbb='FB860B')
Quux(aaa=92, bbb='2B6F91')


In [48]:
h1.reset(seed=99999); print_generated_sequence(h1, num=5)
h2.reset(seed=99999); print_generated_sequence(h2, num=5)

Generated sequence: 20, 24, 70, 47, 92
Generated sequence: 0F0C79, 93846A, 5398AE, FB860B, 2B6F91


## Class `IterateOver`

In [49]:
seq = ['aa', 'bb', 'cc', 'dd', 'ee']

In [50]:
g = IterateOver(seq)

In [51]:
g.reset(); print(list(g.generate(N=3)))
g.reset(); print(list(g.generate(N=10)))
g.reset(); print(list(g))

['aa', 'bb', 'cc']
['aa', 'bb', 'cc', 'dd', 'ee']
['aa', 'bb', 'cc', 'dd', 'ee']


## Using tohu generators as iterators

Each `tohu` generator can also be used as a Python iterator producing an (infinite) series of elements.

In [52]:
int_generator = Integer(low=100, high=500).reset(seed=99999)

for i, x in enumerate(int_generator):
    if i > 20:
        break
    print(x, end=" ")

161 258 356 432 478 221 281 311 203 229 307 470 410 410 367 203 130 455 270 370 296 

## `ItemList`

The `.generate()` method produces an `ItemList` instance.

In [53]:
g = HashDigest(length=6)

In [54]:
item_list = g.generate(N=10, seed=12345)
print(item_list)

<ItemList containing 10 items>


Fundamentally an `ItemList` behaves like a regular list.

In [55]:
print(list(item_list))

['046CE0', 'FFB25A', 'B1DB13', '4F7953', '855677', '0A76B2', '1C80DD', '593213', 'FD8AFB', 'C6CD8C']


In [56]:
item_list.reset(seed=999999)
print(list(item_list.subsample(num=6)))
print(list(item_list.subsample(num=6)))
print(list(item_list.subsample(num=6)))

['B1DB13', '4F7953', '046CE0', 'FD8AFB', '1C80DD', 'C6CD8C']
['C6CD8C', 'B1DB13', 'FFB25A', '593213', '4F7953', 'FD8AFB']
['0A76B2', 'B1DB13', 'FFB25A', '855677', 'C6CD8C', '046CE0']


In [57]:
item_list.reset(seed=99999)
print(list(item_list.subsample(p=0.4)))
print(list(item_list.subsample(p=0.4)))
print(list(item_list.subsample(p=0.4)))

['046CE0', '4F7953', '593213', 'C6CD8C']
['046CE0', 'FFB25A', 'B1DB13', '855677']
['046CE0', 'FFB25A', '4F7953', '855677', 'C6CD8C']
