In [1]:
import tohu
from tohu.generators import *
from utils import print_generated_sequence

In [2]:
# NBVAL_IGNORE_OUTPUT
tohu.__version__

'v0.2.0+23.gf82f08f.dirty'

## Class `Integer`

Generates random integers in the range [`lo`, `hi`].

In [3]:
g = Integer(lo=100, hi=200)

In [4]:
g.reset(seed=12345); print_generated_sequence(g, num=15)
g.reset(seed=9999); print_generated_sequence(g, num=15)

Generated sequence: 153, 193, 101, 138, 147, 124, 134, 172, 155, 120, 147, 115, 155, 133, 171
Generated sequence: 115, 120, 196, 109, 116, 124, 136, 124, 187, 199, 176, 174, 138, 180, 170


In [5]:
some_integers = g.generate(5, seed=99999)

In [6]:
for x in some_integers:
    print(x)

115
139
164
183
194


## Class `Float`

Generates random floating point numbers in the range [`lo`, `hi`].

In [7]:
g = Float(lo=2.71828, hi=3.14159)

In [8]:
g.reset(seed=12345); print_generated_sequence(g, num=4)
g.reset(seed=9999); print_generated_sequence(g, num=4)

Generated sequence: 2.8946393582471686, 2.7225847111228716, 3.0675981674322017, 2.8446972371045396
Generated sequence: 3.0716413078479454, 2.785006097591815, 2.750284761944705, 3.0530348312992466


## Class Constant

Generates a sequence repeating the same element indefinitely.

In [9]:
g = Constant("Foobar"); print_generated_sequence(g, num=10)
g = Constant(42); print_generated_sequence(g, num=20)

Generated sequence: Foobar, Foobar, Foobar, Foobar, Foobar, Foobar, Foobar, Foobar, Foobar, Foobar
Generated sequence: 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42


## Class `Sequential`

Generates a sequence of sequentially numbered strings with a given prefix.

In [10]:
g = Sequential(prefix='Foo_', digits=3)

Calling `reset()` on the generator makes the numbering start from 1 again.

In [11]:
g.reset()
print_generated_sequence(g, num=5)
print_generated_sequence(g, num=5)
print("-----------------------------")
g.reset()
print_generated_sequence(g, num=5)

Generated sequence: Foo_001, Foo_002, Foo_003, Foo_004, Foo_005
Generated sequence: Foo_006, Foo_007, Foo_008, Foo_009, Foo_010
-----------------------------
Generated sequence: Foo_001, Foo_002, Foo_003, Foo_004, Foo_005


**Note**: the method `Sequential.reset()` supports the `seed` argument for consistency with other generators, but its value is ignored - the generator is simply reset to its initial value. This is illustrated here:

In [12]:
g.reset(seed=12345); print_generated_sequence(g, num=5)
g.reset(seed=9999); print_generated_sequence(g, num=5)

Generated sequence: Foo_001, Foo_002, Foo_003, Foo_004, Foo_005
Generated sequence: Foo_001, Foo_002, Foo_003, Foo_004, Foo_005


If a new `Sequential` generator is created from an existing one via the `_spawn()` method then its count will start again from 1.

In [13]:
g1 = Sequential(prefix="Quux_", digits=2)
g1.reset(seed=12345)
print_generated_sequence(g1, num=5)

g2 = g1._spawn()
print_generated_sequence(g1, num=5)
print_generated_sequence(g2, num=5)

Generated sequence: Quux_01, Quux_02, Quux_03, Quux_04, Quux_05
Generated sequence: Quux_06, Quux_07, Quux_08, Quux_09, Quux_10
Generated sequence: Quux_01, Quux_02, Quux_03, Quux_04, Quux_05


## Class `ChooseFrom`

In [14]:
g = ChooseFrom(values=['foobar', 42, 'quux', True, 1.2345])

In [15]:
g.reset(seed=12345); print_generated_sequence(g, num=15)
g.reset(seed=9999); print_generated_sequence(g, num=15)

Generated sequence: 1.2345, 42, True, True, quux, True, 1.2345, quux, True, 42, 1.2345, True, quux, quux, True
Generated sequence: 42, quux, 42, quux, quux, True, quux, True, quux, 42, 42, 1.2345, 1.2345, 42, True


## Class `CharString`

In [16]:
chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789,.?!"

In [17]:
g = CharString(length=15, chars=chars)
g.reset(seed=12345); print_generated_sequence(g, num=5)
g.reset(seed=9999); print_generated_sequence(g, num=5)

Generated sequence: 2cNWzJ4vWq4IxyU, m1!wtBkzSQe7Se2, bbvxLn2oyXERgEq, ?CyH4rgwNcuETSv, YQchi5lYxcWCq,i
Generated sequence: qvkrzLzNDil95jM, ByLv0b.4FyIeXep, 1PHr,yhcR!7O,xB, 3ALLCwMMk0tpfl6, ,pLtCfVBk4hoVMg


## Class `DigitString`

In [18]:
g = DigitString(length=15)
g.reset(seed=12345); print_generated_sequence(g, num=5)
g.reset(seed=9999); print_generated_sequence(g, num=5)

Generated sequence: 715645736275939, 362979334246618, 619711335272374, 961428493579313, 513469637611182
Generated sequence: 232345459419288, 254353718974351, 971927653831916, 985834749554355, 273219288253416


## Class `HashDigest`

In [19]:
g = HashDigest(length=8)
g.reset(seed=12345); print_generated_sequence(g, num=9)
g.reset(seed=9999); print_generated_sequence(g, num=9)

Generated sequence: 7C1EED5E, 6F45A736, 2E759B3A, 936CC29C, 7A9334C2, 4E661861, 9CC711BE, 3DA352B7, 2FBA3C74
Generated sequence: E23D23D4, 54BDAFA5, B9419A2A, 88ED2A54, FBD3AF5A, F3718FA9, 7F4DAAB3, CAC51971, C92B76EC


In [20]:
g = HashDigest(length=20)
g.reset(seed=12345); print_generated_sequence(g, num=4)
g.reset(seed=9999); print_generated_sequence(g, num=4)

Generated sequence: 7C1EED5E6F45A7362E75, 9B3A936CC29C7A9334C2, 4E6618619CC711BE3DA3, 52B72FBA3C749ACFE6D1
Generated sequence: E23D23D454BDAFA5B941, 9A2A88ED2A54FBD3AF5A, F3718FA97F4DAAB3CAC5, 1971C92B76ECFD538319


## Class `Geolocation`

In [21]:
g = Geolocation()
g.reset(seed=12345); print_generated_sequence(g, num=5, sep='\n')

Generated sequence:
(-30.016845883677178, -15.008422941838589)
(-176.3390989954554, -88.1695494977277)
(117.07434333134756, 58.53717166567378)
(-72.48965212814659, -36.244826064073294)
(-47.37179178414874, -23.68589589207437)


## Class `Timestamp`

In [22]:
%aimport tohu
%aimport tohu.generators
from tohu import *

In [23]:
g = Timestamp(start='2016-02-14', end='2016-02-18')

In [24]:
g.reset(seed=12345); print_generated_sequence(g, num=5, sep='\n')

Generated sequence:
2016-02-16 12:40:28
2016-02-18 10:42:18
2016-02-14 01:28:51
2016-02-18 23:26:47
2016-02-18 20:55:23


In [25]:
g = Timestamp(start='1998-03-01 00:02:00', end='1998-03-01 00:02:15')

In [26]:
g.reset(seed=99999); print_generated_sequence(g, num=10, sep='\n')

Generated sequence:
1998-03-01 00:02:03
1998-03-01 00:02:09
1998-03-01 00:02:07
1998-03-01 00:02:11
1998-03-01 00:02:13
1998-03-01 00:02:06
1998-03-01 00:02:08
1998-03-01 00:02:12
1998-03-01 00:02:06
1998-03-01 00:02:01


## Defining a custom generator

A custom "compound" generator can be defined by subclassing from `CustomGenerator`. Any attributes which are `tohu` generators will be used for the creation of random items whenever `next()` is called.

In [27]:
class FoobarGenerator(CustomGenerator):
    a = Integer(lo=1000, hi=3000)
    b = Sequential(prefix="Foo_", digits=2)
    c = Float(lo=1.0, hi=4.0)

Custom generators accept a `seed` argument during creation. In addition, the `reset()` method can be used to bring the generator into a well-defined state.

In [28]:
g = FoobarGenerator(seed=12345)

In [29]:
# Equivalent to the previous cell
g = FoobarGenerator()
g.reset(seed=12345)

A custom generator `g` produces namedtuples whose fields are populated from the individual generators used in the definition of `g`.

In [30]:
next(g)

Foobar(a=1853, b='Foo_01', c=2.2498596176360235)

In [31]:
next(g)

Foobar(a=2500, b='Foo_02', c=1.0305075083712052)

In [32]:
item = next(g)
print("Attributes: a={}, b={}, c={}".format(item.a, item.b, item.c))

Attributes: a=1020, b=Foo_03, c=3.4756195277612294


Test that the generated sequence is reproduced exactly if the same seed is used to reset the custom generator.

In [33]:
g.reset(seed=12345)
print_generated_sequence(g, num=5, sep='\n')
print("-------------------------------------------------")
g.reset(seed=12345)
print_generated_sequence(g, num=5, sep='\n')
print("-------------------------------------------------")
g.reset(seed=9999)
print_generated_sequence(g, num=5, sep='\n')

Generated sequence:
Foobar(a=1853, b='Foo_01', c=2.2498596176360235)
Foobar(a=2500, b='Foo_02', c=1.0305075083712052)
Foobar(a=1020, b='Foo_03', c=3.4756195277612294)
Foobar(a=2679, b='Foo_04', c=1.8959195655987784)
Foobar(a=2690, b='Foo_05', c=2.105235068465427)
-------------------------------------------------
Generated sequence:
Foobar(a=1853, b='Foo_01', c=2.2498596176360235)
Foobar(a=2500, b='Foo_02', c=1.0305075083712052)
Foobar(a=1020, b='Foo_03', c=3.4756195277612294)
Foobar(a=2679, b='Foo_04', c=1.8959195655987784)
Foobar(a=2690, b='Foo_05', c=2.105235068465427)
-------------------------------------------------
Generated sequence:
Foobar(a=2709, b='Foo_01', c=3.5042732832766457)
Foobar(a=1254, b='Foo_02', c=1.472888173620857)
Foobar(a=1322, b='Foo_03', c=1.226817901382237)
Foobar(a=2543, b='Foo_04', c=3.3724090947479155)
Foobar(a=1154, b='Foo_05', c=1.8561874915530612)


## Using tohu generators as iterators

Each `tohu` generator can also be used as a Python iterator producing an (infinite) series of elements.

In [34]:
int_generator = Integer(lo=100, hi=500, seed=99999)

for i, x in enumerate(int_generator):
    if i > 20:
        break
    print(x, end=" ")

161 258 356 432 478 221 281 311 203 229 307 470 410 410 367 203 130 455 270 370 296 

## Nesting custom generators

It is possible to "nest" custom generators, in the sense that one generator can be used as attributes in another one. 
Here is an example where the generator `FoobarGenerator` defined above is re-used in a second custom generator called `QuuxGenerator`.

In [35]:
class QuuxGenerator(CustomGenerator):
    x = Integer(lo=400, hi=499)
    y = FoobarGenerator()
    
    def __init__(self, z_min, z_max):
        self.z = Float(lo=z_min, hi=z_max)

Let's check that the `seed` argument provided during creation of the custom generator is consistent with calling `g.reset(seed)` later.

In [36]:
g = QuuxGenerator(z_min=2.0, z_max=3.0, seed=99999)
print_generated_sequence(g, num=5, sep='\n')

Generated sequence:
Quux(x=415, y=Foobar(a=1246, b='Foo_01', c=1.3614714283377922), z=2.1204904761125976)
Quux(x=439, y=Foobar(a=2944, b='Foo_02', c=1.9278550110777513), z=2.3092850036925836)
Quux(x=464, y=Foobar(a=1633, b='Foo_03', c=3.858759671075081), z=2.95291989035836)
Quux(x=483, y=Foobar(a=2024, b='Foo_04', c=3.2182710149419016), z=2.739423671647301)
Quux(x=494, y=Foobar(a=2951, b='Foo_05', c=2.065540039845949), z=2.355180013281983)


In [37]:
g.reset(seed=99999); print_generated_sequence(g, num=5, sep='\n')

Generated sequence:
Quux(x=415, y=Foobar(a=1246, b='Foo_01', c=1.3614714283377922), z=2.1204904761125976)
Quux(x=439, y=Foobar(a=2944, b='Foo_02', c=1.9278550110777513), z=2.3092850036925836)
Quux(x=464, y=Foobar(a=1633, b='Foo_03', c=3.858759671075081), z=2.95291989035836)
Quux(x=483, y=Foobar(a=2024, b='Foo_04', c=3.2182710149419016), z=2.739423671647301)
Quux(x=494, y=Foobar(a=2951, b='Foo_05', c=2.065540039845949), z=2.355180013281983)


## Formatting with custom generators

In [38]:
class FoobarGenerator(CustomGenerator):
    quux = Integer(lo=1000, hi=3000)
    blabla = Float(lo=1.0, hi=4.0)
    lalalalala = Sequential(prefix="Foo_", digits=2)

In [39]:
g = FoobarGenerator(seed=99999)

In [40]:
item1 = next(g)
item2 = next(g)
item3 = next(g)

Items produced by generators can be printed in the usual way.

In [41]:
print(item1)
print(item2)
print(item3)

Foobar(quux=1246, blabla=1.3614714283377922, lalalalala='Foo_01')
Foobar(quux=2944, blabla=1.9278550110777513, lalalalala='Foo_02')
Foobar(quux=1633, blabla=3.858759671075081, lalalalala='Foo_03')


Each item also implements the `__format__()` method, which makes it possible to call `format()` on it. This will produce a string resulting from the concatenation of all field values in this item.

In [42]:
print(format(item1))
print(format(item2))
print(format(item3))

1246,1.3614714283377922,Foo_01

2944,1.9278550110777513,Foo_02

1633,3.858759671075081,Foo_03



**TODO:** It would be nice to make it possible to customise the formatting using a different field separator (e.g. "|"), or by setting custom start/end strings. However, let's see whether we really need this.

Example:
```python
>>> g.SEPARATOR = "|"
>>> g.START_DELIM = "--> Item: "
>>> g.END_DELIM = " ||"
>>> print(format(item1))
>>> print(format(item2))
>>> print(format(item3))
```
```
--> Item: 1246|1.3614714283377922|Foo_01 ||
--> Item: 2944|1.9278550110777513|Foo_02 ||
--> Item: 1633|3.858759671075081|Foo_03 ||
```

### Writing generator outputs to a file.

Each custom generator comes with an `export()` method which allows to easily export a number of items produced by the generator to a file.

In [43]:
import tempfile

In [44]:
f = tempfile.NamedTemporaryFile()

In [45]:
g.export(f.name, N=3, seed=99999, progressbar=False)

!cat $f.name

1246,1.3614714283377922,Foo_01
2944,1.9278550110777513,Foo_02
1633,3.858759671075081,Foo_03


By default, the contents of an existing file are overwritten.

In [46]:
g.export(f.name, N=3, seed=12345, progressbar=False)

!cat $f.name

1853,2.2498596176360235,Foo_01
2500,1.0305075083712052,Foo_02
1020,3.4756195277612294,Foo_03


However, it is possible to append lines to an existing file by using `mode='a'` or `mode='append'`.

In [47]:
g.export(f.name, N=4, seed=99999, mode='a', progressbar=False)

!cat $f.name

1853,2.2498596176360235,Foo_01
2500,1.0305075083712052,Foo_02
1020,3.4756195277612294,Foo_03
1246,1.3614714283377922,Foo_01
2944,1.9278550110777513,Foo_02
1633,3.858759671075081,Foo_03
2024,3.2182710149419016,Foo_04


### Tweaking the output formatting

By default, the formatting simply concatenates all fields in the custom generator. However, sometimes you may want to change the order in which fields are printed, omit certain fields or otherwise customise the output.

This can be achieved by setting the `FORMAT_STR` attribute on the generator (either in the class definition or later). Any field values are interpolated using the `${fieldname}` syntax.

Note: In most cases `FORMAT_STR` should end with `\n`. Otherwise the `export()` method will export all items in a single line in the output file.

In [48]:
class FoobarGenerator(CustomGenerator):
    aaa = Integer(lo=1000, hi=3000)
    bbb = Float(lo=1.0, hi=4.0)
    ccc = Sequential(prefix="Foo_", digits=2)
    
    FORMAT_STR = "bbb=${bbb}, aaa=${aaa} - [omitting 'ccc']"

In [49]:
g = FoobarGenerator(seed=99999)

In [50]:
item1 = next(g)
item2 = next(g)
item3 = next(g)

In [51]:
print(format(item1))
print(format(item2))
print(format(item3))

bbb=1.3614714283377922, aaa=1246 - [omitting 'ccc']
bbb=1.9278550110777513, aaa=2944 - [omitting 'ccc']
bbb=3.858759671075081, aaa=1633 - [omitting 'ccc']


The `FORMAT_STR` attribute can also be set on an existing generator.

In [52]:
g.FORMAT_STR = 'aaa = ${aaa} | ccc = ${ccc}'

In [53]:
print(format(item1))
print(format(item2))
print(format(item3))

aaa = 1246 | ccc = Foo_01
aaa = 2944 | ccc = Foo_02
aaa = 1633 | ccc = Foo_03


Of course, the value of `FORMAT_STR` is also used when exporting to a file. Note that for file export, `FORMAT_STR` should end in `\n`, otherwise the resulting file will only contain a single long line.

In [54]:
g.FORMAT_STR = 'ccc = ${ccc} -- bbb = ${bbb} -- aaa=${aaa} -- ccc=${ccc}\n'

In [55]:
g.export(f.name, N=5, seed=99999, progressbar=False,
         header='# Each line below contains ccc, bbb, aaa, then ccc again.\n\n')

!cat $f.name

# Each line below contains ccc, bbb, aaa, then ccc again.

ccc = Foo_01 -- bbb = 1.3614714283377922 -- aaa=1246 -- ccc=Foo_01
ccc = Foo_02 -- bbb = 1.9278550110777513 -- aaa=2944 -- ccc=Foo_02
ccc = Foo_03 -- bbb = 3.858759671075081 -- aaa=1633 -- ccc=Foo_03
ccc = Foo_04 -- bbb = 3.2182710149419016 -- aaa=2024 -- ccc=Foo_04
ccc = Foo_05 -- bbb = 2.065540039845949 -- aaa=2951 -- ccc=Foo_05


## Formatting nested generators

In [56]:
class FoobarGenerator(CustomGenerator):
    aaa = Integer(lo=1000, hi=3000)
    bbb = Float(lo=1.0, hi=4.0)
    ccc = Sequential(prefix="Foo_", digits=2)

In [57]:
class QuuxGenerator(CustomGenerator):
    x = Integer(lo=400, hi=499)
    y = FoobarGenerator()

In [58]:
g = QuuxGenerator()

By default, attributes that are themselves produced by custom generators are interpolated using their standard representation.

In [59]:
g.reset(seed=99999)
print(format(next(g)))
print(format(next(g)))
print(format(next(g)))

415,Foobar(aaa=1246, bbb=1.3614714283377922, ccc='Foo_01')

439,Foobar(aaa=2944, bbb=1.9278550110777513, ccc='Foo_02')

464,Foobar(aaa=1633, bbb=3.858759671075081, ccc='Foo_03')



In most cases, this is not what you want, so you should set the `FORMAT_STR` attribute. Nested attributes can be accessed as you would expect:

In [60]:
g.FORMAT_STR = "x=${x},y.aaa=${y.aaa},y.bbb=${y.bbb},y.ccc=${y.ccc}\n"

In [61]:
g.reset(seed=99999)
print(format(next(g)))
print(format(next(g)))
print(format(next(g)))

x=415,y.aaa=1246,y.bbb=1.3614714283377922,y.ccc=Foo_01

x=439,y.aaa=2944,y.bbb=1.9278550110777513,y.ccc=Foo_02

x=464,y.aaa=1633,y.bbb=3.858759671075081,y.ccc=Foo_03



## Working with ItemCollections

The `generate()` method of a `CustomGenerator` returns an iterable of type `ItemCollection`.

In [62]:
class FoobarGenerator(CustomGenerator):
    a = Integer(lo=1000, hi=3000)
    b = Sequential(prefix="Foo_", digits=2)
    c = Float(lo=1.0, hi=4.0)

In [63]:
g = FoobarGenerator()

In [64]:
item_collection = g.generate(5, seed=99999)
item_collection

<ItemCollection of length 5>

We can iterate over this as we would with a regular list.

In [65]:
for x in item_collection:
    print(x)

Foobar(a=1246, b='Foo_01', c=1.3614714283377922)
Foobar(a=2944, b='Foo_02', c=1.9278550110777513)
Foobar(a=1633, b='Foo_03', c=3.858759671075081)
Foobar(a=2024, b='Foo_04', c=3.2182710149419016)
Foobar(a=2951, b='Foo_05', c=2.065540039845949)


In addition, we can also export it as a pandas DataFrame.

In [66]:
item_collection.to_df()

Unnamed: 0,a,b,c
0,1246,Foo_01,1.361471
1,2944,Foo_02,1.927855
2,1633,Foo_03,3.85876
3,2024,Foo_04,3.218271
4,2951,Foo_05,2.06554
