This repository has been archived by the owner on Sep 1, 2023. It is now read-only.
/
make_datasets.py
executable file
·428 lines (336 loc) · 15 KB
/
make_datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
# ----------------------------------------------------------------------
# Numenta Platform for Intelligent Computing (NuPIC)
# Copyright (C) 2013, Numenta, Inc. Unless you have an agreement
# with Numenta, Inc., for a separate license for this software code, the
# following terms and conditions apply:
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero Public License version 3 as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU Affero Public License for more details.
#
# You should have received a copy of the GNU Affero Public License
# along with this program. If not, see http://www.gnu.org/licenses.
#
# http://numenta.org/licenses/
# ----------------------------------------------------------------------
"""
Generate artificial datasets for the multi-step prediction experiments
"""
import os
import numpy
import random
from optparse import OptionParser
from nupic.data.file_record_stream import FileRecordStream
def _generateSimple(filename="simple.csv", numSequences=2, elementsPerSeq=1,
numRepeats=10, resets=False):
""" Generate a simple dataset. This contains a bunch of non-overlapping
sequences.
Parameters:
----------------------------------------------------
filename: name of the file to produce, including extension. It will
be created in a 'datasets' sub-directory within the
directory containing this script.
numSequences: how many sequences to generate
elementsPerSeq: length of each sequence
numRepeats: how many times to repeat each sequence in the output
resets: if True, turn on reset at start of each sequence
"""
# Create the output file
scriptDir = os.path.dirname(__file__)
pathname = os.path.join(scriptDir, 'datasets', filename)
print "Creating %s..." % (pathname)
fields = [('reset', 'int', 'R'),
('field1', 'string', ''),
('field2', 'float', '')]
outFile = FileRecordStream(pathname, write=True, fields=fields)
# Create the sequences
sequences = []
for i in range(numSequences):
seq = [x for x in range(i*elementsPerSeq, (i+1)*elementsPerSeq)]
sequences.append(seq)
# Write out the sequences in random order
seqIdxs = []
for i in range(numRepeats):
seqIdxs += range(numSequences)
random.shuffle(seqIdxs)
for seqIdx in seqIdxs:
reset = int(resets)
seq = sequences[seqIdx]
for x in seq:
outFile.appendRecord([reset, str(x), x])
reset = 0
outFile.close()
def _generateOverlapping(filename="overlap.csv", numSequences=2, elementsPerSeq=3,
numRepeats=10, hub=[0,1], hubOffset=1, resets=False):
""" Generate a temporal dataset containing sequences that overlap one or more
elements with other sequences.
Parameters:
----------------------------------------------------
filename: name of the file to produce, including extension. It will
be created in a 'datasets' sub-directory within the
directory containing this script.
numSequences: how many sequences to generate
elementsPerSeq: length of each sequence
numRepeats: how many times to repeat each sequence in the output
hub: sub-sequence to place within each other sequence
hubOffset: where, within each sequence, to place the hub
resets: if True, turn on reset at start of each sequence
"""
# Check for conflicts in arguments
assert (hubOffset + len(hub) <= elementsPerSeq)
# Create the output file
scriptDir = os.path.dirname(__file__)
pathname = os.path.join(scriptDir, 'datasets', filename)
print "Creating %s..." % (pathname)
fields = [('reset', 'int', 'R'),
('field1', 'string', ''),
('field2', 'float', '')]
outFile = FileRecordStream(pathname, write=True, fields=fields)
# Create the sequences with the hub in the middle
sequences = []
nextElemIdx = max(hub)+1
for _ in range(numSequences):
seq = []
for j in range(hubOffset):
seq.append(nextElemIdx)
nextElemIdx += 1
for j in hub:
seq.append(j)
j = hubOffset + len(hub)
while j < elementsPerSeq:
seq.append(nextElemIdx)
nextElemIdx += 1
j += 1
sequences.append(seq)
# Write out the sequences in random order
seqIdxs = []
for _ in range(numRepeats):
seqIdxs += range(numSequences)
random.shuffle(seqIdxs)
for seqIdx in seqIdxs:
reset = int(resets)
seq = sequences[seqIdx]
for (x) in seq:
outFile.appendRecord([reset, str(x), x])
reset = 0
outFile.close()
def _generateFirstOrder0():
""" Generate the initial, first order, and second order transition
probabilities for 'probability0'. For this model, we generate the following
set of sequences:
.1 .75
0----1-----2
\ \
\ \ .25
\ \-----3
\
\ .9 .5
\--- 4--------- 2
\
\ .5
\---------3
Parameters:
----------------------------------------------------------------------
retval: (initProb, firstOrder, secondOrder, seqLen)
initProb: Initial probability for each category. This is a vector
of length len(categoryList).
firstOrder: A dictionary of the 1st order probabilities. The key
is the 1st element of the sequence, the value is
the probability of each 2nd element given the first.
secondOrder: A dictionary of the 2nd order probabilities. The key
is the first 2 elements of the sequence, the value is
the probability of each possible 3rd element given the
first two.
seqLen: Desired length of each sequence. The 1st element will
be generated using the initProb, the 2nd element by the
firstOrder table, and the 3rd and all successive
elements by the secondOrder table.
categoryList: list of category names to use
Here is an example of some return values when there are 3 categories
initProb: [0.7, 0.2, 0.1]
firstOrder: {'[0]': [0.3, 0.3, 0.4],
'[1]': [0.3, 0.3, 0.4],
'[2]': [0.3, 0.3, 0.4]}
secondOrder: {'[0,0]': [0.3, 0.3, 0.4],
'[0,1]': [0.3, 0.3, 0.4],
'[0,2]': [0.3, 0.3, 0.4],
'[1,0]': [0.3, 0.3, 0.4],
'[1,1]': [0.3, 0.3, 0.4],
'[1,2]': [0.3, 0.3, 0.4],
'[2,0]': [0.3, 0.3, 0.4],
'[2,1]': [0.3, 0.3, 0.4],
'[2,2]': [0.3, 0.3, 0.4]}
"""
# --------------------------------------------------------------------
# Initial probabilities, 'a' and 'e' equally likely
numCategories = 5
initProb = numpy.zeros(numCategories)
initProb[0] = 1.0
# --------------------------------------------------------------------
# 1st order transitions
firstOrder = dict()
firstOrder['0'] = numpy.array([0, 0.1, 0, 0, 0.9])
firstOrder['1'] = numpy.array([0, 0, 0.75, 0.25, 0])
firstOrder['2'] = numpy.array([1.0, 0, 0, 0, 0])
firstOrder['3'] = numpy.array([1.0, 0, 0, 0, 0])
firstOrder['4'] = numpy.array([0, 0, 0.5, 0.5, 0])
# --------------------------------------------------------------------
# 2nd order transitions don't apply
secondOrder = None
# Generate the category list
categoryList = ['%d' % x for x in range(5)]
return (initProb, firstOrder, secondOrder, 3, categoryList)
def _generateFileFromProb(filename, numRecords, categoryList, initProb,
firstOrderProb, secondOrderProb, seqLen, numNoise=0, resetsEvery=None):
""" Generate a set of records reflecting a set of probabilities.
Parameters:
----------------------------------------------------------------
filename: name of .csv file to generate
numRecords: number of records to generate
categoryList: list of category names
initProb: Initial probability for each category. This is a vector
of length len(categoryList).
firstOrderProb: A dictionary of the 1st order probabilities. The key
is the 1st element of the sequence, the value is
the probability of each 2nd element given the first.
secondOrderProb: A dictionary of the 2nd order probabilities. The key
is the first 2 elements of the sequence, the value is
the probability of each possible 3rd element given the
first two. If this is None, then the sequences will be
first order only.
seqLen: Desired length of each sequence. The 1st element will
be generated using the initProb, the 2nd element by the
firstOrder table, and the 3rd and all successive
elements by the secondOrder table. None means infinite
length.
numNoise: Number of noise elements to place between each
sequence. The noise elements are evenly distributed from
all categories.
resetsEvery: If not None, generate a reset every N records
Here is an example of some parameters:
categoryList: ['cat1', 'cat2', 'cat3']
initProb: [0.7, 0.2, 0.1]
firstOrderProb: {'[0]': [0.3, 0.3, 0.4],
'[1]': [0.3, 0.3, 0.4],
'[2]': [0.3, 0.3, 0.4]}
secondOrderProb: {'[0,0]': [0.3, 0.3, 0.4],
'[0,1]': [0.3, 0.3, 0.4],
'[0,2]': [0.3, 0.3, 0.4],
'[1,0]': [0.3, 0.3, 0.4],
'[1,1]': [0.3, 0.3, 0.4],
'[1,2]': [0.3, 0.3, 0.4],
'[2,0]': [0.3, 0.3, 0.4],
'[2,1]': [0.3, 0.3, 0.4],
'[2,2]': [0.3, 0.3, 0.4]}
"""
# Create the file
print "Creating %s..." % (filename)
fields = [('reset', 'int', 'R'),
('field1', 'string', ''),
('field2', 'float', '')]
scriptDir = os.path.dirname(__file__)
pathname = os.path.join(scriptDir, 'datasets', filename)
outFile = FileRecordStream(pathname, write=True, fields=fields)
# --------------------------------------------------------------------
# Convert the probabilitie tables into cumulative probabilities
initCumProb = initProb.cumsum()
firstOrderCumProb = dict()
for (key,value) in firstOrderProb.iteritems():
firstOrderCumProb[key] = value.cumsum()
if secondOrderProb is not None:
secondOrderCumProb = dict()
for (key,value) in secondOrderProb.iteritems():
secondOrderCumProb[key] = value.cumsum()
else:
secondOrderCumProb = None
# --------------------------------------------------------------------
# Write out the sequences
elementsInSeq = []
numElementsSinceReset = 0
maxCatIdx = len(categoryList) - 1
for _ in xrange(numRecords):
# Generate a reset?
if numElementsSinceReset == 0:
reset = 1
else:
reset = 0
# Pick the next element, based on how are we are into the 2nd order
# sequence.
rand = numpy.random.rand()
# Generate 1st order sequences
if secondOrderCumProb is None:
if len(elementsInSeq) == 0:
catIdx = numpy.searchsorted(initCumProb, rand)
elif len(elementsInSeq) >= 1 and \
(seqLen is None or len(elementsInSeq) < seqLen-numNoise):
catIdx = numpy.searchsorted(firstOrderCumProb[str(elementsInSeq[-1])],
rand)
else: # random "noise"
catIdx = numpy.random.randint(len(categoryList))
# Generate 2nd order sequences
else:
if len(elementsInSeq) == 0:
catIdx = numpy.searchsorted(initCumProb, rand)
elif len(elementsInSeq) == 1:
catIdx = numpy.searchsorted(firstOrderCumProb[str(elementsInSeq)], rand)
elif (len(elementsInSeq) >=2) and \
(seqLen is None or len(elementsInSeq) < seqLen-numNoise):
catIdx = numpy.searchsorted(secondOrderCumProb[str(elementsInSeq[-2:])], rand)
else: # random "noise"
catIdx = numpy.random.randint(len(categoryList))
# -------------------------------------------------------------------
# Write out the record
catIdx = min(maxCatIdx, catIdx)
outFile.appendRecord([reset, categoryList[catIdx], catIdx])
#print categoryList[catIdx]
# ------------------------------------------------------------
# Increment counters
elementsInSeq.append(catIdx)
numElementsSinceReset += 1
# Generate another reset?
if resetsEvery is not None and numElementsSinceReset == resetsEvery:
numElementsSinceReset = 0
elementsInSeq = []
# Start another 2nd order sequence?
if seqLen is not None and (len(elementsInSeq) == seqLen+numNoise):
elementsInSeq = []
outFile.close()
if __name__ == '__main__':
helpString = \
"""%prog [options]
Generate artificial datasets for testing multi-step prediction """
# ============================================================================
# Process command line arguments
parser = OptionParser(helpString)
parser.add_option("--verbosity", default=0, type="int",
help="Verbosity level, either 0, 1, 2, or 3 [default: %default].")
(options, args) = parser.parse_args()
if len(args) != 0:
parser.error("No arguments accepted")
# Set random seed
random.seed(42)
# Create the dataset directory if necessary
datasetsDir = os.path.join(os.path.dirname(__file__), 'datasets')
if not os.path.exists(datasetsDir):
os.mkdir(datasetsDir)
# Generate the sample datasets
_generateSimple('simple_0.csv', numSequences=2, elementsPerSeq=5,
numRepeats=30)
_generateSimple('simple_1.csv', numSequences=10, elementsPerSeq=5,
numRepeats=20)
_generateOverlapping('simple_2.csv', numSequences=10, elementsPerSeq=5,
numRepeats=20, hub=[0,1], hubOffset=1, resets=False)
_generateSimple('simple_3.csv', numSequences=2, elementsPerSeq=10,
numRepeats=30, resets=False)
# The first order 0 dataset
(initProb, firstOrderProb, secondOrderProb, seqLen, categoryList) = \
_generateFirstOrder0()
_generateFileFromProb(filename='first_order_0.csv', numRecords=1000,
categoryList=categoryList, initProb=initProb,
firstOrderProb=firstOrderProb, secondOrderProb=secondOrderProb,
seqLen=seqLen, numNoise=0, resetsEvery=None)