# Simple array example

## Array setup

In [1]:
# Need parent directory in path
import sys
sys.path.append('..')

In [2]:
import numpy

Create a simple numpy ndarray to query later:

In [3]:
simple_array = numpy.ndarray(5, numpy.dtype([('col1', numpy.float32), ('col2', numpy.float32)]))
n = 0
for i in range(5):
    for j in range(2):
        simple_array[i][j] = n
        n += 1

**Contents of the example array with a couple simple columns:**

In [4]:
simple_array

array([(0., 1.), (2., 3.), (4., 5.), (6., 7.), (8., 9.)],
      dtype=[('col1', '<f4'), ('col2', '<f4')])

Export this array to a file to run on:

In [5]:
simple_input_pathname = 'simple_data.npy'

In [6]:
numpy.save(simple_input_pathname, simple_array)

## Query

Now we want to make a query to pull some selection out of this array.

In [7]:
# Use array-type backend
from clientlib.DataSets import ArrayDataSet

In [8]:
# Create objects for the dataset and array stream
simple_dataset = ArrayDataSet(simple_input_pathname)
simple_array_stream = simple_dataset.AsNumpyArray()

**Create query to just pull the first column out:**

In [9]:
simple_col1_query = simple_array_stream.Select("lambda e: e.col1")

Then actually run the query:

In [10]:
simple_output = simple_col1_query.value()

**Output array contents:**

In [11]:
simple_output

array([0., 2., 4., 6., 8.], dtype=float32)

Some clean up:

In [12]:
import os

In [13]:
os.remove(simple_input_pathname)

# Example with multi-entry column in array

## Array setup

In [14]:
# create another example array
multidim_array = numpy.ndarray(5, numpy.dtype([('eventNumber', numpy.uint), ('jets', [('pt', numpy.single), ('nTracks', numpy.uint)], 2) ]))
n = 0
for i in range(5):
    multidim_array[i]['eventNumber'] = n
    n += 1
    for j in range(2):
        multidim_array[i]['jets'][j]['pt'] = (i * 2 + j) * 0.3
        multidim_array[i]['jets'][j]['nTracks'] = i * 2 + j

**Now we have a _jets_ column with a couple properties:**

In [15]:
multidim_array

array([(0, [(0. , 0), (0.3, 1)]), (1, [(0.6, 2), (0.9, 3)]),
       (2, [(1.2, 4), (1.5, 5)]), (3, [(1.8, 6), (2.1, 7)]),
       (4, [(2.4, 8), (2.7, 9)])],
      dtype=[('eventNumber', '<u8'), ('jets', [('pt', '<f4'), ('nTracks', '<u8')], (2,))])

In [16]:
multidim_input_pathname = 'multidim_data.npy'

In [17]:
# export array
numpy.save(multidim_input_pathname, multidim_array)

## Query

In [18]:
multidim_dataset = ArrayDataSet(multidim_input_pathname)
multidim_array_stream = multidim_dataset.AsNumpyArray()

**Query to get one property across all jets:**

In [19]:
multidim_pt_query = multidim_array_stream.SelectMany("lambda e: e.jets").Select("lambda j: j.pt")

In [20]:
# run query
multidim_output = multidim_pt_query.value()

**Output array:**

In [21]:
multidim_output

array([0. , 0.3, 0.6, 0.9, 1.2, 1.5, 1.8, 2.1, 2.4, 2.7], dtype=float32)

In [22]:
# cleanup
os.remove(multidim_input_pathname)

# Examples with awkward array

## Array setup

In [23]:
import awkward

**Use lists and dictionaries to import as an awkward array:**

In [24]:
awkward_array = awkward.fromiter([
    {'eventNumber': 0, 'jets': [{'pt': 250000, 'nTracks': 3}, {'pt': 10000, 'nTracks': 2}]},
    {'eventNumber': 1, 'jets': []},
    {'eventNumber': 2, 'jets': [{'pt': 70000, 'nTracks': 10}]},
    {'eventNumber': 3, 'jets': [{'pt': 20000, 'nTracks': 1}]}
])

Array contents:

In [25]:
awkward_array.tolist()

[{'eventNumber': 0,
  'jets': [{'nTracks': 3, 'pt': 250000}, {'nTracks': 2, 'pt': 10000}]},
 {'eventNumber': 1, 'jets': []},
 {'eventNumber': 2, 'jets': [{'nTracks': 10, 'pt': 70000}]},
 {'eventNumber': 3, 'jets': [{'nTracks': 1, 'pt': 20000}]}]

**Note that _jets_ now has a variable length.**

In [26]:
awkward_input_pathname = 'awkward_data.awkd'

In [27]:
# export array
awkward.save(awkward_input_pathname, awkward_array, mode='w')

## Query

In [28]:
awkward_dataset = ArrayDataSet(awkward_input_pathname)
awkward_array_stream = awkward_dataset.AsAwkwardArray()

**Get _nTracks_ for each jet this time:**

In [29]:
awkward_nTracks_query = awkward_array_stream.SelectMany("lambda e: e.jets").Select("lambda j: j.nTracks")

In [30]:
# run query
awkward_output = awkward_nTracks_query.value()

**Output array contents:**

In [31]:
awkward_output

array([ 3,  2, 10,  1])

A more complicated query, using a Where condition and `len` to get the number of jets per event with pt > 50 GeV, and then selecting the event number only for events with at least one of these jets:

In [32]:
# This might actually be breaking some of the rules of the frontend... but it is fairly compact
awkward_where_query = awkward_array_stream.Where('lambda e: len(e.jets.pt / 1000.0 > 50) > 0').Select('lambda e1: e1.eventNumber')

In [33]:
# run query
awkward_output = awkward_where_query.value()

**Output array contents:**

In [34]:
awkward_output

array([0, 2])

In [35]:
# cleanup
os.remove(awkward_input_pathname)

# Example with a real ntuple

Test that we can get out a branch of an analysis ntuple:

In [36]:
real_awkward_dataset = ArrayDataSet('/LLPData/Outputs/CalRatio_ntuples2016dataAnalysis_paperVersion/signal_HSS_LLP_mH125_mS25_lt9m_ntuples2016dataAnalysis_paperVersion.root')
real_awkward_array_stream = real_awkward_dataset.AsAwkwardArray()

In [37]:
real_awkward_query = real_awkward_array_stream.Select('lambda e: e.eventNumber')

In [38]:
real_awkward_query.value()

array([102474, 102256, 102530, ..., 101286, 101027, 101189], dtype=uint64)