In [1]:
import kosh
from sina.utils import DataRange, has_all, has_any, all_in, any_in
import os
import numpy as np
import random

# Simulation Workflow

This notebook will take a user through a simple simulation workflow with an inner loop (single simulations) and outer loop (grouping simulations into an ensemble).

## Create Overall Store

This will house all the simulation data

In [2]:
store_path = 'my_store.sqlite' # "/usr/workspace/group_dir/group.sqlite"
try:
    os.remove(store_path)
except:
    pass
store = kosh.connect(store_path)

  import pkg_resources


## Inner Loop

This inner loop contains the data for the individual simulations

### Create Dataset for a Simulation

We create our first dataset and add `metadata` to it which can be used to filter datasets using the `store.find()` method later on. These are single value items and are dataset attributes.

See [Example_00_Open_Store_And_Add_Datasets.ipynb](Example_00_Open_Store_And_Add_Datasets.ipynb) for more examples on how to add data. 

See [Example_04_Schemas.ipynb](Example_04_Schemas.ipynb) for examples on how to set up a schema for the metadata.

In [3]:
################
# From scratch #
################
# important attributes/metadata that can be used in store.find() method later
my_metadata ={'param4': 1,
              'param5': 'test',
              'param6': 3.14}

dataset_from_scratch = store.create(name='My Example Dataset From Scratch', # name to find dataset later on
                                    metadata=my_metadata)

print("From Scratch:\n", dataset_from_scratch, "\n")

######################################################
# You can also import a dataset from various formats #
######################################################

#########################
# Sina Record object(s) #
#########################
import sina
from sina.model import CurveSet

possible_mode = ["quick", "standard", "test", "high-def"]
possible_machine = ["Quartz", "Catalyst", "local", "Sierra", "Lassen", "Ruby"]

record = sina.model.Record(id="sina_rec_0", type="foo_type")
record.add_data('total_energy', random.randint(0, 1000) / 10.0)
record.add_data('start_time', 0)
record.add_data('mode', random.choice(possible_mode))
record.add_data('machine', random.choice(possible_machine))

cs1 = CurveSet("quick_sample")
cs1.add_independent("time", [1, 2, 3, 4])
cs1.add_dependent("local_density", random.sample(range(1, 10), 4))
cs1.add_dependent("est_overall_density", random.sample(range(1, 10), 4))
record.add_curve_set(cs1)

datasets_from_sina_record = store.import_dataset(record)

print("From Sina Record:\n", datasets_from_sina_record[0], "\n")

####################
# Pandas DataFrame #
####################
import pandas as pd

num_rows = 10

# Each row is a dataset and each column is an attribute/metadata
# if there is no 'id' column it will randomly give each dataset one using uuid.uuid4().hex
data = {
    'total_energy': np.random.randint(0, 1000, size=num_rows) / 10.0,
    'start_time': np.random.rand(num_rows),
    'mode': np.random.choice(possible_mode, size=num_rows),
    'D': np.random.choice(possible_machine, size=num_rows)
}
df = pd.DataFrame(data)

datasets_from_pandas = store.import_dataset(df, match_attributes=['id'])  # This is a list of datasets if more than one row

print("From Pandas:\n", datasets_from_pandas[0][0], "\n")

############
# CSV File #
############
# Each row is a dataset and each column is an attribute/metadata
# if there is no 'id' column it will randomly give each dataset one using uuid.uuid4().hex
datasets_from_csv = store.import_dataset("../tests/baselines/csv/my_csv_file.csv")  # This is a list of datasets if more than one row

print("From CSV:\n", datasets_from_csv[0][0], "\n")

#############
# Sina HDF5 #
#############
# There can be multiple records in a single hdf5
datasets_from_sina_hdf5 = store.import_dataset("sina_rec.hdf5",
                                         match_attributes=['id'])
print("From Sina HDF5:\n", datasets_from_sina_hdf5[0], "\n")

#############
# Sina JSON #
#############
dataset = store.import_dataset("sina_curve_rec.json",
                               match_attributes=['id'])[0]

for key, val in my_metadata.items():
        setattr(dataset, key, val)

# The `metadata` will be in the attributes
# The sina record already has some metadata pre-populated
# It also has some associated data which will be discussed in the upcoming sections
print("From Sina JSON:\n",dataset)

From Scratch:
 KOSH DATASET
	id: d0996640cf2d438c8e6e4c8949ae0575
	name: My Example Dataset From Scratch
	creator: moreno45

--- Attributes ---
	creator: moreno45
	name: My Example D... From Scratch
	param4: 1
	param5: test
	param6: 3.14
--- Associated Data (0)---
--- Ensembles (0)---
	[]
--- Ensemble Attributes ---
--- Alias Feature Dictionary --- 

From Sina Record:
 KOSH DATASET
	id: d0996640cf2d438c8e6e4c8949ae0575
	name: My Example Dataset From Scratch
	creator: moreno45

--- Attributes ---
	creator: moreno45
	machine: Catalyst
	mode: standard
	name: My Example D... From Scratch
	param4: 1
	param5: test
	param6: 3.14
	start_time: 0
	total_energy: 18.8
--- Associated Data (1)---
	Mime_type: sina/curve
		internal ( quick_sample )
--- Ensembles (0)---
	[]
--- Ensemble Attributes ---
--- Alias Feature Dictionary --- 

From Pandas:
 KOSH DATASET
	id: 57e8a2e6df3e44a68d675977e91712ba_0
	name: ???
	creator: ???

--- Attributes ---
	D: Lassen
	mode: high-def
	start_time: 0.444973640081527

### Adding Data to a Dataset

You can also add time history data to the dataset. Each time history data is grouped into `curve_sets` with one `independent` curve and one or more `dependent` curves.

See [Example_01_Add_Data_To_Datasets.ipynb](Example_01_Add_Data_To_Datasets.ipynb) for more examples on how to add curve sets.

In [4]:
# dataset.add_curve(np.array(my_array).tolist(), 'my_time_series', 'x_pos')
dataset.add_curve([1,2,3,4], "my_curves", "time")
dataset.add_curve([2.3, 3.4, 5.6, 7.8], "my_curves", "some_variable")
dataset.add_curve([3, 4,5], "my_other_curves", "time")

# Adding a single value to the metadata
some_variable_mean = float(np.mean(dataset['my_curves/some_variable'][:]))
setattr(dataset, 'some_variable_mean', some_variable_mean)

# The `curve_sets` are located in the associated data section as `mime_type="sina/curve"`.
print(dataset)

KOSH DATASET
	id: obj1
	name: ???
	creator: ???

--- Attributes ---
	param1: 1
	param2: 2
	param3: 3.3
	param4: 1
	param5: test
	param6: 3.14
	some_variable_mean: 4.7749999999999995
--- Associated Data (2)---
	Mime_type: image/png
		foo.png ( obj1 )
	Mime_type: sina/curve
		internal ( timeplot_1, my_curves, my_other_curves )
--- Ensembles (0)---
	[]
--- Ensemble Attributes ---
--- Alias Feature Dictionary ---


### Associate Data to a Dataset

Associating data to a dataset means that you can reference that file and its data. The dataset doesn't store any of the data when you associate a file, it just references the file which means if the associated file is deleted, the data will no longer be "in" the dataset.

Below are a couple of common file formats and how to associate them to the correct `loader` through the `mime_type` so Kosh knows how to access the data. See [Example_02_Read_Data.ipynb](Example_02_Read_Data.ipynb) for more examples on how to associate data. 

If there is a file format that is not supported by Kosh, you can create your own custom loader [Example_Custom_Loader.ipynb](Example_Custom_Loader.ipynb).

In [5]:
# hdf5
dataset.associate("../tests/baselines/node_extracts2/node_extracts2.hdf5",
                  mime_type="hdf5",
                  metadata={"param10": "my value",
                            "my other param": "Example Text"},
                  absolute_path=False)

# csv
# The "pandas/*" mime types use the `pandas.read_*()` methods behind the scenes so you can pass in its arguments through `loader_kwargs`
dataset.associate("../tests/baselines/csv/my_csv_file.csv",
                  mime_type="pandas/csv",
                  metadata={"param20": "my other value",
                            "my param": 10},
                  loader_kwargs={'index_col': 0},
                  absolute_path=False)

# ultra
dataset.associate("my_ult_file.ult",
                  metadata={"param30": 45,
                            "my param": 560},
                  mime_type="ultra")

# These will show up in associated data with their correspondign `mime_type`
print(dataset)

KOSH DATASET
	id: obj1
	name: ???
	creator: ???

--- Attributes ---
	param1: 1
	param2: 2
	param3: 3.3
	param4: 1
	param5: test
	param6: 3.14
	some_variable_mean: 4.7749999999999995
--- Associated Data (5)---
	Mime_type: hdf5
		../tests/baselines/node_extracts2/node_extracts2.hdf5 ( b8a7bd3c354f4af0b2b612b87895a389 )
	Mime_type: image/png
		foo.png ( obj1 )
	Mime_type: pandas/csv
		../tests/baselines/csv/my_csv_file.csv ( 6a2390092c3f42fb96a869c67baa1fdc )
	Mime_type: sina/curve
		internal ( timeplot_1, my_curves, my_other_curves )
	Mime_type: ultra
		/g/g20/moreno45/Projects/ASCAML/kosh/examples/my_ult_file.ult ( 5aa6593ccfa749b29c50be6feb91be8f )
--- Ensembles (0)---
	[]
--- Ensemble Attributes ---
--- Alias Feature Dictionary ---


### Available Data

The commands below allow a user to see what data is available in a dataset

In [6]:
print('Attributes:') # can be single value or list
print('\t',dataset.list_attributes())
print('\n')
print('Features Sets:')  # only for time history data
print('\t',dataset.list_features())

# If there are a lot of files and features, use_cache=True can be used
# so that Kosh doesn't have to search through all the files again
# print('\t',dataset.list_features(use_cache=True))

Attributes:
	 ['id', 'param1', 'param2', 'param3', 'param4', 'param5', 'param6', 'some_variable_mean']


Features Sets:
	 ['my_curves', 'my_curves/some_variable', 'my_curves/time', 'my_other_curves', 'my_other_curves/time', 'timeplot_1', 'timeplot_1/feature_a', 'timeplot_1/feature_b', 'timeplot_1/time', 'cycles', 'direction', 'elements', 'node', 'node/metrics_0', 'node/metrics_1', 'node/metrics_10', 'node/metrics_11', 'node/metrics_12', 'node/metrics_2', 'node/metrics_3', 'node/metrics_4', 'node/metrics_5', 'node/metrics_6', 'node/metrics_7', 'node/metrics_8', 'node/metrics_9', 'zone', 'zone/metrics_0', 'zone/metrics_1', 'zone/metrics_2', 'zone/metrics_3', 'zone/metrics_4', 'id', 'name', 'creator', 'mynewattribute', 'myotherattribute', 'myparam10', 'myparam20', 'myparam30', 'myparam40', 'myparam50', 'myparam60', 'Gaussian (a: 5.0 w: 5.0 c: 0.0)', 'Gaussian (a: 5.0 w: 5.0 c: 50.0)', 'A + B', 'Straight Line (m: 0.125 b: -2.5 xmin: 60.0 xmax: 40.0)', 'a.y+numpy.random.normal(size=100)', '

### Accessing Data

Kosh knows which data is located in which file so all the user has to do is call the feature name to acquire that data. You can post-process this data and add it to the dataset through `setattr()` or `dataset.add_curve()`.

See [Example_05a_Transformers.ipynb](Example_05a_Transformers.ipynb), [Example_05b_Transformers-SKL.ipynb](Example_05b_Transformers-SKL.ipynb), and [Example_05b_Transformers-SKL.ipynb](Example_05b_Transformers-SKL.ipynb) on more ways to post process the data.

#### Whole File

In [7]:
# hdf5
associated_hdf5 = list(dataset.find(mime_type="hdf5"))[0]
h5_file = dataset.open(Id=associated_hdf5.id)
print('HDF5')
print(h5_file,'\n\n')

# csv
# The "pandas/*" mime types use the `pandas.read_*()` methods behind the scenes so you will get a `pandas.DataFrame()`
associated_csv_pandas = list(dataset.find(mime_type="pandas/csv"))[0]
df = dataset.open(Id=associated_csv_pandas.id)
print('CSV')
print(df,'\n\n')

# ultra
# returns a list with PyDV curve objects where data is in curve.x and curve.y see curve.__dict__ for more info
associated_ultra = list(dataset.find(mime_type="ultra"))[0]
ultra = dataset.open(Id=associated_ultra.id)
print('ULTRA')
print(ultra,'\n\n')

HDF5
<HDF5 file "node_extracts2.hdf5" (mode r)> 


CSV
                                  id          name  \
0   c40699ca067a4e29ba0f25470cf29e57   new_dataset   
1   21ef0cd592d24c5c8fcc89b066fb7418            15   
2   f970bd06a67a4956bd475fc48bd5a214             8   
3   3a79fc992f664a88a08151826f352cdf  new_dataset2   
4   6c35fb8c25034483af7031fada507062            16   
5   b634fce83dfd4e73b1f7f92c43b5ee1d             7   
6   1c14f05f44d846499b390fd435827f7d             2   
7   95d16b109ab64c6680af6f0bdc02aa00             9   
8   c0215e9c4080430da59979f71009c045            14   
9   b1e66c735ac6483d88fe041ab70dab2c             0   
10  4b77902d818a4980b8e55ad2106cd73e            21   
11  8554af7b60a34492a202a5f6fd468da1             5   
12  f05a324c3693494b8d0692ee6fd4b4bc             4   
13  ea0e89d9cb9a476d8f6dd683e8fc2666             1   
14  c941ee7ecd0a48b48ef6c3bad13c0216            11   
15  50677728c8dd4844b54f179ea23f09cf            10   
16  373fbc3883504fcb92c9b9c

#### Multiple Features

In [8]:
# hdf5
data1 = dataset[["node/metrics_5",'node/metrics_11']][:]
print('HDF5')
print(data1,'\n\n')

# csv
# The "pandas/*" mime types use the `pandas.read_*()` methods behind the scenes so you will get a `pandas.DataFrame()`
data2 = dataset[["creator", "myparam40"]][:]
print('CSV')
print(data2,'\n\n')

# ultra
# returns a list with PyDV curve objects where data is in curve.x and curve.y see curve.__dict__ for more info
data3 = dataset[['Gaussian (a: 5.0 w: 5.0 c: 0.0)', 'O + R']][:]
print('ULTRA')
print(data3,'\n\n')

HDF5
[<HDF5 dataset "metrics_5": shape (2, 18), type "<f4">, <HDF5 dataset "metrics_11": shape (2, 18), type "<f4">] 


CSV
                             creator  myparam40
0   9b7d60f394284459a1ae979bb0af019f        NaN
1   9b7d60f394284459a1ae979bb0af019f   0.950918
2   9b7d60f394284459a1ae979bb0af019f   2.270797
3   9b7d60f394284459a1ae979bb0af019f        NaN
4   9b7d60f394284459a1ae979bb0af019f   2.466016
5   9b7d60f394284459a1ae979bb0af019f   0.020335
6   9b7d60f394284459a1ae979bb0af019f   1.064911
7   9b7d60f394284459a1ae979bb0af019f   0.108305
8   9b7d60f394284459a1ae979bb0af019f   1.364071
9   9b7d60f394284459a1ae979bb0af019f   0.686090
10  9b7d60f394284459a1ae979bb0af019f   2.985315
11  9b7d60f394284459a1ae979bb0af019f   0.454322
12  9b7d60f394284459a1ae979bb0af019f   0.371130
13  9b7d60f394284459a1ae979bb0af019f   0.709315
14  9b7d60f394284459a1ae979bb0af019f   2.333544
15  9b7d60f394284459a1ae979bb0af019f   2.803816
16  9b7d60f394284459a1ae979bb0af019f   2.337639
17  9b7d60f3

#### Single Feature

In [9]:
# hdf5
data1 = dataset["node/metrics_5"][:]
print('HDF5')
print(data1,'\n\n')

# csv
# The "pandas/*" mime types use the `pandas.read_*()` methods behind the scenes so you will get a `pandas.DataFrame()`
data2 = dataset["creator"][:]
print('CSV')
print(data2,'\n\n')

# ultra
# returns a list with PyDV curve objects where data is in curve.x and curve.y see curve.__dict__ for more info
data3 = dataset['Gaussian (a: 5.0 w: 5.0 c: 0.0)'][:]
print('ULTRA')
print(data3,'\n\n')

HDF5
<HDF5 dataset "metrics_5": shape (2, 18), type "<f4"> 


CSV
                             creator
0   9b7d60f394284459a1ae979bb0af019f
1   9b7d60f394284459a1ae979bb0af019f
2   9b7d60f394284459a1ae979bb0af019f
3   9b7d60f394284459a1ae979bb0af019f
4   9b7d60f394284459a1ae979bb0af019f
5   9b7d60f394284459a1ae979bb0af019f
6   9b7d60f394284459a1ae979bb0af019f
7   9b7d60f394284459a1ae979bb0af019f
8   9b7d60f394284459a1ae979bb0af019f
9   9b7d60f394284459a1ae979bb0af019f
10  9b7d60f394284459a1ae979bb0af019f
11  9b7d60f394284459a1ae979bb0af019f
12  9b7d60f394284459a1ae979bb0af019f
13  9b7d60f394284459a1ae979bb0af019f
14  9b7d60f394284459a1ae979bb0af019f
15  9b7d60f394284459a1ae979bb0af019f
16  9b7d60f394284459a1ae979bb0af019f
17  9b7d60f394284459a1ae979bb0af019f
18  9b7d60f394284459a1ae979bb0af019f
19  9b7d60f394284459a1ae979bb0af019f
20  9b7d60f394284459a1ae979bb0af019f
21  9b7d60f394284459a1ae979bb0af019f
22  9b7d60f394284459a1ae979bb0af019f
23  9b7d60f394284459a1ae979bb0af019f
24  9b7d6

#### Describe Feature

In [10]:
# hdf5
print('HDF5')
print(dataset.describe_feature(Id=associated_hdf5.id, feature="node/metrics_5"),"\n\n")

# csv
# The "pandas/*" mime types use the `pandas.read_*()` methods behind the scenes so you will get `pandas.DataFrame.describe()`
print('CSV')
print(dataset.describe_feature(Id=associated_csv_pandas.id, feature="creator"),"\n\n")

# ultra
print('ULTRA')
print(dataset.describe_feature(Id=associated_ultra.id, feature='Gaussian (a: 5.0 w: 5.0 c: 0.0)'),"\n\n")

HDF5
{'size': (2, 18), 'format': 'hdf5', 'type': dtype('<f4'), 'dimensions': [{'name': 'cycles', 'first': np.int64(11), 'last': np.int64(8), 'length': 2}, {'name': 'elements', 'first': np.int64(17), 'last': np.int64(15), 'length': 18}]} 


CSV
count                                   25
unique                                   1
top       9b7d60f394284459a1ae979bb0af019f
freq                                    25
Name: creator, dtype: object 


ULTRA
{'name': 'Gaussian (a: 5.0 w: 5.0 c: 0.0)', 'size': 10, 'first_time': np.float64(-15.0), 'last_time': np.float64(12.272727272727257), 'min': np.float64(0.0006170490204333978), 'max': np.float64(4.995410739193376), 'type': dtype('float64')} 




## Outer Loop

This outer loop groups the individual simulations into ensembles for organization purposes.

### Adding Dataset to Ensembles

Once there are a lot of simulations that have been completed and their datasets created, we can group them together.

See [Example_Ensembles.ipynb](Example_Ensembles.ipynb) for more information on ensembles.

In [11]:
# Try to see if it already exists
ensemble = list(store.find_ensembles(name="My Example Ensemble"))

if len(ensemble)==0: # create ensemble if doesn't exist
    ensemble = store.create_ensemble(name="My Example Ensemble",
                                    metadata={"root":"/root/path/for/ensemble",
                                            "project":"Example"})
else: # already exists
    ensemble = ensemble[0] # get first ensemble out of find results

# add this dataset to this ensemble
ensemble.add(dataset)

print('----- Ensemble -----')
print(ensemble,"\n\n") # This will display the ids' of the datasets in the ensemble

print('----- Dataset -----')
print(dataset) # This will now display the ensembles this dataset is a member off

----- Ensemble -----
KOSH ENSEMBLE
	id: ae36aa8f8e49479c93bef45d440f8347
	name: My Example Ensemble
	creator: moreno45

--- Attributes ---
	creator: moreno45
	name: My Example Ensemble
	project: Example
	root: /root/path/for/ensemble
--- Associated Data (0)---
--- Member Datasets (1)---
	['obj1'] 


----- Dataset -----
KOSH DATASET
	id: obj1
	name: ???
	creator: ???

--- Attributes ---
	param1: 1
	param2: 2
	param3: 3.3
	param4: 1
	param5: test
	param6: 3.14
	some_variable_mean: 4.7749999999999995
--- Associated Data (5)---
	Mime_type: hdf5
		../tests/baselines/node_extracts2/node_extracts2.hdf5 ( b8a7bd3c354f4af0b2b612b87895a389 )
	Mime_type: image/png
		foo.png ( obj1 )
	Mime_type: pandas/csv
		../tests/baselines/csv/my_csv_file.csv ( 6a2390092c3f42fb96a869c67baa1fdc )
	Mime_type: sina/curve
		internal ( timeplot_1, my_curves, my_other_curves )
	Mime_type: ultra
		/g/g20/moreno45/Projects/ASCAML/kosh/examples/my_ult_file.ult ( 5aa6593ccfa749b29c50be6feb91be8f )
--- Ensembles (1)---
	

### Adding Multiple Datasets to Ensembles with the same attributes and organizing with ensemble tags

Datasets also can be part of multiple ensembles and they can be further organized within a single ensemble using `ensemble_tags`. 

For example, say you want to add your train, validation, and test datasets to a single ensemble but need to organize them as such. Adding an attribute to the dataset would make that attribute the same across all ensembles but the train, validation, and test split is randomized for each ensemble. Adding an attribute to the ensemble would be at the ensemble level and thus you would need three ensembles one for train, validation, and test. `ensemble_tags` allow the user to organize the datasets within the ensemble.

We also use `inherit_attributes=False` so that the datasets and ensembles as well as the different ensemebles containing the same datasets can have the same attributes or else there will be a clash since the same attributes are seen.

**Note:** If a dataset was added to another ensemble using the default parameter `inherit_attributes=True` and the new ensemble and/or dataset attributes have the same name, there will be a conflict. In order to fix this you need to update the special ensemble tag `'INHERIT_ATTRIBUTES'` to `False` for that other dataset ensemble relation. This means that the dataset attributes will no longer be tied to that other ensemble so there will no longer be a conflict. If the dataset belongs to multiple ensembles with `inherit_attributes=True` (and there are attribute conflicts), this will need to be done for all those different ensembles: `dataset.add_ensemble_tags(ensemble_id, {'INHERIT_ATTRIBUTES': False})`

In [12]:
temp_datasets = []
for i in range(20):
    metadata = {"param1": random.randint(0, 1),
                "param2": random.randint(-10, 10),
                "param3": random.randint(-100, 100),
                "param4": random.randint(-1000, 1000),
                "param5": random.randint(-10000, 10000),
                "param6": random.randint(-100000, 100000),
                }

    temp_dataset = store.create(id=f"ds_{i}", metadata=metadata)
    temp_datasets.append(temp_dataset)

for i in range(10):
    ensemble = store.create_ensemble(id=f"ens_{i}",
                                    metadata={"root":f"/root/path/for/ensemble{i}/",
                                              "project":f"Example {i}"})
    for j, temp_ds in enumerate(temp_datasets):

        ensemble_tags = {}

        if j % 2 == 0:
            ensemble_tags["even_or_odd"] = "even"
        else:
            ensemble_tags["even_or_odd"] = "odd"

        if j <= 11:
            ensemble_tags["data_type"] = "train data"
        elif j <= 15:
            ensemble_tags["data_type"] = "validation data"
        else:
            ensemble_tags["data_type"] = "test data"

        ensemble.add(temp_ds, inherit_attributes=False, ensemble_tags=ensemble_tags)

print(ensemble)
print(temp_dataset)

KOSH ENSEMBLE
	id: ens_9
	name: Unnamed Ensemble
	creator: moreno45

--- Attributes ---
	creator: moreno45
	name: Unnamed Ensemble
	project: Example 9
	root: /root/path/for/ensemble9/
--- Associated Data (0)---
--- Member Datasets (20)---
	['ds_0', 'ds_1', 'ds_2', 'ds_3', 'ds_4', 'ds_5', 'ds_6', 'ds_7', 'ds_8', 'ds_9', 'ds_10', 'ds_11', 'ds_12', 'ds_13', 'ds_14', 'ds_15', 'ds_16', 'ds_17', 'ds_18', 'ds_19']
KOSH DATASET
	id: ds_19
	name: Unnamed Dataset
	creator: moreno45

--- Attributes ---
	creator: moreno45
	name: Unnamed Dataset
	param1: 0
	param2: 2
	param3: -82
	param4: 849
	param5: 2000
	param6: -48008
--- Associated Data (0)---
--- Ensembles (10)---
	['ens_0', 'ens_1', 'ens_2', 'ens_3', 'ens_4', 'ens_5', 'ens_6', 'ens_7', 'ens_8', 'ens_9']
--- Ensemble Attributes ---
	--- Ensemble ens_0 ---
		['project', 'root']
		--- Ensemble Tags ---
			['data_type', 'even_or_odd']
	--- Ensemble ens_1 ---
		['project', 'root']
		--- Ensemble Tags ---
			['data_type', 'even_or_odd']
	--- Ensem

### Finding Datasets within Ensembles
We can use `ensemble.find()` to narrow down the search to datasets only within the ensemble instead of searching the whole store with `store.find()`. We can filter by attributes like `store.find()` but we have the added benefit of filtering by `ensemble_tags`.

In [13]:
target_data = {'param1': 1,
               'param3': DataRange(min=0, max=100, max_inclusive=True)}
target_ensemble_tags = {"data_type": "train data"}
found_datasets =  list(ensemble.find_datasets(data=target_data, ensemble_tags=target_ensemble_tags))
for fd in found_datasets:
    setattr(fd, 'test_attr', 42) # setting new attribute for each of the found datasets
    print(fd)



KOSH DATASET
	id: ds_7
	name: Unnamed Dataset
	creator: moreno45

--- Attributes ---
	creator: moreno45
	name: Unnamed Dataset
	param1: 1
	param2: -1
	param3: 58
	param4: -613
	param5: 7372
	param6: -91469
	test_attr: 42
--- Associated Data (0)---
--- Ensembles (10)---
	['ens_0', 'ens_1', 'ens_2', 'ens_3', 'ens_4', 'ens_5', 'ens_6', 'ens_7', 'ens_8', 'ens_9']
--- Ensemble Attributes ---
	--- Ensemble ens_0 ---
		['project', 'root']
		--- Ensemble Tags ---
			['data_type', 'even_or_odd']
	--- Ensemble ens_1 ---
		['project', 'root']
		--- Ensemble Tags ---
			['data_type', 'even_or_odd']
	--- Ensemble ens_2 ---
		['project', 'root']
		--- Ensemble Tags ---
			['data_type', 'even_or_odd']
	--- Ensemble ens_3 ---
		['project', 'root']
		--- Ensemble Tags ---
			['data_type', 'even_or_odd']
	--- Ensemble ens_4 ---
		['project', 'root']
		--- Ensemble Tags ---
			['data_type', 'even_or_odd']
	--- Ensemble ens_5 ---
		['project', 'root']
		--- Ensemble Tags ---
			['data_type', 'even_or_odd'

### Finding Datasets within whole Store

You can also filter datasets at the overall store level

In [14]:
target_data = {'param1': 1,
               'param3': DataRange(min=0, max=100, max_inclusive=True),
               'param6': DataRange(min=-1000, max=100000)}

found_datasets = list(store.find(data=target_data)) #list(store.find()) for all datasets

for fd in found_datasets:
    setattr(fd, 'total', 10) # setting new attribute for each of the found datasets
    print(fd)


KOSH DATASET
	id: ds_17
	name: Unnamed Dataset
	creator: moreno45

--- Attributes ---
	creator: moreno45
	name: Unnamed Dataset
	param1: 1
	param2: 1
	param3: 87
	param4: 824
	param5: 5570
	param6: 94725
	total: 10
--- Associated Data (0)---
--- Ensembles (10)---
	['ens_0', 'ens_1', 'ens_2', 'ens_3', 'ens_4', 'ens_5', 'ens_6', 'ens_7', 'ens_8', 'ens_9']
--- Ensemble Attributes ---
	--- Ensemble ens_0 ---
		['project', 'root']
		--- Ensemble Tags ---
			['data_type', 'even_or_odd']
	--- Ensemble ens_1 ---
		['project', 'root']
		--- Ensemble Tags ---
			['data_type', 'even_or_odd']
	--- Ensemble ens_2 ---
		['project', 'root']
		--- Ensemble Tags ---
			['data_type', 'even_or_odd']
	--- Ensemble ens_3 ---
		['project', 'root']
		--- Ensemble Tags ---
			['data_type', 'even_or_odd']
	--- Ensemble ens_4 ---
		['project', 'root']
		--- Ensemble Tags ---
			['data_type', 'even_or_odd']
	--- Ensemble ens_5 ---
		['project', 'root']
		--- Ensemble Tags ---
			['data_type', 'even_or_odd']
	---

### Converting `store.find()` method to Pandas DataFrame

You can also pass in the same arguments in the `store.find()` method to the  `store.to_dataframe()` method to get the attributes of the filtered datasets. By default, it will always include ['id', 'name', 'creator'].

In [15]:
# All datasets
df = store.to_dataframe()
print('All Datasets')
print(df,'\n\n')

# Filtered datasets
df = store.to_dataframe(data=target_data)
print('Filtered Datasets')
print(df,'\n\n')

# Specific columns
df = store.to_dataframe(data=target_data, data_columns=['param1', 'param6'])
print('Filtered Datasets with specific columns')
print(df,'\n\n')

All Datasets
                                    id             name  \
0                                ds_18  Unnamed Dataset   
1     6c35fb8c25034483af7031fada507062             16.0   
2     f03bfb80e755442bb59e46da9038aa3b             18.0   
3     c0215e9c4080430da59979f71009c045             14.0   
4     b1e66c735ac6483d88fe041ab70dab2c              0.0   
..                                 ...              ...   
57                                ds_8  Unnamed Dataset   
58    23cecefa740346d8970bca303ce0f38b             19.0   
59  281bdb406b99412880af991e9dd77047_3             <NA>   
60  57e8a2e6df3e44a68d675977e91712ba_6             <NA>   
61                                ds_0  Unnamed Dataset   

                             creator  data_type  my_other_att  passed  tempF  \
0   9b7d60f394284459a1ae979bb0af019f       <NA>          <NA>    <NA>   <NA>   
1   9b7d60f394284459a1ae979bb0af019f       <NA>          <NA>    <NA>   <NA>   
2   9b7d60f394284459a1ae979bb0af019f  

### Converting `dataset.find()` method to Pandas DataFrame

You can also pass in the same arguments in the `dataset.find()` method to the  `dataset.to_dataframe()` method to get the attributes of the filtered associated files. By default, it will always include ['id', 'mime_type', 'uri', 'associated'].

In [16]:
# All Associated Files
df = dataset.to_dataframe()
print('All Associated Files')
print(df,'\n\n')

# Filtered Associated Files
target_data = {'my param': 10}
df = dataset.to_dataframe(data=target_data)
print('Filtered Associated Files')
print(df,'\n\n')

# Specific columns
df = dataset.to_dataframe(data=target_data, data_columns=['param20'])
print('Filtered Associated Files with specific columns')
print(df,'\n\n')

All Associated Files
                                 id   mime_type  \
0  6a2390092c3f42fb96a869c67baa1fdc  pandas/csv   
1  b8a7bd3c354f4af0b2b612b87895a389        hdf5   
2  5aa6593ccfa749b29c50be6feb91be8f       ultra   
3                              obj1        <NA>   

                                                 uri associated  \
0             ../tests/baselines/csv/my_csv_file.csv     [obj1]   
1  ../tests/baselines/node_extracts2/node_extract...     [obj1]   
2  /g/g20/moreno45/Projects/ASCAML/kosh/examples/...     [obj1]   
3                                               <NA>       <NA>   

                                            fast_sha  \
0  6ae16fcd8a5bfc197d94451a64e3aa76b3cdce1f2af548...   
1  2c0f45d3ab840e47510a3fc1e463884de3765191cb3d07...   
2  448a457f7344ece8c8be9b4ff383ab8258cb3401bc391e...   
3                                               <NA>   

                    loader_kwargs my other param my param param1   param10  \
0  €•       }”Œ\tindex_col

### Converting `ensemble.find()` method to Pandas DataFrame

You can also pass in the same arguments in the `ensemble.find()` method to the  `ensemble.to_dataframe()` method to get the attributes of the filtered datasets within that specific ensemble. By default, it will always include ['id', 'name', 'creator'] and both the ensemble attributes and ensemble tags but they can be turned off.

In [17]:
# All Datasets in Ensemble
df = ensemble.to_dataframe()
print('All Datasets in Ensemble')
print(df,'\n\n')

# Filtered Datasets in Ensemble
target_data = {'param1': 1,
               'param3': DataRange(min=0, max=100, max_inclusive=True)}
target_ensemble_tags = {"data_type": "train data"}
df = ensemble.to_dataframe(data=target_data, ensemble_tags=target_ensemble_tags)
print('Filtered Datasets in Ensemble')
print(df,'\n\n')

# Specific columns without ensemble attributes or ensemble tags
df = ensemble.to_dataframe(data=target_data, ensemble_tags=target_ensemble_tags,
                           data_columns=['param20'],
                           include_ensemble_attributes=False, include_ensemble_tags=False)
print('Filtered Associated Files with specific columns')
print(df,'\n\n')

All Datasets in Ensemble
       id             name                           creator  param1  param2  \
0   ds_18  Unnamed Dataset  9b7d60f394284459a1ae979bb0af019f       1      -5   
1    ds_4  Unnamed Dataset  9b7d60f394284459a1ae979bb0af019f       1      -9   
2   ds_17  Unnamed Dataset  9b7d60f394284459a1ae979bb0af019f       1       1   
3    ds_5  Unnamed Dataset  9b7d60f394284459a1ae979bb0af019f       1       0   
4   ds_12  Unnamed Dataset  9b7d60f394284459a1ae979bb0af019f       0      -6   
5   ds_10  Unnamed Dataset  9b7d60f394284459a1ae979bb0af019f       1      -1   
6   ds_19  Unnamed Dataset  9b7d60f394284459a1ae979bb0af019f       0       2   
7   ds_11  Unnamed Dataset  9b7d60f394284459a1ae979bb0af019f       0      -8   
8   ds_16  Unnamed Dataset  9b7d60f394284459a1ae979bb0af019f       1       3   
9    ds_0  Unnamed Dataset  9b7d60f394284459a1ae979bb0af019f       1       2   
10   ds_9  Unnamed Dataset  9b7d60f394284459a1ae979bb0af019f       0      -6   
11  ds_13  Unna



### Other Capabilities

#### Moving Datasets
If you want to move datasets around see [Example_07_Transferring_Datasets.ipynb](Example_07_Transferring_Datasets.ipynb) and [Example_Moving_Datasets.ipynb](Example_Moving_Datasets.ipynb).

#### Parallel Access to Kosh Store
If you are running a lot of simulations in parallel (e.g. through Maestro or Merlin) and need to access the Kosh store in parallel as well see [Example_ThreadSafe.ipynb](Example_ThreadSafe.ipynb).


## Using PyDV for Further Analysis

You can convert the features of a dataset into PyDV curves **even if they are NOT ultra files** by providing their x and y data to `pydvpy.makecurve()`. Then you can use PyDV for further analysis. 

See PyDV API Specification documentation:
* https://lc.llnl.gov/weave/pydv/html/pydv.html
* https://pydv.readthedocs.io/en/latest/pydv.html

In [18]:
try:
    # Most Current
    import sys
    sys.path.append("/usr/gapps/pydv/current")
    import pydvpy
except:
    # PyPi or WEAVE Environment
    from pydv import pydvpy


curves = []

# Ultra files output PyDV curve objects by default as seen in the beginning of this tutorial
for ds in store.find():
    for associated_ultra in ds.find(mime_type="ultra"):
        print(associated_ultra.uri)
        curves.extend(dataset.open(Id=associated_ultra.id))

        
# Other data
# hdf5
data1 = dataset["node/metrics_5"][:]
print('HDF5')
print(data1,'\n\n')
curves.append(pydvpy.makecurve(x=data1[0],
                               y=data1[1],
                               name="node/metrics_5",
                               filename="../tests/baselines/node_extracts2/node_extracts2.hdf5",
                               record_id=dataset.id))
                               
# csv
# The "pandas/*" mime types use the `pandas.read_*()` methods behind the scenes so you will get a `pandas.DataFrame()`
data2 = dataset[["myparam10", "myparam40"]][:]
print('CSV')
print(data2,'\n\n')
curves.append(pydvpy.makecurve(x=data2["myparam10"],
                               y=data2["myparam40"],
                               name="myparam40 vs myparam10",
                               filename="../tests/baselines/csv/my_csv_file.csv",
                               record_id=dataset.id))


print(curves)

# Data is in curve.x and curve.y see curve.__dict__ for more info
for curve in curves:
    print(curve.name)
    print("\t", curve.filename)
    # print(curve.__dict__)
    print()


/g/g20/moreno45/Projects/ASCAML/kosh/examples/my_ult_file.ult
HDF5
<HDF5 dataset "metrics_5": shape (2, 18), type "<f4"> 


CSV
    myparam10  myparam40
0         NaN        NaN
1    1.279276   0.950918
2    1.743174   2.270797
3         NaN        NaN
4    0.144626   2.466016
5    1.382974   0.020335
6    0.051635   1.064911
7    1.391787   0.108305
8    0.289573   1.364071
9    0.294249   0.686090
10   1.079470   2.985315
11   1.914257   0.454322
12   0.396960   0.371130
13   1.513962   0.709315
14   0.295615   2.333544
15   1.340525   2.803816
16   0.026997   2.337639
17   1.795573   2.264304
18   0.664151   1.470181
19   1.176025   2.899851
20   1.501699   1.566512
21   1.366581   2.970164
22   1.579366   2.014768
23   0.819499   2.752715
24   0.204605   0.826005 


[<curve.Curve object at 0x1554d3be4820>, <curve.Curve object at 0x1554d3be4e20>, <curve.Curve object at 0x1554d3be4970>, <curve.Curve object at 0x1554d3be4370>, <curve.Curve object at 0x1554d3be4d60>, <curve.Curve objec