In [3]:
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))

# setup stuff.  The cell just supports the workbook, you can ignore it

EXAMPLE_DO_FOLDER = os.path.join(os.getcwd(), "example_do_folder")
INSERTION_FOLDER = os.path.join(EXAMPLE_DO_FOLDER, "hello/do_examples")

def show_loadable(at):
    with open(os.path.join(INSERTION_FOLDER, at)) as file:
        contents = file.read()
    with_bar = "\n  | ".join(contents.split("\n"))
    print(F"\n### SHOWING MODULE {'.../'+at!r}\n  | {with_bar}\n\n")

def run_do(*args, **kwargs):
    parts = [repr(x) for x in args] + [F"{k}={v!r}" for k, v in kwargs.items()]
    print(F"do({', '.join(parts)})")
    result = do(*args, **kwargs)
    print(F"--> {result!r}")
    print()


print(F"### INSERTION FOLDER        = {INSERTION_FOLDER!r}")
print(F"### DO FOLDER (in Jupyter)  = {EXAMPLE_DO_FOLDER}")
sys.path.append(os.path.dirname(os.path.dirname(EXAMPLE_DO_FOLDER)))
from ml_dat import do, DoManager, dat_config   # Add all loadables BEFORE loading this module
do.set_do_folder(dat_config.do_folder)
print(F"## DO FOLDER (in do module) = {do.do_folder!r}")
if EXAMPLE_DO_FOLDER != do.do_folder:
    print(f"WARNING: EXAMPLE_DO_FOLDER used by jupyter = {EXAMPLE_DO_FOLDER} does not match {do.do_folder}")
if not os.path.exists(INSERTION_FOLDER):
    input(f"WARNING: INSERTION_FOLDER {INSERTION_FOLDER!r} not found.")
    os.makedirs(INSERTION_FOLDER)
print("\n\n\n\n")




### INSERTION FOLDER        = '/Users/oblinger/ob/proj/ml-dat/examples/example_do_folder/hello/do_examples'
### DO FOLDER (in Jupyter)  = /Users/oblinger/ob/proj/ml-dat/examples/example_do_folder
## DO FOLDER (in do module) = '/Users/oblinger/ob/proj/ml-dat/examples/example_do_folder'







# Example Data

### Imports

In [4]:
from ml_dat import dat_tools as dt
from ml_dat import Inst, do

### Some Example Insts
These Inst and point_fns are used to build and show DataFrames.


In [5]:
TMP_PATH1 = "/tmp/job_test1"
spec1 = {"main": {"my_key1": "my_val1", "my_key2": "my_val2"}}
inst1 = Inst(spec=spec1, path=TMP_PATH1)

TMP_PATH2 = "/tmp/job_test2"
spec2 = {"main": {"my_key1": "my_val1", "my_key2": "my_val2"}, "other": "key_value"}
inst2 = Inst(spec=spec2, path=TMP_PATH2)


### Some example point fns

In [6]:
def always_17(_inst):
    return 17


def always_18(_inst):
    return 18


.

## Creating DataFrames from Insts and point_fns

#### Zero point_fns applied to zero Insts yields an empty DataFrame

In [7]:
dt.from_inst([], [])  # Empty lists return empty DataFrames

#### 2 metrics applied to 2 insts yields a DataFrame w/ 2 rows & 2 cols

In [8]:
def always_17(_inst):
    return 17


def always_18(_inst):
    return 18

dt.from_inst([inst1, inst2], [always_17, always_18])

Unnamed: 0,always_17,always_18,list
0,17,18,job_test1
1,17,18,job_test2


#### Metric functions can also be expressed as string and loaded using 'do'

In [12]:
do.register_module("registered_cube",
                           "example_do_folder.df_tools_examples.cube_hello",
                   allow_redefine=True)
dt.from_inst([inst1], ["registered_cube.always_5"])

Unnamed: 0,always_5,list
0,5,job_test1


#### Multi-valued Metric Functions
Metric functions (also called point_fns) can return a dict of metric values instead of just returning a single value.

In [15]:
fns = [always_17, lambda inst: {"val1": 111, "val2": 2222}]
dt.from_inst([inst1, inst2], point_fns=fns)

Unnamed: 0,always_17,val1,val2,list
0,17,111,2222,job_test1
1,17,111,2222,job_test2


#### Multi-POINT Metric Functions
Finally, a point_fn can return a LIST of dicts, each dict represents a row in the DataFrame.  In all cases all point_fns are applied to all insts, and the return values are appended to the DataFrame.  These different kinds of point functions can be combined so scalar values and dict values are all merged into a single row for each inst, and lists of dicts are expanded into multiple rows.

In [16]:
point_fns = [always_17, always_18, lambda inst: [{"val1": 1, "val2": 2}, {"val3": 3}]]
dt.from_inst([inst1], point_fns)

Unnamed: 0,val1,val2,list,val3,always_17,always_18
0,1.0,2.0,job_test1,,,
1,,,job_test1,3.0,,
2,,,job_test1,,17.0,18.0


Above you can see that the results of the always_17 and always_18 functions were merged into an single row as before, and the lambda returned to more data points express as two additional rows in the DataFrame.

# Excel Output
