In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

# Parsing the notebook

## Finding tags, filter cells

In [None]:
#export
#default_exp om2
import re
regex_tag = re.compile(r"^\s*#([a-zA-Z_]+).*$")

def extract_tag(line):
    """Returns the name of a tag (#name), if it 
    occurs at the beginning of the line, or None."""
    m = regex_tag.match(line)
    if m is not None: return m.group(1)
    else: return None

In [None]:
assert extract_tag("#nbx ") == "nbx"
assert extract_tag("#nbx  something else ") == "nbx"
assert extract_tag("# nbx something else ") == None
assert extract_tag("#xarg ") == "xarg"

In [None]:
#export
def contains_tag(name):
    return lambda line: extract_tag(line) == name

is_nbx = contains_tag("nbx")

In [None]:
assert is_nbx("#nbx") 
assert is_nbx("# nbx") == False
assert is_nbx(" #nbx") 
assert is_nbx("  #nbx") 

In [None]:
#export 
def is_nbx_cell(cell):
    if cell['cell_type'] != 'code': return False
    if not cell['source']: return False
    line0 = cell['source'][0]
    return is_nbx(line0)

In [None]:
#export
regex_magic =  re.compile(r"^\s*%{1,2}|^\s*!")

def is_magic_or_shell(line):
    """Checks if line contains a jupyter 
    magic function or shell command"""
    m = regex_magic.match(line)
    return m is not None

In [None]:
assert is_magic_or_shell("%pwd ")
assert is_magic_or_shell("%%capture ")
assert is_magic_or_shell("!ls")

## Parsing "xargs"

We need to parse the line below `#xarg`, and decompose it into a variable declaration and the parameter range for the sweep.

In [None]:
#export
regex_xarg = re.compile(r"""
^
([^=]+)
=
([^;]+)
;?
(.*)
$""", re.VERBOSE)

def strip(s):
    return s.strip()

def parse_xarg_expr(line):
    """Parses the line below an `xarg` tag, e.g.
        ('x', '0', '[1,2,3,4]') = parse_xarg_expr("x = 0; [1,2,3,4]")  
    """
    m = regex_xarg.match(line)
    name, val, sweep = map(strip, m.groups())
    return name, val, sweep

In [None]:
parse_xarg_expr("x = f(a) ; [f(1),f(a),6,8]")

('x', 'f(a)', '[f(1),f(a),6,8]')

## Parsing "nbx" cells

First let's load the notebook

In [None]:
#export
import json
from argparse import Namespace

class Bunch(object):
    def __init__(self, adict={}):
        self.__dict__.update(adict)
        
    def __repr__(self):
        return str(self.__dict__.keys())

def load_nb(fname):
    nbdict = json.load(open(fname,'r',encoding="utf-8"))
    nb = Bunch(nbdict)
    nb.name = fname
    return nb

In [None]:
nb = load_nb("om2.ipynb")
nb

dict_keys(['cells', 'metadata', 'nbformat', 'nbformat_minor', 'name'])

In [None]:
#export
def parse_src_with_parse_dict(a, src, parse_dict):
    if len(src) == 0: return a, []
    
    tag = extract_tag(src[0])
    if tag is None or \
       tag not in parse_dict: 
            a, rest = parse_none(a, src)
    else: 
            a, rest = parse_dict[tag](a, src)

    return parse_src_with_parse_dict(a, rest, parse_dict)


def parse_none(a, src):
    if not is_magic_or_shell(src[0]):
        a['none'].append(src[0])
    rest = src[1:]
    return a, rest

def parse_nbx(a, src):
    a["nbx"].append(src[0])
    rest = src[1:]
    return a, rest

def parse_xarg(a, src):
    a["xarg"].append(src[1])
    rest = src[2:]
    return a, rest

def parse_xuse(a, src):
    a["xuse"].append(src[1])
    rest = src[2:]
    return a, rest

def consume_line_below(tag, basket=None):
    if basket is None: basket = tag
    
    def parse(a, src):
        a[basket].append(src[1])
        rest = src[2:]
        return a, rest
    
    return parse


PARSE_DICT = {
    'xarg': consume_line_below('xarg', basket=None),
    'ximp': consume_line_below('ximp', basket=None)
}
    
def parse_nbx_cell_with_parse_dict(cell, parse_dict=PARSE_DICT):
    a = dict([(t,[]) for t in parse_dict.keys()])
    a['none'] = []
    a, _ = parse_src_with_parse_dict(a, cell['source'], parse_dict)
    
    return a


In [None]:
nb = load_nb("om2.ipynb")
for cell in list(filter(is_nbx_cell, nb.cells)):    
    print("*****************\n** Parsed Cell **\n*****************")
    a = parse_nbx_cell_with_parse_dict(cell)
    for key, vals in a.items():
        print(f">> Parsed... {key}'s: ... ")
        [print(v.strip()) for v in vals]

*****************
** Parsed Cell **
*****************
>> Parsed... xarg's: ... 
x = 0 ; [0,1,2,3,4]
y = 0 ;
task_id = 0
results_dir = "./"
>> Parsed... ximp's: ... 
import numpy as np
from numpy import *
>> Parsed... none's: ... 
#nbx







#watch
z = 1
*****************
** Parsed Cell **
*****************
>> Parsed... xarg's: ... 
>> Parsed... ximp's: ... 
>> Parsed... none's: ... 
#nbx
print("some result")


Let's see how it works

In [None]:
#nbx
#ximp
import numpy as np


#ximp
from numpy import *


#xarg 
x = 0 ; [0,1,2,3,4]  
    
#xarg 
y = 0 ;

#xarg
task_id = 0
#xarg
results_dir = "./"

#watch
z = 1

In [None]:
#nbx
print("some result")

some result


## Parsing the whole thing

In [None]:
#export
from functools import reduce

def concat(list1, list2):
    return list1 + list2

def unzip(zipped):
    return zip(*zipped)

def negate(func):
    return lambda x: not func(x)

def is_constarg(a):
    return len(a[2]) == 0

not_constarg = negate(is_constarg)

def get_item(i):
    return lambda x: x[i]

def get_items(*I):
    return lambda x: tuple([x[i] for i in I])

In [None]:
#export
def parse_nb_with_parse_dict(nb, parse_dict=PARSE_DICT):
    nbx_cells = filter(is_nbx_cell, nb.cells)

    keys = parse_dict
    A = dict([(k,[]) for k in parse_dict.keys()])
    A['func_body'] = []
    for cell in nbx_cells:
        a = parse_nbx_cell_with_parse_dict(cell, parse_dict)
        
        for k in parse_dict.keys():
            A[k].extend(a[k])
        A['func_body'].extend(a['none'])
    
    A['xarg'] = [parse_xarg_expr(line) for line in A['xarg']]
    A['args'] = list(map(get_items(0,1), A['xarg']))
    A['const_args'] = list(map(get_items(0,1), filter(is_constarg, A['xarg'])))
    A['sweep_args'] = list(map(get_items(0,2), filter(not_constarg, A['xarg'])))
    A['name'] = nb.name

         
    return A
        

In [None]:
nb = load_nb("om2.ipynb")
nb = parse_nb_with_parse_dict(nb, parse_dict=PARSE_DICT)
nb

{'xarg': [('x', '0', '[0,1,2,3,4]'),
  ('y', '0', ''),
  ('task_id', '0', ''),
  ('results_dir', '"./"', '')],
 'ximp': ['import numpy as np\n', 'from numpy import *\n'],
 'func_body': ['#nbx\n',
  '\n',
  '\n',
  '\n',
  '\n',
  '    \n',
  '\n',
  '\n',
  '#watch\n',
  'z = 1',
  '#nbx\n',
  'print("some result")'],
 'args': [('x', '0'), ('y', '0'), ('task_id', '0'), ('results_dir', '"./"')],
 'const_args': [('y', '0'), ('task_id', '0'), ('results_dir', '"./"')],
 'sweep_args': [('x', '[0,1,2,3,4]')],
 'name': 'om2.ipynb'}

# Creating the file bundle 

In [None]:
#export
def get_arrays(num, m=1000):
    if num < m: return [[1,num]]
    
    arrays = []
    for i in range(num//m): arrays.append([i*m+1, (i+1)*m])
    last = arrays[-1][1]
    if last < num: arrays.append([last+1, num])
        
    return arrays

def get_arrays_2(num, m=1000):
    if num < m: return [[1,num]]
    
    arrays = []
    for i in range(num//m): arrays.append([1, m])
    last = arrays[-1][1]
    if num%m!=0: arrays.append([1,num%m])
        
    return arrays

In [None]:
print(get_arrays(1543, 1000))
get_arrays_2(1543, 1000)

[[1, 1000], [1001, 1543]]


[[1, 1000], [1, 543]]

In [None]:
#export
def init_job(start, end, step):
    return f"job_0=`sbatch --array={start}-{end}%{step} job_0.sh | awk '{{ print $4 }}'`"

def cont_job(j, start, end, step):
    return f"job_{j}=`sbatch --array={start}-{end}%{step} --dependency=afterok:$job_{j-1} job_{j}.sh | awk '{{ print $4 }}'`"

def chain_jobs(arrays, step):
    s = ""
    for i, arr in enumerate(arrays):
        if i ==0: s += init_job(arr[0], arr[1], step)
        else: s += cont_job(i, arr[0], arr[1], step)
        s += "\n"  
    return s

def chain_jobs_2(arrays, step):
    s = ""
    for i, arr in enumerate(arrays):
        if i ==0: s += init_job(arr[0], arr[1], step)
        else: s += cont_job(i, arr[0], arr[1], step)
        s += "\n"  
    return s    




In [None]:
print(chain_jobs_2(get_arrays_2(2543, 1000), step=10))

job_0=`sbatch --array=1-1000%10 job_0.sh | awk '{ print $4 }'`
job_1=`sbatch --array=1-1000%10 --dependency=afterok:$job_0 job_1.sh | awk '{ print $4 }'`
job_2=`sbatch --array=1-543%10 --dependency=afterok:$job_1 job_2.sh | awk '{ print $4 }'`



## Bundle functions

In [None]:
#export
from pathlib import PurePosixPath as Path
import pkg_resources
import importlib
from nbx.templ import *
import os

def add_if_necessary(d, k, v):
    if k not in d:
        d[k] = v
    
    
def create_script(tpath, tname, fname, vars):
    print(f"Creating... {fname} \n\tfrom {tname}")
    create_file_from_template(tpath/tname, fname, vars)
    

tpath = Path(pkg_resources.resource_filename(__name__, "templates/"))


def create_om_files(target_dir,  lang, num_jobs, simg, job_header, experiment="experiment.py",
                    arr_size=900, step=100, tpath=tpath, copy_folders=["data", "src"], job_abbr="nbxjob", bind=[], sym=[]):
    """
    Creates a bundle folder and all the scripts 
    needed to run an experiment script on OM...
    
    Example usage:
    
    >> create_om_files(  
            target_dir = "EXAMPLE_BUNDLE", 
            lang = "py", 
            num_jobs = 10,
            simg = "pytorch.simg",
            job_header = {
                "time": "01:20:00",
                "partition": "fiete",
                "mem": "32gb",
                "cpus-per-task": 4,
                "mail-user": "me@somewhere.com"})
                
    >> create_experiment_script(
            nbname = "my_notebook.ipynb",
            target_dir = "./_EXAMPLE_BUNDLE", 
            lang = "py")
            
    """
    print(f"Creating om ... files...\n\tfrom {tpath}")

    
    create_folders(target_dir, lang, copy_folders=copy_folders)
    create_run_and_job_script(target_dir, lang, simg, job_header, num_jobs, 
                              arr_size, step,  tpath=tpath, job_abbr=job_abbr, bind=bind, sym=sym, experiment=experiment)
#     create_job_script(target_dir, lang, simg, job_header)
    
    print(render_template_from_string(INSTRUCTIONS, {"path": target_dir, "lang": lang}))
    
    
INSTRUCTIONS = """
** Instructions: **
    Copy to remote, run, and pull the results:
    - `!scp -r {{path}} $om:$omx`
   (- `!scp -r {{path}}/experiment.{{lang}} $om:$omx/experiment.{{lang}}`)
    - `!ssh $om sbatch -D $omx/{{path}} $omx/{{path}}/run.sh`
    - `!scp -r $om:$omx/{{path}}/results/* ./results`
    
    For this to work you have to set a few environment variables...
    
"""


def create_folders(path, lang, copy_folders):
    path=Path(path)
            
    
    for p in [path, path/'io', path/'results']:
        if not os.path.exists(p): os.makedirs(p)

    for folder in copy_folders:
        if os.path.exists(f"./{folder}"):
            if not os.path.exists(path/folder):
                os.makedirs(path/folder)
            os.system(f"cp -r {folder}/* {path/folder}") 
                  

    
    if lang==".py":
        open(path/'__init__.py', 'a').close()
                           

def create_run_and_job_script(target_dir, lang, simg, job_header, 
                              num_jobs, arr_size, step,  tpath=tpath, 
                              job_abbr="nbxjob", bind=[], sym=[], experiment="experiment.py"):
    assert arr_size <= 1000, "Maximum number of queued jobs on OM is 1000"   
    fname = Path(target_dir)/'run.sh'
    
    job_arrays = get_arrays_2(num_jobs, arr_size)
    
    # Run script         
    tname = "run_sh.tpl"
    fname = Path(target_dir)/f"run.sh"
    create_script(tpath, tname, fname, {
        'specs_array': [[j]+job_arrays[j]+[j*arr_size] 
                             for j in range(len(job_arrays))],
        'job_name': f"{job_abbr}",
        'step': step
    })    
        
    # Jobs
    create_job_script(target_dir, lang, simg, job_header, tpath, bind, sym, experiment)
#     for j,arr in enumerate(job_arrays):
#         task_offset = j*arr_size
#         create_job_script(target_dir, lang, simg, j, job_header, task_offset, tpath)
        
        

def create_job_script(target_dir, lang, simg, job_header, tpath, bind=[], sym=[], experiment="experiment.py"):
    
    simg        = Path(os.environ['omsimg'])/simg
    nbx_folder  = Path(os.environ['omx'])
    results_dir = Path("./results")
    
    print(f"\nNBX folder: {nbx_folder}\n")
    
    add_if_necessary(job_header, "out", "io/out_%a")
    add_if_necessary(job_header, "error", "io/err_%a")
    add_if_necessary(job_header, "mail-type", "END")
    add_if_necessary(job_header, "exclude", "node030,node016,node015")
    
    tname = f"job_{lang}.tpl"
    fname = Path(target_dir)/f"job.sh"
    create_script(tpath, tname, fname, {
        'job_header': job_header.items(),
        'nbx_folder': nbx_folder,
        'bind': bind,
        'simg': simg,
        'symlinks': sym,
        'results_dir': results_dir,
        'experiment': experiment
    })       

    
def check_nb(pnb):
    keys = list(map(get_item(0), pnb['args']))
    if "task_id" not in keys: raise KeyError("You didn't specify `task_id`!!")
    if "results_dir" not in keys: raise KeyError("You didn't specify `results_dir`!!")
    
                  
def create_experiment_script(nbname, target_dir=".", lang="py", tpath=tpath):
    print("** Creating Experiment script and folder **")
    nb = load_nb(nbname)
    nb = parse_nb_with_parse_dict(nb, parse_dict=PARSE_DICT)
    check_nb(nb)
                  
    path=Path(target_dir)
            
    if not os.path.exists(path):
        os.makedirs(path)

    tname = f"experiment_{lang}.tpl"                  
    fname = path/f"experiment.{lang}"
    create_script(tpath, tname, fname, nb)    

                  
    if lang == "py":
        open(path/'__init__.py', 'a').close()                  
        exp = ".".join((path/'experiment').parts)
        m =  importlib.import_module(exp)
        num_params = len(m.sweep_params)
                  
    print(f"Number of params: {num_params}")
                  
    return {"num_jobs": num_params, "target_dir": target_dir, "lang": lang}
                   
    
def create_raw_experiment(fname="experiment.py", lang="py", tpath=tpath):
    print("** Creating Raw Experiment**")
                  
    tname = f"experiment_raw_{lang}.tpl"                  
    fname = Path(fname)
    
    create_script(tpath, tname, fname, {})    

                
                   
    
        



In [None]:
tpath

PurePosixPath('templates')

In [None]:
 create_om_files(target_dir="OM2_EXAMPLE", lang="py", 
                 num_jobs=3453, 
                 arr_size=500, step=17, 
                 simg="pytorch.simg", 
                 bind=[],
                 job_header={
                    "time": "10:00:00",
                    "partition": "fiete",
                    "mail-user": "me@somewhere.com"}, 
                 tpath=Path("/Users/mirko/Workspace/nbx/nbx/templates"))

Creating om ... files...
	from /Users/mirko/Workspace/nbx/nbx/templates
Creating... OM2_EXAMPLE/run.sh 
	from run_sh.tpl

NBX folder: /om2/user/mklukas/nbx-experiments

Creating... OM2_EXAMPLE/job.sh 
	from job_py.tpl

** Instructions: **
    Copy to remote, run, and pull the results:
    - `!scp -r OM2_EXAMPLE $om:$omx`
   (- `!scp -r OM2_EXAMPLE/experiment.py $om:$omx/experiment.py`)
    - `!ssh $om sbatch -D $omx/OM2_EXAMPLE $omx/OM2_EXAMPLE/run.sh`
    - `!scp -r $om:$omx/OM2_EXAMPLE/results/* ./results`
    
    For this to work you have to set a few environment variables...
    


In [None]:
create_experiment_script("om2.ipynb", target_dir="OM2_EXAMPLE", lang="py", tpath=Path("/Users/mirko/Workspace/nbx/nbx/templates"))

** Creating Experiment script and folder **
Creating... OM2_EXAMPLE/experiment.py 
	from experiment_py.tpl
Number of params: 5


{'num_jobs': 5, 'target_dir': 'OM2_EXAMPLE', 'lang': 'py'}