In [18]:
# General
import sys
import os.path as op
from time import time
from glob import glob
import warnings
from importlib import reload
from cluster_helper.cluster import cluster_view

# Personal
sys.path.append('/home1/dscho/code/projects')
from brandon import code_examples

### We will parallelize the function _save_some_file()_ in _/home1/dscho/code/projects/code_examples_

---
First let's take a look at the function.

In [49]:
code_examples.save_some_file??

---
Here's what it looks like to run this function in serial (i.e. one function call at a time).

In [50]:
name = 'daniel'

output_file = code_examples.save_some_file(name)

print('Saved {}'.format(output_file))

In [51]:
with open(output_file, 'r') as file:
    print(file.read())

---
Okay, now time to parallelize.

There are two steps to this process:
1. First, we write a simple "calling function," that takes a single argument as input, \
   and calls the function that we want to parallelize with a specific set of input values.
   - Although cluster_helper requires you to pass only a single argument as input,\ 
     this object can be anything...including e.g. a list or dictionary that contains \
     more than one piece of info. But basically, you want to pass whichever parameters \
     are going to vary from one instance of the parallelized function to another. \
     (E.g. a subject ID, EEG file path, wavelet frequency, etc.)
   - When this function is called during parallelization, it cannot access \
     any modules that have been imported within the Jupyter notebook. So you must \
     reimport any module-dependent functions at the top of the calling function. \
     Forgetting about this rule is a common mistake behind many parallelization fails :_)
2. Next, we initialize the parallelization process using some boilerplate code \
   that calls the cluster_helper function *cluster_view()*, and gives it (1) the calling \
   function that we just wrote and (2) a list of things that we want to pass to it as input.
   - E.g., if each run of the calling function takes a single subject ID as input, \
     then you would pass a list of subject IDs that you want to run the function on.

In [26]:
# Define the calling function
def save_some_file_parallel(name):
    """I live to call some_file_name()"""
    import sys
    sys.path.append('/home1/dscho/code/projects')
    from brandon import code_examples
    
    output_file = code_examples.save_some_file(name)
    
    return output_file

In [52]:
# Get a list of names to process.
input_items = ['brandon', 'joey', 'john sakon']

print('{} names to process'.format(len(input_items)))

In [53]:
# Parallelize your function
calling_function = save_some_file_parallel # the name of your calling function
n_jobs = len(input_items) # the number of jobs we are going to parallelize, in total
max_jobs = 100 # this is maximum number of jobs that we will allow to run at once. 
               # check qstat -g c before choosing, but generally don't go over 100,
               # and use <100 if there are fewer than 250 cores available.
               # if max_jobs < n_jobs, then some jobs will go in the queue and run
               # automatically as soon as other jobs finish running.
cores_per_job = 1 # this is how many cores to use per job. pretty much should always be 1


# -----------------------
print('Running code for {} operations.\n'.format(n_jobs))
start_time = time()
try:
    with cluster_view(scheduler="sge", queue="RAM.q", num_jobs=min((n_jobs, max_jobs)), cores_per_job=cores_per_job) as view:
        _ = view.map(calling_function, input_items)
except OSError:    
    print('Done in {:.1f}s'.format(time() - start_time))

---
Let's see if our code worked...

In [54]:
output_files = glob(op.join(op.dirname(output_file), 'message_for_*'))

print('Found {} output files'.format(len(output_files)),
      '---------------------', sep='\n', end='\n'*2)
    
for fpath in output_files:
    print(op.basename(fpath))
    with open(fpath, 'r') as file:
        print(file.read(), end='\n'*2)