# Session 4:  Parallel Processing
This is a placeholder to preserve the RF estimation section that was first in session 1 but we realized it was too much for an overview session. This workflow, however, is a good prototype for something that does something useful and demonstrates mspass in a parallel setting.  

## RF Estimation workflow:  Serial version
Above we assembled data into Seismogram objects and saved them to the database.  In this example workflow we will generate a set of receiver function estimates driven by Seismogram inputs.  The serial job is a data driven loop over all Seismogram objects stored in the database.  For each seismogram we will do the following calculations:
1.  Detrend the data (for a Seismogram that means channel by channel)
2.  Lightly taper the ends to reduce filter startup transients.
3.  Bandpass filter the data.
4.  Window the data around the P wave arrival time.
5.  Run the deconvolution algorithm.
6.  Save the results.

In [None]:
from mspasspy.db.database import Database
from mspasspy.db.client import DBClient
dbclient=DBClient()
db=Database(dbclient,'shortcourse')

In [None]:
from obspy.geodetics import degrees2kilometers
import math
from mspasspy.ccore.seismic import SlownessVector
def arrival_slowness_vector(obspy_arrival,azimuth=0.0):
    """
    Given an obspy arrival array member return the mspass SlownessVector.
    
    Obspy's taup calculator returns travel time data as a list with one class member for 
    each seismic phase. Inside that thing is a ray parameter, which is slowness in sec/degree.  
    A slowness vector has direction so we need to compute the direction from the azimuth.
    
    :param obspy_arrival: list member for which the slowness vector is to be computed.
    :param azimuth:  azimuth in degrees of propagation direction at receiver. 
    
    :return: SlownessVector form model estimate for this phase.
    
    """
    # theta is the standard angle in math definition of polar coordinate angle (degrees)
    theta=90.0-azimuth
    rtheta=math.radians(theta)   # radians needed for math calculations
    p=a.ray_param_sec_degree
    u=p/degrees2kilometers(1.0)
    ux=u*math.cos(rtheta)
    uy=u*math.sin(rtheta)
    return SlownessVector(ux,uy,0.0)

In [None]:
mport time
from mspasspy.algorithms.RFdeconProcessor import RFdeconProcessor
from mspasspy.algorithms.RFdeconProcessor import RFdecon
from mspasspy.ccore.utility import AntelopePf
# These are repeated from above, but useful to make this box standalone so one can more 
# easily just cut and paste to use it in another workflow
from mspasspy.algorithms.window import WindowData
from mspasspy.algorithms.signals import (filter, detrend)
from mspasspy.ccore.algorithms.basic import TimeWindow,CosineTaper
from mspasspy.ccore.utility import ErrorSeverity
from obspy.taup import TauPyModel
model = TauPyModel(model="iasp91")
from obspy.geodetics import gps2dist_azimuth,kilometers2degrees
normlist=['source','site']

# MsPASS allows parameters to be placed in a Antelope Pf format file.  We use 
# that here as an example of how to put parameters for a workflow in one place
pfhandle=AntelopePf('session1.pf')
# When using a pf to define constants always do that up front in case there are
# errors in the file
dtaperlength=pfhandle.get_double("data_taper_length")
fmax=pfhandle.get_double("filter_high_corner")
fmin=pfhandle.get_double("filter_low_corner")
awin_start=pfhandle.get_double("analysis_window_starttime")
awin_end=pfhandle.get_double("analysis_window_endtime")
vp0=pfhandle.get_double('vp0')
vs0=pfhandle.get_double('vs0')

# There is a fair amount of overhead to create the slepian tapers used in 
# the multitaper method.   We create an instance that defines the operator
# once and use it in the loop below
decon_operator=RFdeconProcessor(alg="MultiTaperXcor")


cursor=db.wf_Seismogram.find(query)
t0=time.time()
nlive=0
for doc in cursor:
    d=db.read_data(doc,collection='wf_Seismogram',normalize=normlist)
    print('working on data for station=',d['READONLYERROR_sta'])
    # detrend
    detrend(d)
    # bandpass filter
    filter(d,'bandpass',freqmax=fmax,freqmin=fmin)
    # cosine taper ends
    dtaper=CosineTaper(d.t0,d.t0+dtaperlength,d.endtime()-dtaperlength,d.endtime())
    dtaper.apply(d)
    # Time windowing - variant of above example 
    stalat=d['site_lat']
    stalon=d['site_lon']
    srclat=d['source_lat']
    srclon=d['source_lon']
    depth=d['source_depth']
    otime=d['source_time']
    georesult=gps2dist_azimuth(srclat,srclon,stalat,stalon)
    # obspy's function we just called returns distance in m in element 0 of a tuple
    # their travel time calculator it is degrees so we need this conversion
    dist=kilometers2degrees(georesult[0]/1000.0)
    baz=georesult[2]  # gps2dist_azimuth returns back azimuth as 2 of tuple.  We need azimuth
    azimuth=baz+180.0
    if azimuth>360.0:
        azimuth -= 360.0
    #print('DEBUG:  delta=',dist,' azimuth=',azimuth)
    if dist>95.0:
        d.kill()
        d.elog.log_error('session1_serial_script','No P wave - station is in the core shadow',ErrorSeverity.Invalid)
        print('Killed this datum - core shadow')
        db.save_data(d,data_tag='decon_output')
        continue
    arrivals=model.get_travel_times(source_depth_in_km=depth,distance_in_degree=dist,phase_list=['P'])
    # Arrivals are returned in time order 0 is always the first arrival
    # This computes arrival time as an epoch time and shifts the data to put 0 at that time
    a=arrivals[0]
    atime=a.time
    # Shift time 0 to the P wave arrival time
    d.ator(otime+atime)
    # Post the time used to Metadata
    d['P_iasp91']=atime   # Illustrates a made up key for Metadata
    decon_twin=TimeWindow(awin_start,awin_end)
    #print('DEBUG')
    #print(decon_twin.start,decon_twin.end)
    #print(d.t0,d.endtime())
    #print('sample interval=',d.dt,' and number of points=',d.npts)
    if decon_twin.start < d.t0:
        d.kill()
        d.elog.log_error('session_1_serial_script',
                         'Windowing failure - window start is before data starttime',ErrorSeverity.Invalid)
        print('killed this datum - windowing error')
        db.save_data(d,data_tag='decon_output')
    else:
        d=WindowData(d,decon_twin)
        # We transform the data to R,T,L using Kennett's free surface transformation matrix, which 
        # is implemented as a method in Seismogram
        u=arrival_slowness_vector(a,azimuth)
        d.free_surface_transformation(u,vp0,vs0)
        #  run deconvolution 
        decondata=RFdecon(d,decon_operator)
        # save result with a different data tag - automatically will go to wf_Seismogram
        db.save_data(decondata,data_tag='decon_output')
        nlive+=1
print('Total processing time=',time.time()-t0)
print('Number live data save=',nlive)

## RF Estimation:  parallel job using Dask
MsPASS has support for two schedulers:  Dask and Spark.  In this exercise we are going to use Dask because it is slightly simpler to use.  In a later section we will talk about details of this job script, but for now a key point is to demonstrate that a job script to run a parallel job in MsPASS has only minor differences from the serial version.

We do have to make one point here to help you comprehend this job script;  a fundamental idea of both Spark and Dask is the idea of a map operator.  A map operator can be thought of as a function that takes a list of data objects (the dataset), does something to them, and creates a new list (dataset) of the modified data.  The schedulers handle the memory operations so the entire data set does not live in memory simultaneously. 

With that background, here is the above in parallel form (Note that for this notebook we could have dropped most of the initialization, but we retain it to emphasize the parallel structure):

In [None]:
import time
from mspasspy.algorithms.RFdeconProcessor import RFdeconProcessor
from mspasspy.algorithms.RFdeconProcessor import RFdecon
from mspasspy.ccore.utility import AntelopePf
# These are repeated from above, but useful to make this box standalone so one can more 
# easily just cut and paste to use it in another workflow
from mspasspy.algorithms.window import WindowData
from mspasspy.algorithms.signals import (filter, detrend)
from mspasspy.ccore.algorithms.basic import TimeWindow,CosineTaper
from mspasspy.ccore.utility import ErrorSeverity
from obspy.taup import TauPyModel
model = TauPyModel(model="iasp91")
from obspy.geodetics import gps2dist_azimuth,kilometers2degrees
normlist=['source','site']

# We need this function to handle setting arrival times. The serial script should be 
# changed to use this same function.
def Ptime_shift(d,model):
    """
    Sets a predicted P wave arrival time using source and receiver coordinates and 
    model passed as arg 1 and time shifts data so time 0 is the predicted P wave arrival time.
    """
    stalat=d['site_lat']
    stalon=d['site_lon']
    srclat=d['source_lat']
    srclon=d['source_lon']
    depth=d['source_depth']
    otime=d['source_time']
    georesult=gps2dist_azimuth(srclat,srclon,stalat,stalon)
    # obspy's function we just called returns distance in m in element 0 of a tuple
    # With their travel time calculator it is degrees so we need this conversion
    dist=kilometers2degrees(georesult[0]/1000.0)
    baz=georesult[2]  # gps2dist_azimuth returns back azimuth as 2 of tuple.  We need azimuth
    azimuth=baz+180.0
    if azimuth>360.0:
        azimuth -= 360.0
    # the taup calculator fails if we ask for P in the core shadow.  This is a rough 
    # way to handle this for this example that works for the one event we are processing here
    # A more elegant method would worry about source depth
    if dist>95.0:
        d.kill()
        d.elog.log_error('session1_RF_script','No P wave - station is in the core shadow',
                         ErrorSeverity.Invalid)
    else:
        arrivals=model.get_travel_times(source_depth_in_km=depth,distance_in_degree=dist,phase_list=['P'])
        # Arrivals are returned in time order 0 is always the first arrival
        # This computes arrival time as an epoch time and shifts the data to put 0 at that time
        a=arrivals[0]
        atime=a.time
        # Post the time used to Metadata
        d['P_iasp91']=atime   # Illustrates a made up key for Metadata
        d.ator(otime+atime)
        # We also post the slowness data - computed by this function
        u=arrival_slowness_vector(a,azimuth)
        d['ux']=u.ux
        d['uy']=u.uy
    return d
def apply_free_surface_transformation(d,vp0,vs0):
    """
    Thin wrapper for free_surface_transformation method of Seismogram that assumes
    the components of a slowness vector for the transformation are in the Metadata 
    of d stored with the keys ux and uy
    """
    if d.dead():
        return d
    if 'ux' in d and 'uy' in d:
        ux=d['ux']
        uy=d['uy']
        u = SlownessVector(ux,uy)
        d.free_surface_transformation(u,vp0,vs0)
    else:
        d.elog.log_error('session1_RF_script','Slowness vector components were not set',
                         ErrorSeverity.Invalid)
        d.kill()
    return d
# These initializations are identical to the serial version

# MsPASS allows parameters to be placed in a Antelope Pf format file.  We use 
# that here as an example of how to put parameters for a workflow in one place
pfhandle=AntelopePf('session1.pf')
# When using a pf to define constants always do that up front in case there are
# errors in the file
dtaperlength=pfhandle.get_double("data_taper_length")
fmax=pfhandle.get_double("filter_high_corner")
fmin=pfhandle.get_double("filter_low_corner")
awin_start=pfhandle.get_double("analysis_window_starttime")
awin_end=pfhandle.get_double("analysis_window_endtime")
vp0=pfhandle.get_double('vp0')
vs0=pfhandle.get_double('vs0')

# There is a fair amount of overhead to create the slepian tapers used in 
# the multitaper method.   We create an instance that defines the operator
# once and use it in the loop below
decon_operator=RFdeconProcessor(alg="MultiTaperXcor")


cursor=db.wf_Seismogram.find(query)
t0=time.time()

# this script is identical to the serial script prior to this point.  
# Here is the first fundamental change:  our for loop is replaced by 
# this parallel reader that builds a Dask bag used to define the data set
dataset=db.read_distributed_data(cursor,collection='wf_Seismogram',normalize=normlist)
dataset=dataset.map(detrend)
dataset=dataset.map(filter,'bandpass',freqmax=fmax,freqmin=fmin)
dtaper=CosineTaper(d.t0,d.t0+dtaperlength,d.endtime()-dtaperlength,d.endtime())
dataset=dataset.map(dtaper.apply,inplace_return=True)
dataset=dataset.map(set_P_time,model)
dataset=dataset.map(WindowData,decon_twin)
dataset=dataset.map(apply_free_surface_transformation,vp0,vs0)
dataset=dataset.map(RFdecon,decon_operator)
dataset.map(save_data,collection='wf_Seismogram',data_tag='parallel_decon_output')
dataset.compute()