# Script 3:

FMF only returns time series of correlation coefficients. In this script, we show how to use these correlation coefficient time series in order to detect new earthquakes, and extract their waveforms. The functions defined in this script can be reused in another template matching earthquake detection framework.

In [None]:
import sys
import os
sys.path.append(os.getcwd())

import h5py as h5
import numpy as np
import utils

from obspy.core import UTCDateTime as udt
from time import time as give_time

## First, define the function we will call to extract the waveforms of the newly detected earthquakes

This function expects 7 arguments:
- data $(n_{\mathrm{stations}} \times n_{\mathrm{components}} \times n_{\mathrm{samples-in-data}})$
- cc_sums $(n_{\mathrm{templates}} \times n_{\mathrm{correlations}})$ (*i.e.* the output of FMF)
- moveout_array $(n_{\mathrm{templates}} \times n_{\mathrm{stations}} \times n_{\mathrm{components}})$
- n_mad (determines the detection threshold: $n_{\mathrm{mad}} \times \mathrm{MAD}(\mathrm{CC}(t))$)
- template_duration (in seconds, used for choosing the minimum distance between consecutive detections)
- extracted_duration (in seconds, the duration extracted on each station / channel)

Without going into details, this function returns Python lists of metadata and of waveforms with length $n_{\mathrm{templates}}$. Each element of these lists contains a Python dictionary.

In [None]:
def extract_new_detections(data,
                           cc_sums,
                           moveout_array,
                           n_mad=10.,
                           template_duration=8.,
                           step=1,
                           extracted_duration=60.):

    n_templates = cc_sums.shape[0]
    n_stations = moveout_array.shape[1]
    n_components = moveout_array.shape[2]
    n_extracted_samples = np.int32(extracted_duration * data['metadata']['sampling_rate'])
    buffer_extracted_events = 10.

    list_metadata = []
    list_waveforms = []
    for i in range(n_templates):
        cc_sum = cc_sums[i, :]

        cc_sum -= np.median(cc_sum)
        threshold = n_mad * np.median(np.abs(cc_sum))
        # ------------------
        cc_idx = np.argwhere(cc_sum > threshold)
        detections = cc_idx * step

        # only keep highest correlation coefficient for grouped detections
        # we assume the last component is the vertical component
        d_mv = moveout_array[i, :, 0] - moveout_array[i, :, -1]
        # fix the maximum window size to 3 times the template duration
        # fix the minimum window size to 1 time the templare duration
        # in between: choose an adaptive size based on the median
        # P-S time
        search_win = min(np.int32(3. * template_duration *
                                  data['metadata']['sampling_rate'] / step),
                         max(np.int32(1. * np.median(d_mv[d_mv != 0]) / step),
                             np.int32(template_duration *
                                      data['metadata']['sampling_rate'] / step)))
        for j in range(cc_idx.size):
            idx = np.arange(max(0, cc_idx[j] - search_win // 2),
                            min(cc_sum.size-1, cc_idx[j] + search_win // 2),
                            dtype=np.int32)
            idx_to_update = np.where(cc_idx == cc_idx[j])[0]
            cc_idx[idx_to_update] = np.argmax(cc_sum[idx]) + idx[0]

        cc_idx = np.unique(cc_idx)
        detections = cc_idx * step

        # after this step, we can have detections closest than search_win / 2
        cc_idx = list(cc_idx)
        n_removed = 0
        for j in range(1, detections.size):
            if (cc_idx[j-n_removed] - cc_idx[j-n_removed-1]) < search_win // 2:
                if cc_sum[cc_idx[j-n_removed]] > cc_sum[cc_idx[j-n_removed-1]]:
                    cc_idx.remove(cc_idx[j-n_removed-1])
                else:
                    cc_idx.remove(cc_idx[j-n_removed])
                n_removed += 1
        cc_idx = np.asarray(cc_idx)
        detections = cc_idx * step

        n_multiplets = len(detections)
        # ------------------------------------------------------
        metadata_events = {}
        waveforms_events = {}
        origin_times = np.zeros(n_multiplets, dtype=np.float64)
        correlation_coefficients = np.zeros(n_multiplets, dtype=np.float32)
        waveforms = np.zeros((n_multiplets, n_stations,
                              n_components, n_extracted_samples), dtype=np.float32)
        idx_min = 0  # can't extract continuous data before index 0
        idx_max = data['waveforms'].shape[-1]  # can't extract continuous data after
        #                                        the last sample of the day
        for d in range(n_multiplets):
            origin_time = udt(data['metadata']['date']) \
                          + detections[d] / data['metadata']['sampling_rate']
            origin_times[d] = origin_time.timestamp \
                - buffer_extracted_events
            correlation_coefficients[d] = cc_sum[cc_idx[d]]
            # -----------------------------------------
            # take care of not selecting out-of-bound indexes:
            id1 = detections[d] - np.int32(buffer_extracted_events
                                           * data['metadata']['sampling_rate'])
            if id1 < idx_min:
                # will have to zero-pad the beginning of the extracted sequence
                dn_b = idx_min - id1
                id2 = np.int32(id1 + n_extracted_samples)
                id1 = np.int32(idx_min)
            else:
                dn_b = 0
                id2 = id1 + n_extracted_samples
            if id2 > idx_max:
                # will have to zero-pad the end of the extracted sequence
                dn_e = id2 - idx_max
                id2 = np.int32(idx_max)
            else:
                dn_e = 0
            waveforms[d, :, :, :] = np.concatenate((np.zeros((n_stations,
                                                              n_components,
                                                              dn_b),
                                                             dtype=np.float32),
                                                    data['waveforms'][:,
                                                                      :,
                                                                      id1:id2],
                                                    np.zeros((n_stations,
                                                              n_components,
                                                              dn_e),
                                                             dtype=np.float32)),
                                                   axis=-1)
            # -----------------------------------------
        metadata_events.update({'template_id'                :   np.array([i])})
        metadata_events.update({'stations'                   :   np.asarray(data['metadata']['stations']).astype('S')})
        metadata_events.update({'components'                 :   np.asarray(data['metadata']['components']).astype('S')})
        metadata_events.update({'origin_times'               :   origin_times})
        metadata_events.update({'correlation_coefficients'   :   correlation_coefficients})
        waveforms_events.update({'waveforms'                 :   waveforms})

        list_metadata.append(metadata_events)
        list_waveforms.append(waveforms_events)
    return list_metadata, list_waveforms

## Then, define a function to store the extracted events in a database

This function creates an h5 file, containing as many groups as there are templates. Each group contains datasets with metadata and waveforms for the corresponding template. In this example, there is only one group because we only use one template.

In [None]:
def write_new_detections(filename, metadata, waveforms, db_path='./output/'):
    filename_meta = db_path + filename + 'meta.h5'
    filename_wave = db_path + filename + 'wav.h5'
    n_templates = len(metadata)
    with h5.File(filename_meta, mode='w') as f:
        for t in range(n_templates):
            if len(metadata[t]['origin_times']) == 0:
                # no detection
                continue
            f.create_group('{:d}'.format(metadata[t]['template_id'][0]))
            for key in metadata[t].keys():
                f['{:d}'.format(metadata[t]['template_id'][0])].create_dataset(key, data=metadata[t][key], compression='gzip')
    with h5.File(filename_wave, mode='w') as f:
        for t in range(n_templates):
            if len(metadata[t]['origin_times']) == 0:
                # no detection
                continue
            f.create_group('{:d}'.format(metadata[t]['template_id'][0]))
            f['{:d}'.format(metadata[t]['template_id'][0])].create_dataset('waveforms', data=waveforms[t]['waveforms'], compression='lzf')
            print('{:d} events detected with Template {:d}'.format(waveforms[t]['waveforms'].shape[0], metadata[t]['template_id'][0]))

## Load the data and the template

In [None]:
# load the data from day 2013-03-17
data = utils.load_data('data_FMF_tutorial.h5')

# load the template event that we have just built
template = utils.load_template('template.h5', path='./output/')

## Format the moveouts

In [None]:
moveouts = np.hstack( (template['moveouts_S'].reshape(-1, 1),
                       template['moveouts_S'].reshape(-1, 1),
                       template['moveouts_P'].reshape(-1, 1)) )
moveout_array = moveouts[np.newaxis, :]

## Load the correlation coefficient time series that we saved previsouly

In [None]:
cc_sum = utils.load_cc('cc_sum.h5', path='./output/')

## Fix the detection threshold and extract the events

In [None]:
# extract the events with cc n_mad times higher than MAD
n_mad = 10.
metadata, waveforms = extract_new_detections(data, cc_sum, moveout_array, n_mad=n_mad)
print('metadata and waveforms are Python lists with lengths {:d} and {:d}\
 (respectively) because we use a single template.\n'.format(len(metadata), len(waveforms)))
print('Elements of metadata are dictionaries with information on:\n', list(metadata[0].keys()))
print('\n')
print('Elements of waveforms are numpy arrays with shape:\n', waveforms[0]['waveforms'].shape)

## Store the extracted events in an h5 database

In [None]:
write_new_detections('detections_20130317_{:d}_mad'.format(int(n_mad)), metadata, waveforms, db_path='./output/')