# Continuous transfer learning

## Update QNet on a regular (daily) basis

Start with updated QNet using 4 days of data.
Next, update using all events predicted by QNet on day 5 using locations given by catalog. Now use days 1-5 including field data to update model (20-40 epochs) select best model based on dice-coef. Apply yo day 6 and repeat the process

In [2]:
import tensorflow as tf
from tensorflow.keras import backend as K

import random

import shutil
import os
import time

import xarray as xr
import pandas as pd

from sklearn.model_selection import train_test_split

## Data augmentations and normalization

In [3]:
# The data augmentaiton function defined in this block are only used for the synthetic data

def random_t_shifts(d):
   for i in range(d.shape[1]):
     d[:,i] = np.roll(d[:,i], np.random.choice(1024))
   return d

def full_field_noise(data):

  # pick random time-window
  noise_idx = random.choice(np.arange(cfg.field_noise.shape[0]))
  idx = random.choice(np.arange(0, cfg.field_noise.shape[1]-cfg.N_TIMESAMPLES))
  noise = cfg.field_noise[noise_idx, idx:idx+cfg.N_TIMESAMPLES, :]

  noise = np.random.uniform(4.0,12.0)*normalize(noise)
  data = normalize(data)+noise

  return data

def tf_full_field_noise(data: tf.Tensor) -> tf.Tensor:
  data_shape = data.shape
  [data,] = tf.py_function(full_field_noise, [data], [tf.float32])
  data.set_shape(data_shape)
  return data

def station_dropout(data):
    n_drop = random.randrange(cfg.MIN_DROP, cfg.MAX_DROP, 1)
    drop_stations = np.random.choice(np.arange(data.shape[1]), n_drop, replace=False)
    mask = np.ones((data.shape))
    mask[:, drop_stations] *= 0
    data = np.multiply(data, mask)
    return data

def tf_station_dropout(data: tf.Tensor) -> tf.Tensor:
    data_shape = data.shape
    [data,] = tf.py_function(station_dropout, [data], [tf.float32])
    data.set_shape(data_shape)
    return data

def time_shift(data, z):
    negative_time_shifts = np.array([-100, -150, -200, -240, -270, -310])*0.7 #np.array([-20, -30, -40, -48, -54, -62]) #np.array([-100, -150, -200, -240, -270, -310])
    positive_time_shifts = np.array([550, 500, 450, 400, 350, 300])*0.7 #np.array([120, 112, 104, 96, 88, 80]) #np.array([600, 560, 520, 480, 440, 400])
    
    ind_label = np.argmax(z)
    t_shift = np.random.randint(negative_time_shifts[ind_label], positive_time_shifts[ind_label])
  
    data = np.roll(data, t_shift, axis=0)

    if t_shift < 0:
        data[t_shift:, :] = 0
    elif t_shift > 0:
        data[:t_shift, :] = 0

    return data

def tf_time_shift(data, z):
    data_shape = data.shape
    [data,] = tf.py_function(time_shift, [data, z], [tf.float32])
    data.set_shape(data_shape)
    return data

def normalize(data):
    data = data/np.max(np.abs(data))
    return data

def tf_normalize(data):
    return tf.math.divide(data, tf.math.reduce_max(tf.math.abs(data)))

def FCN_output(sx):
    x = np.reshape(np.linspace(5500,8500,128), (128,1,1)) #np.reshape(np.linspace(5300,8700,88), (88,1,1))
    y = np.reshape(np.linspace(3500,6100,96), (1,96,1)) #np.reshape(np.linspace(3300,6300,48), (1,48,1
    z = np.reshape(np.linspace(1400,3600,64), (1,1,64)) # np.reshape(np.linspace(1200,3800,32), (1,1,32)
    xc = np.round(sx[0])
    yc = np.round(sx[1])
    zc = np.round(sx[2])
    fcn_out = np.exp(-((x-xc)**2/(2*200**2)+(y-yc)**2/(2*200**2)+(z-zc)**2/(2*200**2)))#np.exp(-((x-xc)**2+(y-yc)**2+(z-zc)**2)/(2*300**2))#np.exp(-((x-xc)**2/(2*200**2)+(y-yc)**2/(2*200**2)+(z-zc)**2/(2*200**2)))
    fcn_out = fcn_out/np.max(fcn_out)
    return fcn_out

def tf_FCN_output(xc,yc,zc):
    [fcn,] = tf.py_function(FCN_output, [[xc,yc,zc]], [tf.float32])
    fcn.set_shape((128,96,64))
    return fcn

def FCN_output_perturbed(sx):
    x = np.reshape(np.linspace(5500,8500,128), (128,1,1)) #np.reshape(np.linspace(5300,8700,88), (88,1,1))
    y = np.reshape(np.linspace(3500,6100,96), (1,96,1)) #np.reshape(np.linspace(3300,6300,48), (1,48,1
    z = np.reshape(np.linspace(1400,3600,64), (1,1,64)) # np.reshape(np.linspace(1200,3800,32), (1,1,32)

    loc_errors = [sx[3], sx[4], sx[5]]

    rx = np.random.normal(loc=sx[0], scale=np.std(np.arange(-int(loc_errors[0]),int(loc_errors[0])+1)))
    ry = np.random.normal(loc=sx[1], scale=np.std(np.arange(-int(loc_errors[1]),int(loc_errors[1])+1)))
    rz = np.random.normal(loc=sx[2], scale=np.std(np.arange(-int(loc_errors[2]),int(loc_errors[2])+1)))

    while abs(rx-sx[0]) > loc_errors[0]:
      rx = np.random.normal(loc=sx[0], scale=np.std(np.arange(-int(loc_errors[0]),int(loc_errors[0])+1)))
    while abs(ry-sx[1]) > loc_errors[1]:
      ry = np.random.normal(loc=sx[1], scale=np.std(np.arange(-int(loc_errors[1]),int(loc_errors[1])+1)))
    while abs(rz-sx[2]) > loc_errors[2]:
      rz = np.random.normal(loc=sx[2], scale=np.std(np.arange(-int(loc_errors[2]),int(loc_errors[2])+1)))

    xc = rx
    yc = ry
    zc = rz

    fcn_out = np.exp(-((x-xc)**2/(2*200**2)+(y-yc)**2/(2*200**2)+(z-zc)**2/(2*200**2)))#np.exp(-((x-xc)**2+(y-yc)**2+(z-zc)**2)/(2*300**2))#np.exp(-((x-xc)**2/(2*200**2)+(y-yc)**2/(2*200**2)+(z-zc)**2/(2*200**2)))
    fcn_out = fcn_out/np.max(fcn_out)
    return fcn_out

def tf_FCN_output_perturbed(xc,yc,zc, rx, ry, rz):
    [fcn,] = tf.py_function(FCN_output, [[xc,yc,zc, rx, ry, rz]], [tf.float32])
    fcn.set_shape((128,96,64))
    return fcn

## TFR reading pipeline

In [4]:
# Create an input function reading a file using the Dataset API
def read_dataset(prefix, batch_size, config):
  def _input_fn(example_serialized):
    feature_map = {
        "x_true": tf.io.FixedLenFeature(shape=[], dtype=tf.float32, default_value=-1),
        "y_true": tf.io.FixedLenFeature(shape=[], dtype=tf.float32, default_value=-1),
        "z_true": tf.io.FixedLenFeature(shape=[], dtype=tf.float32, default_value=-1),
        "sx_error": tf.io.FixedLenFeature(shape=[], dtype=tf.float32, default_value=-1),
        "sy_error": tf.io.FixedLenFeature(shape=[], dtype=tf.float32, default_value=-1),
        "sz_error": tf.io.FixedLenFeature(shape=[], dtype=tf.float32, default_value=-1),
        "z_id": tf.io.FixedLenFeature(shape=[6], dtype=tf.float32),
        "data": tf.io.FixedLenFeature(shape=[config.N_TIMESAMPLES, config.N_TRACES], dtype=tf.float32),
        }

    parsed = tf.io.parse_single_example(example_serialized, feature_map)
    data = parsed["data"]

    if tf.math.reduce_max(parsed["z_id"])>=0:
      data = tf_time_shift(data, parsed["z_id"])
      data = tf_full_field_noise(data)
      data = tf_station_dropout(data)
      label = tf_FCN_output(parsed["x_true"], parsed["y_true"], parsed["z_true"])
    else:
      label = tf_FCN_output(parsed["x_true"], parsed["y_true"], parsed["z_true"])
      #label = tf_FCN_output_perturbed(parsed["x_true"], parsed["y_true"], parsed["z_true"],
      #                                parsed["sx_error"], parsed["sy_error"], parsed["sz_error"])

    data = tf_normalize(data)
    data = tf.reshape(data, (config.N_TIMESAMPLES, config.N_TRACES, 1))
    
    return (data, label)

  def _input_fn2(example_serialized):
    feature_map = {
        "x_true": tf.io.FixedLenFeature(shape=[], dtype=tf.float32, default_value=-1),
        "y_true": tf.io.FixedLenFeature(shape=[], dtype=tf.float32, default_value=-1),
        "z_true": tf.io.FixedLenFeature(shape=[], dtype=tf.float32, default_value=-1),
        "z_id": tf.io.FixedLenFeature(shape=[6], dtype=tf.float32),
        "data": tf.io.FixedLenFeature(shape=[config.N_TIMESAMPLES, config.N_TRACES], dtype=tf.float32),
        }

    parsed = tf.io.parse_single_example(example_serialized, feature_map)
    data = parsed["data"]

    label = tf_FCN_output(parsed["x_true"], parsed["y_true"], parsed["z_true"])

    data = tf_normalize(data)
    data = tf.reshape(data, (config.N_TIMESAMPLES, config.N_TRACES, 1))
    
    return (data, label)

  if prefix=="train":

    # Get synthetic and field data
    file_path_synth = os.path.join(config.PATH_TO_SYNTH_DATA,'%s*' % prefix)
    file_path_field = os.path.join(config.PATH_TO_FIELD_DATA, '%s*' % prefix)

    # Create list of files that match pattern and process synthetic dataset
    file_list_synth = tf.io.matching_files(file_path_synth)
    file_list_synth = tf.random.shuffle(file_list_synth)
    file_list_field = tf.io.matching_files(file_path_field)
    if config.TRANSFER_LEARNING == True:
      file_list_synth = tf.slice(file_list_synth, [0], [config.N_SYNTH_FILES])
      file_list = tf.concat([file_list_synth, file_list_field], axis=-1) # combine field and synthetic data list
      shards = tf.data.Dataset.from_tensor_slices(file_list)
      shards = shards.shuffle(tf.cast(tf.shape(file_list)[0], tf.int64))
      shards = shards.repeat()
    else:
      file_list = tf.concat([file_list_synth, file_list_field], axis=-1)
      shards = tf.data.Dataset.from_tensor_slices(file_list)
      shards = shards.shuffle(tf.cast(tf.shape(file_list)[0], tf.int64))
      shards = shards.repeat()
    # Feed the shards into TFRecordDataset and randomize again with interleave.
    dataset = shards.interleave(tf.data.TFRecordDataset, cycle_length=4, block_length=16, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.shuffle(buffer_size=10*batch_size)
    num_epochs=None
    dataset = dataset.map(_input_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.repeat(num_epochs).batch(batch_size)


  if prefix == "val":
    file_path_field = os.path.join(config.PATH_TO_FIELD_DATA, '%s*' % prefix)
    file_list_field = tf.io.matching_files(file_path_field)
    shards = tf.data.Dataset.from_tensor_slices(file_list_field)
    dataset = tf.data.TFRecordDataset(shards)
    num_epochs = 1
    dataset = dataset.map(_input_fn2, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.repeat(num_epochs).batch(batch_size)

  return dataset.prefetch(tf.data.experimental.AUTOTUNE)


## Model updating/training

In [5]:
def dice_coef_gaussian(y_true, y_pred, smooth=1):
  y_true = tf.dtypes.cast(y_true>0.1, tf.int32)
  y_true = tf.dtypes.cast(y_true, tf.float32)
  y_pred = tf.dtypes.cast(y_pred>0.1, tf.int32)
  y_pred = tf.dtypes.cast(y_pred, tf.float32)
  intersection = K.sum(y_true * y_pred, axis=[1,2,3])
  union = K.sum(y_true, axis=[1,2,3]) + K.sum(y_pred, axis=[1,2,3])
  dice = K.mean((2. * intersection + smooth)/(union + smooth), axis=0)
  return dice

def dice_coef(y_true, y_pred, smooth=1):
  y_true = tf.dtypes.cast(y_true, tf.float32)
  y_pred = tf.dtypes.cast(y_pred, tf.float32)
  intersection = K.sum(y_true * y_pred, axis=[1,2,3])
  union = K.sum(y_true, axis=[1,2,3]) + K.sum(y_pred, axis=[1,2,3])
  dice = K.mean((2. * intersection + smooth)/(union + smooth), axis=0)
  return dice

# Clip values in Gaussian above 0.1 to 1 and below 0.1 to 0
def iou_coef_gaussian(y_true, y_pred, smooth=1):
    y_true = tf.dtypes.cast(y_true>0.1, tf.int32)
    y_true = tf.dtypes.cast(y_true, tf.float32)
    y_pred = tf.dtypes.cast(y_pred>0.1, tf.int32)
    y_pred = tf.dtypes.cast(y_pred, tf.float32)
    intersection = K.sum(K.abs(y_true * y_pred), axis=[1,2,3])
    union = K.sum(y_true,[1,2,3])+K.sum(y_pred,[1,2,3])-intersection
    iou = K.mean((intersection + smooth) / (union + smooth), axis=0)
    return iou

def train_and_evaluate(config):

  # create checkpoint to save best model based on dice coef in validation set
  filepath = os.path.join(config.PATH_TO_CKPTS, 'ckpt')
  checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath=filepath, 
                             monitor='val_dice_coef',
                             save_weights_only=True,
                             save_best_only=True,
                             mode='max')

  print("loading model...")
  generator = tf.keras.models.load_model(
      os.path.join(config.PATH_TO_MODELS, config.LOAD_MODEL),
      custom_objects={
          'ReLU':tf.keras.layers.ReLU,
          'dice_coef_gaussian': dice_coef_gaussian, 'dice_coef': dice_coef,
          'iou_coef_gaussian': iou_coef_gaussian,
          })
  
  generator.compile(
      optimizer=tf.keras.optimizers.Adam(learning_rate=config.LEARNING_RATE),
      loss=tf.keras.losses.BinaryCrossentropy(),
      metrics=[dice_coef_gaussian, dice_coef])

  # Compute train and validation steps
  file_path = os.path.join(config.PATH_TO_SYNTH_DATA,'%s*' % config.TRAIN)
  file_list = tf.io.matching_files(file_path)

  file_list = file_list[:config.N_SYNTH_FILES]
  n_train_examples_synth = len(file_list)*config.N_FILES_SYNTH_TFRECORD
  
  file_path = os.path.join(config.PATH_TO_FIELD_DATA, '%s*' % config.TRAIN)
  file_list = tf.io.matching_files(file_path)
  n_train_files_field = len(file_list)*config.N_FILES_FIELD_TFRECORD_TRAIN

  n_train_examples = n_train_files_field + n_train_examples_synth

  steps_per_epoch = n_train_examples // config.BATCH_SIZE

  file_path = file_path = os.path.join(config.PATH_TO_FIELD_DATA, '%s*' % config.VAL)
  file_list = tf.io.matching_files(file_path)
  val_steps = (len(file_list)*config.N_FILES_FIELD_TFRECORD_VAL) // config.BATCH_SIZE



  #trainds = read_dataset(prefix=config.TRAIN, batch_size=config.BATCH_SIZE, config=config)
  #valds = read_dataset(prefix=config.VAL, batch_size=config.BATCH_SIZE, config=config)

  #callbacks = [checkpoint]
  #generator.fit(
  #    trainds,
  #    validation_data=valds,
  #    epochs=config.N_EPOCHS,
  #    steps_per_epoch=steps_per_epoch,
  #    validation_steps=val_steps,
  #    verbose=1,
  #    callbacks=callbacks)
  #
  ## After training load best model and save it to desired directory
  #generator.load_weights(filepath)
  #new_idx = config.model_idx + 1
  #config.model_idx = new_idx
  #fn_new_model = "QNet_updated_{}.h5".format(config.model_idx)
  #config.LOAD_MODEL = fn_new_model
  #generator.save(os.path.join(config.PATH_TO_MODELS, fn_new_model), overwrite=True, include_optimizer=True)
  
  for i in range(config.N_ITS):
    # Load the training data
    trainds = read_dataset(prefix=config.TRAIN, batch_size=config.BATCH_SIZE, config=config)
    valds = read_dataset(prefix=config.VAL, batch_size=config.BATCH_SIZE, config=config)
    generator.fit(
        trainds,
        validation_data=valds,
        epochs=config.N_EPOCHS,
        steps_per_epoch=steps_per_epoch,
        validation_steps=val_steps,
        verbose=1,)
    
    save_as = os.path.join(config.PATH_TO_TL_MODELS, "{}_{}.h5".format(config.MODEL_NAME, i+82))
    generator.save(save_as, overwrite=True, include_optimizer=True)

## Model evaluation

In [6]:

def run_validations(data_files, model):

    '''

    pd = (data_files, model, pd_dict_name, detection_threshold=0.6)

    Runs model prediction for all data_files and returns file names of the detected events,
    list of detections (True/False) and a list of all the filenames.
    Additionally it generates pandas dataframe with information about the prediction for each event.

    Args:
        data_files: list of paths to input data
        model: model used for predictions
        pd_dict_name: name of dictionary to save
        detection_threshold: detection threshold

    Retuns:
        pd: pandas dataframe (also saved to disk)

    '''
    
    F1, F1_binary = [], []
    hypo_dist, epi_dist, depth_dist = [], [], []
    max_val = []

    x = np.reshape(np.linspace(5500,8500,128), (128,))
    y = np.reshape(np.linspace(3500,6100,96), (96,))
    z = np.reshape(np.linspace(1400,3600,64), (64,))
    
    for i, fn in enumerate(data_files):

        ds = xr.open_dataset(fn)

        data = ds['data'].values
        dt = 0.002
        t = np.arange(0,1401)*dt
        tn = np.linspace(0,t[-1],1024)
        dtn = tn[1]-tn[0]
        data_new = np.zeros((len(tn),data.shape[1]))

        for i in range(data_new.shape[1]):
            data_new[:,i] = np.interp(tn, t, data[:,i])

        data = data_new
        sx_new = ds.attrs['source_coordinates']
      
        data = normalize(data)
        data = np.reshape(data, (1, data.shape[0], data.shape[1],1))
        prediction = model.predict(data)#[0,:]
        desired = FCN_output(sx_new)
        desired = np.reshape(desired, (1, desired.shape[0], desired.shape[1], desired.shape[2]))

        # Compute fisher ratios data_tf = tf.convert_to_tensor(data_np, np.float32)
        F1.append(dice_coef(tf.convert_to_tensor(desired, np.float32), tf.convert_to_tensor(prediction, np.float32)).numpy())
        F1_binary.append(dice_coef_gaussian(tf.convert_to_tensor(desired, np.float32), tf.convert_to_tensor(prediction, np.float32)).numpy())

        prediction = prediction[0,:]
        maxIdx = np.unravel_index(np.argmax(prediction), prediction.shape)
        xp = x[maxIdx[0]]
        yp = y[maxIdx[1]]
        zp = z[maxIdx[2]]
          
        max_val.append(np.max(prediction))

        hypo_dist.append(np.sqrt((sx_new[0]-x[maxIdx[0]])**2+(sx_new[1]-y[maxIdx[1]])**2+(sx_new[2]-z[maxIdx[2]])**2))
        epi_dist.append(np.sqrt((sx_new[0]-x[maxIdx[0]])**2+(sx_new[1]-y[maxIdx[1]])**2))
        depth_dist.append(np.sqrt((sx_new[2]-z[maxIdx[2]])**2))

    return F1, F1_binary, hypo_dist, epi_dist, depth_dist, max_val

def model_eval(config):

  F1 = []
  model_name = []

  for i in range(config.N_ITS):

    # Load model
    path_model = os.path.join(config.PATH_TO_TL_MODELS, config.MODEL_NAME + '_{}.h5'.format(i))
    model = tf.keras.models.load_model(
        path_model,
        custom_objects={
            'LeakyReLU':tf.keras.layers.LeakyReLU, 'ReLU':tf.keras.layers.ReLU,
            'dice_coef_gaussian': dice_coef_gaussian, 'dice_coef': dice_coef,
            'iou_coef_gaussian': iou_coef_gaussian,
        })
    
    files = glob.glob(cfg.PATH_TO_FIELD_DATA_NC + '/*')
    F1_tmp, F1_binary_tmp, hypo_dist_tmp, epi_dist_tmp, depth_dist_tmp, max_val_tmp = run_validations(files, model)
    F1.append(np.mean(F1_tmp))

  print(F1)
  print(len(F1))
  mod_idx = np.argmax(F1)

  mod_name = config.MODEL_NAME + '_{}'.format(mod_idx)

  return mod_name


## Load QNetPrev updated on 4 days of field data and make predictions on day 5

In [18]:
import glob
import numpy as np

files = glob.glob("drive/My Drive/Texas_TL/Data/contNoise_newShape/*")

#field_noise = np.load(files[0])
#n_size = field_noise.shape
#field_noise = np.reshape(field_noise, (1, n_size[0], n_size[1]))
#for i, fn in enumerate(files[1:]):
#  field_noise = np.append(field_noise, np.reshape(np.load(fn), (1,n_size[0], n_size[1])), axis=0)

class Config(object):
  def __init__(self):
    # PATHS TO MODELS AND OUTPUT
    self.PATH_TO_SYNTH_DATA = "drive/My Drive/Texas_TL/Data/TFR_new"
    self.PATH_TO_FIELD_DATA = "drive/My Drive/Texas_TL/Data/daily_updating_TFRs"#_days4"
    self.PATH_TO_FIELD_DATA_NC = "drive/My Drive/Texas_TL/Data/daily_updating_xarray"
    self.PATH_TO_MODELS = "drive/My Drive/Texas_TL/TL_models/TL_TFR_regular_updating"
    self.PATH_TO_TL_MODELS = "drive/My Drive/Texas_TL/TL_models/TL_TFR_all_temp"
    self.PATH_TO_CKPTS = 'drive/My Drive/Texas_TL/TL_models/ckpt'
    # DEFINE SIZE OF INPUT, BATCH SIZE, NUMBER OF EPOCHS (EVALS), LEARNING RATE, OTHER FIXED NUMBERS
    self.BATCH_SIZE = 10
    self.N_TRACES = 96
    self.N_TIMESAMPLES = 1024
    self.N_EPOCHS = 4
    self.N_ITS = 99
    self.LEARNING_RATE =  0.001
    self.N_FILES_SYNTH_TFRECORD = 400
    self.N_FILES_FIELD_TFRECORD_TRAIN = 1
    self.N_FILES_FIELD_TFRECORD_VAL = 1
    self.MIN_DROP = 5
    self.MAX_DROP = 25
    # PREFIX FOR TRAINING AND VALIDATION SETS
    self.TRAIN = "train"
    self.VAL = "val"
    # LOAD FIELD NOISE, SINGLE FIELD NOISE AND GAUSSIAN NOISE
    self.field_noise = field_noise
    # NAME FOR CHECKPOINTS
    self.LOAD_MODEL = "QNet_update_81.h5" # name of model to load before starting training
    self.MODEL_NAME = "QNet_update"
    self.model_idx = 0
    self.TRANSFER_LEARNING = True
    self.N_SYNTH_FILES = 6
    self.detection_threshold = 0.7

In [8]:

def make_predictions(data_files, model, pd_dict_name, check_day, detection_threshold=0.6):

    '''

    pd = (data_files, model, pd_dict_name, detection_threshold=0.6)

    Runs model prediction for all data_files and returns file names of the detected events,
    list of detections (True/False) and a list of all the filenames.
    Additionally it generates pandas dataframe with information about the prediction for each event.

    Args:
        data_files: list of paths to input data
        model: model used for predictions
        pd_dict_name: name of dictionary to save
        detection_threshold: detection threshold

    Retuns:
        pd: pandas dataframe (also saved to disk)

    '''
    
    distances_old, distances_new = [], []
    true_x_new, true_y_new, true_z_new, pred_x, pred_y, pred_z = [], [], [], [], [], []
    true_x_old, true_y_old, true_z_old = [], [], []
    x_error, y_error, z_error = [], [], []
    snr, magnitude = [], []
    vol, clvd, dc = [], [], []
    file_name = []
    detection = []
    fns = []
    max_val = []
    day = []

    x = np.reshape(np.linspace(5500,8500,128), (128,))
    y = np.reshape(np.linspace(3500,6100,96), (96,))
    z = np.reshape(np.linspace(1400,3600,64), (64,))
    
    print("making predictions on days >= {} and < {}".format(check_day[0], check_day[1]))
    for i, fn in enumerate(data_files):

        ds = xr.open_dataset(fn)

        #if ds.attrs['day'] == check_day:
        if ds.attrs['day'] >= check_day[0] and ds.attrs['day'] <  check_day[1]:

          data = ds['data'].values
          dt = 0.002
          t = np.arange(0,1401)*dt
          tn = np.linspace(0,t[-1],1024)
          dtn = tn[1]-tn[0]
          data_new = np.zeros((len(tn),data.shape[1]))
          for i in range(data_new.shape[1]):
              data_new[:,i] = np.interp(tn, t, data[:,i])
          data = data_new
          sx_old = ds.attrs['source_coordinates_new']
          sx_new = ds.attrs['source_coordinates']
          sx_error = ds.attrs['loc_errors']
          
          true_x_new.append(sx_new[0])
          true_y_new.append(sx_new[1])
          true_z_new.append(sx_new[2])
          true_x_old.append(sx_old[0])
          true_y_old.append(sx_old[1])
          true_z_old.append(sx_old[2])
          x_error.append(sx_error[0])
          y_error.append(sx_error[1])
          z_error.append(sx_error[2])
          snr.append(ds.attrs['SNR'])
          magnitude.append(ds.attrs['Magnitude'])
          vol.append(ds.attrs['VOL'])
          clvd.append(ds.attrs['CLVD'])
          dc.append(ds.attrs['DC'])
          file_name.append(fn.split('/')[-1])
          fns.append(fn)
          day.append(ds.attrs['day'])
          
          data = normalize(data)
          data = np.reshape(data, (1, data.shape[0], data.shape[1],1))
          prediction = model.predict(data)[0,:]
      
          maxIdx = np.unravel_index(np.argmax(prediction), prediction.shape)
          xp = x[maxIdx[0]]
          yp = y[maxIdx[1]]
          zp = z[maxIdx[2]]
          
          pred_x.append(xp)
          pred_y.append(yp)
          pred_z.append(zp)
          max_val.append(np.max(prediction))
          
          dist_old = np.sqrt((sx_old[0]-x[maxIdx[0]])**2+(sx_old[1]-y[maxIdx[1]])**2+(sx_old[2]-z[maxIdx[2]])**2)
          dist_new = np.sqrt((sx_new[0]-x[maxIdx[0]])**2+(sx_new[1]-y[maxIdx[1]])**2+(sx_new[2]-z[maxIdx[2]])**2)
          
          distances_old.append(dist_old)
          distances_new.append(dist_new)

          if np.max(prediction) >= detection_threshold:
              detection.append(True)
          else:
              detection.append(False)
            
    df = pd.DataFrame(
        {
            'file': file_name,
            'detection': detection,
            'distances_old': distances_old,
            'distances_new': distances_new,
            'new_true_x': true_x_new,
            'new_true_y': true_y_new,
            'new_true_z': true_z_new,
            'old_true_x': true_x_old,
            'old_true_y': true_y_old,
            'old_true_z': true_z_old,
            'pred_x': pred_x,
            'pred_y': pred_y,
            'pred_z': pred_z,
            'max_val': max_val,
            'x_error': x_error,
            'y_error': y_error,
            'z_error': z_error,
            'SNR': snr,
            'Magnitude': magnitude,
            'Volume': vol,
            'DC': dc,
            'CLVD': clvd,
            'filenames': fns,
            'day': day
        })
    
    df.to_csv(pd_dict_name)

    n_detections = sum(bool(x) for x in detection)
    print("Number of events detected: {}".format(n_detections))

    return df

## Split predicted into a train and val set

Write training set to tfrecords and 

In [9]:
def train_val_split(df):

  df = df[df['max_val']>=0.7]
  X = df[['filenames','new_true_x','new_true_y','new_true_z', 'x_error', 'y_error', 'z_error']]
  y = df['filenames']

  X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2)
  
  # Write train and val set to tfrecords
  writeTFRs(X_train, 'drive/My Drive/Texas_TL/Data/daily_updating_TFRs', 'train')
  writeTFRs(X_test, 'drive/My Drive/Texas_TL/Data/daily_updating_TFRs', 'val')

  # Copy val_fns
  for i, fn in enumerate(y_test.to_list()):
    shutil.copy(fn, 'drive/My Drive/Texas_TL/Data/daily_updating_xarray/{}'.format(fn.split('/')[-1]),  follow_symlinks=True)

  
  
def get_data(fn):

  ds = xr.open_dataset(fn)
  data = ds.data.values.astype('float32')
  data = interp_time(data)
  
  return data

def interp_time(data):
  dt = 0.002
  t = np.arange(0,1401)*dt
  tn = np.linspace(0,t[-1],1024)
  dtn = tn[1]-tn[0]
  x_new = np.zeros((len(tn),data.shape[1]))
  for i in range(x_new.shape[1]):
    x_new[:,i] = np.interp(tn, t, data[:,i])

  return x_new

def parser(data, label):
  data = np.reshape(data, (data.shape[0]*data.shape[1]), order="C")

  parsed_data = {
      "data": data,
      "z_id": np.array([-1,-1,-1,-1,-1,-1]),
      "sx": label["new_true_x"].values[0],
      "sy": label["new_true_y"].values[0],
      "sz": label["new_true_z"].values[0],
      "sx_error": label["x_error"].values[0],
      "sy_error": label["y_error"].values[0],
      "sz_error": label["z_error"].values[0],
  }

  return parsed_data


def get_tensor_object(single_gather):
  tensor = tf.train.Example(
      features = tf.train.Features(
          feature={
              "data": tf.train.Feature(
                  float_list=tf.train.FloatList(value=single_gather["data"])
              ),
              "z_id": tf.train.Feature(
                  float_list=tf.train.FloatList(value=single_gather["z_id"])
              ),
              "x_true": tf.train.Feature(
                  float_list=tf.train.FloatList(value=[single_gather['sx']])
              ),
              "y_true": tf.train.Feature(
                  float_list=tf.train.FloatList(value=[single_gather['sy']])
              ),
              "z_true": tf.train.Feature(
                  float_list=tf.train.FloatList(value=[single_gather['sz']])
              ),
              "sx_error": tf.train.Feature(
                  float_list=tf.train.FloatList(value=[single_gather['sx_error']])
              ),
              "sy_error": tf.train.Feature(
                  float_list=tf.train.FloatList(value=[single_gather['sy_error']])
              ),
              "sz_error": tf.train.Feature(
                  float_list=tf.train.FloatList(value=[single_gather['sz_error']])
              ),
          }
      )
  )
  return tensor

def writeTFRs(xy_info, output_dir, split):

  n_files = len(os.listdir(output_dir))
  for i, fn in enumerate(xy_info['filenames'].values):

    k = n_files+i

    with tf.io.TFRecordWriter(os.path.join(output_dir, '{}_field_data_{}.tfrecord'.format(split, k))) as tfwriter:

      data = get_data(fn)
      parsed_data = parser(data, xy_info[i:i+1])
      record_tensor = get_tensor_object(parsed_data)
      tfwriter.write(record_tensor.SerializeToString())

0) empty folders that need to be filled on the fly. for example the daily updating xarray folder.  Then fill it with the data it needs to begin with

In [None]:
! rm drive/My\Drive/Texas_TL/Data/daily_updating_xarray/*

In [None]:
def fill_xarray(files):
  for i, fn in enumerate(files):
    ds = xr.open_dataset(fn)
    if ds.attrs['day'] < 20:
      shutil.copy(fn, 'drive/My Drive/Texas_TL/Data/daily_updating_xarray/{}'.format(fn.split('/')[-1]),  follow_symlinks=True)

In [None]:
fill_xarray(test_fns)

1) Load QNet

In [None]:
cfg = Config()

train_fns = glob.glob('drive/My Drive/cnn_gpu/texas_dataset' + '/train*.nc')
val_fns = glob.glob('drive/My Drive/cnn_gpu/texas_dataset' + '/val*.nc')
test_fns = glob.glob('drive/My Drive/cnn_gpu/texas_dataset' + '/test*.nc')
train_fns = np.append(train_fns,val_fns)

# files that are allways used to make predictions on the following day
field_data_files = np.append(train_fns,test_fns)

print("load model: {}".format(os.path.join(cfg.PATH_TO_MODELS, cfg.LOAD_MODEL)))
QNet = tf.keras.models.load_model(
      os.path.join(cfg.PATH_TO_MODELS, cfg.LOAD_MODEL),
      custom_objects={
          'ReLU':tf.keras.layers.ReLU,
          'dice_coef_gaussian': dice_coef_gaussian, 'iou_coef_gaussian': iou_coef_gaussian,
          'dice_coef': dice_coef,
          })

load model: drive/My Drive/Texas_TL/TL_models/TL_TFR_regular_updating/QNet_0.h5


2) Apply to following day

In [None]:
df_day = make_predictions(data_files=field_data_files, model=QNet, pd_dict_name='tmp.csv', check_day=(20,23), detection_threshold=0.7)

making predictions on days >= 20 and < 23
Number of events detected: 285


3) Split predictions that passed threshold to train/val set

In [None]:
train_val_split(df_day)

4) Update QNet using field data up to current day

In [None]:
cfg = Config()
print(cfg.LOAD_MODEL, cfg.model_idx)
train_and_evaluate(cfg)
print(cfg.LOAD_MODEL, cfg.model_idx)

5) Evaluate best model

In [None]:
cfg = Config()
best_model = model_eval(cfg)
best_model

6) Copy best model to new file and rename model to load in config file

In [20]:
def best_model_update(config, model_name, update_idx):

  # rename model in config file
  fn_new_model = "QNet_updated_{}.h5".format(update_idx)
  config.LOAD_MODEL = fn_new_model

  # copy model to model update dir
  model_to_copy = os.path.join(config.PATH_TO_TL_MODELS, model_name+'.h5')
  model_fn_dst = os.path.join(config.PATH_TO_MODELS, fn_new_model)
  print(model_to_copy)
  print(model_fn_dst)
  shutil.copy(model_to_copy, model_fn_dst, follow_symlinks=True)

In [None]:
best_model_update(cfg, best_model, 69)

# Full model updating iteration code

In [None]:
cfg = Config()

train_fns = glob.glob('drive/My Drive/cnn_gpu/texas_dataset' + '/train*.nc')
val_fns = glob.glob('drive/My Drive/cnn_gpu/texas_dataset' + '/val*.nc')
test_fns = glob.glob('drive/My Drive/cnn_gpu/texas_dataset' + '/test*.nc')
train_fns = np.append(train_fns,val_fns)

# files that are allways used to make predictions on the following day
field_data_files = np.append(train_fns,test_fns)


def full_iterative_model_updating(config, field_data_files, update_days):

  for i, day in enumerate(update_days[:-1]):

    print("load model: " + config.LOAD_MODEL)
    QNet = tf.keras.models.load_model(
      os.path.join(config.PATH_TO_MODELS, config.LOAD_MODEL),
      custom_objects={
          'ReLU':tf.keras.layers.ReLU,
          'dice_coef_gaussian': dice_coef_gaussian, 'iou_coef_gaussian': iou_coef_gaussian,
          'dice_coef': dice_coef,
          })

    # Make predictions "real-time monitoring"
    df_day = make_predictions(field_data_files,QNet, 'tmp.csv', (day, update_days[i+1]), config.detection_threshold)

    # Based on predictions that were made that passed threshold we'd update the location with a classic method offline
    # Based on those refined event locations we create a labelled dataset to enirch the current field dataset
    train_val_split(df_day)

    # Set a timer to ensure new field training set is done saving the files
    time.sleep(120)

    # Using the enriched field dataset we update our model
    train_and_evaluate(config)

full_iterative_model_updating(cfg, field_data_files, update_days=[20,21,22,23,24])

In [None]:
! rm drive/My\Drive/Texas_TL/Data/daily_updating_TFRs/train_field_data_*
! rm drive/My\Drive/Texas_TL/Data/daily_updating_TFRs/val_field_data_*