# Model Error Pipeline

## Introduction

The purpose of this notebook is to examine the traces for which the model performed worse. In particular, consider the 10dB dataset generated using ETHZ and Joachim Noises and look at the traces where the pretrained model performed worse than a second away.

## Preparations - Imports, Config, Data and model loading etc.

In [1]:
import os
import seisbench.models as sbm
import torch
from evaluation.noisy_dataset_evaluation import find_large_error_traces
from experiments.filtering import fft_filter_experiment
from utils.common import load_dataset_and_labels, load_pretrained_model

In [2]:
# Possible values
DATASETS_ORIGINS = ['ethz', 'geofon']
SBM_CLASSES= [sbm.PhaseNet, sbm.EQTransformer]
MODEL_TO_NUM_SAMPLES = {sbm.EQTransformer:6000, sbm.PhaseNet: 3001}

In [3]:
dataset_origin = 'ethz'
assert dataset_origin in DATASETS_ORIGINS, f'Expected dataset one of {DATASETS_ORIGINS}. Got {dataset_origin}.'

In [4]:
SBM_CLASS= sbm.EQTransformer
assert SBM_CLASS in SBM_CLASSES
SBM_CLASS

seisbench.models.eqtransformer.EQTransformer

In [5]:
NUM_SAMPLES=MODEL_TO_NUM_SAMPLES[SBM_CLASS]
NUM_SAMPLES

6000

In [6]:
NUM_SHIFTS=5
SAMPLE_RATE=100
LARGE_ERROR_THRESHOLD_SECONDS=1
LARGE_ERROR_THRESHOLD_SAMPLES=LARGE_ERROR_THRESHOLD_SECONDS*SAMPLE_RATE
SYNTHESIZED_SNR=10

In [7]:
SHIFTING_PLOTS_PATH='/home/moshe/plots/shifting_plots'
DATASET_PATH=f'/home/moshe/datasets/GFZ/noisy_datasets/{dataset_origin}_{NUM_SAMPLES}_sample_joachim_noises_energy_ratio_snr/'
NOISY_DATA_PATH = os.path.join(DATASET_PATH, f'noisy_dataset_snr_{SYNTHESIZED_SNR}')
DATASET_PATH, NOISY_DATA_PATH

('/home/moshe/datasets/GFZ/noisy_datasets/ethz_6000_sample_joachim_noises_energy_ratio_snr/',
 '/home/moshe/datasets/GFZ/noisy_datasets/ethz_6000_sample_joachim_noises_energy_ratio_snr/noisy_dataset_snr_10')

In [8]:
def assert_path_exists(path_str: str, name: str=''):
    assert os.path.exists(path_str), f'{name} {path_str} does not exist'

In [9]:
assert_path_exists(path_str=DATASET_PATH, name='DATASET_PATH')
assert_path_exists(path_str=NOISY_DATA_PATH, name='NOISY_DATA_PATH')

### Load the original High SNR Traces

Load a dataset of high SNR traces taken from the original ETHZ\GEOFON dataset.

In [10]:
# SAVE_TO_PATH=f'/home/moshe/datasets/GFZ/noisy_datasets/all_traces_{dataset_origin}_{NUM_SAMPLES}_sample_joachim_noises_energy_ratio_snr/'

dataset_traces_path = os.path.join(DATASET_PATH, 'original_dataset.pt')
dataset_labels_path = os.path.join(DATASET_PATH, 'original_labels.pt')
original_dataset = torch.load(dataset_traces_path)   #[:num_traces]
original_labels = torch.load(dataset_labels_path)    #[:num_traces]

num_original_traces = original_dataset.shape[0]
num_original_labels = original_labels.shape[0]
num_original_samples = original_dataset.shape[-1]

assert num_original_labels == num_original_traces, f'Expected traces equal num labels.Got {num_original_traces} traces and {num_original_labels} labels'
assert num_original_samples == NUM_SAMPLES, f'Expected {NUM_SAMPLES} in each trace. Got {num_original_samples}.'

print(f'Loaded {num_original_traces} traces and corresponding labels.')

Loaded 1856 traces and corresponding labels.


### Load the Synthetic Noisy Traces

In [11]:
synthetic_noisy_dataset_path= os.path.join(NOISY_DATA_PATH, 'traces.pt')

synthetic_noisy_labels_path= os.path.join(NOISY_DATA_PATH, 'labels.pt')

full_noises_path = os.path.join(NOISY_DATA_PATH, 'full_noise_traces.pt')

factors_path = os.path.join(NOISY_DATA_PATH, 'factors.pt')

In [12]:
assert_path_exists(path_str=synthetic_noisy_dataset_path, name='synthetic_noisy_dataset_path')
assert_path_exists(path_str=synthetic_noisy_labels_path, name='synthetic_noisy_labels_path')
assert_path_exists(path_str=full_noises_path, name='full_noises_path')
assert_path_exists(path_str=factors_path, name='factors_path')

In [13]:
synthetic_noisy_dataset, synthetic_noisy_labels = load_dataset_and_labels(dataset_path=synthetic_noisy_dataset_path, labels_path=synthetic_noisy_labels_path)
synthetic_noisy_dataset, synthetic_noisy_labels = synthetic_noisy_dataset.float(), synthetic_noisy_labels.float()
assert NUM_SAMPLES == synthetic_noisy_dataset.shape[-1], f'Expected Dataset contain {NUM_SAMPLES} samples. Got {synthetic_noisy_dataset.shape[-1]}'

print(f'The loaded dataset has {synthetic_noisy_dataset.shape[0]} traces, each has {synthetic_noisy_dataset.shape[1]} channels of {synthetic_noisy_dataset.shape[2]} samples. Each entry is of type {synthetic_noisy_dataset.dtype}')

print(f'The loaded labels have {synthetic_noisy_labels.shape[0]} labels. Each entry is of type {synthetic_noisy_labels.dtype}')

num_original_traces = synthetic_noisy_dataset.shape[0]

The loaded dataset has 14848 traces, each has 3 channels of 6000 samples. Each entry is of type torch.float32
The loaded labels have 14848 labels. Each entry is of type torch.float32


### Load a Pretrained Phasenet Model

In [14]:
pretrained_model = load_pretrained_model(model_class=SBM_CLASS, dataset_trained_on=dataset_origin)
pretrained_model.eval()

Working with <class 'seisbench.models.eqtransformer.EQTransformer'> on ETHZ
Load <class 'seisbench.models.eqtransformer.EQTransformer'> pretrained weights
<class 'seisbench.models.eqtransformer.EQTransformer'> pretrained keys ['ethz', 'geofon', 'instance', 'iquique', 'lendb', 'neic', 'obs', 'original', 'original_nonconservative', 'scedc', 'stead']


EQTransformer(
  (encoder): Encoder(
    (convs): ModuleList(
      (0): Conv1d(3, 8, kernel_size=(11,), stride=(1,), padding=(5,))
      (1): Conv1d(8, 16, kernel_size=(9,), stride=(1,), padding=(4,))
      (2): Conv1d(16, 16, kernel_size=(7,), stride=(1,), padding=(3,))
      (3): Conv1d(16, 32, kernel_size=(7,), stride=(1,), padding=(3,))
      (4): Conv1d(32, 32, kernel_size=(5,), stride=(1,), padding=(2,))
      (5): Conv1d(32, 64, kernel_size=(5,), stride=(1,), padding=(2,))
      (6): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
    )
    (pools): ModuleList(
      (0): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (1): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (2): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (3): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (4): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1,

In [15]:
# NOISES_DATASET_PT= 'noises_dataset.pt'
# NOISES_PATH='/home/moshe/GIT/summer_2022_Seismology/notebooks/Noises'
# full_noise_traces = get_n_random_noises(num_noises=num_traces, desired_window_size=NUM_SAMPLES + NUM_SHIFTS * SAMPLE_RATE, noises_path=NOISES_PATH, filename=NOISES_DATASET_PT)
#
#
# # Take the leftmost NUM_SAMPLES samples. The rest is left for shifts
# noises=full_noise_traces[:,:NUM_SAMPLES]
#
# full_noise_traces.shape, noises.shape

### Evaluate the Loaded Datasets by The Pretrained Model

In [16]:
# evaluation function for EqTransformer. Model returns a tuple instead of a tensor. For P phase should look at the tensor at index =1 of the tuple
def eval_batch_tuple_return(batch, model):
    with torch.no_grad():
        pred = model(batch)
        # transform the returned tuple to the same shape as phasenet where channel 0 is the p phase characteristic function
        # pred = torch.stack(pred, dim=0)
        if SBM_CLASS == sbm.EQTransformer:
            # EQTransformer returns a tuple (N,Z,E)
            pred = torch.stack((pred[1],pred[0],pred[2]), dim=0).swapaxes(0,1)
        pred = pred.cpu()
    return pred

Evaluate both the original traces and the synthetic noisy traces and save the large error traces - the traces where the model had picking error (residual) larger than  predefined threshold (1 second by default - 100 samples)

In [17]:
# large_error_traces_index_list = search_large_errors_given_desired_snr(model=pretrained_model, dataset=natural_dataset, labels=natural_dataset_labels, noise_traces=noises, desired_snr=SYNTHESIZED_SNR, calc_snr=CalcSNR(SnrCalcStrategy.ENERGY_RATIO))
#
large_error_traces_index_list_original_dataset = find_large_error_traces(dataset=original_dataset, model=pretrained_model.float(), labels=original_labels, threshold_samples=LARGE_ERROR_THRESHOLD_SAMPLES)


large_error_traces_index_list_synthetic_noisy_dataset = find_large_error_traces(dataset=synthetic_noisy_dataset, model=pretrained_model.float(), labels=synthetic_noisy_labels, threshold_samples=LARGE_ERROR_THRESHOLD_SAMPLES)

100%|██████████| 1856/1856 [00:38<00:00, 47.84it/s]
100%|██████████| 14848/14848 [05:06<00:00, 48.51it/s]


In [18]:
dataset_len = int(len(large_error_traces_index_list_synthetic_noisy_dataset))

print(f'There are {dataset_len} large errors in the noisy dataset')

dataset_len_original = int(len(large_error_traces_index_list_original_dataset))

print(f'There are {dataset_len_original} large errors in the original dataset')

There are 456 large errors in the noisy dataset
There are 43 large errors in the original dataset


In [19]:
le_original_dataset = original_dataset[large_error_traces_index_list_original_dataset]
le_original_labels = original_labels[large_error_traces_index_list_original_dataset]

In [20]:
le_dataset_traces_path = os.path.join(DATASET_PATH, 'le_original_dataset.pt')
le_dataset_labels_path = os.path.join(DATASET_PATH, 'le_original_labels.pt')

In [21]:
torch.save(le_original_dataset, le_dataset_traces_path)
torch.save(le_original_labels, le_dataset_labels_path)

# le_original_dataset = torch.load(le_dataset_traces_path)
# le_original_labels = torch.load(le_dataset_labels_path)

In [22]:
le_original_dataset.shape, le_original_labels.shape

(torch.Size([43, 3, 6000]), torch.Size([43]))

In [23]:
le_dataset = synthetic_noisy_dataset[large_error_traces_index_list_synthetic_noisy_dataset]
le_labels = synthetic_noisy_labels[large_error_traces_index_list_synthetic_noisy_dataset]
# le_noises_full = full_noise_traces[large_error_traces_index_list]

In [24]:
le_dataset_path= os.path.join(NOISY_DATA_PATH, f'le_{str(SBM_CLASS)}_dataset.pt')

le_labels_path= os.path.join(NOISY_DATA_PATH, f'le_{str(SBM_CLASS)}_labels.pt')

le_full_noises_path = os.path.join(NOISY_DATA_PATH, f'le_{str(SBM_CLASS)}_full_noise_traces.pt')

le_factors_path = os.path.join(NOISY_DATA_PATH, f'le_{str(SBM_CLASS)}_factors.pt')

In [25]:
full_noise_traces = torch.load(full_noises_path)
factors = torch.load(factors_path)
le_noises_full = full_noise_traces[large_error_traces_index_list_synthetic_noisy_dataset]
le_factors = factors[large_error_traces_index_list_synthetic_noisy_dataset]
torch.save(le_dataset, le_dataset_path)
torch.save(le_labels, le_labels_path)
torch.save(le_noises_full, le_full_noises_path)
torch.save(le_factors, le_factors_path)


# le_dataset = torch.load(le_dataset_path)
# le_labels = torch.load(le_labels_path)
# le_noises_full = torch.load(le_full_noises_path)
# le_factors = torch.load(le_factors_path)


In [26]:
le_dataset.shape, le_labels.shape, le_noises_full.shape, le_factors.shape

(torch.Size([456, 3, 6000]),
 torch.Size([456]),
 torch.Size([456, 3, 6600]),
 torch.Size([456]))

## Shifting scheme for large error cases

In [27]:
num_traces_in_experiment=3
# le_dataset[:num_traces_in_experiment].shape, le_labels[:num_traces_in_experiment].shape, le_noises_full[:num_traces_in_experiment].shape

traces_list = [le_dataset[i] - le_factors[i]*le_noises_full[i,:,:NUM_SAMPLES] for i in range(num_traces_in_experiment)]
labels_list = [le_labels[i] for i in range(num_traces_in_experiment)]
full_noises_list = [le_noises_full[i] for i in range(num_traces_in_experiment)]

In [28]:
# for trace, full_noise_trace, label in zip(traces_list, full_noises_list, labels_list):
#     shifting_experiment(trace=trace, full_noise_trace=full_noise_trace, label=int(label), \
#          model=pretrained_model.double(), num_shifts=NUM_SHIFTS, synthesized_snr=SYNTHESIZED_SNR, silent_prints=True,
#                         save_plot_to='')


## Filter the large error traces using FFT

In [29]:
# le_dataset = torch.load(le_dataset_path)
# le_labels = torch.load(le_labels_path)

# noised_traces_list = create_noisy_traces(calc_snr=CalcSNR(SnrCalcStrategy.ENERGY_RATIO), dataset=le_dataset, desired_snr=SYNTHESIZED_SNR, labels=le_labels, noise_traces=noises[large_error_traces_index_list])

# dataset_len = le_original_dataset.shape[0]
# noised_traces_list=[le_original_dataset[i,:,:].float() for i in range(dataset_len)]

# dataset_len = le_dataset.shape[0]
# noised_traces_list=[le_dataset[i,:,:].float() for i in range(dataset_len)]

In [30]:
def band_filter_experiment(lco, uco):
    print(f'lower_cut_off={lco}, upper_cut_off={uco}. Filtering {dataset_len} traces')
    fixed = fft_filter_experiment(traces_list=noised_traces_list, label_list=le_labels, model=pretrained_model.float(),
                                  sample_rate=SAMPLE_RATE, residual_threshold_seconds=1, lower_cut_off=lco,
                                  upper_cut_off=uco, silent_prints=True, plot_fixed_traces=True)
    print(fixed, 'fixed, out of', dataset_len )
    return fixed

def cut_off_search():
    max_fixed = 0
    best_lco_uco = []
    for lco in range(20):
        for uco in range(80, 101):
            fixed = band_filter_experiment(lco, uco)
            if fixed > max_fixed:
                max_fixed = fixed
                best_lco_uco = [(lco, uco)]
            if fixed == max_fixed:
                best_lco_uco.append((lco, uco))
    print(f'cut off {best_lco_uco} fixed {max_fixed}')

In [31]:
# cut_off_search()
# band_filter_experiment(lco=12, uco=100)