In [2]:
import gpytorch
gpytorch.__version__

'1.9.1'

In [None]:
# general
import os
from tqdm import tqdm

# wandb - hyperparameter sweep and Train monitoring
import wandb
#torch - computing and machine learning libraries
import torch
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn.functional as F
# seisbench
import seisbench.models as sbm

#plotting
import matplotlib.pyplot as plt
# seisynth
from utils.common import load_dataset_and_labels, load_pretrained_model

In [None]:
# Possible values
DATASETS_ORIGINS = ['ethz', 'geofon']
SBM_CLASSES= [sbm.PhaseNet, sbm.EQTransformer]
MODEL_TO_NUM_SAMPLES = {sbm.EQTransformer:6000, sbm.PhaseNet: 3001}

In [None]:
dataset_origin = 'geofon'
assert dataset_origin in DATASETS_ORIGINS, f'Expected dataset one of {DATASETS_ORIGINS}. Got {dataset_origin}.'

In [None]:
SBM_CLASS= sbm.PhaseNet
assert SBM_CLASS in SBM_CLASSES
SBM_CLASS

In [None]:
NUM_SAMPLES=MODEL_TO_NUM_SAMPLES[SBM_CLASS]
NUM_SAMPLES

In [None]:
NUM_SHIFTS=6
SAMPLE_RATE=100
LARGE_ERROR_THRESHOLD_SECONDS=1
LARGE_ERROR_THRESHOLD_SAMPLES=LARGE_ERROR_THRESHOLD_SECONDS*SAMPLE_RATE

In [None]:
SYNTHESIZED_SNR_LIST= list(range(1,11))
SYNTHESIZED_SNR_LIST

In [None]:
def assert_path_exists(path_str: str, name: str=''):
    assert os.path.exists(path_str), f'{name} {path_str} does not exist'

@torch.no_grad()
def standardize(trace: torch.tensor):
    m = trace.mean(dim=-1, keepdim=True).unsqueeze(dim=0)
    std = trace.std(dim=-1, keepdim=True).unsqueeze(dim=0)
    trace = trace.unsqueeze(dim=0) if trace.dim() == 1 else trace
    standardized = torch.stack([(trace[ch] - m[0, ch]) / std[0, ch] for ch in range(trace.shape[0])], dim=0)
    assert standardized.shape == trace.shape, f'Standardization should not change shape. Got {standardized.shape}'
    return standardized

In [None]:
DATASET_PATH=f'/home/moshe/datasets/GFZ/noisy_datasets/{dataset_origin}_trainset_{NUM_SAMPLES}_sample_joachim_noises_energy_ratio_snr/'
assert_path_exists(path_str=DATASET_PATH, name='DATASET_PATH')
DATASET_PATH

In [None]:
NOISY_DATA_PATH_LIST = [os.path.join(DATASET_PATH, f'noisy_dataset_snr_{synthesized_snr}') for synthesized_snr in SYNTHESIZED_SNR_LIST]
for p in NOISY_DATA_PATH_LIST:
    assert_path_exists(path_str=p)
NOISY_DATA_PATH_LIST

In [None]:
pretrained_model = load_pretrained_model(model_class=SBM_CLASS, dataset_trained_on=dataset_origin)

In [None]:
# reloading because I cannot torch clone. Seisbench models are not nn.Module :(
retraining_model = load_pretrained_model(model_class=SBM_CLASS, dataset_trained_on=dataset_origin)

In [None]:
pretrained_model.eval()
# retraining_model.train()

In [None]:
NOISY_DATA_PATH_TRACES_LIST = [os.path.join(ndpl, 'traces.pt') for ndpl in NOISY_DATA_PATH_LIST]
for p in NOISY_DATA_PATH_TRACES_LIST:
    assert_path_exists(path_str=p)
NOISY_DATA_PATH_TRACES_LIST

In [None]:
NOISY_DATA_PATH_LABELS_LIST = [os.path.join(ndpl, 'labels.pt') for ndpl in NOISY_DATA_PATH_LIST]
for p in NOISY_DATA_PATH_LABELS_LIST:
    assert_path_exists(path_str=p)
NOISY_DATA_PATH_LABELS_LIST

In [None]:
def load_dataset_from_tensors(traces_path_list: list[torch.tensor], labels_path_list: list[torch.tensor], indices_to_use: list[int]=[]):
    traces_list, labels_list = [], []
    for tp, lp in zip(traces_path_list, labels_path_list):
        traces,labels = load_dataset_and_labels(dataset_path=tp, labels_path=lp)
        if indices_to_use:
            traces, labels = traces[indices_to_use], labels[indices_to_use]
        traces_list.append(traces)
        labels_list.append(labels.unsqueeze(dim=1))
        print(f'traces_list {len(traces_list)} labels_list {len(labels_list)}')


    traces = torch.vstack(traces_list)
    labels = torch.vstack(labels_list).squeeze()
    print(f'traces shape {traces.shape} labels_list {labels.shape}')
    return traces, labels

noised_dataset, noised_labels = load_dataset_from_tensors(traces_path_list=NOISY_DATA_PATH_TRACES_LIST, labels_path_list=NOISY_DATA_PATH_LABELS_LIST, indices_to_use=list(range(500)))

noised_dataset_size = noised_dataset.shape[0]


In [None]:
print(f'Loaded {noised_dataset_size} traces')

In [3]:
import math
import tqdm
import torch
import gpytorch
from matplotlib import pyplot as plt

# Make plots inline
%matplotlib inline

In [0]:
import urllib.request
import os
from scipy.io import loadmat
from math import floor


# this is for running the notebook in our testing framework
smoke_test = ('CI' in os.environ)


if not smoke_test and not os.path.isfile('../elevators.mat'):
    print('Downloading \'elevators\' UCI dataset...')
    urllib.request.urlretrieve('https://drive.google.com/uc?export=download&id=1jhWL3YUHvXIaftia4qeAyDwVxo6j1alk', '../elevators.mat')


if smoke_test:  # this is for running the notebook in our testing framework
    X, y = torch.randn(2000, 3), torch.randn(2000)
else:
    data = torch.Tensor(loadmat('./elevators.mat')['data'])
    X = data[:, :-1]
    X = X - X.min(0)[0]
    X = 2 * (X / X.max(0)[0]) - 1
    y = data[:, -1]


train_n = int(floor(0.8 * len(X)))
train_x = X[:train_n, :].contiguous()
train_y = y[:train_n].contiguous()

test_x = X[train_n:, :].contiguous()
test_y = y[train_n:].contiguous()

if torch.cuda.is_available():
    train_x, train_y, test_x, test_y = train_x.cuda(), train_y.cuda(), test_x.cuda(), test_y.cuda()

In [18]:
train_x.shape, train_y.shape, test_x.shape, test_y.shape

(torch.Size([13279, 18]),
 torch.Size([13279]),
 torch.Size([3320, 18]),
 torch.Size([3320]))

In [8]:
data_dim = train_x.size(-1)

class LargeFeatureExtractor(torch.nn.Sequential):
    def __init__(self):
        super(LargeFeatureExtractor, self).__init__()
        self.add_module('linear1', torch.nn.Linear(data_dim, 1000))
        self.add_module('relu1', torch.nn.ReLU())
        self.add_module('linear2', torch.nn.Linear(1000, 500))
        self.add_module('relu2', torch.nn.ReLU())
        self.add_module('linear3', torch.nn.Linear(500, 50))
        self.add_module('relu3', torch.nn.ReLU())
        self.add_module('linear4', torch.nn.Linear(50, 2))

feature_extractor = LargeFeatureExtractor()

In [9]:
class GPRegressionModel(gpytorch.models.ExactGP):
        def __init__(self, train_x, train_y, likelihood):
            super(GPRegressionModel, self).__init__(train_x, train_y, likelihood)
            self.mean_module = gpytorch.means.ConstantMean()
            self.covar_module = gpytorch.kernels.GridInterpolationKernel(
                gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel(ard_num_dims=2)),
                num_dims=2, grid_size=100
            )
            self.feature_extractor = feature_extractor

            # This module will scale the NN features so that they're nice values
            self.scale_to_bounds = gpytorch.utils.grid.ScaleToBounds(-1., 1.)

        def forward(self, x):
            # We're first putting our data through a deep net (feature extractor)
            projected_x = self.feature_extractor(x)
            projected_x = self.scale_to_bounds(projected_x)  # Make the NN values "nice"

            mean_x = self.mean_module(projected_x)
            covar_x = self.covar_module(projected_x)
            return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

In [10]:
likelihood = gpytorch.likelihoods.GaussianLikelihood()
model = GPRegressionModel(train_x, train_y, likelihood)

if torch.cuda.is_available():
    model = model.cuda()
    likelihood = likelihood.cuda()

In [11]:
training_iterations = 2 if smoke_test else 60

# Find optimal model hyperparameters
model.train()
likelihood.train()

# Use the adam optimizer
optimizer = torch.optim.Adam([
    {'params': model.feature_extractor.parameters()},
    {'params': model.covar_module.parameters()},
    {'params': model.mean_module.parameters()},
    {'params': model.likelihood.parameters()},
], lr=0.01)

# "Loss" for GPs - the marginal log likelihood
mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

def train():
    iterator = tqdm.notebook.tqdm(range(training_iterations))
    for i in iterator:
        # Zero backprop gradients
        optimizer.zero_grad()
        # Get output from model
        output = model(train_x)
        # Calc loss and backprop derivatives
        loss = -mll(output, train_y)
        loss.backward()
        iterator.set_postfix(loss=loss.item())
        optimizer.step()

%time train()

  0%|          | 0/60 [00:00<?, ?it/s]

CPU times: user 13min 47s, sys: 1min 50s, total: 15min 38s
Wall time: 3min 55s


In [14]:
model.eval()
likelihood.eval()
with torch.no_grad(), gpytorch.settings.use_toeplitz(False), gpytorch.settings.fast_pred_var():
    preds = model(test_x)
    observed_pred = likelihood(model(test_x))

In [13]:
print('Test MAE: {}'.format(torch.mean(torch.abs(preds.mean - test_y))))

Test MAE: 0.073357492685318


In [17]:
lower, upper = observed_pred.confidence_region()
lower.shape, upper.shape

(torch.Size([3320]), torch.Size([3320]))