In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import glob
import gc
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn, optim
from tqdm import tqdm
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning import seed_everything
import torchmetrics
from torchmetrics import Metric
from pathlib import Path
from sklearn.model_selection import train_test_split

seed_everything(42, workers=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


## 1. Configurations

In [2]:
!ls /kaggle/input/waveform-inversion/train_samples

CurveFault_A  CurveVel_A  FlatFault_A  FlatVel_A  Style_A
CurveFault_B  CurveVel_B  FlatFault_B  FlatVel_B  Style_B


In [3]:
class config:
    def __init__(self):
        #The File paths
        self.train_path = '/kaggle/input/waveform-inversion/train_samples'
        self.test_path = '/kaggle/input/waveform-inversion/test'
        self.submission_path = '/kaggle/input/waveform-inversion/sample_submission.csv'
        self.model_path = '/kaggle/working/fwi_model.pt'
        self.checkpoint_dir = '/kaggle/working/checkpoint_fwi'
        self.dset = ["FlatVel_A","FlatVel_B", "Style_A", "Style_B"] #Dataset storage names used for training
        
        #Optimizer
        self.lr = 0.001
        self.weight_decay = 1e-4 #Regularization weight
        
        #The training parameters
        self.num_epoch = 50
        self.batch_size = 20

        #Learning rate scheduler
        self.step_size = 10  #To decay after every, say 10 epochs
        self.gamma = 0.5      #To reduce the learning rate by gamma (say, 1/2)

        

cfg = config()

### 1.1 Preparing the Data

In [4]:
#Pytorch Dataset for DataLoader, we set velocity initially to None in the case of loading the test set
#Note: We can also use the TensorDataset from torch.utils.data
class SeismicDataset(Dataset):
    def __init__(self, seismic, vel = None):
        self.seismic = torch.tensor(seismic, dtype = torch.float32)
        self.label = vel is not None
        if self.label:
            self.vel = torch.tensor(vel, dtype = torch.float32)

    def __len__(self):
        return len(self.seismic)

    def __getitem__(self,idx):
        if self.label:
            return self.seismic[idx], self.vel[idx]
        else:
            return self.seismic[idx]

In [5]:

def prepare_data(cfg):
    #First we extract the velocity and seismic data's for training and testing
    vel_data = []; seismic_data = []; test_data = []

    #Extracting and concatenating the training data
    for domain in cfg.dset: 
        model_path = Path(cfg.train_path) / domain / "model"
        data_path = Path(cfg.train_path) / domain / "data"
    
        # Load all .npy files in this domain and extend the master lists
        vel_data += [np.load(str(f)) for f in sorted(model_path.glob("*.npy"))]
        seismic_data += [np.load(str(f)) for f in sorted(data_path.glob("*.npy"))]
    
    # Concatenate all at once
    sample_points = sum(v.shape[0] for v in vel_data)
    vel_data = np.concatenate(vel_data, axis=0)
    seismic_data = np.concatenate(seismic_data, axis=0)
    assert ( vel_data.shape[0] == sample_points and seismic_data.shape[0] == sample_points
           ), f"Expected sample size {sample_points} but got {vel_data.shape[0]} and {seismic_data.shape[0]}"
    print(f"Training data --> Seismic: {seismic_data.shape}, Velocity: {vel_data.shape}")

    #We need to normalize (Z-score) our input before training
    s_mean = seismic_data.mean(axis=(0, 2, 3), keepdims=True); s_std = seismic_data.std(axis=(0, 2, 3), keepdims=True)
    seismic_data = (seismic_data - s_mean)/(s_std + 1e-6) #Epsilon is for stability

    #Extracting the Test data
    test_path = Path(cfg.test_path)
    test_data += [np.load(str(f)) for f in sorted(test_path.glob("*.npy"))[0:25]] #Only first few for illustration
    test_sample_points = sum(v.shape[0] for v in test_data)

    test_data = np.concatenate(test_data,axis=0)
    assert test_data.shape[0] == test_sample_points, f"Expected test size {test_sample_points} but got {test_data.shape[0]} "
    print(f"Testing data --> Seismic: {test_data.shape}")

    #Next, we take a portion of the train data for validation
    X_train, X_val, y_train,y_val = train_test_split(
        seismic_data, vel_data, test_size = 0.1, random_state = 42, shuffle = True
    )
    print(f"After split --> X_train: {X_train.shape}, y_train: {y_train.shape} -- X_val: {X_val.shape}, y_val: {y_val.shape}")
    
    #Loading the datasets into batches
    train_dataset = SeismicDataset(X_train, y_train)
    val_dataset = SeismicDataset(X_val, y_val)
    test_dataset = SeismicDataset(test_data)

    #DataLoader
    train_loader = DataLoader(train_dataset, batch_size = cfg.batch_size, shuffle = True)
    val_loader = DataLoader(val_dataset, batch_size = cfg.batch_size, shuffle = False)
    test_loader = DataLoader(test_dataset, batch_size = cfg.batch_size, shuffle = False)

    return train_loader, val_loader, test_loader 


In [6]:
class DataModule(pl.LightningModule):
    def __init__(self,cfg):
        super().__init__()
        self.cfg = cfg
        self.train_loader = None
        self.val_loader = None
        self.test_loader = None

    def setup(self, stage = None):
        self.train_loader, self.val_loader, self.test_loader = prepare_data(self.cfg)
        print('DataLoaded Successfully')

    def train_dataloader(self):
        return self.train_loader

    def val_dataloader(self):
        return self.val_loader

    def test_dataloader(self):
        return self.test_loader

data_module = DataModule(cfg)

In [7]:
data_module.setup()

Training data --> Seismic: (4000, 5, 1000, 70), Velocity: (4000, 1, 70, 70)
Testing data --> Seismic: (125, 1000, 70)
After split --> X_train: (3600, 5, 1000, 70), y_train: (3600, 1, 70, 70) -- X_val: (400, 5, 1000, 70), y_val: (400, 1, 70, 70)
DataLoaded Successfully


## 2. The U-Net Model