In [None]:
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

# EarthQuake Prediction

*Written by Daniele Rege Cambrin*

## Introduction

The objective of this tutorial is to go through the QuakeSet dataset and cover the following topics:

* How to use TorchGeo data modules to load datasets and plot samples;
* How to use TorchGeo pre-trained model embeddings to train a classical model (i.e., Random Forest);
* How to train a new TorchGeo deep model using tasks and trainer;

## Environment

For the environment, we will install the torchgeo, h5py, and scikit-learn packages.

In [None]:
%pip install torchgeo h5py scikit-learn

## Imports

In [None]:
import tempfile
from collections import defaultdict
from pathlib import Path

import h5py
import numpy as np
import torch
from lightning.pytorch import Trainer
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.loggers import CSVLogger
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from tqdm import tqdm

from torchgeo.datamodules import QuakeSetDataModule
from torchgeo.models import ResNet50_Weights, resnet50
from torchgeo.trainers import ClassificationTask

## Dataset

We will use is the QuakeSet dataset [1] (licensed under OpenRAIL License), which has patches from around the world before and after an earthquake, with corresponding negative examples. 

The dataset uses SAR imagery from Sentinel-1 satellite with 10m of spatial resolution. The task is to predict for each couple of images if an earthquake occurs between them.

[1] Rege Cambrin, D., & Garza, P. (2024). QuakeSet: A Dataset and Low-Resource Models to Monitor Earthquakes through Sentinel-1. Proceedings of the International ISCRAM Conference. [doi:10.59297/n89yc374](doi:10.59297/n89yc374)

In [None]:
# The data module has already been implemented in TorchGeo, so we can use it
datamodule = QuakeSetDataModule(batch_size=2, num_workers=1, download=True)
# This will download the dataset
datamodule.prepare_data()

# Batch Visualization

The QuakeSetDataModule already has a ```plot``` function implemented to show the samples.

Remember to call ```setup``` before using it with *fit* or *test* (otherwise, you will get an error).

In [None]:
datamodule.setup('fit')
datamodule.setup('test')
sample = datamodule.val_dataset[500]
fig = datamodule.plot(sample)

# Train ML model on Pretrained embeddings

The first approach uses a pre-trained deep-learning model to compute the embeddings to train a classical machine-learning model.

First, we have set the constants to select which device to use and the dataset percentage to use (default 10%).

In [None]:
# Change to "cpu" if you don't have a GPU
DEVICE = 'cpu'  #'cuda' if torch.cuda.is_available() else 'cpu'
# Percentage of samples to use
PCT_SAMPLES = 0.1

In [None]:
# We take a subset of the dataset to speed up training
datamodule.train_dataset.data = datamodule.train_dataset.data[
    : int(len(datamodule.train_dataset.data) * PCT_SAMPLES)
]
datamodule.val_dataset.data = datamodule.val_dataset.data[
    : int(len(datamodule.val_dataset.data) * PCT_SAMPLES)
]
datamodule.test_dataset.data = datamodule.test_dataset.data[
    : int(len(datamodule.test_dataset.data) * PCT_SAMPLES)
]

Now, we load a ResNet50 pre-trained on Sentinel-1 images and define the function to make inferences.

In [None]:
model_transform = ResNet50_Weights.SENTINEL1_ALL_MOCO.transforms
rn_model = resnet50(ResNet50_Weights.SENTINEL1_ALL_MOCO).to(DEVICE).eval()

In [None]:
def infer(batch):
    img = batch['image'].to(DEVICE)
    labels = batch['label']
    # Each image has 4 channels (two channels for pre-event image, and two for post-event).
    # We need to split it into two images with two channels each.
    pre = model_transform({'image': img[:, :2]})['image']
    post = model_transform({'image': img[:, 2:]})['image']
    with torch.no_grad():
        embs = torch.concat([rn_model(pre), rn_model(post)], axis=1).cpu().numpy()
    return embs, labels

Now, we run inference on the training and test sets to compute the embeddings. The model will be used as a feature extractor.

We use an HDF5 file to store the embeddings since it is fast to read and write.

In [None]:
# Open an HDF5 file in write mode to store the embeddings
with h5py.File('data/embeddings.h5', 'w') as f:
    # We iterate over the train_dataloader
    for i, batch in tqdm(enumerate(datamodule.train_dataloader()), desc='Train'):
        for j, (emb, lab) in enumerate(zip(*infer(batch))):
            f.create_dataset(f'train/{i}_{j}', data=emb, compression='gzip')
            f[f'train/{i}_{j}'].attrs['label'] = lab
    # We iterate over the test_dataloader
    for i, batch in tqdm(enumerate(datamodule.test_dataloader()), desc='Test'):
        for j, (emb, lab) in enumerate(zip(*infer(batch))):
            f.create_dataset(f'test/{i}_{j}', data=emb, compression='gzip')
            f[f'test/{i}_{j}'].attrs['label'] = lab

Now, we can load the embeddings from the file and fit a classical model (i.e., Random Forest) using the embeddings as features and the labels as targets.

In [None]:
embeddings = defaultdict(list)
labels = defaultdict(list)
with h5py.File('data/embeddings.h5', 'r') as f:
    for split in ['train', 'test']:
        for key in f[split]:
            embeddings[split].append(f[split][key][...])
            labels[split].append(f[split][key].attrs['label'])
embeddings = {k: np.stack(v) for k, v in embeddings.items()}
labels = {k: np.array(v) for k, v in labels.items()}
# Train a RandomForest classifier
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
clf.fit(embeddings['train'], labels['train'])
# Evaluate the classifier on the test set
preds = clf.predict(embeddings['test'])
print(classification_report(labels['test'], preds))

# Training a deep model from scratch

The second approach requires training a deep neural network from scratch. To this end, we can use the TorchGeo's ```ClassificationTask``` and Lightning's ```Trainer``` to simplify the training.

Remember to set the ```in_channels``` parameter to 4 since we are concatenating two (pre and post-event) two-channels images.

In [None]:
task = ClassificationTask(
    model='resnet18', in_channels=4, num_classes=2, loss='ce', lr=0.0001
)

In [None]:
# Set up the trainer logger and checkpoint callback
default_root_dir = Path(tempfile.gettempdir()) / 'experiments'
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss', dirpath=default_root_dir, save_top_k=1, save_last=True
)
logger = CSVLogger(save_dir=default_root_dir, name='tutorial_logs')
# Set up the trainer
trainer = Trainer(
    accelerator='gpu' if DEVICE == 'cuda' else 'cpu',
    callbacks=[checkpoint_callback],
    log_every_n_steps=2,
    logger=logger,
    max_epochs=1,
    limit_train_batches=PCT_SAMPLES,
    limit_val_batches=PCT_SAMPLES,
    limit_test_batches=PCT_SAMPLES,
)

In [None]:
# Fit the model
trainer.fit(model=task, datamodule=datamodule)

In [None]:
# Test the model and print the results
trainer.test(
    model=task, datamodule=datamodule, ckpt_path=checkpoint_callback.best_model_path
)