In [None]:
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

# EarthQuake Prediction

*Written by Daniele Rege Cambrin*

## Introduction

## Environment

For the environment, we will install the torchgeo and scikit-learn packages.

In [None]:
%pip install torchgeo h5py scikit-learn

## Imports

In [None]:
import tempfile
from collections import defaultdict
from itertools import chain
from pathlib import Path

import h5py
import numpy as np
import torch
from lightning.pytorch import Trainer
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.loggers import CSVLogger
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from tqdm import tqdm

from torchgeo.datamodules import QuakeSetDataModule
from torchgeo.models import ResNet50_Weights, resnet50
from torchgeo.trainers import ClassificationTask

## Dataset

The dataset we will use is the QuakeSet dataset [1] (licensed under OpenRAIL License), which has patches from different parts of the world before and after earthquake, with corresponding negative examples (Figure below). The dataset uses SAR imagery from Sentinel-1 satellite with 10m of spatial resolution. The task is predicting for each couple of images if an earthquake occurred between them.

[1] Rege Cambrin, D., & Garza, P. (2024). QuakeSet: A Dataset and Low-Resource Models to Monitor Earthquakes through Sentinel-1. Proceedings of the International ISCRAM Conference. doi:10.59297/n89yc374

In [None]:
# The datamodule is already implemented in torchgeo, so we can just use it
datamodule = QuakeSetDataModule(batch_size=2, num_workers=1, download=True)
# This will download the dataset
datamodule.prepare_data()

# Batch Visualization

The QuakeSetDataModule has already a ```plot``` function implemented to show the samples.

Remember to call ```setup``` before using it with *fit* or *validate* (otherwise you will get an error).

In [None]:
datamodule.setup('fit')
datamodule.setup('test')
sample = datamodule.val_dataset[500]
fig = datamodule.plot(sample)

# Training

The first approach is using a pretrained model to compute the embeddings to be used from a classical machine learning model.

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # Change to "cpu" if you don't have a GPU
PCT_SAMPLE = 0.1  # Percentage of samples to use

In [None]:
model_transform = ResNet50_Weights.SENTINEL1_ALL_MOCO.transforms
rn_model = resnet50(ResNet50_Weights.SENTINEL1_ALL_MOCO).to(DEVICE).eval()

In [None]:
# Take only a small sample of the dataset to speed up the training
datamodule.train_dataset.data = datamodule.train_dataset.data[
    : int(len(datamodule.train_dataset.data) * PCT_SAMPLE)
]
datamodule.val_dataset.data = datamodule.val_dataset.data[
    : int(len(datamodule.val_dataset.data) * PCT_SAMPLE)
]
datamodule.test_dataset.data = datamodule.test_dataset.data[
    : int(len(datamodule.test_dataset.data) * PCT_SAMPLE)
]
# Open a HDF5 file in write mode
with h5py.File('data/embeddings.h5', 'w') as f:
    # Iterate over the batches in both the training and validation dataloaders
    for i, (batch, split) in tqdm(
        enumerate(
            chain(
                zip(
                    datamodule.train_dataloader(),
                    ['train'] * len(datamodule.train_dataloader()),
                ),
                zip(
                    datamodule.test_dataloader(),
                    ['test'] * len(datamodule.test_dataloader()),
                ),
            )
        )
    ):
        # Prepare the data for inference
        img = batch['image'].to(DEVICE)
        labels = batch['label']
        pre = model_transform({'image': img[:, :2]})['image']
        post = model_transform({'image': img[:, 2:]})['image']
        # Do the inference and save the embeddings to the HDF5 file
        with torch.no_grad():
            embs = torch.concat([rn_model(pre), rn_model(post)], axis=1).cpu().numpy()
        for j, (emb, lab) in enumerate(zip(embs, labels)):
            f.create_dataset(f'{split}/{i}_{j}', data=emb, compression='gzip')
            f[f'{split}/{i}_{j}'].attrs['label'] = lab

In [None]:
# Load the embeddings and labels of the train split into memory
with h5py.File('data/embeddings.h5', 'r') as f:
    embeddings = defaultdict(list)
    labels = defaultdict(list)
    for split in ['train', 'test']:
        for key in f[split]:
            embeddings[split].append(f[split][key][...])
            labels[split].append(f[split][key].attrs['label'])
        embeddings[split] = np.stack(embeddings[split])
        labels[split] = np.array(labels[split])

# Train a RandomForest classifier
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
clf.fit(embeddings['train'], labels['train'])

# Evaluate the classifier
preds = clf.predict(embeddings['test'])
print(classification_report(labels['test'], preds))

The second approach requires training a network from scratch. To this end, we can use the TorchGeo's ```ClassificationTask``` and Lightning's ```Trainer``` to simplify the training.

In [None]:
default_root_dir = Path(tempfile.gettempdir()) / 'experiments'
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss', dirpath=default_root_dir, save_top_k=1, save_last=True
)
logger = CSVLogger(save_dir=default_root_dir, name='tutorial_logs')

In [None]:
task = ClassificationTask(
    model='resnet18', in_channels=4, num_classes=2, loss='ce', lr=0.0001
)

In [None]:
trainer = Trainer(
    accelerator="gpu" if DEVICE == "cuda" else "cpu",
    callbacks=[checkpoint_callback],
    log_every_n_steps=2,
    logger=logger,
    max_epochs=1,
    limit_train_batches=PCT_SAMPLE,
    limit_val_batches=PCT_SAMPLE,
    limit_test_batches=PCT_SAMPLE,
)

In [None]:
trainer.fit(model=task, datamodule=datamodule)

In [None]:
trainer.test(
    model=task, datamodule=datamodule, ckpt_path=checkpoint_callback.best_model_path
)