# Run Presto on Darfur Dataset

**Author**: Ivan Zvonkov

**Last Modified**: Feb 19, 2024

**Description**: Runs Presto on Darfur dataset.

In [1]:
import sys
import pandas as pd

from tqdm.notebook import tqdm

sys.path.append("../..")

from datasets import datasets

## 1. Load in data

In [3]:
# Takes a minute and a half
dfs = []
for d in tqdm(datasets):
    df = d.load_df(to_np=True, disable_tqdm=True)
    df["name"] = d.name
    dfs.append(df)
df = pd.concat(dfs)
df["is_crop"] = df["class_probability"] > 0.5

  0%|          | 0/47 [00:00<?, ?it/s]

  df = d.load_df(to_np=True, disable_tqdm=True)


## 2. Setup training and evaluation sets

In [15]:
from src.bboxes import bboxes

In [19]:
df[df["name"] == "SudanGedarefDarfurAlJazirah2022"]["subset"].value_counts()

training      483
testing       375
validation    338
Name: subset, dtype: int64

In [20]:
# Fewer points because many failed on GEE
df[df["name"] == "SudanGedarefDarfurAlJazirah2023"]["subset"].value_counts()

training      277
testing       239
validation    215
Name: subset, dtype: int64

In [24]:
is_test_2022 = (df["name"] == "SudanGedarefDarfurAlJazirah2022") & (df["subset"] != "training")
is_test_2023 = (df["name"] == "SudanGedarefDarfurAlJazirah2023") & (df["subset"] != "training")

In [17]:
bbox_name = "Sudan_South"

is_local_lat = (df.lat >= bboxes[bbox_name].min_lat) & (df.lat <= bboxes[bbox_name].max_lat)
is_local_lon = (df.lon >= bboxes[bbox_name].min_lon) & (df.lon <= bboxes[bbox_name].max_lon)

In [25]:
# Have to exclude both test sets from either training sets with Presto
# because Presto uses lat lons as input, so bias would be introduced
df_train_2022 = df[is_local_lat & is_local_lon & ~is_test_2022 & ~is_test_2022]
df_train_2023 = df[is_local_lat & is_local_lon & ~is_test_2023 & ~is_test_2023]
df_test_2022 = df[is_test_2022]
df_test_2023 = df[is_test_2023]

## 3. Convert to Presto Tensor Datasets 

In [40]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from openmapflow.bands import BANDS
from src.single_file_presto_v2 import Presto, DEVICE

In [38]:
ADD_BY = np.zeros(18)
ADD_BY[0:2] = [25.0, 25.0]     # Sentinel-1 VV, VH (range from -50 to 1)
ADD_BY[13] = -272.15           # ERA5 Celcius

DIVIDE_BY = np.ones(18)
DIVIDE_BY[0:2] = [25.0, 25.0]   # Sentinel-1 VV, VH (range from -50 to 1)
DIVIDE_BY[2:13] = [10000.0] * 11 # Sentinel-2 high band values
DIVIDE_BY[13] = 35.0            # ERA5 high celcius value
DIVIDE_BY[14] = 0.03            # ERA5 high precipitation value
DIVIDE_BY[15] = 2000.0          # SRTM elevation high value
DIVIDE_BY[16] = 50.0            # Slope high value

def normalize(x):
    keep_indices = [idx for idx, val in enumerate(BANDS) if val != "B9"] # remove the b9 band
    normalized = ((x + ADD_BY) / DIVIDE_BY).astype(np.float32)
    return normalized[:, keep_indices]

In [30]:
dw_mask = (torch.ones(12) * 9).long()

class PrestoDataset(Dataset):
    def __init__(self, arg_df, start_month=1):
        xs_list = [normalize(x[start_month:start_month+12]) for x in arg_df["eo_data"].to_list()]
        self.xs_tensors = [torch.from_numpy(x).to(DEVICE).float() for x in xs_list]

        self.latlons = [np.stack([lat, lon], axis=-1) for lat, lon in zip(arg_df["eo_lat"].to_list(), arg_df["eo_lon"].to_list())]
        self.latlons_tensors = [torch.from_numpy(latlon).to(DEVICE).float() for latlon in self.latlons]
        
        self.is_crop_tensors = [torch.tensor(is_crop, dtype=torch.float32) for is_crop in arg_df["is_crop"].astype(int).to_list()]
        self.start_month = start_month
        
    def __len__(self):
        return len(self.xs_tensors)
    
    def __getitem__(self, idx):
        x = self.xs_tensors[idx]
        latlons = self.latlons_tensors[idx]
        is_crop = self.is_crop_tensors[idx]
        return x, latlons, dw_mask, self.start_month, is_crop

In [43]:
train_dataset_2022 = PrestoDataset(df_train_2022, start_month=2)
test_dataset_2022 = PrestoDataset(df_test_2022, start_month=2) 
train_dataset_2023 = PrestoDataset(df_train_2023, start_month=2)
test_dataset_2023 = PrestoDataset(df_test_2023, start_month=2) 

## 4. Generate encodings using Presto

In [45]:
encoder_decoder = Presto.load_pretrained("../../data/presto/default_model_v2.pt")
pretrained_model = encoder_decoder.encoder.eval()

In [46]:
def generate_encodings(dataset):
    dataloader = DataLoader(dataset=dataset, batch_size=64, shuffle=False)
    feature_list = []
    for (x, latlons, dw, start_month, _) in tqdm(dataloader, desc="Encodings", leave=False):
        with torch.no_grad():
            encodings = (pretrained_model(x, dynamic_world=dw, latlons=latlons, month=start_month).cpu().numpy())
            feature_list.append(encodings)
    return np.concatenate(feature_list)

X_train = generate_encodings(train_dataset)
X_test = generate_encodings(test_dataset)
y_train = train_df["is_crop"].to_list() 
y_test = test_df["is_crop"].to_list()

Encodings:   0%|          | 0/146 [00:00<?, ?it/s]

AttributeError: 'tuple' object has no attribute 'cpu'