In [1]:
import xarray as xr
import numpy as np
from matplotlib import pyplot as plt
from pathlib import Path
import os
os.environ['USE_PYGEOS'] = '0'
import geopandas as gpd
import seaborn as sns
import pandas as pd
from tqdm import tqdm
from shapely import wkt
import gc

In [2]:
# ADD the path to the burned areas file
gdf = gpd.read_file("/hkfs/work/workspace/scratch/uyxib-pauline_gddpfa/mesogeos/auxilliary/burned_areas_shapefile")

In [4]:
# ADD the path to the datacube
ds = xr.open_zarr("/hkfs/work/workspace/scratch/uyxib-pauline_gddpfa/mesogeos/mesogeos_cube.zarr")

In [None]:
# ADD the path to the biomes
biome = gpd.read_file("/hkfs/work/workspace/scratch/uyxib-pauline_gddpfa/mesogeos/auxilliary/ecoregions")

In [None]:
biome=biome[(biome['BIOME_NUM']== 12.0) & (biome['REALM']=='Palearctic')]

# Prepare geopandas dataframe

In [None]:
# change the CRS of gdf to EPSG:4326
# slice gdf to rows with IGNITION_D >= '2002-04-01'
gdf = gdf[gdf['IGNITION_D'] >= '2006-04-02'].reset_index(drop=True)
gdf = gdf[gdf['IGNITION_D'] <= '2022-09-30'].reset_index(drop=True)
gdf = gdf[gdf['AREA_HA'].astype(float) >= 30]
# convert FIREDATE column to datetime
gdf['IGNITION_D'] = pd.to_datetime(gdf['IGNITION_D'])
# subtract 1 day from FIREDATE
gdf['IGNITION_D'] = gdf['IGNITION_D'] - pd.Timedelta(days=1)
# get the year of FIREDATE
gdf['IGNITION_YEAR'] = gdf['IGNITION_D'].dt.year
# change geometry
gdf['geometry_h'] = gdf['geometry_h'].apply(wkt.loads)
gdf = gdf.set_geometry('geometry_h')
gdf.crs = "EPSG:5643"
gdf = gdf.to_crs(epsg=4326)

In [None]:
gdf = gpd.sjoin(gdf, biome, how = 'inner', predicate = 'intersects').reset_index(drop=True)

In [None]:
# Create the save directories
save_dir = "/hkfs/work/workspace/scratch/uyxib-pauline_gddpfa/mesogeos"

# Sample Positive samples

In [1]:
lag = 30
patch_size = 125
patch_half = 125//2
len_x = len(ds['x'])
len_y = len(ds['y'])

s_cl = 0
s_seg = 0
for i in tqdm(range(len(gdf))):
    np_var = {}
    date_format = '%Y-%m-%d'
    ignition_date = gdf.loc[i, 'IGNITION_D']
    ignition_xy = gdf.loc[i, 'geometry_h']
              
    ign_date_str  = (ignition_date).strftime('%Y-%m-%d')
    ign_date_lag_str = (ignition_date - pd.Timedelta(days=lag-1)).strftime('%Y-%m-%d')
              
    pos_sample_ds = ds.sel(time=slice(ign_date_lag_str, ign_date_str))
    
    pos_sample = pos_sample_ds.sel(x=ignition_xy.x, y=ignition_xy.y, method='nearest')
    x_idx = np.where(pos_sample_ds['x']==pos_sample['x'].values)[0].item()
    y_idx = np.where(pos_sample_ds['y']==pos_sample['y'].values)[0].item()
    
    if ((x_idx - patch_half < 0) or (x_idx + patch_half + 1 >= len_x) or (y_idx - patch_half < 0) or (y_idx + patch_half + 1 >= len_y)):
        print('border')
        continue
        
    pos_sample_ds = pos_sample_ds.isel(x=slice(x_idx - patch_half,x_idx + patch_half + 1),
                                      y=slice(y_idx - patch_half,y_idx + patch_half + 1))
    
    pos_sample_ds_vars = list(pos_sample_ds.keys()) 
    year = str(pos_sample_ds.time.dt.year.values[-1])
    for var in pos_sample_ds_vars:
        if var == 'population' or 'lc' in var:
            del pos_sample_ds[var]
            if year == '2006':
                pos_sample_ds[var] = ds[var].sel(time=slice('2006-04-01', '2006-04-01'))[0].isel(x=slice(x_idx - patch_half,x_idx + patch_half + 1),
                                      y=slice(y_idx - patch_half,y_idx + patch_half + 1)) 
            else:
                dt = str(year) + '-01-01'
                pos_sample_ds[var] = ds[var].sel(time=slice(dt, dt))[0].isel(x=slice(x_idx - patch_half,x_idx + patch_half + 1),
                                      y=slice(y_idx - patch_half,y_idx + patch_half + 1))
                
    del pos_sample_ds['spatial_ref']
    pos_sample_ds = pos_sample_ds.load()
    

    pos_sample_ds = pos_sample_ds.isel(x=patch_half, y=patch_half)
    pos_sample_ds = pos_sample_ds.load()
    pos_sample_ds['burned_area_has'] = float(gdf.loc[i, 'AREA_HA'])
    if pd.notnull(pos_sample_ds['t2m'][0]): 
        if s_cl == 0: 
            df = pos_sample_ds.to_dataframe()
            df['time_idx'] = np.arange(0,lag)
            df['sample'] = s_cl
        else:
            df1 = pos_sample_ds.to_dataframe()
            df1['time_idx'] = np.arange(0,lag)
            df1['sample'] = s_cl
            df = pd.concat([df, df1], axis=0)
            del df1
        s_cl+=1
    del pos_sample_ds
    gc.collect()
path_df = save_dir / 'positives.csv'
df.to_csv(path_df)