In [3]:
import sys
import pathlib
import os
from skmap.catalog import DataCatalog
from skmap.overlay import SpaceOverlay, SpaceTimeOverlay
from skmap.misc import find_files, GoogleSheet, ttprint
from osgeo.gdal import BuildVRT, SetConfigOption
import random
import pandas as pd
import time
import skmap_bindings as sb
import numpy as np
from shapely.geometry import Point
from geopandas import gpd 

folder_path = '/mnt/ripley/global_soc/scikit-map/global-soil-mapping'

base_path = [f'http://192.168.49.{gaia_id}:8333' for gaia_id in range(30,47)]
GDAL_OPTS = {'GDAL_HTTP_VERSION': '1.0', 'CPL_VSIL_CURL_ALLOWED_EXTENSIONS': '.tif'}
max_ram_mb = 1000000
n_threads = 96

# version = "20250204"
version = "20250522"
# read in gsheet
gsheet_key = '/mnt/apollo/stac/gaia-319808-913d36b5fca4.json'
gsheet_url = 'https://docs.google.com/spreadsheets/d/1lNTpzdHBG5dirYj46iBDRJMk_YAV0Um2ovBc8v3dR9w/edit?gid=78425683#gid=78425683'
gsheet = GoogleSheet(gsheet_key, gsheet_url, verbose=False)

# properties

In [5]:
# read in the data to be overlaid
df = pd.read_csv(f'{folder_path}/material/soil_pnts_global_xyt_v{version}.csv', encoding='latin1', on_bad_lines='skip', dtype={2: str})

years = df['observation_year'].unique().tolist()
years = [int(ii) for ii in years]

# create catalog
# catalog = DataCatalog.create_catalog(catalog_def=gsheet.global_soil_mapping, years=years, base_path=base_path)

catalog_csv = f"catalog_global_soil_mapping_v{version}.csv"
catalog = DataCatalog.create_catalog(catalog_def=catalog_csv, years=years, base_path=base_path)

# json_out_path = 'soc_global_catalog.json'
# catalog.save_json(json_out_path)

Year 2000 not available for layer wv_mcd19a2v061_n_1km_s_YYYY0101_YYYY0131_go_epsg.4326_v20230619, propagating year 2001
Year 2000 not available for layer oswf_gwp_n_250m_s_YYYY0101_YYYY1231_go_epsg.4326_v1, propagating year 2003
Year 2000 not available for layer photosynthetic.veg_mcd43a4.fc_m_500m_s_YYYY0101_YYYY1231_go_epsg.4326_v20240616, propagating year 2001
Year 2000 not available for layer photosynthetic.veg_mcd43a4.fc_mx_500m_s_YYYY0101_YYYY1231_go_epsg.4326_v20240616, propagating year 2001
Year 2000 not available for layer photosynthetic.veg_mcd43a4.fc_std_500m_s_YYYY0101_YYYY1231_go_epsg.4326_v20240616, propagating year 2001
Year 2000 not available for layer non.photosynthetic.veg_mcd43a4.fc_std_gf_500m_s_YYYY0101_YYYY1231_go_epsg.4326_v20250116, propagating year 2001
Year 2000 not available for layer non.photosynthetic.veg_mcd43a4.fc_m_500m_s_YYYY0101_YYYY1231_go_epsg.4326_v20240616, propagating year 2001
Year 2000 not available for layer non.photosynthetic.veg_mcd43a4.fc_m

In [None]:
from shapely.geometry import Point
print('data size before overlay', df.shape)
geometry = [Point(xy) for xy in zip(df['longitude_decimal_degrees'], df['latitude_decimal_degrees'])]
df = gpd.GeoDataFrame(df, geometry=geometry, crs="EPSG:4326")

start = time.time()
space_time_overlay = SpaceTimeOverlay(
        col_date='observation_year',
        points=df, 
        catalog=catalog,
        raster_tiles='ard2_final_status.gpkg',
        verbose=True,
        n_threads=n_threads,
        tile_id_col='TILE')

print(f"Extraction of overlay meta-data: {(time.time() - start):.2f} s")


data size before overlay (591794, 18)
[09:22:01] Overlay 49478 points from 2000 in 363 raster layers
[09:22:01] Reading ard2_final_status.gpkg
[09:22:05] 0 out of 321 URLs returning 404
[09:22:21] Scanning blocks of 321 layers
[09:22:21] Finding query pixels for 0eedd36ec93fd9c99387cc8f5c320801 (34 layers)
[09:22:22] Finding query pixels for 1c15fdb5dcc20e9188b97bbf7d255ad8 (1 layers)
[09:22:23] Finding query pixels for 3f280ba251c10c66de252d02ff9d0b8a (1 layers)
[09:22:34] Finding query pixels for 5cbe5afcc7253ab5c399d4c37ecc2530 (6 layers)
[09:22:37] Finding query pixels for 5e0d3cfd5a6aa8ef64ba20d46592d06b (5 layers)
[09:22:38] Finding query pixels for 5e898c4a684b1175f4b2523a6999b3da (5 layers)
[09:22:56] Finding query pixels for 6ed680f0705ae492bb83f45e130f7980 (2 layers)
[09:22:57] Finding query pixels for 70ff59a5bad044b32b373de9a7cb4344 (12 layers)
[09:22:59] Finding query pixels for 82b14127aedf4840b02b3a63f38f9d3a (16 layers)
[09:23:01] Finding query pixels for 8f12d0e827c80c

In [None]:
import pickle
with open("space_time_overlay.pkl", "wb") as f:
    pickle.dump(space_time_overlay, f)

In [None]:
# import pickle 
# with open("space_time_overlay.pkl", "rb") as f:
#     space_time_overlay = pickle.load(f)

In [34]:
f2 = '/mnt/ripley/global_soc/scikit-map/global-soil-mapping/textures/feature_importance_texture2_v20250522.csv'
f1 = '/mnt/ripley/global_soc/scikit-map/global-soil-mapping/textures/feature_importance_texture1_v20250522.csv'

df1 = pd.read_csv(f1)
df2 = pd.read_csv(f2)

feats = list(set(list(df1['feature']) + list(df2['feature'])))
feats.sort()

In [None]:
start = time.time()
ovelayed_props_data = space_time_overlay.run(gdal_opts=GDAL_OPTS, max_ram_mb=max_ram_mb, out_file_name=f'{folder_path}/material/ovelayed_props_v{version}.pq')
print(f"Reading overlayed layers: {(tzime.time() - start):.2f} s")


[12:03:59] Running the overlay for 2000
[12:03:59] Loading and sampling 34 raster layers for group 0eedd36ec93fd9c99387cc8f5c320801
[12:04:07] Loading and sampling 1 raster layers for group 1c15fdb5dcc20e9188b97bbf7d255ad8
[12:04:08] Loading and sampling 1 raster layers for group 3f280ba251c10c66de252d02ff9d0b8a
[12:04:08] Loading and sampling 6 raster layers for group 5cbe5afcc7253ab5c399d4c37ecc2530
[12:04:11] Loading and sampling 5 raster layers for group 5e0d3cfd5a6aa8ef64ba20d46592d06b
[12:04:12] Loading and sampling 5 raster layers for group 5e898c4a684b1175f4b2523a6999b3da
[12:04:36] Loading and sampling 2 raster layers for group 6ed680f0705ae492bb83f45e130f7980
[12:04:36] Loading and sampling 12 raster layers for group 70ff59a5bad044b32b373de9a7cb4344
[12:04:39] Loading and sampling 16 raster layers for group 82b14127aedf4840b02b3a63f38f9d3a
[12:04:42] Loading and sampling 7 raster layers for group 85cc6c645162184927cc7f200c2164d3
[12:04:47] Loading and sampling 5 raster layers

NameError: name 'ovelayed_data' is not defined

In [8]:
print(f'data size: ', ovelayed_props_data.shape)

data size:  (591362, 385)


In [17]:
old_df = pd.read_parquet('/mnt/ripley/global_soc/scikit-map/global-soil-mapping/material/ovelayed_props_v20250522.pq')

In [36]:
# Remove features that had isses form the previous overlay
old_df = old_df.drop(['karst.extent_bgr.whymap_p_1km_b_20000101_20221231_go_epsg4326_v20241019'], axis=1)

In [37]:
old_df.shape

(591362, 380)

In [38]:
old_df.to_parquet('/mnt/ripley/global_soc/scikit-map/global-soil-mapping/material/ovelayed_props_v20250523.pq')


# overlay soil type data

In [8]:
# read in the data to be overlaid
df = pd.read_csv(f'{folder_path}/material/taxsubgrp_pnts_global_xyt_v{version}.csv', encoding='latin1', on_bad_lines='skip')#, errors='ignore')
df['site_key'] = df['site_key'].astype('str')
df = df.drop(columns=['Unnamed: 0'])
years = df['year'].unique().tolist()
years = [int(ii) for ii in years if not np.isnan(ii)]

# create catalog
# catalog = DataCatalog.create_catalog(catalog_def=gsheet.global_soil_types, years=years, base_path=base_path)

catalog_csv = f"catalog_global_soil_types_v{version}.csv"
catalog = DataCatalog.create_catalog(catalog_def=catalog_csv, years=years, base_path=base_path)

# json_out_path = 'soc_global_catalog.json'
# catalog.save_json(json_out_path)

  df = pd.read_csv(f'{folder_path}/material/taxsubgrp_pnts_global_xyt_v{version}.csv', encoding='latin1', on_bad_lines='skip')#, errors='ignore')


In [9]:
print('data size before overlay', df.shape)
geometry = [Point(xy) for xy in zip(df['longitude_decimal_degrees'], df['latitude_decimal_degrees'])]
df = gpd.GeoDataFrame(df, geometry=geometry, crs="EPSG:4326")

start = time.time()
space_overlay = SpaceOverlay(
        points=df, 
        catalog=catalog,
        raster_tiles='ard2_final_status.gpkg',
        verbose=True,
        n_threads=n_threads,
        tile_id_col='TILE')

print(f"Extraction of overlay meta-data: {(time.time() - start):.2f} s")

data size before overlay (334761, 7)
[18:04:21] Reading ard2_final_status.gpkg
[18:04:28] Scanning blocks of 165 layers
[18:04:28] Finding query pixels for 0eedd36ec93fd9c99387cc8f5c320801 (34 layers)
[18:04:29] Finding query pixels for 5cbe5afcc7253ab5c399d4c37ecc2530 (6 layers)
[18:04:33] Finding query pixels for 5e0d3cfd5a6aa8ef64ba20d46592d06b (5 layers)
[18:04:35] Finding query pixels for 5e898c4a684b1175f4b2523a6999b3da (5 layers)
[18:05:11] Finding query pixels for 6ed680f0705ae492bb83f45e130f7980 (2 layers)
[18:05:12] Finding query pixels for 70ff59a5bad044b32b373de9a7cb4344 (12 layers)
[18:05:17] Finding query pixels for 775b7cae2ab45efa24be3eca6d7e1e3d (14 layers)
[18:05:17] Retrieving block information for 3069 tiles.
[18:08:46] Finding query pixels for 85cc6c645162184927cc7f200c2164d3 (7 layers)
[18:08:49] Finding query pixels for 8f12d0e827c80c7a4d31c3a12b6d2dff (5 layers)
[18:09:05] Finding query pixels for 907f01ccbcf02de8f030abad8719de2c (12 layers)
[18:09:09] Finding q

In [10]:
import pickle
with open("space_overlay.pkl", "wb") as f:
    pickle.dump(space_overlay, f)

In [11]:
# import pickle 
# with open("space_overlay.pkl", "rb") as f:
#     space_overlay = pickle.load(f)

In [12]:
start = time.time()
ovelayed_data = space_overlay.run(gdal_opts=GDAL_OPTS, max_ram_mb=max_ram_mb, out_file_name=f'{folder_path}/material/ovelayed_soil_types_v{version}.pq')
print(f"Reading overlayed layers: {(time.time() - start):.2f} s")
print(f'data size: ', ovelayed_data.shape)

[18:12:19] Loading and sampling 34 raster layers for group 0eedd36ec93fd9c99387cc8f5c320801
[18:12:26] Loading and sampling 6 raster layers for group 5cbe5afcc7253ab5c399d4c37ecc2530
[18:12:28] Loading and sampling 5 raster layers for group 5e0d3cfd5a6aa8ef64ba20d46592d06b
[18:12:30] Loading and sampling 5 raster layers for group 5e898c4a684b1175f4b2523a6999b3da
[18:13:15] Loading and sampling 2 raster layers for group 6ed680f0705ae492bb83f45e130f7980
[18:13:16] Loading and sampling 12 raster layers for group 70ff59a5bad044b32b373de9a7cb4344
[18:13:20] Loading and sampling 14 raster layers for group 775b7cae2ab45efa24be3eca6d7e1e3d
[18:17:09] Loading and sampling 7 raster layers for group 85cc6c645162184927cc7f200c2164d3
[18:17:16] Loading and sampling 5 raster layers for group 8f12d0e827c80c7a4d31c3a12b6d2dff
[18:17:26] Loading and sampling 12 raster layers for group 907f01ccbcf02de8f030abad8719de2c
[18:17:27] Loading and sampling 4 raster layers for group a3552eafbe5e63b1d596ecab2408