<a href="https://colab.research.google.com/github/milver/Experiments/blob/main/TestBiomassStreaming.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Testing ABGB dataset streaming from HuggingFace

## Install necessary libraries

In [None]:
!pip install datasets leafmap geopandas



## Import necessary libraries

In [None]:
from datasets import load_dataset
import numpy as np
import leafmap
import geopandas as gpd
from shapely.geometry import Point
import ipywidgets as widgets

## Functions for stream map

In [None]:
# Define function to compute sine and cosine for bounds
def compute_sin_cos_bounds(lat_min, lat_max, lon_min, lon_max):
    lat_sin_min, lat_sin_max = np.sin(np.radians([lat_min, lat_max]))
    lat_cos_min, lat_cos_max = np.cos(np.radians([lat_min, lat_max]))
    lon_sin_min, lon_sin_max = np.sin(np.radians([lon_min, lon_max]))
    lon_cos_min, lon_cos_max = np.cos(np.radians([lon_min, lon_max]))
    return (lat_sin_min, lat_sin_max), (lat_cos_min, lat_cos_max), (lon_sin_min, lon_sin_max), (lon_cos_min, lon_cos_max)

# Define filter function
def filter_by_bounds(example, bounds):
    lat_sin = example['lat_sin']
    lat_cos = example['lat_cos']
    lon_sin = example['lon_sin']
    lon_cos = example['lon_cos']
    return (bounds['lat_sin'][0] <= lat_sin <= bounds['lat_sin'][1] and
            bounds['lat_cos'][0] <= lat_cos <= bounds['lat_cos'][1] and
            bounds['lon_sin'][0] <= lon_sin <= bounds['lon_sin'][1] and
            bounds['lon_cos'][0] <= lon_cos <= bounds['lon_cos'][1])

# Function to update the map based on current bounds
def update_map(m, dataset):
    bounds = m.bounds  # Get the current map bounds
    lat_min, lon_min = bounds[0]
    lat_max, lon_max = bounds[1]

    lat_sin_bounds, lat_cos_bounds, lon_sin_bounds, lon_cos_bounds = compute_sin_cos_bounds(lat_min, lat_max, lon_min, lon_max)
    bound_dict = {
        'lat_sin': lat_sin_bounds,
        'lat_cos': lat_cos_bounds,
        'lon_sin': lon_sin_bounds,
        'lon_cos': lon_cos_bounds
    }

    filtered_examples = []
    for example in dataset:
        if filter_by_bounds(example, bound_dict):
            filtered_examples.append(example)

    geometries = []
    biomass_values = []
    for example in filtered_examples:
        lat, lon = example['lat_decimal'], example['lon_decimal']
        biomass = example['biomass']
        geometries.append(Point(lon, lat))
        biomass_values.append(biomass)

    gdf = gpd.GeoDataFrame({'biomass': biomass_values}, geometry=geometries)
    m.add_gdf(gdf, layer_name='Filtered Data')

## Load the dataset

In [None]:
# Load the dataset
dataset = load_dataset("prs-eth/AGBD_15", streaming=True)["train"]

Resolving data files:   0%|          | 0/461 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/125 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/153 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/461 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/125 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/153 [00:00<?, ?it/s]

## Initialize and Map

In [None]:
# Initialize a Leafmap map
m = leafmap.Map(center=(39.0170445, -77.5782395), zoom=13)

# Create button to update map
update_button = widgets.Button(description="Update Map")
update_button.on_click(lambda x: update_map(m, dataset))

# Display the button and map
display(update_button)
m

Button(description='Update Map', style=ButtonStyle())

Map(center=[39.0170445, -77.5782395], controls=(ZoomControl(options=['position', 'zoom_in_text', 'zoom_in_titl…

KeyError: 'lat_sin'

In [None]:
# Print the first example to see the available keys
for example in dataset.take(1):
    print(example)

{'input': [[[0.7394304871559143, 0.756686806678772, 0.7739430665969849, 0.791199266910553, 0.8075926899909973, 0.824849009513855, 0.839516818523407, 0.8524589538574219, 0.8654012680053711, 0.8774805665016174, 0.8904227018356323, 0.9033650159835815, 0.8990508913993835, 0.8800690770149231, 0.8602243065834045], [0.6721311211585999, 0.6893874406814575, 0.7057808637619019, 0.72303706407547, 0.7394304871559143, 0.756686806678772, 0.7739430665969849, 0.7920621037483215, 0.8093183636665344, 0.8274374604225159, 0.8455564975738525, 0.8628127574920654, 0.8645384311676025, 0.8507334589958191, 0.8369283080101013], [0.6048317551612854, 0.6212252378463745, 0.6384814977645874, 0.654874861240387, 0.6712682843208313, 0.688524603843689, 0.708369255065918, 0.7308023571968079, 0.7540982961654663, 0.7773942947387695, 0.7998273968696594, 0.8231233358383179, 0.830025851726532, 0.8222604990005493, 0.8136323690414429], [0.537532389163971, 0.5539258122444153, 0.5703192949295044, 0.586712658405304, 0.603968918323

In [None]:
for example in dataset.take(1):
    # Print the entire example to understand its structure
    print("Example:", example)

    # Access and print the 'input' key if it exists
    if 'input' in example:
        data = example['input']  # Assuming 'input' is already a list or dictionary
        print("Input data:", data)

        # If 'input' is a list of dictionaries, iterate through each item
        if isinstance(data, list):
            for idx, item in enumerate(data):
                print(f"Item {idx}:", item)

                # If 'item' is a dictionary, print its keys and values
                if isinstance(item, dict):
                    for key, value in item.items():
                        print(f"  {key}: {value}")
        else:
            print("Input is not a list:", data)
    else:
        print("'input' key not found in example")


Example: {'input': [[[0.7394304871559143, 0.756686806678772, 0.7739430665969849, 0.791199266910553, 0.8075926899909973, 0.824849009513855, 0.839516818523407, 0.8524589538574219, 0.8654012680053711, 0.8774805665016174, 0.8904227018356323, 0.9033650159835815, 0.8990508913993835, 0.8800690770149231, 0.8602243065834045], [0.6721311211585999, 0.6893874406814575, 0.7057808637619019, 0.72303706407547, 0.7394304871559143, 0.756686806678772, 0.7739430665969849, 0.7920621037483215, 0.8093183636665344, 0.8274374604225159, 0.8455564975738525, 0.8628127574920654, 0.8645384311676025, 0.8507334589958191, 0.8369283080101013], [0.6048317551612854, 0.6212252378463745, 0.6384814977645874, 0.654874861240387, 0.6712682843208313, 0.688524603843689, 0.708369255065918, 0.7308023571968079, 0.7540982961654663, 0.7773942947387695, 0.7998273968696594, 0.8231233358383179, 0.830025851726532, 0.8222604990005493, 0.8136323690414429], [0.537532389163971, 0.5539258122444153, 0.5703192949295044, 0.586712658405304, 0.603

In [None]:
# prompt: print the twelvth item in the list that is the value of the dictionary with key 'input' that is returned when querying dataset.take(1)

for example in dataset.take(1):
  # Access and print the 'input' key if it exists
  if 'input' in example:
    data = example['input']  # Assuming 'input' is already a list or dictionary

    # If 'input' is a list, access the 13 item
    if isinstance(data, list):
      lat_cos_item = data[12]
      print(f"13th item: {lat_cos_item}")
    else:
      print("'input' is not a list:", data)
  else:
    print("'input' key not found in example")

  # Convert to a NumPy array
  lat_cos_array = np.array(lat_cos_item)

  # Print the shape of the array to verify its dimensions
  print(lat_cos_array.shape)  # Should be (number_of_rows, 15)

  # Print the array to check its contents
  print(lat_cos_array)




13th item: [[0.6073664426803589, 0.6073648929595947, 0.6073634028434753, 0.6073618531227112, 0.607360303401947, 0.6073587536811829, 0.6073572635650635, 0.6073557138442993, 0.6073541641235352, 0.6073526740074158, 0.6073511242866516, 0.6073495745658875, 0.6073480248451233, 0.6073465347290039, 0.6073449850082397], [0.6073664426803589, 0.6073648929595947, 0.6073634028434753, 0.6073618531227112, 0.607360303401947, 0.6073587536811829, 0.6073572635650635, 0.6073557138442993, 0.6073541641235352, 0.6073526740074158, 0.6073511242866516, 0.6073495745658875, 0.6073480248451233, 0.6073465347290039, 0.6073449850082397], [0.6073664426803589, 0.6073648929595947, 0.6073634028434753, 0.6073618531227112, 0.607360303401947, 0.6073587536811829, 0.6073572635650635, 0.6073557138442993, 0.6073541641235352, 0.6073526740074158, 0.6073511242866516, 0.6073495745658875, 0.6073480248451233, 0.6073465347290039, 0.6073449850082397], [0.6073664426803589, 0.6073648929595947, 0.6073634028434753, 0.6073618531227112, 0.60

Sources:
https://arxiv.org/pdf/2406.04928
https://github.com/ghjuliasialelli/AGBD/tree/main
https://huggingface.co/datasets/prs-eth/AGBD_15?row=0
https://huggingface.co/docs/datasets/stream
