In [None]:
# Exapnding landcover proportions from a parquet file to individual columns

import dask.dataframe as dd
import ast
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from dask.diagnostics import ProgressBar
import os
import glob

# -----------------------------------------------------------------------------------
# Landcover Setup
# -----------------------------------------------------------------------------------
landcover_legend = {
    0: "Open Seas", 10: "Tree Cover", 20: "Shrubland", 30: "Grassland",
    40: "Cropland", 50: "Built-up", 60: "Bare/Sparse Vegetation", 70: "Snow and Ice",
    80: "Permanent Water Bodies", 90: "Herbaceous Wetland", 95: "Mangroves", 100: "Moss and Lichen"
}
lc_columns = list(landcover_legend.values())

# -----------------------------------------------------------------------------------
# Expansion Function
# -----------------------------------------------------------------------------------
def expand_landcover_props_fast(df_partition):
    parsed = df_partition['landcover_props'].apply(ast.literal_eval)

    for col in lc_columns:
        df_partition[col] = 0.0

    for code, col_name in landcover_legend.items():
        df_partition[col_name] = parsed.apply(lambda d: d.get(code, 0.0))

    df_partition = df_partition.drop(columns=["landcover_props"])

    final_columns = ["tile_id"] + lc_columns
    df_partition = df_partition[final_columns]

    return df_partition

# -----------------------------------------------------------------------------------
# Paths
# -----------------------------------------------------------------------------------
input_folder_pattern = "D:/tascarrd_test_outputs/grid_1200m/lc_proportions_10m_*_1200m.parquet"
output_folder = "D:/tascarrd_test_outputs/grid_1200m/lc_props_with_columns/"

os.makedirs(output_folder, exist_ok=True)

# -----------------------------------------------------------------------------------
# Process Each Zone File Separately
# -----------------------------------------------------------------------------------
zone_files = sorted(glob.glob(input_folder_pattern))

print(f"Found {len(zone_files)} zone files to process.")

for file_path in zone_files:
    zone_name = os.path.basename(file_path).replace(".parquet", "")  # e.g., lc_proportions_10m_32N_1200m
    output_file = os.path.join(output_folder, f"{zone_name}.parquet")

    if os.path.exists(output_file):
        print(f"Skipping {zone_name}, already processed.")
        continue
    
    print(f"\nProcessing {zone_name}...")

    # Read the zone file lazily
    df = dd.read_parquet(
        file_path,
        engine="pyarrow",
        columns=["tile_id", "landcover_props"]
    )

    # Optional: repartition to larger partitions if needed
    df = df.repartition(partition_size="3GB")

    # Define correct meta
    meta_columns = {"tile_id": 'object'}
    meta_columns.update({col: 'float32' for col in lc_columns})

    # Expand
    df_expanded = df.map_partitions(expand_landcover_props_fast, meta=meta_columns)

    # Compute and Save
    with ProgressBar():
        df_final = df_expanded.compute()

    # Save cleanly
    table = pa.Table.from_pandas(df_final, preserve_index=False)
    pq.write_table(
        table,
        where=output_file,
        compression="snappy",
        use_dictionary=True,
        data_page_size=2097152,
        row_group_size=1000000
    )

    print(f"Saved expanded zone: {output_file}")

print("\nAll zones processed and saved successfully.")


Found 122 zone files to process.
Skipping lc_proportions_10m_10N_1200m, already processed.
Skipping lc_proportions_10m_10S_1200m, already processed.
Skipping lc_proportions_10m_11N_1200m, already processed.
Skipping lc_proportions_10m_11S_1200m, already processed.
Skipping lc_proportions_10m_12N_1200m, already processed.
Skipping lc_proportions_10m_12S_1200m, already processed.
Skipping lc_proportions_10m_13N_1200m, already processed.
Skipping lc_proportions_10m_13S_1200m, already processed.
Skipping lc_proportions_10m_14N_1200m, already processed.
Skipping lc_proportions_10m_14S_1200m, already processed.
Skipping lc_proportions_10m_15N_1200m, already processed.
Skipping lc_proportions_10m_15S_1200m, already processed.
Skipping lc_proportions_10m_16N_1200m, already processed.
Skipping lc_proportions_10m_16S_1200m, already processed.
Skipping lc_proportions_10m_17N_1200m, already processed.

Processing lc_proportions_10m_17S_1200m...
[########################################] | 100% Com

In [None]:
# Filtering non-Open Seas Landcover Proportions for the files with expanded columns
# -----------------------------------------------------------------------------------
import glob
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os


# -----------------------------------------------------------------------------------
# Landcover Setup
# -----------------------------------------------------------------------------------
landcover_legend = {
    0: "Open Seas", 10: "Tree Cover", 20: "Shrubland", 30: "Grassland",
    40: "Cropland", 50: "Built-up", 60: "Bare/Sparse Vegetation", 70: "Snow and Ice",
    80: "Permanent Water Bodies", 90: "Herbaceous Wetland", 95: "Mangroves", 100: "Moss and Lichen"
}
landcover_columns = list(landcover_legend.values())

# -----------------------------------------------------------------------------------
# Input and output paths
input_folder = "D:/tascarrd_test_outputs/grid_1200m/lc_props_with_columns/"
output_folder = "D:/tascarrd_test_outputs/grid_1200m/lc_props_with_columns_filtered/"

os.makedirs(output_folder, exist_ok=True)

# List all parquet files
files = sorted(glob.glob(os.path.join(input_folder, "*.parquet")))

# Process each file separately
for i, f in enumerate(files):
    print(f"Processing {i+1}/{len(files)}: {os.path.basename(f)}")

    # Read one file at a time
    df = pd.read_parquet(f, engine="pyarrow")

    # Apply filter immediately
    df_filtered = df[df['Open Seas'] < 0.999].copy()

    # Convert all landcover columns to float32
    df_filtered[landcover_columns] = df_filtered[landcover_columns].astype('float32')

    
    # Save immediately
    output_path = os.path.join(output_folder, os.path.basename(f))
    df_filtered.to_parquet(output_path, index=False, engine="pyarrow", compression="snappy")

    # Clean memory manually
    del df
    del df_filtered

print("\nAll zone files filtered and saved separately.")


Processing 1/122: lc_proportions_10m_10N_1200m.parquet
Processing 2/122: lc_proportions_10m_10S_1200m.parquet
Processing 3/122: lc_proportions_10m_11N_1200m.parquet
Processing 4/122: lc_proportions_10m_11S_1200m.parquet
Processing 5/122: lc_proportions_10m_12N_1200m.parquet
Processing 6/122: lc_proportions_10m_12S_1200m.parquet
Processing 7/122: lc_proportions_10m_13N_1200m.parquet
Processing 8/122: lc_proportions_10m_13S_1200m.parquet
Processing 9/122: lc_proportions_10m_14N_1200m.parquet
Processing 10/122: lc_proportions_10m_14S_1200m.parquet
Processing 11/122: lc_proportions_10m_15N_1200m.parquet
Processing 12/122: lc_proportions_10m_15S_1200m.parquet
Processing 13/122: lc_proportions_10m_16N_1200m.parquet
Processing 14/122: lc_proportions_10m_16S_1200m.parquet
Processing 15/122: lc_proportions_10m_17N_1200m.parquet
Processing 16/122: lc_proportions_10m_17S_1200m.parquet
Processing 17/122: lc_proportions_10m_18N_1200m.parquet
Processing 18/122: lc_proportions_10m_18S_1200m.parquet
P

In [None]:
# Filtering a few Open Seas Landcover Proportions to add to the previous non-Open Seas Landcover Proportions
# -----------------------------------------------------------------------------------

import glob
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os
import gc

# -----------------------------------------------------------------------------------
# Landcover Setup
# -----------------------------------------------------------------------------------
landcover_legend = {
    0: "Open Seas", 10: "Tree Cover", 20: "Shrubland", 30: "Grassland",
    40: "Cropland", 50: "Built-up", 60: "Bare/Sparse Vegetation", 70: "Snow and Ice",
    80: "Permanent Water Bodies", 90: "Herbaceous Wetland", 95: "Mangroves", 100: "Moss and Lichen"
}
landcover_columns = list(landcover_legend.values())

# -----------------------------------------------------------------------------------


# Input and output paths
input_folder = "D:/tascarrd_test_outputs/grid_1200m/lc_props_with_columns/"
output_folder = "D:/tascarrd_test_outputs/grid_1200m/lc_props_with_columns_openseas_filtered/"

os.makedirs(output_folder, exist_ok=True)

# List all parquet files
files = sorted(glob.glob(os.path.join(input_folder, "*.parquet")))

# Process each file separately
for i, f in enumerate(files):
    print(f"Processing {i+1}/{len(files)}: {os.path.basename(f)}")

    # Read one file at a time
    df = pd.read_parquet(f, engine="pyarrow")

    # Apply filter immediately
    df_filtered = df[df['Open Seas'] > 0.999].copy()
    del df
    gc.collect()

    # Convert all landcover columns to float32
    df_filtered[landcover_columns] = df_filtered[landcover_columns].astype('float32')

    # sample_factor = len(df_filtered) / 1000_000
    df_filtered = df_filtered.sample(frac=0.01, random_state=42)

    # Save immediately
    output_path = os.path.join(output_folder, os.path.basename(f))
    df_filtered.to_parquet(output_path, index=False, engine="pyarrow", compression="snappy")

    # Clean memory manually
    # del df
    del df_filtered

print("\nAll zone files filtered and saved separately.")


In [None]:
# Merging all filtered Non- Open Seas Landcover Proportions to the previous non-Open Seas Landcover Proportions
import glob
import pandas as pd
import pyarrow as pa
files = glob.glob("D:/tascarrd_test_outputs/grid_1200m/lc_props_with_columns_filtered/*.parquet")

dfs = []
for i, f in enumerate(files):
    print(i+1, f)
    df = pd.read_parquet(f)
    df = df[df['Open Seas'] < 0.999].copy()
    dfs.append(df)
    del df

df = pd.concat(dfs, ignore_index=True)
df.to_parquet("D:/tascarrd_test_outputs/grid_1200m/lc_props_with_columns_filtered/lc_props_non_openseas_all_zones_test.parquet", index=False, engine='pyarrow', compression='snappy')

# Open Seas Landcover Proportions
# -----------------------------------------------------------------------------------

import glob
import pandas as pd
import pyarrow as pa
files = glob.glob("D:/tascarrd_test_outputs/grid_1200m/lc_props_with_columns_openseas_filtered/*.parquet")

dfs = []
for i, f in enumerate(files):
    print(i+1, f)
    df = pd.read_parquet(f)
    df = df[df['Open Seas'] > 0.999].copy()
    dfs.append(df)
    del df

df = pd.concat(dfs, ignore_index=True)
##########################################
df = df.sample(n=1050000, random_state=42)  # sample 1 million rows
##########################################

df.to_parquet("D:/tascarrd_test_outputs/grid_1200m/lc_props_with_columns_openseas_filtered/lc_props_some_openseas_all_zones_test.parquet", index=False, engine='pyarrow', compression='snappy')

# Merging all filtered Non- Open Seas Landcover Proportions to the previous non-Open Seas Landcover Proportions
# -----------------------------------------------------------------------------------
df_non_openseas = pd.read_parquet("D:/tascarrd_test_outputs/grid_1200m/lc_props_with_columns_filtered/lc_props_non_openseas_all_zones.parquet")
df_openseas = pd.read_parquet("D:/tascarrd_test_outputs/grid_1200m/lc_props_with_columns_openseas_filtered/lc_props_some_openseas_all_zones.parquet")

df = pd.concat([df_non_openseas, df_openseas], ignore_index=True)
df.to_parquet("D:/tascarrd_test_outputs/grid_1200m/filtered_lc_props_all_zones.parquet", index=False, engine='pyarrow', compression='snappy')

1 D:/tascarrd_test_outputs/grid_1200m/lc_props_with_columns_filtered\lc_proportions_10m_10N_1200m.parquet
2 D:/tascarrd_test_outputs/grid_1200m/lc_props_with_columns_filtered\lc_proportions_10m_10S_1200m.parquet
3 D:/tascarrd_test_outputs/grid_1200m/lc_props_with_columns_filtered\lc_proportions_10m_11N_1200m.parquet
4 D:/tascarrd_test_outputs/grid_1200m/lc_props_with_columns_filtered\lc_proportions_10m_11S_1200m.parquet
5 D:/tascarrd_test_outputs/grid_1200m/lc_props_with_columns_filtered\lc_proportions_10m_12N_1200m.parquet
6 D:/tascarrd_test_outputs/grid_1200m/lc_props_with_columns_filtered\lc_proportions_10m_12S_1200m.parquet
7 D:/tascarrd_test_outputs/grid_1200m/lc_props_with_columns_filtered\lc_proportions_10m_13N_1200m.parquet
8 D:/tascarrd_test_outputs/grid_1200m/lc_props_with_columns_filtered\lc_proportions_10m_13S_1200m.parquet
9 D:/tascarrd_test_outputs/grid_1200m/lc_props_with_columns_filtered\lc_proportions_10m_14N_1200m.parquet
10 D:/tascarrd_test_outputs/grid_1200m/lc_prop


# Now, below we start on subset selections from this filtered dataset
## Above we filtered non-openseas 105 million cells, and added 1.05 million open seas cell for potential filtering later

In [13]:
df = pd.read_parquet("D:/tascarrd_test_outputs/grid_1200m/filtered_lc_props_all_zones.parquet")
len(df)

107048570

In [14]:
df.head()

Unnamed: 0,tile_id,Open Seas,Tree Cover,Shrubland,Grassland,Cropland,Built-up,Bare/Sparse Vegetation,Snow and Ice,Permanent Water Bodies,Herbaceous Wetland,Mangroves,Moss and Lichen,Majority_LC
0,G1200m_10N_X000422_Y004451,0.0,0.95146,0.0,0.00514,0.0,0.00049,0.00063,0.0,0.04229,0.0,0.0,0.0,Tree Cover
1,G1200m_10N_X000423_Y004451,0.0,0.99174,0.0,0.0,0.0,0.0,0.00028,0.0,0.00799,0.0,0.0,0.0,Tree Cover
2,G1200m_10N_X000424_Y004451,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Tree Cover
3,G1200m_10N_X000425_Y004451,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Tree Cover
4,G1200m_10N_X000426_Y004451,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Tree Cover


In [None]:
# Adding major landcover classes to the filtered file
# -----------------------------------------------------------------------------------

import pandas as pd

# Assuming df is your DataFrame
landcover_columns = [
    "Open Seas", "Tree Cover", "Shrubland", "Grassland", "Cropland", "Built-up",
    "Bare/Sparse Vegetation", "Snow and Ice", "Permanent Water Bodies",
    "Herbaceous Wetland", "Mangroves", "Moss and Lichen"
]

# Sum landcover fractions across all rows for each class
landcover_totals = df[landcover_columns].sum()

# Compute total landcover (all landcover values across all rows and classes)
total_sum = landcover_totals.sum()

# Compute global proportions
global_proportions = landcover_totals / total_sum

# Optional: convert to percentage
global_proportions_percent = global_proportions * 100


global_proportions_percent.sort_values(ascending=False).to_frame().reset_index().rename(columns={"index": "Landcover", 0: "Proportion (%)"}).to_csv("D:/tascarrd_test_outputs/grid_1200m/lc_props_with_columns_filtered/global_landcover_proportions.csv", index=False)
global_proportions_percent

In [None]:
df["Majority_LC"] = df[landcover_columns].idxmax(axis=1)
global_majority_lc = df["Majority_LC"].value_counts(normalize=True) * 100
# global_majority_lc = global_majority_lc.sort_index()
global_majority_lc

Unnamed: 0,tile_id,Open Seas,Tree Cover,Shrubland,Grassland,Cropland,Built-up,Bare/Sparse Vegetation,Snow and Ice,Permanent Water Bodies,Herbaceous Wetland,Mangroves,Moss and Lichen,Majority_LC
0,G1200m_10N_X000422_Y004451,0.0,0.95146,0.0,0.00514,0.0,0.00049,0.00063,0.0,0.04229,0.0,0.0,0.0,Tree Cover
1,G1200m_10N_X000423_Y004451,0.0,0.99174,0.0,0.0,0.0,0.0,0.00028,0.0,0.00799,0.0,0.0,0.0,Tree Cover
2,G1200m_10N_X000424_Y004451,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Tree Cover
3,G1200m_10N_X000425_Y004451,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Tree Cover
4,G1200m_10N_X000426_Y004451,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Tree Cover


In [None]:
df.to_parquet("D:/tascarrd_test_outputs/grid_1200m/filtered_lc_props_all_zones_107million.parquet", index=False, engine='pyarrow', compression='snappy')

## Reading 107million cells of 1200 for later subset selections
###
###
##

In [3]:
# Adding Shannon Entropy to the filtered file
# -----------------------------------------------------------------------------------
import pandas as pd
df = pd.read_parquet("D:/tascarrd_test_outputs/grid_1200m/filtered_lc_props_all_zones_107million.parquet")
from scipy.stats import entropy
import numpy as np, pandas as pd
rng = np.random.default_rng(42)                               # reproducible

def compute_shannon_entropy(row):
    p = np.asarray(row, dtype="float32")
    p = p[p > 0]
    return entropy(p, base=2)

landcover_cols = [
    "Open Seas", "Tree Cover", "Shrubland", "Grassland", "Cropland",
    "Built-up", "Bare/Sparse Vegetation", "Snow and Ice",
    "Permanent Water Bodies", "Herbaceous Wetland",
    "Mangroves", "Moss and Lichen"
]

df[landcover_cols] = (
    df[landcover_cols]
        .apply(pd.to_numeric, errors="coerce")
        .fillna(0)
        .astype("float32")
)
print(f"Converted landcover columns to float32., calculating shannon entropy...")
if "shannon_entropy" not in df.columns:
    df["shannon_entropy"] = df[landcover_cols].apply(
        compute_shannon_entropy, axis=1)
    print("Shannon entropy computed.")


df.to_parquet("D:/tascarrd_test_outputs/grid_1200m/filtered_lc_props_all_zones_107million.parquet")

In [1]:
import pandas as pd
df = pd.read_parquet("D:/tascarrd_test_outputs/grid_1200m/filtered_lc_props_all_zones_107million.parquet")

In [2]:
df.head(7)

Unnamed: 0,tile_id,Open Seas,Tree Cover,Shrubland,Grassland,Cropland,Built-up,Bare/Sparse Vegetation,Snow and Ice,Permanent Water Bodies,Herbaceous Wetland,Mangroves,Moss and Lichen,Majority_LC,shannon_entropy
0,G1200m_10N_X000422_Y004451,0.0,0.95146,0.0,0.00514,0.0,0.00049,0.00063,0.0,0.04229,0.0,0.0,0.0,Tree Cover,0.312475
1,G1200m_10N_X000423_Y004451,0.0,0.99174,0.0,0.0,0.0,0.0,0.00028,0.0,0.00799,0.0,0.0,0.0,Tree Cover,0.070857
2,G1200m_10N_X000424_Y004451,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Tree Cover,0.0
3,G1200m_10N_X000425_Y004451,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Tree Cover,0.0
4,G1200m_10N_X000426_Y004451,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Tree Cover,0.0
5,G1200m_10N_X000427_Y004451,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Tree Cover,0.0
6,G1200m_10N_X000428_Y004451,0.0,0.99528,0.0,0.00465,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7e-05,Tree Cover,0.04379


In [1]:
import pandas as pd
df = pd.read_parquet("D:/tascarrd_test_outputs/grid_1200m/grid_1200m_subsets_after_10k.parquet")
df.head(5)

Unnamed: 0,tile_id,Open Seas,Tree Cover,Shrubland,Grassland,Cropland,Built-up,Bare/Sparse Vegetation,Snow and Ice,Permanent Water Bodies,Herbaceous Wetland,Mangroves,Moss and Lichen,Majority_LC,shannon_entropy,subset_phase,subset_10k
0,G1200m_10N_X000422_Y004451,0.0,0.95146,0.0,0.00514,0.0,0.00049,0.00063,0.0,0.04229,0.0,0.0,0.0,Tree Cover,0.312475,none,none
1,G1200m_10N_X000423_Y004451,0.0,0.99174,0.0,0.0,0.0,0.0,0.00028,0.0,0.00799,0.0,0.0,0.0,Tree Cover,0.070857,none,none
2,G1200m_10N_X000424_Y004451,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Tree Cover,0.0,none,none
3,G1200m_10N_X000425_Y004451,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Tree Cover,0.0,none,none
4,G1200m_10N_X000426_Y004451,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Tree Cover,0.0,none,none


In [2]:
df_250k = pd.read_parquet("D:/tascarrd_test_outputs/grid_1200m/grid_1200m_subsets_after_250k.parquet")
df_250k.head()

Unnamed: 0,tile_id,Open Seas,Tree Cover,Shrubland,Grassland,Cropland,Built-up,Bare/Sparse Vegetation,Snow and Ice,Permanent Water Bodies,...,Mangroves,Moss and Lichen,Majority_LC,shannon_entropy,subset_phase,subset_10k,subset_25k,subset_50k,subset_100k,subset_250k
844,G1200m_10N_X000465_Y004459,0.0,0.18521,0.00021,0.1284,0.0,0.0,0.51361,0.10354,0.0075,...,0.0,0.06153,Bare/Sparse Vegetation,1.966283,250k_phase2,none,none,none,none,250k
2564,G1200m_10N_X000473_Y004475,0.0,0.93646,0.00375,0.05285,0.0,0.0,0.00049,0.0,0.0,...,0.0,0.00646,Tree Cover,0.39549,250k_phase3,none,none,none,none,250k
2646,G1200m_10N_X000443_Y004476,0.0,0.86924,0.0,0.12507,0.0,0.0,0.00368,0.0,0.0,...,0.0,0.00201,Tree Cover,0.59861,250k_phase3,none,none,none,none,250k
2675,G1200m_10N_X000472_Y004476,0.0,0.41535,0.00194,0.09903,0.0,0.0,0.20493,0.0,0.0,...,0.0,0.27875,Tree Cover,1.856697,250k_phase2,none,none,none,none,250k
2890,G1200m_10N_X000462_Y004478,0.0,0.20743,0.01833,0.65,0.0,0.0,0.07451,0.0,0.0,...,0.0,0.04972,Grassland,1.474882,250k_phase2,none,none,none,none,250k


In [3]:
df_10k = df_250k[df_250k["subset_10k"]!='none']
df_25k = df_250k[df_250k["subset_25k"]!='none']
df_50k = df_250k[df_250k["subset_50k"]!='none']
df_100k = df_250k[df_250k["subset_100k"]!='none']


In [4]:
# Step 3: Ensure subset columns exist
for col in [f"subset_10k", "subset_25k",  "subset_50k", "subset_100k", "subset_250k", "subset_phase"]:
    if col not in df.columns:
        df[col] = "none"
df.head()

Unnamed: 0,tile_id,Open Seas,Tree Cover,Shrubland,Grassland,Cropland,Built-up,Bare/Sparse Vegetation,Snow and Ice,Permanent Water Bodies,...,Mangroves,Moss and Lichen,Majority_LC,shannon_entropy,subset_phase,subset_10k,subset_25k,subset_50k,subset_100k,subset_250k
0,G1200m_10N_X000422_Y004451,0.0,0.95146,0.0,0.00514,0.0,0.00049,0.00063,0.0,0.04229,...,0.0,0.0,Tree Cover,0.312475,none,none,none,none,none,none
1,G1200m_10N_X000423_Y004451,0.0,0.99174,0.0,0.0,0.0,0.0,0.00028,0.0,0.00799,...,0.0,0.0,Tree Cover,0.070857,none,none,none,none,none,none
2,G1200m_10N_X000424_Y004451,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,Tree Cover,0.0,none,none,none,none,none,none
3,G1200m_10N_X000425_Y004451,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,Tree Cover,0.0,none,none,none,none,none,none
4,G1200m_10N_X000426_Y004451,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,Tree Cover,0.0,none,none,none,none,none,none


In [5]:


# Step 4: Map back subset_10k and subset_phase to full df
df.loc[df_10k.index, "subset_10k"] = "10k"
df.loc[df_25k.index, "subset_25k"] = "25k"
df.loc[df_50k.index, "subset_50k"] = "50k"
df.loc[df_100k.index, "subset_100k"] = "100k"
df.loc[df_250k.index, "subset_250k"] = "250k"
df.loc[df_250k.index, "subset_phase"] = df_250k["subset_phase"]

print(f"Subset 10k rows correctly mapped back: {(df['subset_10k'] == '10k').sum():,} rows.")
print(f"Subset 10k rows correctly mapped back: {(df['subset_25k'] == '25k').sum():,} rows.")
print(f"Subset 10k rows correctly mapped back: {(df['subset_50k'] == '50k').sum():,} rows.")
print(f"Subset 10k rows correctly mapped back: {(df['subset_100k'] == '100k').sum():,} rows.")
print(f"Subset 10k rows correctly mapped back: {(df['subset_250k'] == '250k').sum():,} rows.")



# Now you can run your optimized select_hybrid_subsets() safely starting from 25k

Subset 10k rows correctly mapped back: 9,000 rows.
Subset 10k rows correctly mapped back: 22,500 rows.
Subset 10k rows correctly mapped back: 45,000 rows.
Subset 10k rows correctly mapped back: 90,000 rows.
Subset 10k rows correctly mapped back: 225,100 rows.


In [6]:
# # Choose a previously created subsets if available

# import pandas as pd
# df = pd.read_parquet("D:/tascarrd_test_outputs/grid_1200m/lc_1200m_subsets_after_100k.parquet")
# df.head()

In [None]:
# # Old Code
# from scipy.stats import entropy
# import numpy as np
# rng = np.random.default_rng(42)   # single reproducible generator
# # -----------------------------------------------------------------------------------
# def compute_shannon_entropy(row):
#     p = np.array(row)
#     p = p[p > 0]  # Remove zeros
#     return entropy(p, base=2)

# def select_hybrid_subsets(
#     df,
#     landcover_cols,
#     targets,
#     sizes,
#     class_thresholds,
#     phase_proportions=[0.4, 0.1, 0.4],
#     batch_size=10,
#     sample_pool_size=2000
# ):
#     assert sum(phase_proportions) == 0.9, "Phase proportions must sum to 0.9, leaving 0.1 for random selection."

#     df = df.copy()
#     df[landcover_cols] = df[landcover_cols].apply(pd.to_numeric, errors='coerce').fillna(0).astype("float32")

#     # Pre-compute entropy if not already available
#     if "shannon_entropy" not in df.columns:
#         df["shannon_entropy"] = df[landcover_cols].apply(compute_shannon_entropy, axis=1)
#         print("Entropy computed.")

#     # Track cumulative selection index
#     global_selected = set()

#     for label, size in sizes.items():
#         print(f"\nSelecting subset: {label}")
#         subset_col_name = f"subset_{label}"
#         if subset_col_name not in df.columns:
#             df[subset_col_name] = "none"

#         # --- Step 1: Reuse previous subset selections (e.g., from 100k) ---
#         previous_labels = [k for k in sizes if int(k.replace("k", "")) < int(label.replace("k", ""))]
#         reused_indices = set()
#         for prev_label in previous_labels:
#             prev_col = f"subset_{prev_label}"
#             reused_indices.update(df.index[df[prev_col] != "none"])


#         # --------------------------------------------------------------
#         # ---- determine how many *new* rows to pick automatically -----
#         # --------------------------------------------------------------
#         auto_quota = int(size * 0.9) - len(reused_indices)   # 90 % for phases 1-3
#         if auto_quota <= 0:
#             print(f"Subset {label}: already satisfied by smaller subsets.")
#             df.loc[list(reused_indices), subset_col_name] = label
#             continue           # nothing more to do for this subset size

#         print(f"Reusing {len(reused_indices)} rows; need {auto_quota} new rows.")

#         # split that 90 % into the three automatic phases
#         phase1_cells = int(auto_quota * phase_proportions[0])   # 40 %
#         phase2_cells = int(auto_quota * phase_proportions[1])   # 10 %
#         phase3_cells = auto_quota - phase1_cells - phase2_cells # 40 %

#         # remaining_required = size - len(reused_indices)
#         # if remaining_required <= 0:
#         #     print(f"Subset {label}: already satisfied from smaller subsets.")
#         #     df.loc[list(reused_indices), subset_col_name] = label
#         #     continue

#         print(f"Reusing {len(reused_indices)} from smaller subsets. Need to select {remaining_required} new.")

#         selected_indices = set(reused_indices)
#         phase_col = pd.Series(["none"] * len(df), index=df.index, name="subset_phase")

#         phase1_cells = int(remaining_required * phase_proportions[0])
#         phase2_cells = int(remaining_required * phase_proportions[1])
#         phase3_cells = remaining_required - phase1_cells - phase2_cells

#         # --- PHASE 1: Pure tiles using class-specific thresholds ---
#         phase1_indices = []
#         for i, lc in enumerate(landcover_cols):
#             class_target = targets[label][i]
#             dominance_thresh = class_thresholds[i]
#             target_count = int(phase1_cells * class_target)

#             class_candidates = df[
#                 (df[lc] >= dominance_thresh) &
#                 (~df.index.isin(selected_indices))
#             ]
#             sampled = class_candidates.sample(
#                 n=min(target_count, len(class_candidates)), random_state=rng.integers(1e9)
#             )
#             phase1_indices.extend(sampled.index.tolist())

#         df.loc[phase1_indices, subset_col_name] = label
#         phase_col.loc[phase1_indices] = f"{label}_phase1"

#         selected_indices.update(phase1_indices)

#         left = phase1_cells - len(phase1_indices)
#         if left > 0:
#             extra = df.loc[
#                 (~df.index.isin(selected_indices)) &
#                 (df[landcover_cols].max(axis=1) >= 0.5)  # any reasonably pure
#             ].sample(n=left, random_state=rng.integers(1e9)).index.tolist()
#             phase1_indices.extend(extra)
#             df.loc[extra, subset_col_name] = label
#             phase_col.loc[extra] = f"{label}_phase1_extra"
#             selected_indices.update(extra)
#         # give reused rows a phase tag
#         df.loc[list(reused_indices), "subset_phase"] = f"{label}_reuse"

#         print(f"Phase 1 (Pure): Selected {len(phase1_indices)}")

#         # --- PHASE 2: High-entropy transitional tiles ---
#         entropy_candidates = df.loc[~df.index.isin(selected_indices)]
#         # entropy_sorted = entropy_candidates.sort_values("shannon_entropy", ascending=False)
#         # phase2_indices = entropy_sorted.head(phase2_cells).index.tolist()

#         entropy_sorted = entropy_candidates.sort_values("shannon_entropy", ascending=False)
#         phase2_indices = entropy_sorted.head(min(phase2_cells, len(entropy_sorted))).index.tolist()

#         df.loc[phase2_indices, subset_col_name] = label
#         phase_col.loc[phase2_indices] = f"{label}_phase2"
#         selected_indices.update(phase2_indices)
#         print(f"Phase 2 (Transitional): Selected {len(phase2_indices)}")

#         # --- PHASE 3: Distribution matching using greedy batch search ---
#         phase3_indices = []
#         cumulative = df.loc[list(selected_indices), landcover_cols].sum().values.astype("float32")
#         n_selected = len(selected_indices)

#         remaining = df.loc[~df.index.isin(selected_indices)]   # define once
#         while len(phase3_indices) < phase3_cells and not remaining.empty:
#             # remaining = df.loc[~df.index.isin(selected_indices)]
#             if remaining.empty:
#                 break

#             # candidate_pool = remaining.sample(n=min(sample_pool_size, len(remaining)), random_state=None)
#             candidate_pool = remaining.sample(n=min(sample_pool_size, len(remaining)), random_state=rng.integers(1e9))
#             X = candidate_pool[landcover_cols].values
#             tmp_distributions = (cumulative + X) / (n_selected + 1)
#             errors = np.mean((tmp_distributions - targets[label]) ** 2, axis=1)

#             top_idxs = np.argsort(errors)[:batch_size]
#             best_indices = candidate_pool.iloc[top_idxs].index.tolist()

#             phase3_indices.extend(best_indices)
#             selected_indices.update(best_indices)
#             remaining = df.loc[~df.index.isin(selected_indices)]
#             cumulative += df.loc[best_indices, landcover_cols].sum().values.astype("float32")
#             n_selected += len(best_indices)

#             if len(phase3_indices) % 1000 < batch_size:
#                 print(f"{label} Phase 3: {len(phase3_indices)} / {phase3_cells}")

#         df.loc[phase3_indices, subset_col_name] = label
#         phase_col.loc[phase3_indices] = f"{label}_phase3"
#         print(f"Finished subset: {label} | Total selected this round: {len(selected_indices) - len(reused_indices)}")

#         # Store subset_phase (only once)
#         if "subset_phase" not in df.columns:
#             df["subset_phase"] = "none"
#         df.loc[phase_col.index[phase_col != "none"], "subset_phase"] = phase_col[phase_col != "none"]

#         # Update global selected tracker
#         global_selected.update(selected_indices)

#     return df

In [2]:
# New Code
from scipy.stats import entropy
import numpy as np
import pandas as pd

rng = np.random.default_rng(42)        # deterministic runs
# -------------------------------------------------------------
def compute_shannon_entropy(row):
    p = np.array(row, dtype="float32")
    p = p[p > 0]
    return entropy(p, base=2)
# -------------------------------------------------------------
def select_hybrid_subsets(
    df,
    landcover_cols,
    targets,
    sizes,
    class_thresholds,
    phase_proportions=[0.4, 0.1, 0.4],   # must sum to 0.9
    batch_size=10,
    sample_pool_size=2000
):
    assert abs(sum(phase_proportions) - 0.9) < 1e-6, \
        "Phase proportions must sum to 0.9 (10 % left for Phase-4)."

    df = df.copy()
    df[landcover_cols] = (
        df[landcover_cols]
        .apply(pd.to_numeric, errors="coerce")
        .fillna(0)
        .astype("float32")
    )

    if "shannon_entropy" not in df.columns:
        df["shannon_entropy"] = df[landcover_cols].apply(
            compute_shannon_entropy, axis=1
        )
        print("Entropy computed.")

    global_selected = set()            # across all subset levels

    # ---------------------------------------------------------
    for label, size in sizes.items():
        print(f"\nSelecting subset {label} (request {size:,} rows)")
        subset_col = f"subset_{label}"
        if subset_col not in df.columns:
            df[subset_col] = "none"

        # -------- reuse rows from smaller subset levels -------
        prev_labels = [
            k for k in sizes
            if int(k.replace("k", "")) < int(label.replace("k", ""))
        ]
        reused_idx = set()
        for p in prev_labels:
            reused_idx.update(df.index[df[f"subset_{p}"] != "none"])

        df.loc[list(reused_idx), subset_col] = label
        df.loc[list(reused_idx), "subset_phase"] = f"{label}_reuse"

        auto_quota = int(size * 0.9) - len(reused_idx)   # 90 % auto
        if auto_quota <= 0:
            print("Nothing new to pick – already filled by reuse.")
            continue

        print(f"Reusing {len(reused_idx):,}; need {auto_quota:,} new rows.")

        phase1_cells = int(auto_quota * phase_proportions[0])  # 40 %
        phase2_cells = int(auto_quota * phase_proportions[1])  # 10 %
        phase3_cells = auto_quota - phase1_cells - phase2_cells

        selected = set(reused_idx)
        phase_col = pd.Series("none", index=df.index, dtype="object")

        # ---------------- Phase 1 (pure/dominant) -------------
        phase1_idx = []
        for i, lc in enumerate(landcover_cols):
            need = int(phase1_cells * targets[label][i])
            if need == 0:
                continue
            cand = df[
                (df[lc] >= class_thresholds[i]) &
                (~df.index.isin(selected))
            ]
            if cand.empty:
                continue
            sample = cand.sample(
                n=min(need, len(cand)),
                random_state=rng.integers(1e9)
            ).index.tolist()
            phase1_idx.extend(sample)

        # top-up if Phase 1 short
        short = phase1_cells - len(phase1_idx)
        if short > 0:
            extra = df.loc[
                (~df.index.isin(selected)) &
                (df[landcover_cols].max(axis=1) >= 0.50)
            ].sample(n=short, random_state=rng.integers(1e9)).index.tolist()
            phase1_idx.extend(extra)

        df.loc[phase1_idx, subset_col] = label
        phase_col.loc[phase1_idx] = f"{label}_phase1"
        selected.update(phase1_idx)
        print(f"Phase 1 picked {len(phase1_idx):,}")

        # ---------------- Phase 2 (high entropy) --------------
        ent_pool = df.loc[~df.index.isin(selected)]
        phase2_idx = (
            ent_pool.sort_values("shannon_entropy", ascending=False)
                    .head(min(phase2_cells, len(ent_pool)))
                    .index.tolist()
        )
        df.loc[phase2_idx, subset_col] = label
        phase_col.loc[phase2_idx] = f"{label}_phase2"
        selected.update(phase2_idx)
        print(f"Phase 2 picked {len(phase2_idx):,}")

        # ---------------- Phase 3 (greedy matching) -----------
        phase3_idx = []
        cumulative = df.loc[list(selected), landcover_cols].sum().values
        n_sel = len(selected)
        remaining = df.loc[~df.index.isin(selected)]

        while len(phase3_idx) < phase3_cells and not remaining.empty:
            pool = remaining.sample(
                n=min(sample_pool_size, len(remaining)),
                random_state=rng.integers(1e9)
            )
            X = pool[landcover_cols].values
            new_dist = (cumulative + X) / (n_sel + 1)
            errs = ((new_dist - targets[label]) ** 2).mean(axis=1)
            best = pool.iloc[np.argsort(errs)[:batch_size]].index.tolist()

            phase3_idx.extend(best)
            selected.update(best)
            cumulative += df.loc[best, landcover_cols].sum().values
            n_sel += len(best)
            remaining = df.loc[~df.index.isin(selected)]

        df.loc[phase3_idx, subset_col] = label
        phase_col.loc[phase3_idx] = f"{label}_phase3"
        print(f"Phase 3 picked {len(phase3_idx):,}")

        # ---------------- commit phase labels ----------------
        if "subset_phase" not in df.columns:
            df["subset_phase"] = "none"
        df.loc[phase_col != "none", "subset_phase"] = phase_col[phase_col != "none"]

        print(f"Finished {label}: "
              f"{len(selected) - len(reused_idx):,} new rows (auto quota {auto_quota:,}).")

        global_selected.update(selected)

    return df


In [None]:
# Newest Batch sizes and restarts and save each level

from scipy.stats import entropy
import numpy as np, pandas as pd
rng = np.random.default_rng(42)                               # reproducible

def compute_shannon_entropy(row):
    p = np.asarray(row, dtype="float32")
    p = p[p > 0]
    return entropy(p, base=2)

# -------------------------------------------------------------------------
def select_hybrid_subsets(
    df,
    landcover_cols,
    targets,                    # dict: label -> 1-d numpy (sum=1)
    sizes,                      # dict: label -> int (# rows requested)
    class_thresholds,
    phase_proportions=[0.4, 0.1, 0.4],      # must sum to 0.9
    batch_size=25,             # int or dict
    sample_pool_size=2500,     # int or dict
    save_path_template=None    # e.g. "out_with_{label}.parquet"
):
    assert abs(sum(phase_proportions) - 0.9) < 1e-6, \
        "phase_proportions must sum to 0.9 (10 % is Phase-4 / manual)."

    df = df.copy()
    print(f"Initial size: {len(df):,} rows")
    df[landcover_cols] = (
        df[landcover_cols]
          .apply(pd.to_numeric, errors="coerce")
          .fillna(0)
          .astype("float32")
    )
    print(f"Converted landcover columns to float32., calculating shannon entropy...")
    if "shannon_entropy" not in df.columns:
        df["shannon_entropy"] = df[landcover_cols].apply(
            compute_shannon_entropy, axis=1)
        print("Shannon entropy computed.")

    if "subset_phase" not in df.columns:
        df["subset_phase"] = "none"

    # -----------------------------------------------------------------
    for label, size in sizes.items():
        print(f"\n=== Selecting subset {label} (request {size:,} rows) ===")
        subset_col = f"subset_{label}"
        if subset_col not in df.columns:
            df[subset_col] = "none"

        present = (df[subset_col] != "none").sum()
        if present >= int(size * 0.9)-1:
            print(f"[{label}] already ≥ 90 % filled ({present:,}/{size:,}). "
                  "Skipping selection and re-using those rows.")
            continue

        print(f"\n=== Building subset {label}  (need {size:,}) ===")

        # ---------- per-subset batch / pool sizes --------------------
        this_batch = batch_size[label]     if isinstance(batch_size, dict)     else batch_size
        this_pool  = sample_pool_size[label] if isinstance(sample_pool_size, dict) else sample_pool_size

        # ---------- rows reused from earlier subset levels ----------
        earlier = [k for k in sizes if int(k.replace("k", "")) < int(label.replace("k", ""))]
        reused = set()
        for k in earlier:
            reused.update(df.index[df[f"subset_{k}"] != "none"])

        df.loc[list(reused), subset_col]      = label
        df.loc[list(reused), "subset_phase"]  = f"{label}_reuse"

        auto_quota = int(size * 0.9) - len(reused)
        if auto_quota <= 0:
            print(f"[{label}] quota satisfied by reuse ({len(reused):,}).")
            if save_path_template:
                df.to_parquet(save_path_template.format(label=label))
            continue

        p1 = int(auto_quota * phase_proportions[0])
        p2 = int(auto_quota * phase_proportions[1])
        p3 = auto_quota - p1 - p2

        selected = set(reused)
        phase_tag = pd.Series("none", index=df.index, dtype="object")

        # ---------------- Phase 1  (dominant) ------------------------
        p1_idx = []
        for i, lc in enumerate(landcover_cols):
            need = int(p1 * targets[label][i])
            if need == 0: continue
            cand = df[(df[lc] >= class_thresholds[i]) & (~df.index.isin(selected))]
            if cand.empty: continue
            pick = cand.sample(
                n=min(need, len(cand)),
                random_state=rng.integers(1e9)).index.tolist()
            p1_idx.extend(pick)
        short = p1 - len(p1_idx)
        if short > 0:
            extra = df.loc[
                (~df.index.isin(selected)) &
                (df[landcover_cols].max(axis=1) >= 0.5)
            ].sample(n=short, random_state=rng.integers(1e9)).index.tolist()
            p1_idx.extend(extra)

        df.loc[p1_idx, subset_col] = label
        phase_tag.loc[p1_idx] = f"{label}_phase1"
        selected.update(p1_idx)
        print(f"Phase-1 picked {len(p1_idx):,}")

        # ---------------- Phase 2  (entropy) -------------------------
        ent_pool = df.loc[~df.index.isin(selected)]
        p2_idx = (ent_pool.sort_values("shannon_entropy", ascending=False)
                          .head(min(p2, len(ent_pool))).index.tolist())
        df.loc[p2_idx, subset_col] = label
        phase_tag.loc[p2_idx] = f"{label}_phase2"
        selected.update(p2_idx)
        print(f"Phase-2 picked {len(p2_idx):,}")

        # ---------------- Phase 3  (greedy match) -------------------
        p3_idx = []
        cum = df.loc[list(selected), landcover_cols].sum().values
        n_sel = len(selected)
        remain = df.loc[~df.index.isin(selected)]

        from tqdm import tqdm
        # safe dynamic tracking
        remaining_rows = p3 - len(p3_idx)
        steps = (remaining_rows + this_batch - 1) // this_batch
        pbar = tqdm(total=steps, desc=f"{label} phase-3", leave=False)

        while len(p3_idx) < p3 and not remain.empty:
            remain = df.loc[~df.index.isin(selected)]
            if remain.empty:
                print(f"Phase-3: no more candidates left.")
                break

            pool = remain.sample(
                n=min(this_pool, len(remain)),
                random_state=rng.integers(1e9))
            X = pool[landcover_cols].values
            errs = (( (cum + X) / (n_sel + 1) - targets[label]) ** 2).mean(axis=1)
            best = pool.iloc[np.argsort(errs)[:this_batch]].index.tolist()

            p3_idx.extend(best)
            selected.update(best)
            cum += df.loc[best, landcover_cols].sum().values
            n_sel += len(best)
            pbar.update(1)
            # remain = df.loc[~df.index.isin(selected)]
            # exit early if filled
            if len(p3_idx) >= p3:
                break

        df.loc[p3_idx, subset_col] = label
        phase_tag.loc[p3_idx] = f"{label}_phase3"
        print(f"Phase-3 picked {len(p3_idx):,}")

        # ---------- commit phase labels & save progress -------------
        df.loc[phase_tag != "none", "subset_phase"] = phase_tag[phase_tag != "none"]

        done = (df[subset_col] != "none").sum()
        print(f"Finished {label}: {done:,}/{size:,} rows labeled "
              f"(auto {len(selected) - len(reused):,}, reuse {len(reused):,}).")

        if save_path_template:
            out_path = save_path_template.format(label=label)
            df.to_parquet(out_path)
            print(f"Saved progress → {out_path}")

    return df


In [None]:
# Newest Batch sizes and restarts and save each level, plus batch wise update not individual tiles, Pick the purest in phase 1

from scipy.stats import entropy
import numpy as np, pandas as pd
rng = np.random.default_rng(42)                               # reproducible

def compute_shannon_entropy(row):
    p = np.asarray(row, dtype="float32")
    p = p[p > 0]
    return entropy(p, base=2)

# -------------------------------------------------------------------------
def select_hybrid_subsets(
    df,
    landcover_cols,
    targets,                    # dict: label -> 1-d numpy (sum=1)
    sizes,                      # dict: label -> int (# rows requested)
    class_thresholds,
    phase_proportions=[0.4, 0.1, 0.4],      # must sum to 0.9
    batch_size=25,             # int or dict
    sample_pool_size=2500,     # int or dict
    save_path_template=None    # e.g. "out_with_{label}.parquet"
):
    assert abs(sum(phase_proportions) - 0.9) < 1e-6, \
        "phase_proportions must sum to 0.9 (10 % is Phase-4 / manual)."

    df = df.copy()
    print(f"Initial size: {len(df):,} rows")
    df[landcover_cols] = (
        df[landcover_cols]
          .apply(pd.to_numeric, errors="coerce")
          .fillna(0)
          .astype("float32")
    )
    print(f"Converted landcover columns to float32., calculating shannon entropy...")
    if "shannon_entropy" not in df.columns:
        df["shannon_entropy"] = df[landcover_cols].apply(
            compute_shannon_entropy, axis=1)
        print("Shannon entropy computed.")

    if "subset_phase" not in df.columns:
        df["subset_phase"] = "none"

    # -----------------------------------------------------------------
    for label, size in sizes.items():
        print(f"\n=== Selecting subset {label} (request {size:,} rows) ===")
        subset_col = f"subset_{label}"
        if subset_col not in df.columns:
            df[subset_col] = "none"

        present = (df[subset_col] != "none").sum()
        if present >= int(size * 0.9)-1:
            print(f"[{label}] already ≥ 90 % filled ({present:,}/{size:,}). "
                  "Skipping selection and re-using those rows.")
            continue

        print(f"\n=== Building subset {label}  (need {size:,}) ===")

        # ---------- per-subset batch / pool sizes --------------------
        this_batch = batch_size[label]     if isinstance(batch_size, dict)     else batch_size
        this_pool  = sample_pool_size[label] if isinstance(sample_pool_size, dict) else sample_pool_size

        # ---------- rows reused from earlier subset levels ----------
        earlier = [k for k in sizes if int(k.replace("k", "")) < int(label.replace("k", ""))]
        reused = set()
        for k in earlier:
            reused.update(df.index[df[f"subset_{k}"] != "none"])

        df.loc[list(reused), subset_col]      = label
        df.loc[list(reused), "subset_phase"]  = f"{label}_reuse"

        auto_quota = int(size * 0.9) - len(reused)
        if auto_quota <= 0:
            print(f"[{label}] quota satisfied by reuse ({len(reused):,}).")
            if save_path_template:
                df.to_parquet(save_path_template.format(label=label))
            continue

        p1 = int(auto_quota * phase_proportions[0])
        p2 = int(auto_quota * phase_proportions[1])
        p3 = auto_quota - p1 - p2

        selected = set(reused)
        phase_tag = pd.Series("none", index=df.index, dtype="object")

        # ---------------- Phase 1  (dominant) ------------------------
        p1_idx = []
        for i, lc in enumerate(landcover_cols):
            need = int(p1 * targets[label][i])
            if need == 0: continue
            cand = df[(df[lc] >= class_thresholds[i]) & (~df.index.isin(selected))]
            if cand.empty: continue
            # pick = cand.sample(
            #     n=min(need, len(cand)),
            #     random_state=rng.integers(1e9)).index.tolist()
            
            pick = cand.sort_values(by=lc, ascending=False).head(min(need, len(cand))).index.tolist()

            p1_idx.extend(pick)
        short = p1 - len(p1_idx)
        if short > 0:
            extra = df.loc[
                (~df.index.isin(selected)) &
                (df[landcover_cols].max(axis=1) >= 0.5)
            ].sample(n=short, random_state=rng.integers(1e9)).index.tolist()
            p1_idx.extend(extra)

        df.loc[p1_idx, subset_col] = label
        phase_tag.loc[p1_idx] = f"{label}_phase1"
        selected.update(p1_idx)
        print(f"Phase-1 picked {len(p1_idx):,}")

        # ---------------- Phase 2  (entropy) -------------------------

        ent_pool = df.loc[~df.index.isin(selected)]

        # SAMPLE a smaller number of candidates to sort
        if len(ent_pool) > 6 * p2:   # if more than 5× needed
            ent_pool = ent_pool.sample(n=6*p2, random_state=rng.integers(1e9))

        p2_idx = (ent_pool.sort_values("shannon_entropy", ascending=False)
                        .head(min(p2, len(ent_pool)))
                        .index.tolist())

        # p2_idx = ent_pool.nlargest(p2, columns="shannon_entropy").index.tolist()

        # p2_idx = ent_pool.nlargest(p2, columns="shannon_entropy").index.tolist()

        # ent_pool = df.loc[~df.index.isin(selected)]
        # p2_idx = (ent_pool.sort_values("shannon_entropy", ascending=False)
        #                   .head(min(p2, len(ent_pool))).index.tolist())
        df.loc[p2_idx, subset_col] = label
        phase_tag.loc[p2_idx] = f"{label}_phase2"
        selected.update(p2_idx)
        print(f"Phase-2 picked {len(p2_idx):,}")

        # ---------------- Phase 3  (greedy match, batch-wise) -------------------

        p3_idx = []
        cum = df.loc[list(selected), landcover_cols].sum().values
        n_sel = len(selected)

        remain = df.loc[~df.index.isin(selected)]
        from tqdm import tqdm
        steps = (p3 + this_batch - 1) // this_batch
        pbar  = tqdm(total=steps, desc=f"{label} phase-3", leave=False)

        while len(p3_idx) < p3 and not remain.empty:
            remain = df.loc[~df.index.isin(selected)]
            if remain.empty:
                print(f"Phase-3: no more candidates available.")
                break

            from joblib import Parallel, delayed

            # inside your phase 3 loop...

            # sample and batchify
            pool = remain.sample(
                n=min(this_pool, len(remain)),
                random_state=rng.integers(1e9)
            )
            pool_X = pool[landcover_cols].values
            pool_idx = pool.index.to_numpy()

            n_batches = len(pool_idx) // this_batch
            if n_batches == 0:
                print("Warning: not enough candidates to form even one batch.")
                break

            # split pool into batches
            pool_X_batches = pool_X[:n_batches * this_batch].reshape(n_batches, this_batch, -1)

            def compute_error(i):
                batch_sum = pool_X_batches[i].sum(axis=0)
                error = np.mean(( (cum + batch_sum) / (n_sel + this_batch) - targets[label]) ** 2)
                return i, error

            # parallel compute errors
            batch_errors = Parallel(n_jobs=-1, backend="threading")(
                delayed(compute_error)(i) for i in range(n_batches)
            )

            batch_errors = sorted(batch_errors, key=lambda x: x[1])
            best_batch_idx = batch_errors[0][0]
            best_idxs = pool_idx[best_batch_idx*this_batch : (best_batch_idx+1)*this_batch]

            # update
            p3_idx.extend(best_idxs.tolist())
            selected.update(best_idxs.tolist())
            cum += pool_X_batches[best_batch_idx].sum(axis=0)
            n_sel += len(best_idxs)

            # clean
            del pool, pool_X, pool_idx, pool_X_batches, batch_errors
            import gc; gc.collect()


            # # -------- sample once and split into batches ----------
            # pool = remain.sample(
            #     n=min(this_pool, len(remain)),
            #     random_state=rng.integers(1e9))
            # pool_X = pool[landcover_cols].values
            # pool_idx = pool.index.to_numpy()

            # n_batches = len(pool_idx) // this_batch
            # if n_batches == 0:
            #     print("Warning: not enough candidates to form even one batch.")
            #     break

            # batch_sums = np.array([
            #     pool_X[i*this_batch : (i+1)*this_batch].sum(axis=0)
            #     for i in range(n_batches)
            # ])
            # batch_idxs = [
            #     pool_idx[i*this_batch : (i+1)*this_batch].tolist()
            #     for i in range(n_batches)
            # ]

            # errs = (( (cum + batch_sums) / (n_sel + this_batch) - targets[label]) ** 2).mean(axis=1)
            # best_batch = int(np.argmin(errs))
            # best_idxs  = batch_idxs[best_batch]

            # # -------- update state ----------
            # p3_idx.extend(best_idxs)
            # selected.update(best_idxs)
            # cum += batch_sums[best_batch]
            # n_sel += len(best_idxs)

            # # tidy up
            # del pool, pool_X, pool_idx, batch_sums, batch_idxs, errs
            # import gc; gc.collect()

            pbar.update(1)
            if len(p3_idx) >= p3:
                break

        pbar.close()

        df.loc[p3_idx, subset_col] = label
        phase_tag.loc[p3_idx]      = f"{label}_phase3"
        print(f"Phase-3 picked {len(p3_idx):,}")

        # ---------- commit phase labels & save progress -------------
        df.loc[phase_tag != "none", "subset_phase"] = phase_tag[phase_tag != "none"]

        done = (df[subset_col] != "none").sum()
        print(f"Finished {label}: {done:,}/{size:,} rows labeled "
              f"(auto {len(selected) - len(reused):,}, reuse {len(reused):,}).")

        if save_path_template:
            # save only picked rows, not all
            df.loc[df[subset_col] == label].to_parquet(save_path_template.format(label=label))
            if label in ["500k", "1000K", "1500K"]:
                # save all rows for these labels
                # df.loc[df[subset_col] == label].to_parquet(save_path_template.format(label=label))
                out_path = save_path_template.format(label=label)
                df.to_parquet(out_path)
                print(f"Saved progress → {out_path}")

    return df



In [8]:
import pandas as pd
import numpy as np

# ------------------------------------------------------------------
# 1.  Load the CSV that contains your level‐wise proportions
# ------------------------------------------------------------------
csv_path = "D:/tascarrd_test_outputs/tiered_props_all_levels.csv"
tbl = pd.read_csv(csv_path)
tbl.rename(columns={tbl.columns[0]: "Land Cover"}, inplace=True)
ordered = [
    "Open Seas", "Tree Cover", "Shrubland", "Grassland", "Cropland",
    "Built-up", "Bare/Sparse Vegetation", "Snow and Ice",
    "Permanent Water Bodies", "Herbaceous Wetland",
    "Mangroves", "Moss and Lichen"
]

base_1200m = (
    tbl.set_index("Land Cover")          # align by name
    #    .loc[ordered, "Global 1200m"]     # % values
        .loc[ordered, "Medium (1200m)"] 
       .values.astype("float32") / 100.0 # to fractions
)

# ------------------------------------------------------------
# 1.  importance weights  (edit if you wish)
# ------------------------------------------------------------
weights = np.array([
    0.25, 1.0, 0.9, 1.1, 1.5,
    2.0, 0.9, 1.1, 
    1.2, 1.5, 
    2.0, 1.5
], dtype="float32")

# ------------------------------------------------------------
# 2.  bias factors  –  small batch = strong oversampling
# ------------------------------------------------------------
bias = {
    "10k":   1.5,
    "25k":   1.4,
    "50k":   1.3,
    "100k":  1.2,
    "250k":  1.1,
    "500k":  1.0,
    "750k":  0.9,
    "1000k": 0.8,
    "1500k": 0.7,
}

# ------------------------------------------------------------
# 3.  availability ceiling  (fraction of majority-class tiles)
#     → use real numbers when you have them
# ------------------------------------------------------------
avail = np.minimum(base_1200m, 1.0).astype("float32")

# ------------------------------------------------------------
# 4.  progressive targets  (now bias really matters)
#     formula:  blend = (1-μ)*base + μ*biased ,
#               where μ = (b-1)/(max_bias-1)
# ------------------------------------------------------------
max_bias = max(bias.values())  # = 1.6 here
target_distributions = {}

for name, b in bias.items():
    mu   = (b - 1.0) / (max_bias - 1.0)   # 0 … 1
    biased = weights * base_1200m
    raw    = (1 - mu) * base_1200m + mu * biased
    raw    = np.minimum(raw, avail)       # respect ceiling
    target = raw / raw.sum()              # normalise to 1
    target_distributions[name] = target

# ------------------------------------------------------------------
# 5.  verify each row differs
# ------------------------------------------------------------------
pd.set_option("display.precision", 2)
print(pd.DataFrame(target_distributions, index=ordered).T * 100)


       Open Seas  Tree Cover  Shrubland  Grassland  Cropland  Built-up  \
10k         0.51       22.66       6.49      12.36     18.54     10.30   
25k         0.82       22.52       6.59      12.29     18.43     10.24   
50k         1.12       22.39       6.70      12.21     18.32     10.18   
100k        1.42       22.26       6.80      12.14     18.21     10.12   
250k        1.71       22.13       6.90      12.07     18.11     10.06   
500k        2.00       22.00       7.00      12.00     18.00     10.00   
750k        2.12       23.35       7.43      12.48     17.20      8.49   
1000k       2.26       24.89       7.92      13.03     16.29      6.79   
1500k       2.42       26.63       8.47      13.66     15.25      4.84   

       Bare/Sparse Vegetation  Snow and Ice  Permanent Water Bodies  \
10k                      6.49          4.12                    7.21   
25k                      6.59          4.10                    7.17   
50k                      6.70          4.07   

In [9]:
target_distributions

{'10k': array([0.00514933, 0.22657056, 0.06488157, 0.12358394, 0.18537591,
        0.10298662, 0.06488157, 0.04119464, 0.07209063, 0.05149331,
        0.03089598, 0.03089598], dtype=float32),
 '25k': array([0.00819001, 0.22522523, 0.06592957, 0.12285013, 0.1842752 ,
        0.10237511, 0.06592957, 0.04095004, 0.07166258, 0.05118755,
        0.03071253, 0.03071253], dtype=float32),
 '50k': array([0.01119479, 0.22389579, 0.0669652 , 0.12212498, 0.18318747,
        0.10177082, 0.0669652 , 0.04070833, 0.07123957, 0.05088541,
        0.03053124, 0.03053124], dtype=float32),
 '100k': array([0.01416431, 0.22258195, 0.06798867, 0.12140834, 0.18211251,
        0.10117362, 0.06798867, 0.04046945, 0.07082153, 0.05058681,
        0.03035208, 0.03035208], dtype=float32),
 '250k': array([0.01709918, 0.22128345, 0.06900021, 0.12070007, 0.1810501 ,
        0.10058339, 0.06900021, 0.04023336, 0.07040837, 0.05029169,
        0.03017502, 0.03017502], dtype=float32),
 '500k': array([0.02      , 0.22000001

In [10]:
# ------------------------------------------------------------------
# 7.  Subset-size dictionary
# ------------------------------------------------------------------

# Subset sizes
subset_sizes = {
    "10k": 10_000,
    "25k": 25_000,
    "50k": 50_000,
    "100k": 100_000,
    "250k": 250_000,
    "500k": 500_000,
    "750k": 750_000,
    "1000k": 1_000_000,
    "1500k": 1_500_000
}

pure_class_thresholds = np.array([
    0.80,  # Open Seas
    0.70,  # Tree Cover
    0.70,  # Shrubland
    0.65,  # Grassland
    0.60,  # Cropland
    0.20,  # Built-up
    0.60,  # Bare / Sparse Vegetation
    0.40,  # Snow and Ice
    0.50,  # Permanent Water Bodies
    0.40,  # Herbaceous Wetland
    0.05,  # Mangroves
    0.10   # Moss and Lichen
])

# Usage:
phase_proportions = [0.4, 0.1, 0.4]  # 50% pure, 10% high-entropy, 40% target matching 10% left for manual based on dataset availability



In [None]:
batch_sizes = { "10k": 20, "25k": 25, "50k": 50,
                "100k": 75, "250k": 150, "500k": 150,
                "750k": 375, "1000k": 500, "1500k": 750 }

pool_sizes  = { "10k": 2500, "25k": 2500, "50k": 3000,
                "100k": 3500, "250k": 3500, "500k": 3500,
                "750k": 3750, "1000k": 5000, "1500k": 5000 }  #{ k: v*50 for k, v in batch_sizes.items() }  # example

df = select_hybrid_subsets(
        df,
        ordered,
        targets=target_distributions,
        sizes=subset_sizes,
        class_thresholds=pure_class_thresholds,
        batch_size=batch_sizes,
        sample_pool_size=pool_sizes,
        save_path_template="D:/tascarrd_test_outputs/grid_1200m/grid_1200m_subsets_after_{label}.parquet"
)


Initial size: 107,048,570 rows
Converted landcover columns to float32., calculating shannon entropy...

=== Selecting subset 10k (request 10,000 rows) ===
[10k] already ≥ 90 % filled (9,000/10,000). Skipping selection and re-using those rows.

=== Selecting subset 25k (request 25,000 rows) ===
[25k] already ≥ 90 % filled (22,500/25,000). Skipping selection and re-using those rows.

=== Selecting subset 50k (request 50,000 rows) ===
[50k] already ≥ 90 % filled (45,000/50,000). Skipping selection and re-using those rows.

=== Selecting subset 100k (request 100,000 rows) ===
[100k] already ≥ 90 % filled (90,000/100,000). Skipping selection and re-using those rows.

=== Selecting subset 250k (request 250,000 rows) ===
[250k] already ≥ 90 % filled (225,100/250,000). Skipping selection and re-using those rows.

=== Selecting subset 500k (request 500,000 rows) ===

=== Building subset 500k  (need 500,000) ===
Phase-1 picked 89,960
Phase-2 picked 22,490


500k phase-3:   0%|          | 0/450 [00:00<?, ?it/s]

MemoryError: Unable to allocate 5.57 GiB for an array with shape (7, 106711020) and data type object

In [None]:
# 250k samples selected for 1200m levels

# Previous code using hardcoded for each subset level

In [None]:
import pandas as pd
import numpy as np

landcover_columns = [
    "Open Seas", "Tree Cover", "Shrubland", "Grassland", "Cropland", "Built-up",
    "Bare / Sparse Vegetation", "Snow and Ice", "Permanent Water Bodies",
    "Herbaceous Wetland", "Mangroves", "Moss and Lichen"
]

target_1200m = np.array([
    2.0,  # Open Seas
    22.0, # Tree Cover
    7.0,  # Shrubland
    12.0, # Grassland
    18.0, # Cropland
    10.0,  # Built-up
    7.0, # Bare / Sparse Vegetation
    4.0,  # Snow and Ice
    7.0, # Permanent Water Bodies
    5.0,  # Herbaceous Wetland
    3.0,  # Mangroves
    3.0   # Moss and Lichen
]) / 100
# -----------------------------------------------------------------------------------
target_10k = np.array([
    0.5,  # Open Seas
    16.0, # Tree Cover
    6.0,  # Shrubland
    13.0, # Grassland
    15.0, # Cropland
    8.0,  # Built-up
    10.0, # Bare / Sparse Vegetation
    4.0,  # Snow and Ice
    10.0, # Permanent Water Bodies
    6.0,  # Herbaceous Wetland
    3.0,  # Mangroves
    8.5   # Moss and Lichen
]) / 100

target_25k = np.array([
    1.0,  # Open Seas
    18.0, # Tree Cover
    6.0,  # Shrubland
    14.0, # Grassland
    15.0, # Cropland
    7.0,  # Built-up
    11.0, # Bare / Sparse Vegetation
    4.0,  # Snow and Ice
    10.0, # Permanent Water Bodies
    6.0,  # Herbaceous Wetland
    2.5,  # Mangroves
    5.5   # Moss and Lichen
]) / 100

target_50k = np.array([
    1.2,  # Open Seas
    20.0, # Tree Cover
    6.5,  # Shrubland
    15.0, # Grassland
    14.5, # Cropland
    6.0,  # Built-up
    12.0, # Bare / Sparse Vegetation
    4.0,  # Snow and Ice
    9.5, # Permanent Water Bodies
    5.5,  # Herbaceous Wetland
    2.0,  # Mangroves
    3.8   # Moss and Lichen
]) / 100

target_100k = np.array([
    1.5,  # Open Seas
    22.0, # Tree Cover
    7.0,  # Shrubland
    16.0, # Grassland
    13.5, # Cropland
    5.0,  # Built-up
    12.0, # Bare / Sparse Vegetation
    4.0,  # Snow and Ice
    10.0, # Permanent Water Bodies
    5.0,  # Herbaceous Wetland
    1.5,  # Mangroves
    2.5   # Moss and Lichen
]) / 100

target_250k = np.array([
    1.8,  # Open Seas
    24.0, # Tree Cover
    7.5,  # Shrubland
    17.0, # Grassland
    12.5, # Cropland
    4.0,  # Built-up
    12, # Bare / Sparse Vegetation
    4.0,  # Snow and Ice
    11, # Permanent Water Bodies
    4,  # Herbaceous Wetland
    1.0,  # Mangroves
    1.2   # Moss and Lichen
]) / 100

target_500k = np.array([
    1.8,  # Open Seas
    24.0, # Tree Cover
    7.5,  # Shrubland
    17.0, # Grassland
    12.5, # Cropland
    4.0,  # Built-up
    12, # Bare / Sparse Vegetation
    4.0,  # Snow and Ice
    11, # Permanent Water Bodies
    4,  # Herbaceous Wetland
    1.0,  # Mangroves
    1.2   # Moss and Lichen
]) / 100

target_750k = np.array([
    1.8,  # Open Seas
    24.0, # Tree Cover
    7.5,  # Shrubland
    17.0, # Grassland
    12.5, # Cropland
    4.0,  # Built-up
    12, # Bare / Sparse Vegetation
    4.0,  # Snow and Ice
    11, # Permanent Water Bodies
    4,  # Herbaceous Wetland
    1.0,  # Mangroves
    1.2   # Moss and Lichen
]) / 100

target_1000k = np.array([
    1.8,  # Open Seas
    24.0, # Tree Cover
    7.5,  # Shrubland
    17.0, # Grassland
    12.5, # Cropland
    4.0,  # Built-up
    12, # Bare / Sparse Vegetation
    4.0,  # Snow and Ice
    11, # Permanent Water Bodies
    4,  # Herbaceous Wetland
    1.0,  # Mangroves
    1.2   # Moss and Lichen
]) / 100
print(target_10k.sum())
print(target_25k.sum())
print(target_50k.sum())
print(target_100k.sum())
print(target_250k.sum())
print(target_500k.sum())
print(target_750k.sum())    
print(target_1000k.sum())       
                



1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0


In [None]:

target_distributions = {
    "10k": target_10k,
    "25k":target_25k,
    "50k": target_50k,    
    "100k": target_100k,
    "250k":target_250k,
    "500k": target_500k,
    "750k": target_750k,
    "1000k": target_1000k
}

# Subset sizes
subset_sizes = {
    "10k": 10_000,
    "25k": 25_000,
    "50k": 50_000,
    "100k": 100_000,
    "250k": 250_000,
    "500k": 500_000,
    "750k": 750_000,
    "1000k": 1_000_000
}

pure_class_thresholds = np.array([
    0.80,  # Open Seas
    0.70,  # Tree Cover
    0.70,  # Shrubland
    0.65,  # Grassland
    0.60,  # Cropland
    0.20,  # Built-up
    0.60,  # Bare / Sparse Vegetation
    0.40,  # Snow and Ice
    0.50,  # Permanent Water Bodies
    0.40,  # Herbaceous Wetland
    0.10,  # Mangroves
    0.10   # Moss and Lichen
])

# Usage:
phase_proportions = [0.4, 0.1, 0.4]  # 50% pure, 10% high-entropy, 40% target matching

df_labeled = select_hybrid_subsets(
    df,
    landcover_cols=landcover_columns,
    targets=target_distributions,
    sizes=subset_sizes,
    class_thresholds=pure_class_thresholds,
    phase_proportions=phase_proportions,
    batch_size=50,
    sample_pool_size=2500
)

df_labeled.to_parquet("D:/MajorTOMExpand/0. DatasetsFolder/13.Major-TOM/MajorTOM_metadata_with_computed_attributes_subsets.parquet") 

SyntaxError: invalid syntax (3437613395.py, line 7)

In [None]:
df_labeled2.head()

In [None]:
import pandas as pd

# Example loading from CSV (you'd replace this with your real DataFrame)
# df = pd.read_csv('your_file.csv')
df = df_labeled2.copy()

# Assuming your DataFrame is already loaded and named `df`
# The goal is to propagate subset labels upward

subset_cols = ['subset_100k', 'subset_250k', 'subset_500k', 'subset_750k', 'subset_1000k']

# Process each column starting from the second, copying non-'none' values from lower levels
for i in range(1, len(subset_cols)):
    current_col = subset_cols[i]
    lower_cols = subset_cols[:i]
    
    # Create a mask of rows where the current column is 'none'
    mask = df[current_col] == 'none'
    
    # For those rows, check the first non-'none' value from lower levels
    for col in reversed(lower_cols):
        df.loc[mask & (df[col] != 'none'), current_col] = df[col]

# Now the higher subset columns (like 500k, 750k, etc.) also include inherited lower levels
df

In [None]:
df.to_parquet("D:/MajorTOMExpand/0. DatasetsFolder/13.Major-TOM/MajorTOM_metadata_with_computed_attributes_subsetsMentioned.parquet")

# Custom 200k subset

In [None]:
filter_grid_cells = pd.read_csv("D:/2020_subset_grid.csv")
filter_grid_cells

In [None]:
df_merge = df.merge(filter_grid_cells, on='grid_cell', how='inner')
df_merge

In [None]:
df_merge_export = df_merge[["grid_cell", "subset_phase", "subset_100k", "subset_250k", "subset_500k", "subset_750k", "subset_1000k"]]
df_merge_export

In [None]:
df_merge_export.to_csv("D:/MajorTOMExpand/0. DatasetsFolder/13.Major-TOM/MajorTOM_metadata_with_computed_cusotmSubsetsGridCells.csv", index=False)

In [None]:
df_merge_dedup.to_parquet("D:/MajorTOMExpand/0. DatasetsFolder/13.Major-TOM/MajorTOM_metadata_with_computed_attributes_2020_subset_grid.parquet")