In [None]:
# Exapnding landcover proportions from a parquet file to individual columns

import dask.dataframe as dd
import ast
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from dask.diagnostics import ProgressBar
import os
import glob

# -----------------------------------------------------------------------------------
# Landcover Setup
# -----------------------------------------------------------------------------------
landcover_legend = {
    0: "Open Seas", 10: "Tree Cover", 20: "Shrubland", 30: "Grassland",
    40: "Cropland", 50: "Built-up", 60: "Bare/Sparse Vegetation", 70: "Snow and Ice",
    80: "Permanent Water Bodies", 90: "Herbaceous Wetland", 95: "Mangroves", 100: "Moss and Lichen"
}
lc_columns = list(landcover_legend.values())

# -----------------------------------------------------------------------------------
# Expansion Function
# -----------------------------------------------------------------------------------
def expand_landcover_props_fast(df_partition):
    parsed = df_partition['landcover_props'].apply(ast.literal_eval)

    for col in lc_columns:
        df_partition[col] = 0.0

    for code, col_name in landcover_legend.items():
        df_partition[col_name] = parsed.apply(lambda d: d.get(code, 0.0))

    df_partition = df_partition.drop(columns=["landcover_props"])

    final_columns = ["tile_id"] + lc_columns
    df_partition = df_partition[final_columns]

    return df_partition

# -----------------------------------------------------------------------------------
# Paths
# -----------------------------------------------------------------------------------
input_folder_pattern = "D:/NestEO_test_outputs/grid_1200m/lc_proportions_10m_*_1200m.parquet"
output_folder = "D:/NestEO_test_outputs/grid_1200m/lc_props_with_columns/"

os.makedirs(output_folder, exist_ok=True)

# -----------------------------------------------------------------------------------
# Process Each Zone File Separately
# -----------------------------------------------------------------------------------
zone_files = sorted(glob.glob(input_folder_pattern))

print(f"Found {len(zone_files)} zone files to process.")

for file_path in zone_files:
    zone_name = os.path.basename(file_path).replace(".parquet", "")  # e.g., lc_proportions_10m_32N_1200m
    output_file = os.path.join(output_folder, f"{zone_name}.parquet")

    if os.path.exists(output_file):
        print(f"Skipping {zone_name}, already processed.")
        continue
    
    print(f"\nProcessing {zone_name}...")

    # Read the zone file lazily
    df = dd.read_parquet(
        file_path,
        engine="pyarrow",
        columns=["tile_id", "landcover_props"]
    )

    # Optional: repartition to larger partitions if needed
    df = df.repartition(partition_size="3GB")

    # Define correct meta
    meta_columns = {"tile_id": 'object'}
    meta_columns.update({col: 'float32' for col in lc_columns})

    # Expand
    df_expanded = df.map_partitions(expand_landcover_props_fast, meta=meta_columns)

    # Compute and Save
    with ProgressBar():
        df_final = df_expanded.compute()

    # Save cleanly
    table = pa.Table.from_pandas(df_final, preserve_index=False)
    pq.write_table(
        table,
        where=output_file,
        compression="snappy",
        use_dictionary=True,
        data_page_size=2097152,
        row_group_size=1000000
    )

    print(f"Saved expanded zone: {output_file}")

print("\nAll zones processed and saved successfully.")


In [None]:
# Filtering non-Open Seas Landcover Proportions for the files with expanded columns
# -----------------------------------------------------------------------------------
import glob
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os


# -----------------------------------------------------------------------------------
# Landcover Setup
# -----------------------------------------------------------------------------------
landcover_legend = {
    0: "Open Seas", 10: "Tree Cover", 20: "Shrubland", 30: "Grassland",
    40: "Cropland", 50: "Built-up", 60: "Bare/Sparse Vegetation", 70: "Snow and Ice",
    80: "Permanent Water Bodies", 90: "Herbaceous Wetland", 95: "Mangroves", 100: "Moss and Lichen"
}
landcover_columns = list(landcover_legend.values())

# -----------------------------------------------------------------------------------
# Input and output paths
input_folder = "D:/NestEO_test_outputs/grid_1200m/lc_props_with_columns/"
output_folder = "D:/NestEO_test_outputs/grid_1200m/lc_props_with_columns_filtered/"

os.makedirs(output_folder, exist_ok=True)

# List all parquet files
files = sorted(glob.glob(os.path.join(input_folder, "*.parquet")))

# Process each file separately
for i, f in enumerate(files):
    print(f"Processing {i+1}/{len(files)}: {os.path.basename(f)}")

    # Read one file at a time
    df = pd.read_parquet(f, engine="pyarrow")

    # Apply filter immediately
    df_filtered = df[df['Open Seas'] < 0.999].copy()

    # Convert all landcover columns to float32
    df_filtered[landcover_columns] = df_filtered[landcover_columns].astype('float32')

    
    # Save immediately
    output_path = os.path.join(output_folder, os.path.basename(f))
    df_filtered.to_parquet(output_path, index=False, engine="pyarrow", compression="snappy")

    # Clean memory manually
    del df
    del df_filtered

print("\nAll zone files filtered and saved separately.")


In [None]:
# Filtering a few Open Seas Landcover Proportions to add to the previous non-Open Seas Landcover Proportions
# -----------------------------------------------------------------------------------

import glob
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os
import gc

# -----------------------------------------------------------------------------------
# Landcover Setup
# -----------------------------------------------------------------------------------
landcover_legend = {
    0: "Open Seas", 10: "Tree Cover", 20: "Shrubland", 30: "Grassland",
    40: "Cropland", 50: "Built-up", 60: "Bare/Sparse Vegetation", 70: "Snow and Ice",
    80: "Permanent Water Bodies", 90: "Herbaceous Wetland", 95: "Mangroves", 100: "Moss and Lichen"
}
landcover_columns = list(landcover_legend.values())

# -----------------------------------------------------------------------------------


# Input and output paths
input_folder = "D:/NestEO_test_outputs/grid_1200m/lc_props_with_columns/"
output_folder = "D:/NestEO_test_outputs/grid_1200m/lc_props_with_columns_openseas_filtered/"

os.makedirs(output_folder, exist_ok=True)

# List all parquet files
files = sorted(glob.glob(os.path.join(input_folder, "*.parquet")))

# Process each file separately
for i, f in enumerate(files):
    print(f"Processing {i+1}/{len(files)}: {os.path.basename(f)}")

    # Read one file at a time
    df = pd.read_parquet(f, engine="pyarrow")

    # Apply filter immediately
    df_filtered = df[df['Open Seas'] > 0.999].copy()
    del df
    gc.collect()

    # Convert all landcover columns to float32
    df_filtered[landcover_columns] = df_filtered[landcover_columns].astype('float32')

    # sample_factor = len(df_filtered) / 1000_000
    df_filtered = df_filtered.sample(frac=0.01, random_state=42)

    # Save immediately
    output_path = os.path.join(output_folder, os.path.basename(f))
    df_filtered.to_parquet(output_path, index=False, engine="pyarrow", compression="snappy")

    # Clean memory manually
    # del df
    del df_filtered

print("\nAll zone files filtered and saved separately.")


In [None]:
# Merging all filtered Non- Open Seas Landcover Proportions to the previous non-Open Seas Landcover Proportions
import glob
import pandas as pd
import pyarrow as pa
files = glob.glob("D:/NestEO_test_outputs/grid_1200m/lc_props_with_columns_filtered/*.parquet")

dfs = []
for i, f in enumerate(files):
    print(i+1, f)
    df = pd.read_parquet(f)
    df = df[df['Open Seas'] < 0.999].copy()
    dfs.append(df)
    del df

df = pd.concat(dfs, ignore_index=True)
df.to_parquet("D:/NestEO_test_outputs/grid_1200m/lc_props_with_columns_filtered/lc_props_non_openseas_all_zones_test.parquet", index=False, engine='pyarrow', compression='snappy')

# Open Seas Landcover Proportions
# -----------------------------------------------------------------------------------

import glob
import pandas as pd
import pyarrow as pa
files = glob.glob("D:/NestEO_test_outputs/grid_1200m/lc_props_with_columns_openseas_filtered/*.parquet")

dfs = []
for i, f in enumerate(files):
    print(i+1, f)
    df = pd.read_parquet(f)
    df = df[df['Open Seas'] > 0.999].copy()
    dfs.append(df)
    del df

df = pd.concat(dfs, ignore_index=True)
##########################################
df = df.sample(n=1050000, random_state=42)  # sample 1 million rows
##########################################

df.to_parquet("D:/NestEO_test_outputs/grid_1200m/lc_props_with_columns_openseas_filtered/lc_props_some_openseas_all_zones_test.parquet", index=False, engine='pyarrow', compression='snappy')

# Merging all filtered Non- Open Seas Landcover Proportions to the previous non-Open Seas Landcover Proportions
# -----------------------------------------------------------------------------------
df_non_openseas = pd.read_parquet("D:/NestEO_test_outputs/grid_1200m/lc_props_with_columns_filtered/lc_props_non_openseas_all_zones.parquet")
df_openseas = pd.read_parquet("D:/NestEO_test_outputs/grid_1200m/lc_props_with_columns_openseas_filtered/lc_props_some_openseas_all_zones.parquet")

df = pd.concat([df_non_openseas, df_openseas], ignore_index=True)
df.to_parquet("D:/NestEO_test_outputs/grid_1200m/filtered_lc_props_all_zones.parquet", index=False, engine='pyarrow', compression='snappy')


# Now, below we start on subset selections from this filtered dataset
## Above we filtered non-openseas 105 million cells, and added 1.05 million open seas cell for potential filtering later

In [None]:
df = pd.read_parquet("D:/NestEO_test_outputs/grid_1200m/filtered_lc_props_all_zones.parquet")
len(df)

In [None]:
df.head()

In [None]:
# Adding major landcover classes to the filtered file
# -----------------------------------------------------------------------------------

import pandas as pd

# Assuming df is your DataFrame
landcover_columns = [
    "Open Seas", "Tree Cover", "Shrubland", "Grassland", "Cropland", "Built-up",
    "Bare/Sparse Vegetation", "Snow and Ice", "Permanent Water Bodies",
    "Herbaceous Wetland", "Mangroves", "Moss and Lichen"
]

# Sum landcover fractions across all rows for each class
landcover_totals = df[landcover_columns].sum()

# Compute total landcover (all landcover values across all rows and classes)
total_sum = landcover_totals.sum()

# Compute global proportions
global_proportions = landcover_totals / total_sum

# Optional: convert to percentage
global_proportions_percent = global_proportions * 100


global_proportions_percent.sort_values(ascending=False).to_frame().reset_index().rename(columns={"index": "Landcover", 0: "Proportion (%)"}).to_csv("D:/NestEO_test_outputs/grid_1200m/lc_props_with_columns_filtered/global_landcover_proportions.csv", index=False)
global_proportions_percent

In [None]:
df["Majority_LC"] = df[landcover_columns].idxmax(axis=1)
global_majority_lc = df["Majority_LC"].value_counts(normalize=True) * 100
# global_majority_lc = global_majority_lc.sort_index()
global_majority_lc

In [None]:
df.to_parquet("D:/NestEO_test_outputs/grid_1200m/filtered_lc_props_all_zones_107million.parquet", index=False, engine='pyarrow', compression='snappy')

## Reading 107million cells of 1200 for later subset selections
###
###
##

In [None]:
# Adding Shannon Entropy to the filtered file
# -----------------------------------------------------------------------------------
import pandas as pd
df = pd.read_parquet("D:/NestEO_test_outputs/grid_1200m/filtered_lc_props_all_zones_107million.parquet")
from scipy.stats import entropy
import numpy as np, pandas as pd
rng = np.random.default_rng(42)                               # reproducible

def compute_shannon_entropy(row):
    p = np.asarray(row, dtype="float32")
    p = p[p > 0]
    return entropy(p, base=2)

landcover_cols = [
    "Open Seas", "Tree Cover", "Shrubland", "Grassland", "Cropland",
    "Built-up", "Bare/Sparse Vegetation", "Snow and Ice",
    "Permanent Water Bodies", "Herbaceous Wetland",
    "Mangroves", "Moss and Lichen"
]

df[landcover_cols] = (
    df[landcover_cols]
        .apply(pd.to_numeric, errors="coerce")
        .fillna(0)
        .astype("float32")
)
print(f"Converted landcover columns to float32., calculating shannon entropy...")
if "shannon_entropy" not in df.columns:
    df["shannon_entropy"] = df[landcover_cols].apply(
        compute_shannon_entropy, axis=1)
    print("Shannon entropy computed.")


df.to_parquet("D:/NestEO_test_outputs/grid_1200m/filtered_lc_props_all_zones_107million.parquet")

#  The subset selection on the pre-filtered 1200m landcover proportions 

In [None]:
import pandas as pd
df = pd.read_parquet("D:/NestEO_test_outputs/grid_1200m/filtered_lc_props_all_zones_107million.parquet")

In [None]:
df.head(7)

In [None]:
import pandas as pd
df = pd.read_parquet("D:/NestEO_test_outputs/grid_1200m/grid_1200m_subsets_after_10k.parquet")
df.head(5)

In [None]:
df_250k = pd.read_parquet("D:/NestEO_test_outputs/grid_1200m/grid_1200m_subsets_after_250k.parquet")
df_250k.head()

In [None]:
df_10k = df_250k[df_250k["subset_10k"]!='none']
df_25k = df_250k[df_250k["subset_25k"]!='none']
df_50k = df_250k[df_250k["subset_50k"]!='none']
df_100k = df_250k[df_250k["subset_100k"]!='none']


In [None]:
# Step 3: Ensure subset columns exist
for col in [f"subset_10k", "subset_25k",  "subset_50k", "subset_100k", "subset_250k", "subset_phase"]:
    if col not in df.columns:
        df[col] = "none"
df.head()

In [None]:


# Step 4: Map back subset_10k and subset_phase to full df
df.loc[df_10k.index, "subset_10k"] = "10k"
df.loc[df_25k.index, "subset_25k"] = "25k"
df.loc[df_50k.index, "subset_50k"] = "50k"
df.loc[df_100k.index, "subset_100k"] = "100k"
df.loc[df_250k.index, "subset_250k"] = "250k"
df.loc[df_250k.index, "subset_phase"] = df_250k["subset_phase"]

print(f"Subset 10k rows correctly mapped back: {(df['subset_10k'] == '10k').sum():,} rows.")
print(f"Subset 10k rows correctly mapped back: {(df['subset_25k'] == '25k').sum():,} rows.")
print(f"Subset 10k rows correctly mapped back: {(df['subset_50k'] == '50k').sum():,} rows.")
print(f"Subset 10k rows correctly mapped back: {(df['subset_100k'] == '100k').sum():,} rows.")
print(f"Subset 10k rows correctly mapped back: {(df['subset_250k'] == '250k').sum():,} rows.")

# Now you can run your optimized select_hybrid_subsets() safely starting from 25k

In [None]:
# # Choose a previously created subsets if available

# import pandas as pd
# df = pd.read_parquet("D:/NestEO_test_outputs/grid_1200m/lc_1200m_subsets_after_100k.parquet")
# df.head()

In [None]:
# Newest Batch sizes and restarts and save each level, plus batch wise update not individual tiles, Pick the purest in phase 1

from scipy.stats import entropy
import numpy as np, pandas as pd
rng = np.random.default_rng(42)                               # reproducible

def compute_shannon_entropy(row):
    p = np.asarray(row, dtype="float32")
    p = p[p > 0]
    return entropy(p, base=2)

# -------------------------------------------------------------------------
def select_hybrid_subsets(
    df,
    landcover_cols,
    targets,                    # dict: label -> 1-d numpy (sum=1)
    sizes,                      # dict: label -> int (# rows requested)
    class_thresholds,
    phase_proportions=[0.4, 0.1, 0.4],      # must sum to 0.9
    batch_size=25,             # int or dict
    sample_pool_size=2500,     # int or dict
    save_path_template=None    # e.g. "out_with_{label}.parquet"
):
    assert abs(sum(phase_proportions) - 0.9) < 1e-6, \
        "phase_proportions must sum to 0.9 (10 % is Phase-4 / manual)."

    df = df.copy()
    print(f"Initial size: {len(df):,} rows")
    df[landcover_cols] = (
        df[landcover_cols]
          .apply(pd.to_numeric, errors="coerce")
          .fillna(0)
          .astype("float32")
    )
    print(f"Converted landcover columns to float32., calculating shannon entropy...")
    if "shannon_entropy" not in df.columns:
        df["shannon_entropy"] = df[landcover_cols].apply(
            compute_shannon_entropy, axis=1)
        print("Shannon entropy computed.")

    if "subset_phase" not in df.columns:
        df["subset_phase"] = "none"

    # -----------------------------------------------------------------
    for label, size in sizes.items():
        print(f"\n=== Selecting subset {label} (request {size:,} rows) ===")
        subset_col = f"subset_{label}"
        if subset_col not in df.columns:
            df[subset_col] = "none"

        present = (df[subset_col] != "none").sum()
        if present >= int(size * 0.9)-1:
            print(f"[{label}] already ≥ 90 % filled ({present:,}/{size:,}). "
                  "Skipping selection and re-using those rows.")
            continue

        print(f"\n=== Building subset {label}  (need {size:,}) ===")

        # ---------- per-subset batch / pool sizes --------------------
        this_batch = batch_size[label]     if isinstance(batch_size, dict)     else batch_size
        this_pool  = sample_pool_size[label] if isinstance(sample_pool_size, dict) else sample_pool_size

        # ---------- rows reused from earlier subset levels ----------
        earlier = [k for k in sizes if int(k.replace("k", "")) < int(label.replace("k", ""))]
        reused = set()
        for k in earlier:
            reused.update(df.index[df[f"subset_{k}"] != "none"])

        df.loc[list(reused), subset_col]      = label
        df.loc[list(reused), "subset_phase"]  = f"{label}_reuse"

        auto_quota = int(size * 0.9) - len(reused)
        if auto_quota <= 0:
            print(f"[{label}] quota satisfied by reuse ({len(reused):,}).")
            if save_path_template:
                df.to_parquet(save_path_template.format(label=label))
            continue

        p1 = int(auto_quota * phase_proportions[0])
        p2 = int(auto_quota * phase_proportions[1])
        p3 = auto_quota - p1 - p2

        selected = set(reused)
        phase_tag = pd.Series("none", index=df.index, dtype="object")

        # ---------------- Phase 1  (dominant) ------------------------
        p1_idx = []
        for i, lc in enumerate(landcover_cols):
            need = int(p1 * targets[label][i])
            if need == 0: continue
            cand = df[(df[lc] >= class_thresholds[i]) & (~df.index.isin(selected))]
            if cand.empty: continue
            # pick = cand.sample(
            #     n=min(need, len(cand)),
            #     random_state=rng.integers(1e9)).index.tolist()
            
            pick = cand.sort_values(by=lc, ascending=False).head(min(need, len(cand))).index.tolist()

            p1_idx.extend(pick)
        short = p1 - len(p1_idx)
        if short > 0:
            extra = df.loc[
                (~df.index.isin(selected)) &
                (df[landcover_cols].max(axis=1) >= 0.5)
            ].sample(n=short, random_state=rng.integers(1e9)).index.tolist()
            p1_idx.extend(extra)

        df.loc[p1_idx, subset_col] = label
        phase_tag.loc[p1_idx] = f"{label}_phase1"
        selected.update(p1_idx)
        print(f"Phase-1 picked {len(p1_idx):,}")

        # ---------------- Phase 2  (entropy) -------------------------

        ent_pool = df.loc[~df.index.isin(selected)]

        # SAMPLE a smaller number of candidates to sort
        if len(ent_pool) > 6 * p2:   # if more than 5× needed
            ent_pool = ent_pool.sample(n=6*p2, random_state=rng.integers(1e9))

        p2_idx = (ent_pool.sort_values("shannon_entropy", ascending=False)
                        .head(min(p2, len(ent_pool)))
                        .index.tolist())

        # p2_idx = ent_pool.nlargest(p2, columns="shannon_entropy").index.tolist()

        # p2_idx = ent_pool.nlargest(p2, columns="shannon_entropy").index.tolist()

        # ent_pool = df.loc[~df.index.isin(selected)]
        # p2_idx = (ent_pool.sort_values("shannon_entropy", ascending=False)
        #                   .head(min(p2, len(ent_pool))).index.tolist())
        df.loc[p2_idx, subset_col] = label
        phase_tag.loc[p2_idx] = f"{label}_phase2"
        selected.update(p2_idx)
        print(f"Phase-2 picked {len(p2_idx):,}")

        # ---------------- Phase 3  (greedy match, batch-wise) -------------------

        p3_idx = []
        cum = df.loc[list(selected), landcover_cols].sum().values
        n_sel = len(selected)

        remain = df.loc[~df.index.isin(selected)]
        from tqdm import tqdm
        steps = (p3 + this_batch - 1) // this_batch
        pbar  = tqdm(total=steps, desc=f"{label} phase-3", leave=False)

        while len(p3_idx) < p3 and not remain.empty:
            remain = df.loc[~df.index.isin(selected)]
            if remain.empty:
                print(f"Phase-3: no more candidates available.")
                break

            from joblib import Parallel, delayed

            # inside your phase 3 loop...

            # sample and batchify
            pool = remain.sample(
                n=min(this_pool, len(remain)),
                random_state=rng.integers(1e9)
            )
            pool_X = pool[landcover_cols].values
            pool_idx = pool.index.to_numpy()

            n_batches = len(pool_idx) // this_batch
            if n_batches == 0:
                print("Warning: not enough candidates to form even one batch.")
                break

            # split pool into batches
            pool_X_batches = pool_X[:n_batches * this_batch].reshape(n_batches, this_batch, -1)

            def compute_error(i):
                batch_sum = pool_X_batches[i].sum(axis=0)
                error = np.mean(( (cum + batch_sum) / (n_sel + this_batch) - targets[label]) ** 2)
                return i, error

            # parallel compute errors
            batch_errors = Parallel(n_jobs=-1, backend="threading")(
                delayed(compute_error)(i) for i in range(n_batches)
            )

            batch_errors = sorted(batch_errors, key=lambda x: x[1])
            best_batch_idx = batch_errors[0][0]
            best_idxs = pool_idx[best_batch_idx*this_batch : (best_batch_idx+1)*this_batch]

            # update
            p3_idx.extend(best_idxs.tolist())
            selected.update(best_idxs.tolist())
            cum += pool_X_batches[best_batch_idx].sum(axis=0)
            n_sel += len(best_idxs)

            # clean
            del pool, pool_X, pool_idx, pool_X_batches, batch_errors
            import gc; gc.collect()


            pbar.update(1)
            if len(p3_idx) >= p3:
                break

        pbar.close()

        df.loc[p3_idx, subset_col] = label
        phase_tag.loc[p3_idx]      = f"{label}_phase3"
        print(f"Phase-3 picked {len(p3_idx):,}")

        # ---------- commit phase labels & save progress -------------
        df.loc[phase_tag != "none", "subset_phase"] = phase_tag[phase_tag != "none"]

        done = (df[subset_col] != "none").sum()
        print(f"Finished {label}: {done:,}/{size:,} rows labeled "
              f"(auto {len(selected) - len(reused):,}, reuse {len(reused):,}).")

        if save_path_template:
            # save only picked rows, not all
            df.loc[df[subset_col] == label].to_parquet(save_path_template.format(label=label))
            if label in ["500k", "1000K", "1500K"]:
                # save all rows for these labels
                # df.loc[df[subset_col] == label].to_parquet(save_path_template.format(label=label))
                out_path = save_path_template.format(label=label)
                df.to_parquet(out_path)
                print(f"Saved progress → {out_path}")

    return df



In [None]:
import pandas as pd
import numpy as np

# ------------------------------------------------------------------
# 1.  Load the CSV that contains your level‐wise proportions
# ------------------------------------------------------------------
csv_path = "D:/NestEO_test_outputs/tiered_props_all_levels.csv"
tbl = pd.read_csv(csv_path)
tbl.rename(columns={tbl.columns[0]: "Land Cover"}, inplace=True)
ordered = [
    "Open Seas", "Tree Cover", "Shrubland", "Grassland", "Cropland",
    "Built-up", "Bare/Sparse Vegetation", "Snow and Ice",
    "Permanent Water Bodies", "Herbaceous Wetland",
    "Mangroves", "Moss and Lichen"
]

base_1200m = (
    tbl.set_index("Land Cover")          # align by name
    #    .loc[ordered, "Global 1200m"]     # % values
        .loc[ordered, "Medium (1200m)"] 
       .values.astype("float32") / 100.0 # to fractions
)

# ------------------------------------------------------------
# 1.  importance weights  (edit if you wish)
# ------------------------------------------------------------
weights = np.array([
    0.25, 1.0, 0.9, 1.1, 1.5,
    2.0, 0.9, 1.1, 
    1.2, 1.5, 
    2.0, 1.5
], dtype="float32")

# ------------------------------------------------------------
# 2.  bias factors  –  small batch = strong oversampling
# ------------------------------------------------------------
bias = {
    "10k":   1.5,
    "25k":   1.4,
    "50k":   1.3,
    "100k":  1.2,
    "250k":  1.1,
    "500k":  1.0,
    "750k":  0.9,
    "1000k": 0.8,
    "1500k": 0.7,
}

# ------------------------------------------------------------
# 3.  availability ceiling  (fraction of majority-class tiles)
#     → use real numbers when you have them
# ------------------------------------------------------------
avail = np.minimum(base_1200m, 1.0).astype("float32")

# ------------------------------------------------------------
# 4.  progressive targets  (now bias really matters)
#     formula:  blend = (1-μ)*base + μ*biased ,
#               where μ = (b-1)/(max_bias-1)
# ------------------------------------------------------------
max_bias = max(bias.values())  # = 1.6 here
target_distributions = {}

for name, b in bias.items():
    mu   = (b - 1.0) / (max_bias - 1.0)   # 0 … 1
    biased = weights * base_1200m
    raw    = (1 - mu) * base_1200m + mu * biased
    raw    = np.minimum(raw, avail)       # respect ceiling
    target = raw / raw.sum()              # normalise to 1
    target_distributions[name] = target

# ------------------------------------------------------------------
# 5.  verify each row differs
# ------------------------------------------------------------------
pd.set_option("display.precision", 2)
print(pd.DataFrame(target_distributions, index=ordered).T * 100)


In [None]:
target_distributions

In [None]:
# ------------------------------------------------------------------
# 7.  Subset-size dictionary
# ------------------------------------------------------------------

# Subset sizes
subset_sizes = {
    "10k": 10_000,
    "25k": 25_000,
    "50k": 50_000,
    "100k": 100_000,
    "250k": 250_000,
    "500k": 500_000,
    "750k": 750_000,
    "1000k": 1_000_000,
    "1500k": 1_500_000
}

pure_class_thresholds = np.array([
    0.80,  # Open Seas
    0.70,  # Tree Cover
    0.70,  # Shrubland
    0.65,  # Grassland
    0.60,  # Cropland
    0.20,  # Built-up
    0.60,  # Bare / Sparse Vegetation
    0.40,  # Snow and Ice
    0.50,  # Permanent Water Bodies
    0.40,  # Herbaceous Wetland
    0.05,  # Mangroves
    0.10   # Moss and Lichen
])

# Usage:
phase_proportions = [0.4, 0.1, 0.4]  # 50% pure, 10% high-entropy, 40% target matching 10% left for manual based on dataset availability



In [None]:
batch_sizes = { "10k": 20, "25k": 25, "50k": 50,
                "100k": 75, "250k": 150, "500k": 150,
                "750k": 375, "1000k": 500, "1500k": 750 }

pool_sizes  = { "10k": 2500, "25k": 2500, "50k": 3000,
                "100k": 3500, "250k": 3500, "500k": 3500,
                "750k": 3750, "1000k": 5000, "1500k": 5000 }  #{ k: v*50 for k, v in batch_sizes.items() }  # example

df = select_hybrid_subsets(
        df,
        ordered,
        targets=target_distributions,
        sizes=subset_sizes,
        class_thresholds=pure_class_thresholds,
        batch_size=batch_sizes,
        sample_pool_size=pool_sizes,
        save_path_template="D:/NestEO_test_outputs/grid_1200m/grid_1200m_subsets_after_{label}.parquet"
)


In [None]:
# 250k samples selected for 1200m levels

# Previous code using hardcoded for each subset level

In [None]:
import pandas as pd
import numpy as np

landcover_columns = [
    "Open Seas", "Tree Cover", "Shrubland", "Grassland", "Cropland", "Built-up",
    "Bare / Sparse Vegetation", "Snow and Ice", "Permanent Water Bodies",
    "Herbaceous Wetland", "Mangroves", "Moss and Lichen"
]

target_1200m = np.array([
    2.0,  # Open Seas
    22.0, # Tree Cover
    7.0,  # Shrubland
    12.0, # Grassland
    18.0, # Cropland
    10.0,  # Built-up
    7.0, # Bare / Sparse Vegetation
    4.0,  # Snow and Ice
    7.0, # Permanent Water Bodies
    5.0,  # Herbaceous Wetland
    3.0,  # Mangroves
    3.0   # Moss and Lichen
]) / 100
# -----------------------------------------------------------------------------------
target_10k = np.array([
    0.5,  # Open Seas
    16.0, # Tree Cover
    6.0,  # Shrubland
    13.0, # Grassland
    15.0, # Cropland
    8.0,  # Built-up
    10.0, # Bare / Sparse Vegetation
    4.0,  # Snow and Ice
    10.0, # Permanent Water Bodies
    6.0,  # Herbaceous Wetland
    3.0,  # Mangroves
    8.5   # Moss and Lichen
]) / 100

target_25k = np.array([
    1.0,  # Open Seas
    18.0, # Tree Cover
    6.0,  # Shrubland
    14.0, # Grassland
    15.0, # Cropland
    7.0,  # Built-up
    11.0, # Bare / Sparse Vegetation
    4.0,  # Snow and Ice
    10.0, # Permanent Water Bodies
    6.0,  # Herbaceous Wetland
    2.5,  # Mangroves
    5.5   # Moss and Lichen
]) / 100

target_50k = np.array([
    1.2,  # Open Seas
    20.0, # Tree Cover
    6.5,  # Shrubland
    15.0, # Grassland
    14.5, # Cropland
    6.0,  # Built-up
    12.0, # Bare / Sparse Vegetation
    4.0,  # Snow and Ice
    9.5, # Permanent Water Bodies
    5.5,  # Herbaceous Wetland
    2.0,  # Mangroves
    3.8   # Moss and Lichen
]) / 100

target_100k = np.array([
    1.5,  # Open Seas
    22.0, # Tree Cover
    7.0,  # Shrubland
    16.0, # Grassland
    13.5, # Cropland
    5.0,  # Built-up
    12.0, # Bare / Sparse Vegetation
    4.0,  # Snow and Ice
    10.0, # Permanent Water Bodies
    5.0,  # Herbaceous Wetland
    1.5,  # Mangroves
    2.5   # Moss and Lichen
]) / 100

target_250k = np.array([
    1.8,  # Open Seas
    24.0, # Tree Cover
    7.5,  # Shrubland
    17.0, # Grassland
    12.5, # Cropland
    4.0,  # Built-up
    12, # Bare / Sparse Vegetation
    4.0,  # Snow and Ice
    11, # Permanent Water Bodies
    4,  # Herbaceous Wetland
    1.0,  # Mangroves
    1.2   # Moss and Lichen
]) / 100

target_500k = np.array([
    1.8,  # Open Seas
    24.0, # Tree Cover
    7.5,  # Shrubland
    17.0, # Grassland
    12.5, # Cropland
    4.0,  # Built-up
    12, # Bare / Sparse Vegetation
    4.0,  # Snow and Ice
    11, # Permanent Water Bodies
    4,  # Herbaceous Wetland
    1.0,  # Mangroves
    1.2   # Moss and Lichen
]) / 100

target_750k = np.array([
    1.8,  # Open Seas
    24.0, # Tree Cover
    7.5,  # Shrubland
    17.0, # Grassland
    12.5, # Cropland
    4.0,  # Built-up
    12, # Bare / Sparse Vegetation
    4.0,  # Snow and Ice
    11, # Permanent Water Bodies
    4,  # Herbaceous Wetland
    1.0,  # Mangroves
    1.2   # Moss and Lichen
]) / 100

target_1000k = np.array([
    1.8,  # Open Seas
    24.0, # Tree Cover
    7.5,  # Shrubland
    17.0, # Grassland
    12.5, # Cropland
    4.0,  # Built-up
    12, # Bare / Sparse Vegetation
    4.0,  # Snow and Ice
    11, # Permanent Water Bodies
    4,  # Herbaceous Wetland
    1.0,  # Mangroves
    1.2   # Moss and Lichen
]) / 100
print(target_10k.sum())
print(target_25k.sum())
print(target_50k.sum())
print(target_100k.sum())
print(target_250k.sum())
print(target_500k.sum())
print(target_750k.sum())    
print(target_1000k.sum())       
                



In [None]:

target_distributions = {
    "10k": target_10k,
    "25k":target_25k,
    "50k": target_50k,    
    "100k": target_100k,
    "250k":target_250k,
    "500k": target_500k,
    "750k": target_750k,
    "1000k": target_1000k
}

# Subset sizes
subset_sizes = {
    "10k": 10_000,
    "25k": 25_000,
    "50k": 50_000,
    "100k": 100_000,
    "250k": 250_000,
    "500k": 500_000,
    "750k": 750_000,
    "1000k": 1_000_000
}

pure_class_thresholds = np.array([
    0.80,  # Open Seas
    0.70,  # Tree Cover
    0.70,  # Shrubland
    0.65,  # Grassland
    0.60,  # Cropland
    0.20,  # Built-up
    0.60,  # Bare / Sparse Vegetation
    0.40,  # Snow and Ice
    0.50,  # Permanent Water Bodies
    0.40,  # Herbaceous Wetland
    0.10,  # Mangroves
    0.10   # Moss and Lichen
])

# Usage:
phase_proportions = [0.4, 0.1, 0.4]  # 50% pure, 10% high-entropy, 40% target matching

df_labeled = select_hybrid_subsets(
    df,
    landcover_cols=landcover_columns,
    targets=target_distributions,
    sizes=subset_sizes,
    class_thresholds=pure_class_thresholds,
    phase_proportions=phase_proportions,
    batch_size=50,
    sample_pool_size=2500
)

df_labeled.to_parquet("D:/MajorTOMExpand/0. DatasetsFolder/13.Major-TOM/MajorTOM_metadata_with_computed_attributes_subsets.parquet") 

In [None]:
df_labeled2.head()

In [None]:
import pandas as pd

# Example loading from CSV (you'd replace this with your real DataFrame)
# df = pd.read_csv('your_file.csv')
df = df_labeled2.copy()

# Assuming your DataFrame is already loaded and named `df`
# The goal is to propagate subset labels upward

subset_cols = ['subset_100k', 'subset_250k', 'subset_500k', 'subset_750k', 'subset_1000k']

# Process each column starting from the second, copying non-'none' values from lower levels
for i in range(1, len(subset_cols)):
    current_col = subset_cols[i]
    lower_cols = subset_cols[:i]
    
    # Create a mask of rows where the current column is 'none'
    mask = df[current_col] == 'none'
    
    # For those rows, check the first non-'none' value from lower levels
    for col in reversed(lower_cols):
        df.loc[mask & (df[col] != 'none'), current_col] = df[col]

# Now the higher subset columns (like 500k, 750k, etc.) also include inherited lower levels
df

In [None]:
df.to_parquet("D:/MajorTOMExpand/0. DatasetsFolder/13.Major-TOM/MajorTOM_metadata_with_computed_attributes_subsetsMentioned.parquet")

# Custom 200k subset

In [None]:
filter_grid_cells = pd.read_csv("D:/2020_subset_grid.csv")
filter_grid_cells

In [None]:
df_merge = df.merge(filter_grid_cells, on='grid_cell', how='inner')
df_merge

In [None]:
df_merge_export = df_merge[["grid_cell", "subset_phase", "subset_100k", "subset_250k", "subset_500k", "subset_750k", "subset_1000k"]]
df_merge_export

In [None]:
df_merge_export.to_csv("D:/MajorTOMExpand/0. DatasetsFolder/13.Major-TOM/MajorTOM_metadata_with_computed_cusotmSubsetsGridCells.csv", index=False)

In [None]:
df_merge_dedup.to_parquet("D:/MajorTOMExpand/0. DatasetsFolder/13.Major-TOM/MajorTOM_metadata_with_computed_attributes_2020_subset_grid.parquet")