In [None]:
import dask.dataframe as dd
import ast
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from dask.diagnostics import ProgressBar
import os
import glob

# -----------------------------------------------------------------------------------
# Landcover Setup
# -----------------------------------------------------------------------------------
landcover_legend = {
    0: "Open Seas", 10: "Tree Cover", 20: "Shrubland", 30: "Grassland",
    40: "Cropland", 50: "Built-up", 60: "Bare/Sparse Vegetation", 70: "Snow and Ice",
    80: "Permanent Water Bodies", 90: "Herbaceous Wetland", 95: "Mangroves", 100: "Moss and Lichen"
}
lc_columns = list(landcover_legend.values())

# -----------------------------------------------------------------------------------
# Expansion Function
# -----------------------------------------------------------------------------------
def expand_landcover_props_fast(df_partition):
    parsed = df_partition['landcover_props'].apply(ast.literal_eval)

    for col in lc_columns:
        df_partition[col] = 0.0

    for code, col_name in landcover_legend.items():
        df_partition[col_name] = parsed.apply(lambda d: d.get(code, 0.0))

    df_partition = df_partition.drop(columns=["landcover_props"])

    final_columns = ["tile_id"] + lc_columns
    df_partition = df_partition[final_columns]

    return df_partition

# -----------------------------------------------------------------------------------
# Paths
# -----------------------------------------------------------------------------------
input_folder_pattern = "D:/tascarrd_test_outputs/grid_1200m/lc_proportions_10m_*_1200m.parquet"
output_folder = "D:/tascarrd_test_outputs/grid_1200m/lc_props_with_columns/"

os.makedirs(output_folder, exist_ok=True)

# -----------------------------------------------------------------------------------
# Process Each Zone File Separately
# -----------------------------------------------------------------------------------
zone_files = sorted(glob.glob(input_folder_pattern))

print(f"Found {len(zone_files)} zone files to process.")

for file_path in zone_files:
    zone_name = os.path.basename(file_path).replace(".parquet", "")  # e.g., lc_proportions_10m_32N_1200m
    output_file = os.path.join(output_folder, f"{zone_name}.parquet")

    if os.path.exists(output_file):
        print(f"Skipping {zone_name}, already processed.")
        continue
    
    print(f"\nProcessing {zone_name}...")

    # Read the zone file lazily
    df = dd.read_parquet(
        file_path,
        engine="pyarrow",
        columns=["tile_id", "landcover_props"]
    )

    # Optional: repartition to larger partitions if needed
    df = df.repartition(partition_size="3GB")

    # Define correct meta
    meta_columns = {"tile_id": 'object'}
    meta_columns.update({col: 'float32' for col in lc_columns})

    # Expand
    df_expanded = df.map_partitions(expand_landcover_props_fast, meta=meta_columns)

    # Compute and Save
    with ProgressBar():
        df_final = df_expanded.compute()

    # Save cleanly
    table = pa.Table.from_pandas(df_final, preserve_index=False)
    pq.write_table(
        table,
        where=output_file,
        compression="snappy",
        use_dictionary=True,
        data_page_size=2097152,
        row_group_size=1000000
    )

    print(f"Saved expanded zone: {output_file}")

print("\nAll zones processed and saved successfully.")


Found 122 zone files to process.
Skipping lc_proportions_10m_10N_1200m, already processed.
Skipping lc_proportions_10m_10S_1200m, already processed.
Skipping lc_proportions_10m_11N_1200m, already processed.
Skipping lc_proportions_10m_11S_1200m, already processed.
Skipping lc_proportions_10m_12N_1200m, already processed.
Skipping lc_proportions_10m_12S_1200m, already processed.
Skipping lc_proportions_10m_13N_1200m, already processed.
Skipping lc_proportions_10m_13S_1200m, already processed.
Skipping lc_proportions_10m_14N_1200m, already processed.
Skipping lc_proportions_10m_14S_1200m, already processed.
Skipping lc_proportions_10m_15N_1200m, already processed.
Skipping lc_proportions_10m_15S_1200m, already processed.
Skipping lc_proportions_10m_16N_1200m, already processed.
Skipping lc_proportions_10m_16S_1200m, already processed.
Skipping lc_proportions_10m_17N_1200m, already processed.

Processing lc_proportions_10m_17S_1200m...
[########################################] | 100% Com

In [1]:
import glob
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os


# -----------------------------------------------------------------------------------
# Landcover Setup
# -----------------------------------------------------------------------------------
landcover_legend = {
    0: "Open Seas", 10: "Tree Cover", 20: "Shrubland", 30: "Grassland",
    40: "Cropland", 50: "Built-up", 60: "Bare/Sparse Vegetation", 70: "Snow and Ice",
    80: "Permanent Water Bodies", 90: "Herbaceous Wetland", 95: "Mangroves", 100: "Moss and Lichen"
}
landcover_columns = list(landcover_legend.values())

# -----------------------------------------------------------------------------------


# Input and output paths
input_folder = "D:/tascarrd_test_outputs/grid_1200m/lc_props_with_columns/"
output_folder = "D:/tascarrd_test_outputs/grid_1200m/lc_props_with_columns_filtered/"

os.makedirs(output_folder, exist_ok=True)

# List all parquet files
files = sorted(glob.glob(os.path.join(input_folder, "*.parquet")))

# Process each file separately
for i, f in enumerate(files):
    print(f"Processing {i+1}/{len(files)}: {os.path.basename(f)}")

    # Read one file at a time
    df = pd.read_parquet(f, engine="pyarrow")

    # Apply filter immediately
    df_filtered = df[df['Open Seas'] < 0.999].copy()

    # Convert all landcover columns to float32
    df_filtered[landcover_columns] = df_filtered[landcover_columns].astype('float32')

    
    # Save immediately
    output_path = os.path.join(output_folder, os.path.basename(f))
    df_filtered.to_parquet(output_path, index=False, engine="pyarrow", compression="snappy")

    # Clean memory manually
    del df
    del df_filtered

print("\nAll zone files filtered and saved separately.")


Processing 1/120: lc_proportions_10m_10N_1200m.parquet
Processing 2/120: lc_proportions_10m_10S_1200m.parquet
Processing 3/120: lc_proportions_10m_11N_1200m.parquet
Processing 4/120: lc_proportions_10m_11S_1200m.parquet
Processing 5/120: lc_proportions_10m_12N_1200m.parquet
Processing 6/120: lc_proportions_10m_12S_1200m.parquet
Processing 7/120: lc_proportions_10m_13N_1200m.parquet
Processing 8/120: lc_proportions_10m_13S_1200m.parquet
Processing 9/120: lc_proportions_10m_14N_1200m.parquet
Processing 10/120: lc_proportions_10m_14S_1200m.parquet
Processing 11/120: lc_proportions_10m_15N_1200m.parquet
Processing 12/120: lc_proportions_10m_15S_1200m.parquet
Processing 13/120: lc_proportions_10m_16N_1200m.parquet
Processing 14/120: lc_proportions_10m_16S_1200m.parquet
Processing 15/120: lc_proportions_10m_17N_1200m.parquet
Processing 16/120: lc_proportions_10m_17S_1200m.parquet
Processing 17/120: lc_proportions_10m_18N_1200m.parquet
Processing 18/120: lc_proportions_10m_18S_1200m.parquet
P

In [2]:
import glob
import pandas as pd
import pyarrow as pa
files = glob.glob("D:/tascarrd_test_outputs/grid_1200m/lc_props_with_columns_filtered/*.parquet")

dfs = []
for i, f in enumerate(files):
    print(i+1, f)
    df = pd.read_parquet(f)
    df = df[df['Open Seas'] < 0.999]
    dfs.append(df)
    del df

df = pd.concat(dfs, ignore_index=True)
df.to_parquet("D:/tascarrd_test_outputs/grid_1200m/lc_props_with_columns_filtered/lc_props_non_openseas_all_zones.parquet", index=False, engine='pyarrow', compression='snappy')

1 D:/tascarrd_test_outputs/grid_1200m/lc_props_with_columns_filtered\lc_proportions_10m_10N_1200m.parquet
2 D:/tascarrd_test_outputs/grid_1200m/lc_props_with_columns_filtered\lc_proportions_10m_10S_1200m.parquet
3 D:/tascarrd_test_outputs/grid_1200m/lc_props_with_columns_filtered\lc_proportions_10m_11N_1200m.parquet
4 D:/tascarrd_test_outputs/grid_1200m/lc_props_with_columns_filtered\lc_proportions_10m_11S_1200m.parquet
5 D:/tascarrd_test_outputs/grid_1200m/lc_props_with_columns_filtered\lc_proportions_10m_12N_1200m.parquet
6 D:/tascarrd_test_outputs/grid_1200m/lc_props_with_columns_filtered\lc_proportions_10m_12S_1200m.parquet
7 D:/tascarrd_test_outputs/grid_1200m/lc_props_with_columns_filtered\lc_proportions_10m_13N_1200m.parquet
8 D:/tascarrd_test_outputs/grid_1200m/lc_props_with_columns_filtered\lc_proportions_10m_13S_1200m.parquet
9 D:/tascarrd_test_outputs/grid_1200m/lc_props_with_columns_filtered\lc_proportions_10m_14N_1200m.parquet
10 D:/tascarrd_test_outputs/grid_1200m/lc_prop

In [3]:
df.head()

Unnamed: 0,tile_id,Open Seas,Tree Cover,Shrubland,Grassland,Cropland,Built-up,Bare/Sparse Vegetation,Snow and Ice,Permanent Water Bodies,Herbaceous Wetland,Mangroves,Moss and Lichen
0,G1200m_10N_X000422_Y004451,0.0,0.95146,0.0,0.00514,0.0,0.00049,0.00063,0.0,0.04229,0.0,0.0,0.0
1,G1200m_10N_X000423_Y004451,0.0,0.99174,0.0,0.0,0.0,0.0,0.00028,0.0,0.00799,0.0,0.0,0.0
2,G1200m_10N_X000424_Y004451,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,G1200m_10N_X000425_Y004451,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,G1200m_10N_X000426_Y004451,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
len(df)/1_000_000

105.99857

In [None]:
import pandas as pd

# Assuming df is your DataFrame
landcover_columns = [
    "Open Seas", "Tree Cover", "Shrubland", "Grassland", "Cropland", "Built-up",
    "Bare/Sparse Vegetation", "Snow and Ice", "Permanent Water Bodies",
    "Herbaceous Wetland", "Mangroves", "Moss and Lichen"
]

# Sum landcover fractions across all rows for each class
landcover_totals = df[landcover_columns].sum()

# Compute total landcover (all landcover values across all rows and classes)
total_sum = landcover_totals.sum()

# Compute global proportions
global_proportions = landcover_totals / total_sum

# Optional: convert to percentage
global_proportions_percent = global_proportions * 100




Open Seas                  0.244829
Tree Cover                29.145586
Shrubland                  6.668302
Grassland                 21.226473
Cropland                   8.220444
Built-up                   0.651806
Bare/Sparse Vegetation    14.767981
Snow and Ice               1.628042
Permanent Water Bodies    13.788013
Herbaceous Wetland         1.441016
Mangroves                  0.107942
Moss and Lichen            2.109562
dtype: float32


In [15]:
global_proportions_percent.sort_values(ascending=False).to_frame().reset_index().rename(columns={"index": "Landcover", 0: "Proportion (%)"}).to_csv("D:/tascarrd_test_outputs/grid_1200m/lc_props_with_columns_filtered/global_landcover_proportions.csv", index=False)

In [17]:
# Display results
global_proportions_percent

Open Seas                  0.244829
Tree Cover                29.145586
Shrubland                  6.668302
Grassland                 21.226473
Cropland                   8.220444
Built-up                   0.651806
Bare/Sparse Vegetation    14.767981
Snow and Ice               1.628042
Permanent Water Bodies    13.788013
Herbaceous Wetland         1.441016
Mangroves                  0.107942
Moss and Lichen            2.109562
dtype: float32

In [10]:
df["Majority_LC"] = df[landcover_columns].idxmax(axis=1)
df.head()

Unnamed: 0,tile_id,Open Seas,Tree Cover,Shrubland,Grassland,Cropland,Built-up,Bare/Sparse Vegetation,Snow and Ice,Permanent Water Bodies,Herbaceous Wetland,Mangroves,Moss and Lichen,Majority_LC
0,G1200m_10N_X000422_Y004451,0.0,0.95146,0.0,0.00514,0.0,0.00049,0.00063,0.0,0.04229,0.0,0.0,0.0,Tree Cover
1,G1200m_10N_X000423_Y004451,0.0,0.99174,0.0,0.0,0.0,0.0,0.00028,0.0,0.00799,0.0,0.0,0.0,Tree Cover
2,G1200m_10N_X000424_Y004451,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Tree Cover
3,G1200m_10N_X000425_Y004451,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Tree Cover
4,G1200m_10N_X000426_Y004451,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Tree Cover


In [12]:
global_majority_lc = df["Majority_LC"].value_counts(normalize=True) * 100
# global_majority_lc = global_majority_lc.sort_index()
global_majority_lc

Majority_LC
Tree Cover                30.818458
Grassland                 20.481359
Bare/Sparse Vegetation    14.575383
Permanent Water Bodies    13.403137
Cropland                   8.865737
Shrubland                  6.069284
Moss and Lichen            2.054686
Snow and Ice               1.625774
Herbaceous Wetland         1.362833
Built-up                   0.389651
Open Seas                  0.244828
Mangroves                  0.108869
Name: proportion, dtype: float64

In [7]:
from scipy.stats import entropy
def compute_shannon_entropy(row):
    p = np.array(row)
    p = p[p > 0]  # Remove zeros
    return entropy(p, base=2)

def select_hybrid_subsets(
    df,
    landcover_cols,
    targets,
    sizes,
    class_thresholds,
    phase_proportions=[0.5, 0.1, 0.4],
    batch_size=10,
    sample_pool_size=2000
):
    assert sum(phase_proportions) == 1.0, "Phase proportions must sum to 1.0"

    df = df.copy()
    df[landcover_cols] = df[landcover_cols].apply(pd.to_numeric, errors='coerce').fillna(0).astype("float32")

    # Pre-compute entropy if not already available
    if "shannon_entropy" not in df.columns:
        df["shannon_entropy"] = df[landcover_cols].apply(compute_shannon_entropy, axis=1)
        print("Entropy computed.")

    # Track cumulative selection index
    global_selected = set()

    for label, size in sizes.items():
        print(f"\nSelecting subset: {label}")
        subset_col_name = f"subset_{label}"
        if subset_col_name not in df.columns:
            df[subset_col_name] = "none"

        # --- Step 1: Reuse previous subset selections (e.g., from 100k) ---
        previous_labels = [k for k in sizes if int(k.replace("k", "")) < int(label.replace("k", ""))]
        reused_indices = set()
        for prev_label in previous_labels:
            prev_col = f"subset_{prev_label}"
            reused_indices.update(df.index[df[prev_col] != "none"])

        remaining_required = size - len(reused_indices)
        if remaining_required <= 0:
            print(f"Subset {label}: already satisfied from smaller subsets.")
            df.loc[list(reused_indices), subset_col_name] = label
            continue

        print(f"Reusing {len(reused_indices)} from smaller subsets. Need to select {remaining_required} new.")

        selected_indices = set(reused_indices)
        phase_col = pd.Series(["none"] * len(df), index=df.index, name="subset_phase")

        phase1_cells = int(remaining_required * phase_proportions[0])
        phase2_cells = int(remaining_required * phase_proportions[1])
        phase3_cells = remaining_required - phase1_cells - phase2_cells

        # --- PHASE 1: Pure tiles using class-specific thresholds ---
        phase1_indices = []
        for i, lc in enumerate(landcover_cols):
            class_target = targets[label][i]
            dominance_thresh = class_thresholds[i]
            target_count = int(phase1_cells * class_target)

            class_candidates = df[
                (df[lc] >= dominance_thresh) &
                (~df.index.isin(selected_indices))
            ]
            sampled = class_candidates.sample(
                n=min(target_count, len(class_candidates)), random_state=42
            )
            phase1_indices.extend(sampled.index.tolist())

        df.loc[phase1_indices, subset_col_name] = label
        phase_col.loc[phase1_indices] = f"{label}_phase1"
        selected_indices.update(phase1_indices)
        print(f"Phase 1 (Pure): Selected {len(phase1_indices)}")

        # --- PHASE 2: High-entropy transitional tiles ---
        entropy_candidates = df.loc[~df.index.isin(selected_indices)]
        entropy_sorted = entropy_candidates.sort_values("shannon_entropy", ascending=False)
        phase2_indices = entropy_sorted.head(phase2_cells).index.tolist()

        df.loc[phase2_indices, subset_col_name] = label
        phase_col.loc[phase2_indices] = f"{label}_phase2"
        selected_indices.update(phase2_indices)
        print(f"Phase 2 (Transitional): Selected {len(phase2_indices)}")

        # --- PHASE 3: Distribution matching using greedy batch search ---
        phase3_indices = []
        cumulative = df.loc[list(selected_indices), landcover_cols].sum().values.astype("float32")
        n_selected = len(selected_indices)

        while len(phase3_indices) < phase3_cells:
            remaining = df.loc[~df.index.isin(selected_indices)]
            if remaining.empty:
                break

            candidate_pool = remaining.sample(n=min(sample_pool_size, len(remaining)), random_state=None)
            X = candidate_pool[landcover_cols].values
            tmp_distributions = (cumulative + X) / (n_selected + 1)
            errors = np.mean((tmp_distributions - targets[label]) ** 2, axis=1)

            top_idxs = np.argsort(errors)[:batch_size]
            best_indices = candidate_pool.iloc[top_idxs].index.tolist()

            phase3_indices.extend(best_indices)
            selected_indices.update(best_indices)
            cumulative += df.loc[best_indices, landcover_cols].sum().values.astype("float32")
            n_selected += len(best_indices)

            if len(phase3_indices) % 1000 < batch_size:
                print(f"{label} Phase 3: {len(phase3_indices)} / {phase3_cells}")

        df.loc[phase3_indices, subset_col_name] = label
        phase_col.loc[phase3_indices] = f"{label}_phase3"
        print(f"Finished subset: {label} | Total selected this round: {len(selected_indices) - len(reused_indices)}")

        # Store subset_phase (only once)
        if "subset_phase" not in df.columns:
            df["subset_phase"] = "none"
        df.loc[phase_col.index[phase_col != "none"], "subset_phase"] = phase_col[phase_col != "none"]

        # Update global selected tracker
        global_selected.update(selected_indices)

    return df


In [None]:
import pandas as pd
import numpy as np

landcover_columns = [
    "Open Seas", "Tree Cover", "Shrubland", "Grassland", "Cropland", "Built-up",
    "Bare / Sparse Vegetation", "Snow and Ice", "Permanent Water Bodies",
    "Herbaceous Wetland", "Mangroves", "Moss and Lichen"
]

target_10k = np.array([
    0.5,  # Open Seas
    16.0, # Tree Cover
    6.0,  # Shrubland
    13.0, # Grassland
    15.0, # Cropland
    8.0,  # Built-up
    10.0, # Bare / Sparse Vegetation
    4.0,  # Snow and Ice
    10.0, # Permanent Water Bodies
    6.0,  # Herbaceous Wetland
    3.0,  # Mangroves
    8.5   # Moss and Lichen
]) / 100


target_25k = np.array([
    1.0,  # Open Seas
    18.0, # Tree Cover
    6.0,  # Shrubland
    14.0, # Grassland
    15.0, # Cropland
    7.0,  # Built-up
    11.0, # Bare / Sparse Vegetation
    4.0,  # Snow and Ice
    10.0, # Permanent Water Bodies
    6.0,  # Herbaceous Wetland
    2.5,  # Mangroves
    5.5   # Moss and Lichen
]) / 100


target_50k = np.array([
    1.2,  # Open Seas
    20.0, # Tree Cover
    6.5,  # Shrubland
    15.0, # Grassland
    14.5, # Cropland
    6.0,  # Built-up
    12.0, # Bare / Sparse Vegetation
    4.0,  # Snow and Ice
    9.5, # Permanent Water Bodies
    5.5,  # Herbaceous Wetland
    2.0,  # Mangroves
    3.8   # Moss and Lichen
]) / 100


target_100k = np.array([
    1.5,  # Open Seas
    22.0, # Tree Cover
    7.0,  # Shrubland
    16.0, # Grassland
    13.5, # Cropland
    5.0,  # Built-up
    12.0, # Bare / Sparse Vegetation
    4.0,  # Snow and Ice
    10.0, # Permanent Water Bodies
    5.0,  # Herbaceous Wetland
    1.5,  # Mangroves
    2.5   # Moss and Lichen
]) / 100


target_250k = np.array([
    1.8,  # Open Seas
    24.0, # Tree Cover
    7.5,  # Shrubland
    17.0, # Grassland
    12.5, # Cropland
    4.0,  # Built-up
    12, # Bare / Sparse Vegetation
    4.0,  # Snow and Ice
    11, # Permanent Water Bodies
    4,  # Herbaceous Wetland
    1.0,  # Mangroves
    1.2   # Moss and Lichen
]) / 100



# target_100k = np.array([1, 25, 6, 14, 18, 8, 
#                         8, 3, 7, 
#                         4, 2, 4])
print(target_100k.sum())
# target_100k = target_100k / target_100k.sum()

# target_250k = np.array([1, 25, 6, 14, 18, 8, 
#                         8, 3, 7, 
#                         4, 2, 4])
print(target_250k.sum())
# target_250k = target_250k / target_250k.sum()

# target_500k = np.array([2, 25, 6, 14, 18, 6,
#                         10, 3, 7, 
#                         4, 1, 4  ])
print(target_500k.sum())
# target_500k = target_500k / target_500k.sum()


# target_750k = np.array([2, 25, 6, 14, 18, 6,
#                         10, 3, 7, 
#                         4, 1, 4 ])
print(target_750k.sum())
# target_750k = target_750k / target_750k.sum()       

# target_1M = np.array([2, 25, 6, 14, 18, 6,
#                         10, 3, 7, 
#                         4, 1, 4 ])
print(target_1000k.sum())       
# target_1M = target_1M / target_1M.sum()                 
                



1.0
1.0
1.0
1.0
1.0


In [None]:

target_distributions = {
    "100k": target_100k,
    "250k":target_250k,
    "500k": target_500k,
    "750k": target_750k,
    "1000k": target_1000k
}s

# Subset sizes
subset_sizes = {
    "100k": 100_000,
    "250k": 250_000,
    "500k": 500_000,
    "750k": 750_000,
    "1000k": 1_000_000
}

# Dominance threshold for "pure" tiles
pure_threshold = 0.70

class_thresholds = np.array([
    0.80,  # Open Seas
    0.70,  # Tree Cover
    0.70,  # Shrubland
    0.65,  # Grassland
    0.60,  # Cropland
    0.20,  # Built-up
    0.60,  # Bare / Sparse Vegetation
    0.40,  # Snow and Ice
    0.50,  # Permanent Water Bodies
    0.40,  # Herbaceous Wetland
    0.10,  # Mangroves
    0.10   # Moss and Lichen
])

# Usage:
phase_proportions = [0.5, 0.1, 0.4]  # 50% pure, 10% high-entropy, 40% target matching

df_labeled2 = select_hybrid_subsets(
    df,
    landcover_cols=landcover_columns,
    targets=target_distributions,
    sizes=subset_sizes,
    class_thresholds=class_thresholds,
    phase_proportions=phase_proportions,
    batch_size=50,
    sample_pool_size=2500
)

df_labeled2.to_parquet("D:/MajorTOMExpand/0. DatasetsFolder/13.Major-TOM/MajorTOM_metadata_with_computed_attributes_subsets.parquet") 

In [None]:
df_labeled2.head()

In [None]:
import pandas as pd

# Example loading from CSV (you'd replace this with your real DataFrame)
# df = pd.read_csv('your_file.csv')
df = df_labeled2.copy()

# Assuming your DataFrame is already loaded and named `df`
# The goal is to propagate subset labels upward

subset_cols = ['subset_100k', 'subset_250k', 'subset_500k', 'subset_750k', 'subset_1000k']

# Process each column starting from the second, copying non-'none' values from lower levels
for i in range(1, len(subset_cols)):
    current_col = subset_cols[i]
    lower_cols = subset_cols[:i]
    
    # Create a mask of rows where the current column is 'none'
    mask = df[current_col] == 'none'
    
    # For those rows, check the first non-'none' value from lower levels
    for col in reversed(lower_cols):
        df.loc[mask & (df[col] != 'none'), current_col] = df[col]

# Now the higher subset columns (like 500k, 750k, etc.) also include inherited lower levels
df

In [None]:
df.to_parquet("D:/MajorTOMExpand/0. DatasetsFolder/13.Major-TOM/MajorTOM_metadata_with_computed_attributes_subsetsMentioned.parquet")

# Custom 200k subset

In [None]:
filter_grid_cells = pd.read_csv("D:/2020_subset_grid.csv")
filter_grid_cells

In [None]:
df_merge = df.merge(filter_grid_cells, on='grid_cell', how='inner')
df_merge

In [None]:
df_merge_export = df_merge[["grid_cell", "subset_phase", "subset_100k", "subset_250k", "subset_500k", "subset_750k", "subset_1000k"]]
df_merge_export

In [None]:
df_merge_export.to_csv("D:/MajorTOMExpand/0. DatasetsFolder/13.Major-TOM/MajorTOM_metadata_with_computed_cusotmSubsetsGridCells.csv", index=False)

In [None]:
df_merge_dedup.to_parquet("D:/MajorTOMExpand/0. DatasetsFolder/13.Major-TOM/MajorTOM_metadata_with_computed_attributes_2020_subset_grid.parquet")