In [1]:
# Ensemble XGBoost Pipeline for Geospatial SOC Topsoil soil Uncertainty Quantification Under Current Climate
import xgboost as xgb
import pandas as pd
import numpy as np
import glob
import os
import rioxarray as rxr
import xarray as xr
import rasterio
from pathlib import Path
import gc
from tqdm import tqdm
import psutil

# ====== Configuration ======
cv_model_paths = glob.glob('E:/minmin/cv_models/passive_top/*.json') # replace with 'active_top','SOC_top','passive_top'
input_tif_folder = 'E:/cleaned_tifs_no_extremes_iqr'
output_prediction_path = 'E:/minmin/passive_top_prediction_current.tif'# replace with 'active_top','SOC_top','passive_top'
output_spread_path = 'E:/minmin/passive_top_uncertainty_spread_current.tif'# replace with 'active_top','SOC_top','passive_top'
CHUNK_SIZE = 2000
MAX_FEATURES_IN_MEMORY = 30 

def check_memory_usage():
    """检查当前内存使用情况"""
    mem = psutil.virtual_memory()
    print(f"  Memory: {mem.used/1e9:.1f}GB / {mem.total/1e9:.1f}GB ({mem.percent}%)")
    return mem.percent

# ====== Load Models ======
print("Loading models...")
cv_models = []
for i, path in enumerate(cv_model_paths):
    try:
        model = xgb.Booster()
        model.load_model(path)
        cv_models.append(model)
        print(f"  Model {i+1}: Loaded from {os.path.basename(path)}")
    except Exception as e:
        print(f"  Model {i+1}: Failed to load - {e}")

if not cv_models:
    raise ValueError("No models loaded successfully!")

# Get feature names from first model
model_feature_names = cv_models[0].feature_names
print(f"\nModel feature names ({len(model_feature_names)}):")
for i in range(0, min(30, len(model_feature_names)), 10):
    print(f"  {model_feature_names[i:i+10]}")

# 分析特征类型
base_features = []
derived_features = []
for feat in model_feature_names:
    if '_log' in feat or '_interaction' in feat or '_squared' in feat or '_boost' in feat or '_bins' in feat:
        derived_features.append(feat)
    else:
        base_features.append(feat)

print(f"\nFeature analysis:")
print(f"  Base features: {len(base_features)}")
print(f"  Derived features: {len(derived_features)}")

# ====== 关键映射表 ======
simple_tif_to_model = {
    'Landuse_type': 'LUtype', 'LUtype': 'LUtype', 'LU_type': 'LUtype',
    'Recovery_mode': 'Recovmode', 'Recovmode': 'Recovmode',
    'BD': 't_bd', 't_bd': 't_bd',
    'pH': 't_ph', 't_ph': 't_ph',
    'Sand': 't_sand', 't_sand': 't_sand',
    'Silt': 't_silt', 't_silt': 't_silt',
    'Clay': 't_clay', 't_clay': 't_clay',
    'Vege_type': 'Vegetype', 'Vegetype': 'Vegetype',
    'TC': 't_oc', 't_oc': 't_oc',
    'TN': 'TN13', 'TN13': 'TN13',
    'TK': 'TK13', 'TK13': 'TK13',
    'Altitude': 'Altitude', 'elevation': 'Altitude',
    'ForestAge_TC000': 'Age', 'Age': 'Age',
    'Lon': 'x', 'Lat': 'y',
}

# 反向映射
model_to_tif = {}
for tif_name, model_name in simple_tif_to_model.items():
    if model_name not in model_to_tif:
        model_to_tif[model_name] = []
    model_to_tif[model_name].append(tif_name)

# ====== 获取参考栅格信息 ======
def get_reference_info(tif_folder):
    """获取参考栅格的详细信息"""
    print(f"\n=== Getting reference raster info ===")
    
    all_tif_files = list(Path(tif_folder).glob("*.tif"))
    if not all_tif_files:
        raise ValueError(f"No TIF files found in {tif_folder}")
    
    # 使用Recovery_mode或Recovmode作为参考
    ref_candidates = ['Recovery_mode', 'Recovmode', 'Landuse_type', 'Altitude']
    ref_path = None
    
    for candidate in ref_candidates:
        candidate_path = os.path.join(tif_folder, f"{candidate}.tif")
        if os.path.exists(candidate_path):
            ref_path = candidate_path
            break
    
    if not ref_path:
        ref_path = str(all_tif_files[0])
    
    print(f"Using reference raster: {os.path.basename(ref_path)}")
    
    with rasterio.open(ref_path) as src:
        height, width = src.height, src.width
        transform = src.transform
        crs = src.crs
        dtype = src.dtypes[0]
    
    # 计算总内存需求估计
    total_pixels = height * width
    bytes_per_pixel = 4  # float32
    estimated_memory = len(base_features) * total_pixels * bytes_per_pixel / 1e9
    
    print(f"  Shape: {height} x {width} = {total_pixels:,} pixels")
    print(f"  Data type: {dtype}")
    print(f"  CRS: {crs}")
    print(f"  Estimated memory for base features: {estimated_memory:.2f} GB")
    print(f"  Chunk size: {CHUNK_SIZE} rows")
    print(f"  Pixels per chunk: {CHUNK_SIZE * width:,}")
    
    return height, width, transform, crs

# ====== 预加载特征元数据 ======
def preload_feature_metadata(tif_folder, model_features):
    """预加载特征文件的路径和元数据"""
    print(f"\n=== Preloading feature metadata ===")
    
    all_tif_files = list(Path(tif_folder).glob("*.tif"))
    tif_names = [f.stem for f in all_tif_files]
    
    feature_metadata = {}
    
    for model_feat in base_features:  # 只需要基础特征
        possible_tif_names = model_to_tif.get(model_feat, [model_feat])
        tif_path = None
        
        for tif_name in possible_tif_names:
            if tif_name in tif_names:
                tif_path = os.path.join(tif_folder, f"{tif_name}.tif")
                break
        
        if tif_path and os.path.exists(tif_path):
            try:
                with rasterio.open(tif_path) as src:
                    # 只存储元数据，不加载数据
                    feature_metadata[model_feat] = {
                        'path': tif_path,
                        'dtype': src.dtypes[0],
                        'nodata': src.nodata
                    }
                    print(f"  ✓ {model_feat} -> {os.path.basename(tif_path)}")
            except:
                pass
    
    print(f"  Found metadata for {len(feature_metadata)} base features")
    return feature_metadata

# ====== 高效加载分块数据 ======
def load_chunk_data_efficient(feature_metadata, row_start, row_end, width, transform):
    """高效加载分块数据"""
    chunk_data = {}
    chunk_height = row_end - row_start
    
    if chunk_height <= 0:
        return chunk_data
    
    # 加载基础特征
    for model_feat, meta in feature_metadata.items():
        try:
            with rasterio.open(meta['path']) as src:
                # 读取指定窗口
                window = ((row_start, row_end), (0, width))
                data = src.read(1, window=window)
                
                # 转换为float32，处理nodata
                data = data.astype(np.float32)
                if meta['nodata'] is not None:
                    data[data == meta['nodata']] = np.nan
                
                chunk_data[model_feat] = data
                
        except Exception as e:
            print(f"    Error loading {model_feat}: {e}")
            # 创建空数组作为占位符
            chunk_data[model_feat] = np.full((chunk_height, width), np.nan, dtype=np.float32)
    
    # 生成坐标特征（如果需要）
    if 'x' in base_features:
        x_coords = np.arange(width) * transform[0] + transform[2] + transform[0] / 2
        chunk_data['x'] = np.tile(x_coords, (chunk_height, 1)).astype(np.float32)
    
    if 'y' in base_features:
        y_coords = np.arange(row_start, row_end) * transform[4] + transform[5] + transform[4] / 2
        chunk_data['y'] = np.tile(y_coords.reshape(-1, 1), (1, width)).astype(np.float32)
    
    return chunk_data

# ====== 批量创建衍生特征 ======
def create_derived_features_batch(chunk_data, model_features):
    """批量创建所有衍生特征"""
    processed = chunk_data.copy()
    
    if not processed:
        return processed
    
    chunk_rows, chunk_cols = next(iter(processed.values())).shape
    
    # 1. 对数变换
    log_transforms = [
        ('x', 'Lon_log'), ('y', 'Lat_log'), 
        ('Age', 'Age_log'), ('t_bd', 'BD_log'), ('t_ph', 'pH_log')
    ]
    
    for base_feat, log_feat in log_transforms:
        if log_feat in model_features and base_feat in processed:
            data = processed[base_feat].copy()
            mask = ~np.isnan(data)
            if np.any(mask):
                data[mask] = np.log(data[mask] + 1e-8)
            processed[log_feat] = data
    
    # 2. LUtype相关特征
    if 'LUtype' in processed:
        lu_data = processed['LUtype'].copy()
        lu_filled = np.where(np.isnan(lu_data), 0, lu_data)
        
        # LUtype增强特征
        lu_boost_features = [f for f in model_features if f.startswith('LUtype_boost_')]
        for boost_feat in lu_boost_features:
            # 提取boost编号
            try:
                boost_num = int(boost_feat.split('_')[-1])
                processed[boost_feat] = lu_filled * boost_num if boost_num > 1 else lu_filled
            except:
                processed[boost_feat] = lu_filled
        
        # LUtype平方
        if 'LUtype_squared' in model_features:
            processed['LUtype_squared'] = lu_filled ** 2
        
        # LUtype交互特征
        for other_feat in ['t_ph', 't_bd', 'Age', 'Altitude']:
            interaction_name = f'LUtype_{other_feat}_interaction'
            if interaction_name in model_features and other_feat in processed:
                other_data = processed[other_feat].copy()
                other_filled = np.where(np.isnan(other_data), 0, other_data)
                processed[interaction_name] = lu_filled * other_filled
    
    # 3. Altitude_bins
    if 'Altitude_bins' in model_features and 'Altitude' in processed:
        alt_data = processed['Altitude'].copy()
        alt_filled = np.where(np.isnan(alt_data), 0, alt_data)
        processed['Altitude_bins'] = alt_filled
    
    # 4. 填充缺失特征
    missing = [f for f in model_features if f not in processed]
    if missing:
        for feat in missing:
            processed[feat] = np.zeros((chunk_rows, chunk_cols), dtype=np.float32)
    
    return processed

# ====== 批量预测 ======
def predict_chunk_batch(chunk_data, models):
    """对分块数据进行批量预测"""
    if not models or not chunk_data:
        return None, None
    
    model_features = models[0].feature_names
    
    # 创建衍生特征
    all_features = create_derived_features_batch(chunk_data, model_features)
    
    # 获取分块形状
    chunk_rows, chunk_cols = next(iter(all_features.values())).shape
    
    # 使用Recovmode或第一个非坐标特征作为有效掩膜
    core_features = ['Recovmode', 'Recovery_mode', 'LUtype', 'Altitude']
    core_feature = None
    for cf in core_features:
        if cf in all_features:
            core_feature = cf
            break
    
    if core_feature:
        valid_mask = ~np.isnan(all_features[core_feature])
    else:
        valid_mask = np.ones((chunk_rows, chunk_cols), dtype=bool)
    
    # 准备特征数组
    feature_arrays = []
    for feat in model_features:
        if feat in all_features:
            feature_arrays.append(all_features[feat])
        else:
            feature_arrays.append(np.zeros((chunk_rows, chunk_cols), dtype=np.float32))
    
    # 转换为2D数组
    X_3d = np.stack(feature_arrays, axis=-1)
    X_flat = X_3d.reshape(-1, len(model_features))
    
    # 提取有效像素
    valid_indices = valid_mask.flatten()
    X_valid = X_flat[valid_indices]
    
    if len(X_valid) == 0:
        return None, None
    
    # 批量预测（使用所有模型）
    all_predictions = []
    for model in models:
        dmatrix = xgb.DMatrix(X_valid, feature_names=model_features)
        preds = model.predict(dmatrix)
        all_predictions.append(preds)
    
    # 计算均值和标准差
    preds_array = np.array(all_predictions)
    mean_pred = np.mean(preds_array, axis=0)
    std_pred = np.std(preds_array, axis=0)
    
    # 重建分块
    mean_full = np.full(chunk_rows * chunk_cols, np.nan, dtype=np.float32)
    std_full = np.full(chunk_rows * chunk_cols, np.nan, dtype=np.float32)
    
    mean_full[valid_indices] = mean_pred
    std_full[valid_indices] = std_pred
    
    return mean_full.reshape(chunk_rows, chunk_cols), std_full.reshape(chunk_rows, chunk_cols)

# ====== 主程序 ======
if __name__ == "__main__":
    print("=" * 70)
    print("SOC Active Layer Prediction - Optimized for Large Memory Systems")
    print("=" * 70)
    
    # 初始内存检查
    print(f"\nInitial system check:")
    check_memory_usage()
    
    # 步骤1: 加载模型
    print(f"\n1. Loading {len(cv_model_paths)} models...")
    
    # 步骤2: 获取参考信息
    print(f"\n2. Analyzing input data...")
    height, width, transform, crs = get_reference_info(input_tif_folder)
    
    # 步骤3: 预加载特征元数据
    print(f"\n3. Preloading feature metadata...")
    feature_metadata = preload_feature_metadata(input_tif_folder, model_feature_names)
    
    if not feature_metadata:
        raise ValueError("No base features found! Check your TIF files.")
    
    # 步骤4: 分块处理
    print(f"\n4. Starting chunked prediction...")
    
    # 准备输出数组
    mean_result = np.full((height, width), np.nan, dtype=np.float32)
    std_result = np.full((height, width), np.nan, dtype=np.float32)
    
    total_chunks = int(np.ceil(height / CHUNK_SIZE))
    total_valid_pixels = 0
    
    # 使用进度条
    print(f"\nProgress:")
    
    for chunk_idx in range(total_chunks):
        chunk_start = chunk_idx * CHUNK_SIZE
        chunk_end = min(chunk_start + CHUNK_SIZE, height)
        chunk_height = chunk_end - chunk_start
        
        print(f"\n  Chunk {chunk_idx+1}/{total_chunks}: Rows {chunk_start:,}-{chunk_end:,}")
        check_memory_usage()
        
        # 加载分块数据
        chunk_data = load_chunk_data_efficient(
            feature_metadata, chunk_start, chunk_end, width, transform
        )
        
        if not chunk_data:
            print(f"    No data loaded, skipping...")
            continue
        
        print(f"    Loaded {len(chunk_data)} features for this chunk")
        
        # 预测
        mean_chunk, std_chunk = predict_chunk_batch(chunk_data, cv_models)
        
        if mean_chunk is not None:
            # 存储结果
            mean_result[chunk_start:chunk_end, :] = mean_chunk
            std_result[chunk_start:chunk_end, :] = std_chunk
            
            # 统计
            valid_in_chunk = np.sum(~np.isnan(mean_chunk))
            total_valid_pixels += valid_in_chunk
            
            chunk_min = np.nanmin(mean_chunk)
            chunk_max = np.nanmax(mean_chunk)
            print(f"    Predicted: {valid_in_chunk:,} pixels")
            print(f"    Range: [{chunk_min:.3f}, {chunk_max:.3f}]")
            print(f"    Progress: {total_valid_pixels/(height*width)*100:.1f}%")
        
        # 清理内存
        del chunk_data, mean_chunk, std_chunk
        gc.collect()
    
    # 步骤5: 保存结果
    print(f"\n5. Saving results...")
    
    try:
        # 创建坐标
        x_coords = np.arange(width) * transform[0] + transform[2] + transform[0] / 2
        y_coords = np.arange(height) * transform[4] + transform[5] + transform[4] / 2
        
        # 保存预测均值
        print(f"  Saving prediction to: {output_prediction_path}")
        da_mean = xr.DataArray(
            mean_result,
            coords=[('y', y_coords), ('x', x_coords)],
            dims=('y', 'x'),
            name='SOC_prediction'
        )
        da_mean.rio.write_crs(crs, inplace=True)
        da_mean.rio.write_transform(transform, inplace=True)
        
        da_mean.rio.to_raster(
            output_prediction_path,
            driver='GTiff',
            dtype=np.float32,
            compress='LZW',
            nodata=np.nan,
            tiled=True,
            blockxsize=256,
            blockysize=256,
            BIGTIFF='IF_SAFER'  # 支持大文件
        )
        print(f"  ✓ Prediction saved successfully")
        
        # 保存不确定性
        print(f"  Saving uncertainty to: {output_spread_path}")
        da_std = xr.DataArray(
            std_result,
            coords=[('y', y_coords), ('x', x_coords)],
            dims=('y', 'x'),
            name='SOC_uncertainty'
        )
        da_std.rio.write_crs(crs, inplace=True)
        da_std.rio.write_transform(transform, inplace=True)
        
        da_std.rio.to_raster(
            output_spread_path,
            driver='GTiff',
            dtype=np.float32,
            compress='LZW',
            nodata=np.nan,
            tiled=True,
            blockxsize=256,
            blockysize=256,
            BIGTIFF='IF_SAFER'
        )
        print(f"  ✓ Uncertainty saved successfully")
        
    except Exception as e:
        print(f"  Error saving results: {e}")
        raise
    
    # 最终统计
    print(f"\n6. Final statistics:")
    total_pixels = height * width
    
    print(f"  Total pixels: {total_pixels:,}")
    print(f"  Predicted pixels: {total_valid_pixels:,} ({total_valid_pixels/total_pixels*100:.1f}%)")
    print(f"  Missing pixels: {total_pixels-total_valid_pixels:,} ({(total_pixels-total_valid_pixels)/total_pixels*100:.1f}%)")
    
    if total_valid_pixels > 0:
        final_min = np.nanmin(mean_result)
        final_max = np.nanmax(mean_result)
        final_mean = np.nanmean(mean_result)
        uncert_mean = np.nanmean(std_result)
        
        print(f"  Prediction range: [{final_min:.4f}, {final_max:.4f}]")
        print(f"  Prediction mean: {final_mean:.4f}")
        print(f"  Uncertainty mean: {uncert_mean:.4f}")
        print(f"  Uncertainty range: [{np.nanmin(std_result):.4f}, {np.nanmax(std_result):.4f}]")
    
    print(f"\n" + "=" * 70)
    print("✓ Analysis completed successfully!")
    print("=" * 70)
    
    # 最终内存检查
    print(f"\nFinal memory usage:")
    check_memory_usage()

In [None]:
# Ensemble XGBoost Pipeline for Geospatial SOC Subsoil Uncertainty Quantification Under Current Climate
import xgboost as xgb
import pandas as pd
import numpy as np
import glob
import os
import rioxarray as rxr
import xarray as xr
import rasterio
from pathlib import Path
import gc
from tqdm import tqdm
import psutil

# ====== Configuration ======
cv_model_paths = glob.glob('E:/minmin/cv_models/passive_sub/*.json') # replace with 'active_sub','SOC_sub','passive_sub'
input_tif_folder = 'E:/cleaned_tifs_no_extremes_iqr'
output_prediction_path = 'E:/minmin/passive_sub_prediction_current.tif'# replace with 'active_sub','SOC_sub','passive_sub'
output_spread_path = 'E:/minmin/passive_sub_uncertainty_spread_current.tif'# replace with 'active_sub','SOC_sub','passive_sub'
CHUNK_SIZE = 2000
MAX_FEATURES_IN_MEMORY = 30 

# ====== 内存监控 ======
def check_memory_usage():
    mem = psutil.virtual_memory()
    print(f"  Memory: {mem.used/1e9:.1f}GB / {mem.total/1e9:.1f}GB ({mem.percent}%)")
    return mem.percent

# ====== Load Models ======
print("Loading models...")
cv_models = []
for i, path in enumerate(cv_model_paths):
    try:
        model = xgb.Booster()
        model.load_model(path)
        cv_models.append(model)
        print(f"  Model {i+1}: Loaded from {os.path.basename(path)}")
    except Exception as e:
        print(f"  Model {i+1}: Failed to load - {e}")

if not cv_models:
    raise ValueError("No models loaded successfully!")

# Get feature names from first model
model_feature_names = cv_models[0].feature_names
print(f"\nModel feature names ({len(model_feature_names)}):")
for i in range(0, min(30, len(model_feature_names)), 10):
    print(f"  {model_feature_names[i:i+10]}")

# 分析特征类型
base_features = []
derived_features = []
for feat in model_feature_names:
    if '_log' in feat or '_interaction' in feat or '_squared' in feat or '_boost' in feat or '_bins' in feat:
        derived_features.append(feat)
    else:
        base_features.append(feat)

print(f"\nFeature analysis:")
print(f"  Base features: {len(base_features)}")
print(f"  Derived features: {len(derived_features)}")

# ====== 关键映射表 ======
simple_tif_to_model = {
    'Landuse_type': 'LUtype', 'LUtype': 'LUtype', 'LU_type': 'LUtype',
    'Recovery_mode': 'Recovmode', 'Recovmode': 'Recovmode',
    'BD': 's_bd', 's_bd': 's_bd',
    'pH': 's_ph', 's_ph': 's_ph',
    'Sand': 's_sand', 's_sand': 's_sand',
    'Silt': 's_silt', 's_silt': 's_silt',
    'Clay': 's_clay', 's_clay': 's_clay',
    'Vege_type': 'Vegetype', 'Vegetype': 'Vegetype',
    'TC': 's_oc', 's_oc': 's_oc',
    'TN': 'TN46', 'TN46': 'TN46',
    'TK': 'TK46', 'TK46': 'TK46',
    'Altitude': 'Altitude', 'elevation': 'Altitude',
    'ForestAge_TC000': 'Age', 'Age': 'Age',
    'Lon': 'x', 'Lat': 'y',
}

# 反向映射
model_to_tif = {}
for tif_name, model_name in simple_tif_to_model.items():
    if model_name not in model_to_tif:
        model_to_tif[model_name] = []
    model_to_tif[model_name].append(tif_name)

# ====== 获取参考栅格信息 ======
def get_reference_info(tif_folder):
    """获取参考栅格的详细信息"""
    print(f"\n=== Getting reference raster info ===")
    
    all_tif_files = list(Path(tif_folder).glob("*.tif"))
    if not all_tif_files:
        raise ValueError(f"No TIF files found in {tif_folder}")
    
    # 使用Recovery_mode或Recovmode作为参考
    ref_candidates = ['Recovery_mode', 'Recovmode', 'Landuse_type', 'Altitude']
    ref_path = None
    
    for candidate in ref_candidates:
        candidate_path = os.path.join(tif_folder, f"{candidate}.tif")
        if os.path.exists(candidate_path):
            ref_path = candidate_path
            break
    
    if not ref_path:
        ref_path = str(all_tif_files[0])
    
    print(f"Using reference raster: {os.path.basename(ref_path)}")
    
    with rasterio.open(ref_path) as src:
        height, width = src.height, src.width
        transform = src.transform
        crs = src.crs
        dtype = src.dtypes[0]
    
    # 计算总内存需求估计
    total_pixels = height * width
    bytes_per_pixel = 4  # float32
    estimated_memory = len(base_features) * total_pixels * bytes_per_pixel / 1e9
    
    print(f"  Shape: {height} x {width} = {total_pixels:,} pixels")
    print(f"  Data type: {dtype}")
    print(f"  CRS: {crs}")
    print(f"  Estimated memory for base features: {estimated_memory:.2f} GB")
    print(f"  Chunk size: {CHUNK_SIZE} rows")
    print(f"  Pixels per chunk: {CHUNK_SIZE * width:,}")
    
    return height, width, transform, crs

# ====== 预加载特征元数据 ======
def preload_feature_metadata(tif_folder, model_features):
    """预加载特征文件的路径和元数据"""
    print(f"\n=== Preloading feature metadata ===")
    
    all_tif_files = list(Path(tif_folder).glob("*.tif"))
    tif_names = [f.stem for f in all_tif_files]
    
    feature_metadata = {}
    
    for model_feat in base_features:  # 只需要基础特征
        possible_tif_names = model_to_tif.get(model_feat, [model_feat])
        tif_path = None
        
        for tif_name in possible_tif_names:
            if tif_name in tif_names:
                tif_path = os.path.join(tif_folder, f"{tif_name}.tif")
                break
        
        if tif_path and os.path.exists(tif_path):
            try:
                with rasterio.open(tif_path) as src:
                    # 只存储元数据，不加载数据
                    feature_metadata[model_feat] = {
                        'path': tif_path,
                        'dtype': src.dtypes[0],
                        'nodata': src.nodata
                    }
                    print(f"  ✓ {model_feat} -> {os.path.basename(tif_path)}")
            except:
                pass
    
    print(f"  Found metadata for {len(feature_metadata)} base features")
    return feature_metadata

# ====== 高效加载分块数据 ======
def load_chunk_data_efficient(feature_metadata, row_start, row_end, width, transform):
    """高效加载分块数据"""
    chunk_data = {}
    chunk_height = row_end - row_start
    
    if chunk_height <= 0:
        return chunk_data
    
    # 加载基础特征
    for model_feat, meta in feature_metadata.items():
        try:
            with rasterio.open(meta['path']) as src:
                # 读取指定窗口
                window = ((row_start, row_end), (0, width))
                data = src.read(1, window=window)
                
                # 转换为float32，处理nodata
                data = data.astype(np.float32)
                if meta['nodata'] is not None:
                    data[data == meta['nodata']] = np.nan
                
                chunk_data[model_feat] = data
                
        except Exception as e:
            print(f"    Error loading {model_feat}: {e}")
            # 创建空数组作为占位符
            chunk_data[model_feat] = np.full((chunk_height, width), np.nan, dtype=np.float32)
    
    # 生成坐标特征（如果需要）
    if 'x' in base_features:
        x_coords = np.arange(width) * transform[0] + transform[2] + transform[0] / 2
        chunk_data['x'] = np.tile(x_coords, (chunk_height, 1)).astype(np.float32)
    
    if 'y' in base_features:
        y_coords = np.arange(row_start, row_end) * transform[4] + transform[5] + transform[4] / 2
        chunk_data['y'] = np.tile(y_coords.reshape(-1, 1), (1, width)).astype(np.float32)
    
    return chunk_data

# ====== 批量创建衍生特征 ======
def create_derived_features_batch(chunk_data, model_features):
    """批量创建所有衍生特征"""
    processed = chunk_data.copy()
    
    if not processed:
        return processed
    
    chunk_rows, chunk_cols = next(iter(processed.values())).shape
    
    # 1. 对数变换
    log_transforms = [
        ('x', 'Lon_log'), ('y', 'Lat_log'), 
        ('Age', 'Age_log'), ('t_bd', 'BD_log'), ('t_ph', 'pH_log')
    ]
    
    for base_feat, log_feat in log_transforms:
        if log_feat in model_features and base_feat in processed:
            data = processed[base_feat].copy()
            mask = ~np.isnan(data)
            if np.any(mask):
                data[mask] = np.log(data[mask] + 1e-8)
            processed[log_feat] = data
    
    # 2. LUtype相关特征
    if 'LUtype' in processed:
        lu_data = processed['LUtype'].copy()
        lu_filled = np.where(np.isnan(lu_data), 0, lu_data)
        
        # LUtype增强特征
        lu_boost_features = [f for f in model_features if f.startswith('LUtype_boost_')]
        for boost_feat in lu_boost_features:
            # 提取boost编号
            try:
                boost_num = int(boost_feat.split('_')[-1])
                processed[boost_feat] = lu_filled * boost_num if boost_num > 1 else lu_filled
            except:
                processed[boost_feat] = lu_filled
        
        # LUtype平方
        if 'LUtype_squared' in model_features:
            processed['LUtype_squared'] = lu_filled ** 2
        
        # LUtype交互特征
        for other_feat in ['t_ph', 't_bd', 'Age', 'Altitude']:
            interaction_name = f'LUtype_{other_feat}_interaction'
            if interaction_name in model_features and other_feat in processed:
                other_data = processed[other_feat].copy()
                other_filled = np.where(np.isnan(other_data), 0, other_data)
                processed[interaction_name] = lu_filled * other_filled
    
    # 3. Altitude_bins
    if 'Altitude_bins' in model_features and 'Altitude' in processed:
        alt_data = processed['Altitude'].copy()
        alt_filled = np.where(np.isnan(alt_data), 0, alt_data)
        processed['Altitude_bins'] = alt_filled
    
    # 4. 填充缺失特征
    missing = [f for f in model_features if f not in processed]
    if missing:
        for feat in missing:
            processed[feat] = np.zeros((chunk_rows, chunk_cols), dtype=np.float32)
    
    return processed

# ====== 批量预测 ======
def predict_chunk_batch(chunk_data, models):
    """对分块数据进行批量预测"""
    if not models or not chunk_data:
        return None, None
    
    model_features = models[0].feature_names
    
    # 创建衍生特征
    all_features = create_derived_features_batch(chunk_data, model_features)
    
    # 获取分块形状
    chunk_rows, chunk_cols = next(iter(all_features.values())).shape
    
    # 使用Recovmode或第一个非坐标特征作为有效掩膜
    core_features = ['Recovmode', 'Recovery_mode', 'LUtype', 'Altitude']
    core_feature = None
    for cf in core_features:
        if cf in all_features:
            core_feature = cf
            break
    
    if core_feature:
        valid_mask = ~np.isnan(all_features[core_feature])
    else:
        valid_mask = np.ones((chunk_rows, chunk_cols), dtype=bool)
    
    # 准备特征数组
    feature_arrays = []
    for feat in model_features:
        if feat in all_features:
            feature_arrays.append(all_features[feat])
        else:
            feature_arrays.append(np.zeros((chunk_rows, chunk_cols), dtype=np.float32))
    
    # 转换为2D数组
    X_3d = np.stack(feature_arrays, axis=-1)
    X_flat = X_3d.reshape(-1, len(model_features))
    
    # 提取有效像素
    valid_indices = valid_mask.flatten()
    X_valid = X_flat[valid_indices]
    
    if len(X_valid) == 0:
        return None, None
    
    # 批量预测（使用所有模型）
    all_predictions = []
    for model in models:
        dmatrix = xgb.DMatrix(X_valid, feature_names=model_features)
        preds = model.predict(dmatrix)
        all_predictions.append(preds)
    
    # 计算均值和标准差
    preds_array = np.array(all_predictions)
    mean_pred = np.mean(preds_array, axis=0)
    std_pred = np.std(preds_array, axis=0)
    
    # 重建分块
    mean_full = np.full(chunk_rows * chunk_cols, np.nan, dtype=np.float32)
    std_full = np.full(chunk_rows * chunk_cols, np.nan, dtype=np.float32)
    
    mean_full[valid_indices] = mean_pred
    std_full[valid_indices] = std_pred
    
    return mean_full.reshape(chunk_rows, chunk_cols), std_full.reshape(chunk_rows, chunk_cols)

# ====== 主程序 ======
if __name__ == "__main__":
    print("=" * 70)
    print("SOC Active Layer Prediction - Optimized for Large Memory Systems")
    print("=" * 70)
    
    # 初始内存检查
    print(f"\nInitial system check:")
    check_memory_usage()
    
    # 步骤1: 加载模型
    print(f"\n1. Loading {len(cv_model_paths)} models...")
    
    # 步骤2: 获取参考信息
    print(f"\n2. Analyzing input data...")
    height, width, transform, crs = get_reference_info(input_tif_folder)
    
    # 步骤3: 预加载特征元数据
    print(f"\n3. Preloading feature metadata...")
    feature_metadata = preload_feature_metadata(input_tif_folder, model_feature_names)
    
    if not feature_metadata:
        raise ValueError("No base features found! Check your TIF files.")
    
    # 步骤4: 分块处理
    print(f"\n4. Starting chunked prediction...")
    
    # 准备输出数组
    mean_result = np.full((height, width), np.nan, dtype=np.float32)
    std_result = np.full((height, width), np.nan, dtype=np.float32)
    
    total_chunks = int(np.ceil(height / CHUNK_SIZE))
    total_valid_pixels = 0
    
    # 使用进度条
    print(f"\nProgress:")
    
    for chunk_idx in range(total_chunks):
        chunk_start = chunk_idx * CHUNK_SIZE
        chunk_end = min(chunk_start + CHUNK_SIZE, height)
        chunk_height = chunk_end - chunk_start
        
        print(f"\n  Chunk {chunk_idx+1}/{total_chunks}: Rows {chunk_start:,}-{chunk_end:,}")
        check_memory_usage()
        
        # 加载分块数据
        chunk_data = load_chunk_data_efficient(
            feature_metadata, chunk_start, chunk_end, width, transform
        )
        
        if not chunk_data:
            print(f"    No data loaded, skipping...")
            continue
        
        print(f"    Loaded {len(chunk_data)} features for this chunk")
        
        # 预测
        mean_chunk, std_chunk = predict_chunk_batch(chunk_data, cv_models)
        
        if mean_chunk is not None:
            # 存储结果
            mean_result[chunk_start:chunk_end, :] = mean_chunk
            std_result[chunk_start:chunk_end, :] = std_chunk
            
            # 统计
            valid_in_chunk = np.sum(~np.isnan(mean_chunk))
            total_valid_pixels += valid_in_chunk
            
            chunk_min = np.nanmin(mean_chunk)
            chunk_max = np.nanmax(mean_chunk)
            print(f"    Predicted: {valid_in_chunk:,} pixels")
            print(f"    Range: [{chunk_min:.3f}, {chunk_max:.3f}]")
            print(f"    Progress: {total_valid_pixels/(height*width)*100:.1f}%")
        
        # 清理内存
        del chunk_data, mean_chunk, std_chunk
        gc.collect()
    
    # 步骤5: 保存结果
    print(f"\n5. Saving results...")
    
    try:
        # 创建坐标
        x_coords = np.arange(width) * transform[0] + transform[2] + transform[0] / 2
        y_coords = np.arange(height) * transform[4] + transform[5] + transform[4] / 2
        
        # 保存预测均值
        print(f"  Saving prediction to: {output_prediction_path}")
        da_mean = xr.DataArray(
            mean_result,
            coords=[('y', y_coords), ('x', x_coords)],
            dims=('y', 'x'),
            name='SOC_prediction'
        )
        da_mean.rio.write_crs(crs, inplace=True)
        da_mean.rio.write_transform(transform, inplace=True)
        
        da_mean.rio.to_raster(
            output_prediction_path,
            driver='GTiff',
            dtype=np.float32,
            compress='LZW',
            nodata=np.nan,
            tiled=True,
            blockxsize=256,
            blockysize=256,
            BIGTIFF='IF_SAFER'  # 支持大文件
        )
        print(f"  ✓ Prediction saved successfully")
        
        # 保存不确定性
        print(f"  Saving uncertainty to: {output_spread_path}")
        da_std = xr.DataArray(
            std_result,
            coords=[('y', y_coords), ('x', x_coords)],
            dims=('y', 'x'),
            name='SOC_uncertainty'
        )
        da_std.rio.write_crs(crs, inplace=True)
        da_std.rio.write_transform(transform, inplace=True)
        
        da_std.rio.to_raster(
            output_spread_path,
            driver='GTiff',
            dtype=np.float32,
            compress='LZW',
            nodata=np.nan,
            tiled=True,
            blockxsize=256,
            blockysize=256,
            BIGTIFF='IF_SAFER'
        )
        print(f"  ✓ Uncertainty saved successfully")
        
    except Exception as e:
        print(f"  Error saving results: {e}")
        raise
    
    # 最终统计
    print(f"\n6. Final statistics:")
    total_pixels = height * width
    
    print(f"  Total pixels: {total_pixels:,}")
    print(f"  Predicted pixels: {total_valid_pixels:,} ({total_valid_pixels/total_pixels*100:.1f}%)")
    print(f"  Missing pixels: {total_pixels-total_valid_pixels:,} ({(total_pixels-total_valid_pixels)/total_pixels*100:.1f}%)")
    
    if total_valid_pixels > 0:
        final_min = np.nanmin(mean_result)
        final_max = np.nanmax(mean_result)
        final_mean = np.nanmean(mean_result)
        uncert_mean = np.nanmean(std_result)
        
        print(f"  Prediction range: [{final_min:.4f}, {final_max:.4f}]")
        print(f"  Prediction mean: {final_mean:.4f}")
        print(f"  Uncertainty mean: {uncert_mean:.4f}")
        print(f"  Uncertainty range: [{np.nanmin(std_result):.4f}, {np.nanmax(std_result):.4f}]")
    
    print(f"\n" + "=" * 70)
    print("✓ Analysis completed successfully!")
    print("=" * 70)
    
    # 最终内存检查
    print(f"\nFinal memory usage:")
    check_memory_usage()

In [None]:
# Ensemble XGBoost Pipeline for Geospatial SOC Subsoil Uncertainty Quantification Under Future Climate
import xgboost as xgb
import pandas as pd
import numpy as np
import glob
import os
import rioxarray as rxr
import xarray as xr
import rasterio
from pathlib import Path
import gc
from tqdm import tqdm
import psutil

# ====== Configuration ======
cv_model_paths = glob.glob('E:/minmin/cv_models/SOC_top/*.json') 
input_tif_folder = 'E:/cleaned_tifs_no_extremes_iqr'
output_prediction_path = 'E:/minmin/SOC_top_prediction.tif'
output_spread_path = 'E:/minmin/SOC_top_uncertainty_spread.tif'


CHUNK_SIZE = 2000 
MAX_FEATURES_IN_MEMORY = 30 

def check_memory_usage():
    mem = psutil.virtual_memory()
    print(f"  Memory: {mem.used/1e9:.1f}GB / {mem.total/1e9:.1f}GB ({mem.percent}%)")
    return mem.percent

# ====== Load Models ======
print("Loading models...")
cv_models = []
for i, path in enumerate(cv_model_paths):
    try:
        model = xgb.Booster()
        model.load_model(path)
        cv_models.append(model)
        print(f"  Model {i+1}: Loaded from {os.path.basename(path)}")
    except Exception as e:
        print(f"  Model {i+1}: Failed to load - {e}")

if not cv_models:
    raise ValueError("No models loaded successfully!")

# Get feature names from first model
model_feature_names = cv_models[0].feature_names
print(f"\nModel feature names ({len(model_feature_names)}):")
for i in range(0, min(30, len(model_feature_names)), 10):
    print(f"  {model_feature_names[i:i+10]}")

base_features = []
derived_features = []
for feat in model_feature_names:
    if '_log' in feat or '_interaction' in feat or '_squared' in feat or '_boost' in feat or '_bins' in feat:
        derived_features.append(feat)
    else:
        base_features.append(feat)

print(f"\nFeature analysis:")
print(f"  Base features: {len(base_features)}")
print(f"  Derived features: {len(derived_features)}")

simple_tif_to_model = {
    'Landuse_type': 'LUtype', 'LUtype': 'LUtype', 'LU_type': 'LUtype',
    'Recovery_mode': 'Recovmode', 'Recovmode': 'Recovmode',
    'BD': 't_bd', 't_bd': 't_bd',
    'pH': 't_ph', 't_ph': 't_ph',
    'Sand': 't_sand', 't_sand': 't_sand',
    'Silt': 't_silt', 't_silt': 't_silt',
    'Clay': 't_clay', 't_clay': 't_clay',
    'Vege_type': 'Vegetype', 'Vegetype': 'Vegetype',
    'TC': 't_oc', 't_oc': 't_oc',
    'TN': 'TN13', 'TN13': 'TN13',
    'TK': 'TK13', 'TK13': 'TK13',
    'Altitude': 'Altitude', 'elevation': 'Altitude',
    'ForestAge_TC000': 'Age_plus100', 'Age': 'Age_plus100',
    'Lon': 'x', 'Lat': 'y',
    'MAT':'wc_BIO1_MAT',
    'bio2':'wc_BIO2',
    'bio3':'wc_BIO3',
    'bio4':'wc_BIO4',
    'bio5':'wc_BIO5',
    'bio6':'wc_BIO6', 
    'bio7':'wc_BIO7', 
    'bio8':'wc_BIO8', 
    'bio9':'wc_BIO9', 
    'bio10':'wc_BIO10',  
    'bio11':'wc_BIO11', 
    'MAP':'wc_BIO12_MAP',  
    'bio13':'wc_BIO13',
    'bio14':'wc_BIO14',
    'bio15':'wc_BIO15',
    'bio16':'wc_BIO16',
    'bio17':'wc_BIO17',
    'bio18':'wc_BIO18',
    'bio19':'wc_BIO19'

}

model_to_tif = {}
for tif_name, model_name in simple_tif_to_model.items():
    if model_name not in model_to_tif:
        model_to_tif[model_name] = []
    model_to_tif[model_name].append(tif_name)

def get_reference_info(tif_folder):
    """获取参考栅格的详细信息"""
    print(f"\n=== Getting reference raster info ===")
    
    all_tif_files = list(Path(tif_folder).glob("*.tif"))
    if not all_tif_files:
        raise ValueError(f"No TIF files found in {tif_folder}")
    
    ref_candidates = ['Recovery_mode', 'Recovmode', 'Landuse_type', 'Altitude']
    ref_path = None
    
    for candidate in ref_candidates:
        candidate_path = os.path.join(tif_folder, f"{candidate}.tif")
        if os.path.exists(candidate_path):
            ref_path = candidate_path
            break
    
    if not ref_path:
        ref_path = str(all_tif_files[0])
    
    print(f"Using reference raster: {os.path.basename(ref_path)}")
    
    with rasterio.open(ref_path) as src:
        height, width = src.height, src.width
        transform = src.transform
        crs = src.crs
        dtype = src.dtypes[0]
    
    total_pixels = height * width
    bytes_per_pixel = 4  # float32
    estimated_memory = len(base_features) * total_pixels * bytes_per_pixel / 1e9
    
    print(f"  Shape: {height} x {width} = {total_pixels:,} pixels")
    print(f"  Data type: {dtype}")
    print(f"  CRS: {crs}")
    print(f"  Estimated memory for base features: {estimated_memory:.2f} GB")
    print(f"  Chunk size: {CHUNK_SIZE} rows")
    print(f"  Pixels per chunk: {CHUNK_SIZE * width:,}")
    
    return height, width, transform, crs

def preload_feature_metadata(tif_folder, model_features):
    """预加载特征文件的路径和元数据"""
    print(f"\n=== Preloading feature metadata ===")
    
    all_tif_files = list(Path(tif_folder).glob("*.tif"))
    tif_names = [f.stem for f in all_tif_files]
    
    feature_metadata = {}
    
    for model_feat in base_features: 
        possible_tif_names = model_to_tif.get(model_feat, [model_feat])
        tif_path = None
        
        for tif_name in possible_tif_names:
            if tif_name in tif_names:
                tif_path = os.path.join(tif_folder, f"{tif_name}.tif")
                break
        
        if tif_path and os.path.exists(tif_path):
            try:
                with rasterio.open(tif_path) as src:
                    feature_metadata[model_feat] = {
                        'path': tif_path,
                        'dtype': src.dtypes[0],
                        'nodata': src.nodata
                    }
                    print(f"  ✓ {model_feat} -> {os.path.basename(tif_path)}")
            except:
                pass
    
    print(f"  Found metadata for {len(feature_metadata)} base features")
    return feature_metadata

def load_chunk_data_efficient(feature_metadata, row_start, row_end, width, transform):
    """高效加载分块数据"""
    chunk_data = {}
    chunk_height = row_end - row_start
    
    if chunk_height <= 0:
        return chunk_data
    
    for model_feat, meta in feature_metadata.items():
        try:
            with rasterio.open(meta['path']) as src:
                window = ((row_start, row_end), (0, width))
                data = src.read(1, window=window)
                
                data = data.astype(np.float32)
                if meta['nodata'] is not None:
                    data[data == meta['nodata']] = np.nan
                
                chunk_data[model_feat] = data
                
        except Exception as e:
            print(f"    Error loading {model_feat}: {e}")
            chunk_data[model_feat] = np.full((chunk_height, width), np.nan, dtype=np.float32)
    
    if 'x' in base_features:
        x_coords = np.arange(width) * transform[0] + transform[2] + transform[0] / 2
        chunk_data['x'] = np.tile(x_coords, (chunk_height, 1)).astype(np.float32)
    
    if 'y' in base_features:
        y_coords = np.arange(row_start, row_end) * transform[4] + transform[5] + transform[4] / 2
        chunk_data['y'] = np.tile(y_coords.reshape(-1, 1), (1, width)).astype(np.float32)
    
    return chunk_data

def create_derived_features_batch(chunk_data, model_features):
    processed = chunk_data.copy()
    
    if not processed:
        return processed
    
    chunk_rows, chunk_cols = next(iter(processed.values())).shape
    
    log_transforms = [
        ('x', 'Lon_log'), ('y', 'Lat_log'), 
        ('Age', 'Age_log'), ('s_bd', 'BD_log'), ('s_ph', 'pH_log')
    ]
    
    for base_feat, log_feat in log_transforms:
        if log_feat in model_features and base_feat in processed:
            data = processed[base_feat].copy()
            mask = ~np.isnan(data)
            if np.any(mask):
                data[mask] = np.log(data[mask] + 1e-8)
            processed[log_feat] = data
    
    if 'LUtype' in processed:
        lu_data = processed['LUtype'].copy()
        lu_filled = np.where(np.isnan(lu_data), 0, lu_data)
        
        lu_boost_features = [f for f in model_features if f.startswith('LUtype_boost_')]
        for boost_feat in lu_boost_features:
            try:
                boost_num = int(boost_feat.split('_')[-1])
                processed[boost_feat] = lu_filled * boost_num if boost_num > 1 else lu_filled
            except:
                processed[boost_feat] = lu_filled
        
        if 'LUtype_squared' in model_features:
            processed['LUtype_squared'] = lu_filled ** 2
        
        for other_feat in ['s_ph', 's_bd', 'Age', 'Altitude']:
            interaction_name = f'LUtype_{other_feat}_interaction'
            if interaction_name in model_features and other_feat in processed:
                other_data = processed[other_feat].copy()
                other_filled = np.where(np.isnan(other_data), 0, other_data)
                processed[interaction_name] = lu_filled * other_filled
    
    if 'Altitude_bins' in model_features and 'Altitude' in processed:
        alt_data = processed['Altitude'].copy()
        alt_filled = np.where(np.isnan(alt_data), 0, alt_data)
        processed['Altitude_bins'] = alt_filled
    
    missing = [f for f in model_features if f not in processed]
    if missing:
        for feat in missing:
            processed[feat] = np.zeros((chunk_rows, chunk_cols), dtype=np.float32)
    
    return processed

def predict_chunk_batch(chunk_data, models):
    """对分块数据进行批量预测"""
    if not models or not chunk_data:
        return None, None
    
    model_features = models[0].feature_names
    
    all_features = create_derived_features_batch(chunk_data, model_features)
    
    chunk_rows, chunk_cols = next(iter(all_features.values())).shape

    core_features = ['Recovmode', 'Recovery_mode', 'LUtype', 'Altitude']
    core_feature = None
    for cf in core_features:
        if cf in all_features:
            core_feature = cf
            break
    
    if core_feature:
        valid_mask = ~np.isnan(all_features[core_feature])
    else:
        valid_mask = np.ones((chunk_rows, chunk_cols), dtype=bool)
    
    feature_arrays = []
    for feat in model_features:
        if feat in all_features:
            feature_arrays.append(all_features[feat])
        else:
            feature_arrays.append(np.zeros((chunk_rows, chunk_cols), dtype=np.float32))
    
    X_3d = np.stack(feature_arrays, axis=-1)
    X_flat = X_3d.reshape(-1, len(model_features))
    
    valid_indices = valid_mask.flatten()
    X_valid = X_flat[valid_indices]
    
    if len(X_valid) == 0:
        return None, None
    
    all_predictions = []
    for model in models:
        dmatrix = xgb.DMatrix(X_valid, feature_names=model_features)
        preds = model.predict(dmatrix)
        all_predictions.append(preds)
    
    preds_array = np.array(all_predictions)
    mean_pred = np.mean(preds_array, axis=0)
    std_pred = np.std(preds_array, axis=0)

    mean_full = np.full(chunk_rows * chunk_cols, np.nan, dtype=np.float32)
    std_full = np.full(chunk_rows * chunk_cols, np.nan, dtype=np.float32)
    
    mean_full[valid_indices] = mean_pred
    std_full[valid_indices] = std_pred
    
    return mean_full.reshape(chunk_rows, chunk_cols), std_full.reshape(chunk_rows, chunk_cols)

if __name__ == "__main__":
    print("=" * 70)
    print("SOC Active Layer Prediction - Optimized for Large Memory Systems")
    print("=" * 70)
    
    print(f"\nInitial system check:")
    check_memory_usage()

    print(f"\n1. Loading {len(cv_model_paths)} models...")

    print(f"\n2. Analyzing input data...")
    height, width, transform, crs = get_reference_info(input_tif_folder)
    
    print(f"\n3. Preloading feature metadata...")
    feature_metadata = preload_feature_metadata(input_tif_folder, model_feature_names)
    
    if not feature_metadata:
        raise ValueError("No base features found! Check your TIF files.")
    
    print(f"\n4. Starting chunked prediction...")
    
    mean_result = np.full((height, width), np.nan, dtype=np.float32)
    std_result = np.full((height, width), np.nan, dtype=np.float32)
    
    total_chunks = int(np.ceil(height / CHUNK_SIZE))
    total_valid_pixels = 0
    
    print(f"\nProgress:")
    
    for chunk_idx in range(total_chunks):
        chunk_start = chunk_idx * CHUNK_SIZE
        chunk_end = min(chunk_start + CHUNK_SIZE, height)
        chunk_height = chunk_end - chunk_start
        
        print(f"\n  Chunk {chunk_idx+1}/{total_chunks}: Rows {chunk_start:,}-{chunk_end:,}")
        check_memory_usage()
        
        chunk_data = load_chunk_data_efficient(
            feature_metadata, chunk_start, chunk_end, width, transform
        )
        
        if not chunk_data:
            print(f"    No data loaded, skipping...")
            continue
        
        print(f"    Loaded {len(chunk_data)} features for this chunk")

        mean_chunk, std_chunk = predict_chunk_batch(chunk_data, cv_models)
        
        if mean_chunk is not None:
            mean_result[chunk_start:chunk_end, :] = mean_chunk
            std_result[chunk_start:chunk_end, :] = std_chunk
            valid_in_chunk = np.sum(~np.isnan(mean_chunk))
            total_valid_pixels += valid_in_chunk
            
            chunk_min = np.nanmin(mean_chunk)
            chunk_max = np.nanmax(mean_chunk)
            print(f"    Predicted: {valid_in_chunk:,} pixels")
            print(f"    Range: [{chunk_min:.3f}, {chunk_max:.3f}]")
            print(f"    Progress: {total_valid_pixels/(height*width)*100:.1f}%")
        del chunk_data, mean_chunk, std_chunk
        gc.collect()
    print(f"\n5. Saving results...")
    
    try:
        x_coords = np.arange(width) * transform[0] + transform[2] + transform[0] / 2
        y_coords = np.arange(height) * transform[4] + transform[5] + transform[4] / 2
        print(f"  Saving prediction to: {output_prediction_path}")
        da_mean = xr.DataArray(
            mean_result,
            coords=[('y', y_coords), ('x', x_coords)],
            dims=('y', 'x'),
            name='SOC_prediction'
        )
        da_mean.rio.write_crs(crs, inplace=True)
        da_mean.rio.write_transform(transform, inplace=True)
        
        da_mean.rio.to_raster(
            output_prediction_path,
            driver='GTiff',
            dtype=np.float32,
            compress='LZW',
            nodata=np.nan,
            tiled=True,
            blockxsize=256,
            blockysize=256,
            BIGTIFF='IF_SAFER' 
        )
        print(f"  ✓ Prediction saved successfully")
        
        print(f"  Saving uncertainty to: {output_spread_path}")
        da_std = xr.DataArray(
            std_result,
            coords=[('y', y_coords), ('x', x_coords)],
            dims=('y', 'x'),
            name='SOC_uncertainty'
        )
        da_std.rio.write_crs(crs, inplace=True)
        da_std.rio.write_transform(transform, inplace=True)
        
        da_std.rio.to_raster(
            output_spread_path,
            driver='GTiff',
            dtype=np.float32,
            compress='LZW',
            nodata=np.nan,
            tiled=True,
            blockxsize=256,
            blockysize=256,
            BIGTIFF='IF_SAFER'
        )
        print(f"  ✓ Uncertainty saved successfully")
        
    except Exception as e:
        print(f"  Error saving results: {e}")
        raise

    print(f"\n6. Final statistics:")
    total_pixels = height * width
    
    print(f"  Total pixels: {total_pixels:,}")
    print(f"  Predicted pixels: {total_valid_pixels:,} ({total_valid_pixels/total_pixels*100:.1f}%)")
    print(f"  Missing pixels: {total_pixels-total_valid_pixels:,} ({(total_pixels-total_valid_pixels)/total_pixels*100:.1f}%)")
    
    if total_valid_pixels > 0:
        final_min = np.nanmin(mean_result)
        final_max = np.nanmax(mean_result)
        final_mean = np.nanmean(mean_result)
        uncert_mean = np.nanmean(std_result)
        
        print(f"  Prediction range: [{final_min:.4f}, {final_max:.4f}]")
        print(f"  Prediction mean: {final_mean:.4f}")
        print(f"  Uncertainty mean: {uncert_mean:.4f}")
        print(f"  Uncertainty range: [{np.nanmin(std_result):.4f}, {np.nanmax(std_result):.4f}]")
    
    print(f"\n" + "=" * 70)
    print("✓ Analysis completed successfully!")
    print("=" * 70)
    
    # 最终内存检查
    print(f"\nFinal memory usage:")
    check_memory_usage()

In [None]:
# Ensemble XGBoost Pipeline for Geospatial SOC Subsoil Uncertainty Quantification Under Future Climate
import xgboost as xgb
import pandas as pd
import numpy as np
import glob
import os
import rioxarray as rxr
import xarray as xr
import rasterio
from pathlib import Path
import gc
from tqdm import tqdm
import psutil

# ====== Configuration ======
cv_model_paths = glob.glob('E:/minmin/cv_models/SOC_sub/*.json') 
input_tif_folder = 'E:/cleaned_tifs_no_extremes_iqr'
output_prediction_path = 'E:/minmin/SOC_sub_prediction.tif'
output_spread_path = 'E:/minmin/SOC_sub_uncertainty_spread.tif'


CHUNK_SIZE = 2000 
MAX_FEATURES_IN_MEMORY = 30 

def check_memory_usage():
    mem = psutil.virtual_memory()
    print(f"  Memory: {mem.used/1e9:.1f}GB / {mem.total/1e9:.1f}GB ({mem.percent}%)")
    return mem.percent

# ====== Load Models ======
print("Loading models...")
cv_models = []
for i, path in enumerate(cv_model_paths):
    try:
        model = xgb.Booster()
        model.load_model(path)
        cv_models.append(model)
        print(f"  Model {i+1}: Loaded from {os.path.basename(path)}")
    except Exception as e:
        print(f"  Model {i+1}: Failed to load - {e}")

if not cv_models:
    raise ValueError("No models loaded successfully!")

# Get feature names from first model
model_feature_names = cv_models[0].feature_names
print(f"\nModel feature names ({len(model_feature_names)}):")
for i in range(0, min(30, len(model_feature_names)), 10):
    print(f"  {model_feature_names[i:i+10]}")

base_features = []
derived_features = []
for feat in model_feature_names:
    if '_log' in feat or '_interaction' in feat or '_squared' in feat or '_boost' in feat or '_bins' in feat:
        derived_features.append(feat)
    else:
        base_features.append(feat)

print(f"\nFeature analysis:")
print(f"  Base features: {len(base_features)}")
print(f"  Derived features: {len(derived_features)}")

simple_tif_to_model = {
    'Landuse_type': 'LUtype', 'LUtype': 'LUtype', 'LU_type': 'LUtype',
    'Recovery_mode': 'Recovmode', 'Recovmode': 'Recovmode',
    'BD': 's_bd', 's_bd': 's_bd',
    'pH': 's_ph', 's_ph': 's_ph',
    'Sand': 's_sand', 's_sand': 's_sand',
    'Silt': 's_silt', 's_silt': 's_silt',
    'Clay': 's_clay', 's_clay': 's_clay',
    'Vege_type': 'Vegetype', 'Vegetype': 'Vegetype',
    'TC': 's_oc', 's_oc': 's_oc',
    'TN': 'TN46', 'TN46': 'TN46',
    'TK': 'TK46', 'TK46': 'TK46',
    'Altitude': 'Altitude', 'elevation': 'Altitude',
    'ForestAge_TC000': 'Age_plus100', 'Age': 'Age_plus100',
    'Lon': 'x', 'Lat': 'y',
    'MAT':'wc_BIO1_MAT',
    'bio2':'wc_BIO2',
    'bio3':'wc_BIO3',
    'bio4':'wc_BIO4',
    'bio5':'wc_BIO5',
    'bio6':'wc_BIO6', 
    'bio7':'wc_BIO7', 
    'bio8':'wc_BIO8', 
    'bio9':'wc_BIO9', 
    'bio10':'wc_BIO10',  
    'bio11':'wc_BIO11', 
    'MAP':'wc_BIO12_MAP',  
    'bio13':'wc_BIO13',
    'bio14':'wc_BIO14',
    'bio15':'wc_BIO15',
    'bio16':'wc_BIO16',
    'bio17':'wc_BIO17',
    'bio18':'wc_BIO18',
    'bio19':'wc_BIO19'

}

model_to_tif = {}
for tif_name, model_name in simple_tif_to_model.items():
    if model_name not in model_to_tif:
        model_to_tif[model_name] = []
    model_to_tif[model_name].append(tif_name)

def get_reference_info(tif_folder):
    """获取参考栅格的详细信息"""
    print(f"\n=== Getting reference raster info ===")
    
    all_tif_files = list(Path(tif_folder).glob("*.tif"))
    if not all_tif_files:
        raise ValueError(f"No TIF files found in {tif_folder}")
    
    ref_candidates = ['Recovery_mode', 'Recovmode', 'Landuse_type', 'Altitude']
    ref_path = None
    
    for candidate in ref_candidates:
        candidate_path = os.path.join(tif_folder, f"{candidate}.tif")
        if os.path.exists(candidate_path):
            ref_path = candidate_path
            break
    
    if not ref_path:
        ref_path = str(all_tif_files[0])
    
    print(f"Using reference raster: {os.path.basename(ref_path)}")
    
    with rasterio.open(ref_path) as src:
        height, width = src.height, src.width
        transform = src.transform
        crs = src.crs
        dtype = src.dtypes[0]
    
    total_pixels = height * width
    bytes_per_pixel = 4  # float32
    estimated_memory = len(base_features) * total_pixels * bytes_per_pixel / 1e9
    
    print(f"  Shape: {height} x {width} = {total_pixels:,} pixels")
    print(f"  Data type: {dtype}")
    print(f"  CRS: {crs}")
    print(f"  Estimated memory for base features: {estimated_memory:.2f} GB")
    print(f"  Chunk size: {CHUNK_SIZE} rows")
    print(f"  Pixels per chunk: {CHUNK_SIZE * width:,}")
    
    return height, width, transform, crs

def preload_feature_metadata(tif_folder, model_features):
    """预加载特征文件的路径和元数据"""
    print(f"\n=== Preloading feature metadata ===")
    
    all_tif_files = list(Path(tif_folder).glob("*.tif"))
    tif_names = [f.stem for f in all_tif_files]
    
    feature_metadata = {}
    
    for model_feat in base_features: 
        possible_tif_names = model_to_tif.get(model_feat, [model_feat])
        tif_path = None
        
        for tif_name in possible_tif_names:
            if tif_name in tif_names:
                tif_path = os.path.join(tif_folder, f"{tif_name}.tif")
                break
        
        if tif_path and os.path.exists(tif_path):
            try:
                with rasterio.open(tif_path) as src:
                    feature_metadata[model_feat] = {
                        'path': tif_path,
                        'dtype': src.dtypes[0],
                        'nodata': src.nodata
                    }
                    print(f"  ✓ {model_feat} -> {os.path.basename(tif_path)}")
            except:
                pass
    
    print(f"  Found metadata for {len(feature_metadata)} base features")
    return feature_metadata

def load_chunk_data_efficient(feature_metadata, row_start, row_end, width, transform):
    """高效加载分块数据"""
    chunk_data = {}
    chunk_height = row_end - row_start
    
    if chunk_height <= 0:
        return chunk_data
    
    for model_feat, meta in feature_metadata.items():
        try:
            with rasterio.open(meta['path']) as src:
                window = ((row_start, row_end), (0, width))
                data = src.read(1, window=window)
                
                data = data.astype(np.float32)
                if meta['nodata'] is not None:
                    data[data == meta['nodata']] = np.nan
                
                chunk_data[model_feat] = data
                
        except Exception as e:
            print(f"    Error loading {model_feat}: {e}")
            chunk_data[model_feat] = np.full((chunk_height, width), np.nan, dtype=np.float32)
    
    if 'x' in base_features:
        x_coords = np.arange(width) * transform[0] + transform[2] + transform[0] / 2
        chunk_data['x'] = np.tile(x_coords, (chunk_height, 1)).astype(np.float32)
    
    if 'y' in base_features:
        y_coords = np.arange(row_start, row_end) * transform[4] + transform[5] + transform[4] / 2
        chunk_data['y'] = np.tile(y_coords.reshape(-1, 1), (1, width)).astype(np.float32)
    
    return chunk_data

def create_derived_features_batch(chunk_data, model_features):
    processed = chunk_data.copy()
    
    if not processed:
        return processed
    
    chunk_rows, chunk_cols = next(iter(processed.values())).shape
    
    log_transforms = [
        ('x', 'Lon_log'), ('y', 'Lat_log'), 
        ('Age', 'Age_log'), ('s_bd', 'BD_log'), ('s_ph', 'pH_log')
    ]
    
    for base_feat, log_feat in log_transforms:
        if log_feat in model_features and base_feat in processed:
            data = processed[base_feat].copy()
            mask = ~np.isnan(data)
            if np.any(mask):
                data[mask] = np.log(data[mask] + 1e-8)
            processed[log_feat] = data
    
    if 'LUtype' in processed:
        lu_data = processed['LUtype'].copy()
        lu_filled = np.where(np.isnan(lu_data), 0, lu_data)
        
        lu_boost_features = [f for f in model_features if f.startswith('LUtype_boost_')]
        for boost_feat in lu_boost_features:
            try:
                boost_num = int(boost_feat.split('_')[-1])
                processed[boost_feat] = lu_filled * boost_num if boost_num > 1 else lu_filled
            except:
                processed[boost_feat] = lu_filled
        
        if 'LUtype_squared' in model_features:
            processed['LUtype_squared'] = lu_filled ** 2
        
        for other_feat in ['s_ph', 's_bd', 'Age', 'Altitude']:
            interaction_name = f'LUtype_{other_feat}_interaction'
            if interaction_name in model_features and other_feat in processed:
                other_data = processed[other_feat].copy()
                other_filled = np.where(np.isnan(other_data), 0, other_data)
                processed[interaction_name] = lu_filled * other_filled
    
    if 'Altitude_bins' in model_features and 'Altitude' in processed:
        alt_data = processed['Altitude'].copy()
        alt_filled = np.where(np.isnan(alt_data), 0, alt_data)
        processed['Altitude_bins'] = alt_filled
    
    missing = [f for f in model_features if f not in processed]
    if missing:
        for feat in missing:
            processed[feat] = np.zeros((chunk_rows, chunk_cols), dtype=np.float32)
    
    return processed

def predict_chunk_batch(chunk_data, models):
    """对分块数据进行批量预测"""
    if not models or not chunk_data:
        return None, None
    
    model_features = models[0].feature_names
    
    all_features = create_derived_features_batch(chunk_data, model_features)
    
    chunk_rows, chunk_cols = next(iter(all_features.values())).shape

    core_features = ['Recovmode', 'Recovery_mode', 'LUtype', 'Altitude']
    core_feature = None
    for cf in core_features:
        if cf in all_features:
            core_feature = cf
            break
    
    if core_feature:
        valid_mask = ~np.isnan(all_features[core_feature])
    else:
        valid_mask = np.ones((chunk_rows, chunk_cols), dtype=bool)
    
    feature_arrays = []
    for feat in model_features:
        if feat in all_features:
            feature_arrays.append(all_features[feat])
        else:
            feature_arrays.append(np.zeros((chunk_rows, chunk_cols), dtype=np.float32))
    
    X_3d = np.stack(feature_arrays, axis=-1)
    X_flat = X_3d.reshape(-1, len(model_features))
    
    valid_indices = valid_mask.flatten()
    X_valid = X_flat[valid_indices]
    
    if len(X_valid) == 0:
        return None, None
    
    all_predictions = []
    for model in models:
        dmatrix = xgb.DMatrix(X_valid, feature_names=model_features)
        preds = model.predict(dmatrix)
        all_predictions.append(preds)
    
    preds_array = np.array(all_predictions)
    mean_pred = np.mean(preds_array, axis=0)
    std_pred = np.std(preds_array, axis=0)

    mean_full = np.full(chunk_rows * chunk_cols, np.nan, dtype=np.float32)
    std_full = np.full(chunk_rows * chunk_cols, np.nan, dtype=np.float32)
    
    mean_full[valid_indices] = mean_pred
    std_full[valid_indices] = std_pred
    
    return mean_full.reshape(chunk_rows, chunk_cols), std_full.reshape(chunk_rows, chunk_cols)

if __name__ == "__main__":
    print("=" * 70)
    print("SOC Active Layer Prediction - Optimized for Large Memory Systems")
    print("=" * 70)
    
    print(f"\nInitial system check:")
    check_memory_usage()

    print(f"\n1. Loading {len(cv_model_paths)} models...")

    print(f"\n2. Analyzing input data...")
    height, width, transform, crs = get_reference_info(input_tif_folder)
    
    print(f"\n3. Preloading feature metadata...")
    feature_metadata = preload_feature_metadata(input_tif_folder, model_feature_names)
    
    if not feature_metadata:
        raise ValueError("No base features found! Check your TIF files.")
    
    print(f"\n4. Starting chunked prediction...")
    
    mean_result = np.full((height, width), np.nan, dtype=np.float32)
    std_result = np.full((height, width), np.nan, dtype=np.float32)
    
    total_chunks = int(np.ceil(height / CHUNK_SIZE))
    total_valid_pixels = 0
    
    print(f"\nProgress:")
    
    for chunk_idx in range(total_chunks):
        chunk_start = chunk_idx * CHUNK_SIZE
        chunk_end = min(chunk_start + CHUNK_SIZE, height)
        chunk_height = chunk_end - chunk_start
        
        print(f"\n  Chunk {chunk_idx+1}/{total_chunks}: Rows {chunk_start:,}-{chunk_end:,}")
        check_memory_usage()
        
        chunk_data = load_chunk_data_efficient(
            feature_metadata, chunk_start, chunk_end, width, transform
        )
        
        if not chunk_data:
            print(f"    No data loaded, skipping...")
            continue
        
        print(f"    Loaded {len(chunk_data)} features for this chunk")

        mean_chunk, std_chunk = predict_chunk_batch(chunk_data, cv_models)
        
        if mean_chunk is not None:
            mean_result[chunk_start:chunk_end, :] = mean_chunk
            std_result[chunk_start:chunk_end, :] = std_chunk
            valid_in_chunk = np.sum(~np.isnan(mean_chunk))
            total_valid_pixels += valid_in_chunk
            
            chunk_min = np.nanmin(mean_chunk)
            chunk_max = np.nanmax(mean_chunk)
            print(f"    Predicted: {valid_in_chunk:,} pixels")
            print(f"    Range: [{chunk_min:.3f}, {chunk_max:.3f}]")
            print(f"    Progress: {total_valid_pixels/(height*width)*100:.1f}%")
        del chunk_data, mean_chunk, std_chunk
        gc.collect()
    print(f"\n5. Saving results...")
    
    try:
        x_coords = np.arange(width) * transform[0] + transform[2] + transform[0] / 2
        y_coords = np.arange(height) * transform[4] + transform[5] + transform[4] / 2
        print(f"  Saving prediction to: {output_prediction_path}")
        da_mean = xr.DataArray(
            mean_result,
            coords=[('y', y_coords), ('x', x_coords)],
            dims=('y', 'x'),
            name='SOC_prediction'
        )
        da_mean.rio.write_crs(crs, inplace=True)
        da_mean.rio.write_transform(transform, inplace=True)
        
        da_mean.rio.to_raster(
            output_prediction_path,
            driver='GTiff',
            dtype=np.float32,
            compress='LZW',
            nodata=np.nan,
            tiled=True,
            blockxsize=256,
            blockysize=256,
            BIGTIFF='IF_SAFER' 
        )
        print(f"  ✓ Prediction saved successfully")
        
        print(f"  Saving uncertainty to: {output_spread_path}")
        da_std = xr.DataArray(
            std_result,
            coords=[('y', y_coords), ('x', x_coords)],
            dims=('y', 'x'),
            name='SOC_uncertainty'
        )
        da_std.rio.write_crs(crs, inplace=True)
        da_std.rio.write_transform(transform, inplace=True)
        
        da_std.rio.to_raster(
            output_spread_path,
            driver='GTiff',
            dtype=np.float32,
            compress='LZW',
            nodata=np.nan,
            tiled=True,
            blockxsize=256,
            blockysize=256,
            BIGTIFF='IF_SAFER'
        )
        print(f"  ✓ Uncertainty saved successfully")
        
    except Exception as e:
        print(f"  Error saving results: {e}")
        raise

    print(f"\n6. Final statistics:")
    total_pixels = height * width
    
    print(f"  Total pixels: {total_pixels:,}")
    print(f"  Predicted pixels: {total_valid_pixels:,} ({total_valid_pixels/total_pixels*100:.1f}%)")
    print(f"  Missing pixels: {total_pixels-total_valid_pixels:,} ({(total_pixels-total_valid_pixels)/total_pixels*100:.1f}%)")
    
    if total_valid_pixels > 0:
        final_min = np.nanmin(mean_result)
        final_max = np.nanmax(mean_result)
        final_mean = np.nanmean(mean_result)
        uncert_mean = np.nanmean(std_result)
        
        print(f"  Prediction range: [{final_min:.4f}, {final_max:.4f}]")
        print(f"  Prediction mean: {final_mean:.4f}")
        print(f"  Uncertainty mean: {uncert_mean:.4f}")
        print(f"  Uncertainty range: [{np.nanmin(std_result):.4f}, {np.nanmax(std_result):.4f}]")
    
    print(f"\n" + "=" * 70)
    print("✓ Analysis completed successfully!")
    print("=" * 70)
    
    # 最终内存检查
    print(f"\nFinal memory usage:")
    check_memory_usage()

In [None]:
#Raster-Based Coefficient of Variation (CV) Generator for Uncertainty Analysis
import rasterio
import numpy as np
import os
from rasterio.plot import show
import matplotlib.pyplot as plt

def calculate_cv(mean_path, sd_path, output_path=None):
    """
    Calculate CV = (SD / Mean) × 100%, and SD/Mean take absolute values.
    """
    
    with rasterio.open(mean_path) as mean_src, \
         rasterio.open(sd_path) as sd_src:
        
        mean_data = mean_src.read(1).astype(float)
        sd_data = sd_src.read(1).astype(float)
        
        profile = mean_src.profile.copy()
        profile.update(dtype=rasterio.float32, count=1)
        
        valid_mask = (~np.isnan(mean_data)) & (~np.isnan(sd_data)) & (mean_data != 0)
        
        cv_data = np.full(mean_data.shape, np.nan, dtype=np.float32)
        
        # ---- KEY UPDATE: take absolute value ----
        cv_data[valid_mask] = np.abs(sd_data[valid_mask] / mean_data[valid_mask]) * 100
        
        if output_path:
            with rasterio.open(output_path, 'w', **profile) as dst:
                dst.write(cv_data, 1)
            print(f"CV raster saved to: {output_path}")
        
        return cv_data, profile


def main():
    base_path = "E:/minmin/spreadingmapping/future"
    mean_file = os.path.join(base_path, "passive_top_prediction.tif")#replace with active_top, SOC_top, active_sub,SOC_sub,passive_sub
    sd_file = os.path.join(base_path, "passive_top_uncertainty_spread.tif")#replace with active_top, SOC_top, active_sub,SOC_sub,passive_sub
    output_file = os.path.join(base_path, "passive_top_CV.tif")#replace with active_top, SOC_top, active_sub,SOC_sub,passive_sub

    if not os.path.exists(mean_file):
        print(f"Error: Mean file not found at {mean_file}")
        return
    if not os.path.exists(sd_file):
        print(f"Error: SD file not found at {sd_file}")
        return
    
    print("\nCalculating CV...")
    cv_data, profile = calculate_cv(mean_file, sd_file, output_file)
    
    valid_cv = cv_data[~np.isnan(cv_data)]
    if len(valid_cv) > 0:
        print("\nCV Statistics:")
        print(f"  Min CV: {valid_cv.min():.2f}%")
        print(f"  Max CV: {valid_cv.max():.2f}%")
        print(f"  Mean CV: {valid_cv.mean():.2f}%")
        print(f"  Median CV: {np.median(valid_cv):.2f}%")
        print(f"  Valid pixels: {len(valid_cv)}")
        print(f"  Invalid pixels: {np.sum(np.isnan(cv_data) | np.isinf(cv_data))}")

if __name__ == "__main__":
    main()


In [None]:
#Raster-Based Coefficient of Variation (CV) for lability Index (LI)
import rasterio
import numpy as np

active_path = r"E:/minmin/spreadingmapping/future/active_sub_future_uncertainty_spread.tif" #alternatively replace with active_top
passive_path = r"E:/minmin/spreadingmapping/future/passive_sub_future_uncertainty_spread.tif"#alternatively replace with active_top
output_path = r"E:/minmin/spreadingmapping/future/LI_future_sub_CV.tif"

with rasterio.open(active_path) as src_a, rasterio.open(passive_path) as src_p:

    meta = src_a.meta.copy()
    meta.update(dtype="float32")

    with rasterio.open(output_path, "w", **meta) as dst:

        # 逐块处理，省内存
        for ji, window in src_a.block_windows(1):

            a = src_a.read(1, window=window)
            p = src_p.read(1, window=window)

            out = np.sqrt(a.astype("float64")**2 + p.astype("float64")**2)

            dst.write(out.astype("float32"), 1, window=window)

print("Done.")
