In [None]:
def crop_to_patches(image, patch_size=(64, 64), overlap=0.2):
    """
    将图像裁剪成固定大小的块
    Args:
        image: 输入图像 [H, W, C]
        patch_size: 块大小 (height, width)
        overlap: 重叠比例 (0-1)
    Returns:
        patches: 裁剪后的图像块列表
        positions: 每个块的位置信息 [(y, x), ...]
    """
    h, w = image.shape[:2]
    stride_h = int(patch_size[0] * (1 - overlap))
    stride_w = int(patch_size[1] * (1 - overlap))
    
    patches = []
    positions = []
    
    # 计算非零区域的掩码
    if len(image.shape) == 3:
        mask = ~(image == 0).all(axis=2)
    else:
        mask = ~(image == 0)
    
    for y in range(0, h - patch_size[0] + 1, stride_h):
        for x in range(0, w - patch_size[1] + 1, stride_w):
            patch = image[y:y + patch_size[0], x:x + patch_size[1]]
            patch_mask = mask[y:y + patch_size[0], x:x + patch_size[1]]
            
            # 检查patch中非零像素的比例
            valid_ratio = np.sum(patch_mask) / (patch_size[0] * patch_size[1])
            
            # 如果非零像素比例大于阈值，保存该patch
            if valid_ratio > 0.7:  # 可以调整这个阈值
                patches.append(patch)
                positions.append((y, x))
    
    return patches, positions

def preprocess_image(image_path, patch_size=(128, 128), overlap=0.2):
    """
    预处理图像：移除空白区域并分割成小块
    """
    with rasterio.open(image_path) as src:
        image = src.read()
        nodata = src.nodata
        transform = src.transform
        crs = src.crs
        
        # 转换为[H, W, C]格式
        image = np.moveaxis(image, 0, -1)
        
        # 获取patches
        patches, positions = crop_to_patches(image, patch_size, overlap)
        
        return patches, positions, transform, crs

def batch_preprocess_images(input_dir, output_dir, patch_size=(128, 128), overlap=0.2):
    """
    批量处理图像
    """
    input_dir = Path(input_dir)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    for image_path in input_dir.rglob('*.tif'):
        try:
            # 处理图像
            patches, positions, transform, crs = preprocess_image(
                str(image_path), patch_size, overlap
            )
            
            # 为每个patch创建文件名
            base_name = image_path.stem
            
            # 保存patches
            for idx, (patch, pos) in enumerate(zip(patches, positions)):
                output_name = f"{base_name}_patch{idx:03d}_y{pos[0]}_x{pos[1]}.tif"
                output_path = output_dir / output_name
                
                # 计算patch的地理变换矩阵
                patch_transform = rasterio.Affine(
                    transform.a, transform.b, transform.c + pos[1] * transform.a,
                    transform.d, transform.e, transform.f + pos[0] * transform.e
                )
                
                # 保存patch
                with rasterio.open(
                    str(output_path),
                    'w',
                    driver='GTiff',
                    height=patch_size[0],
                    width=patch_size[1],
                    count=patch.shape[2],
                    dtype=patch.dtype,
                    crs=crs,
                    transform=patch_transform,
                ) as dst:
                    for i in range(patch.shape[2]):
                        dst.write(patch[:, :, i], i + 1)
            
            print(f"已处理: {image_path.name}, 生成 {len(patches)} 个patches")
            
        except Exception as e:
            print(f"处理 {image_path.name} 时出错: {str(e)}")

if __name__ == "__main__":
    input_dir = r"D:\work\DATA\DATA_TS4GPC\processed\clip_CXZ_WN_2024"
    output_dir = r"D:\work\DATA\DATA_TS4GPC\processed\clip_CXZ_WN_2024_patches"
    # batch_preprocess_images(input_dir, output_dir, patch_size=(128, 128), overlap=0.2)

In [6]:
from get_image_paths import get_tif_files, get_list_time_band

input_dir = r"D:\work\DATA\DATA_TS4GPC\processed\clip_CXZ_WN_2024"
output_dir = r"D:\work\DATA\DATA_TS4GPC\processed\clip_CXZ_WN_2024_patches"
patch_size=(128, 128) 
overlap=0.2

tif_files = get_tif_files(input_dir)

image_path = tif_files[0]

In [3]:
import cv2
import numpy as np
from pathlib import Path
import rasterio

with rasterio.open(iamge_path) as src:
    image = src.read()
    nodata = src.nodata
    transform = src.transform
    crs = src.crs
    
    # 转换为[H, W, C]格式
    image = np.moveaxis(image, 0, -1)

image

array([[[-10000.],
        [-10000.],
        [-10000.],
        ...,
        [-10000.],
        [-10000.],
        [-10000.]],

       [[-10000.],
        [-10000.],
        [-10000.],
        ...,
        [-10000.],
        [-10000.],
        [-10000.]],

       [[-10000.],
        [-10000.],
        [-10000.],
        ...,
        [-10000.],
        [-10000.],
        [-10000.]],

       ...,

       [[-10000.],
        [-10000.],
        [-10000.],
        ...,
        [-10000.],
        [-10000.],
        [-10000.]],

       [[-10000.],
        [-10000.],
        [-10000.],
        ...,
        [-10000.],
        [-10000.],
        [-10000.]],

       [[-10000.],
        [-10000.],
        [-10000.],
        ...,
        [-10000.],
        [-10000.],
        [-10000.]]], shape=(585, 162, 1), dtype=float32)

In [4]:
def crop_to_patches(image, patch_size=(64, 64), overlap=0.2):
    """
    将图像裁剪成固定大小的块
    Args:
        image: 输入图像 [H, W, C]
        patch_size: 块大小 (height, width)
        overlap: 重叠比例 (0-1)
    Returns:
        patches: 裁剪后的图像块列表
        positions: 每个块的位置信息 [(y, x), ...]
    """
    h, w = image.shape[:2]
    stride_h = int(patch_size[0] * (1 - overlap))
    stride_w = int(patch_size[1] * (1 - overlap))
    
    patches = []
    positions = []
    
    # 计算非零区域的掩码
    if len(image.shape) == 3:
        mask = ~(image == 0).all(axis=2)
    else:
        mask = ~(image == 0)
    
    for y in range(0, h - patch_size[0] + 1, stride_h):
        for x in range(0, w - patch_size[1] + 1, stride_w):
            patch = image[y:y + patch_size[0], x:x + patch_size[1]]
            patch_mask = mask[y:y + patch_size[0], x:x + patch_size[1]]
            
            # 检查patch中非零像素的比例
            valid_ratio = np.sum(patch_mask) / (patch_size[0] * patch_size[1])
            
            # 如果非零像素比例大于阈值，保存该patch
            if valid_ratio > 0.7:  # 可以调整这个阈值
                patches.append(patch)
                positions.append((y, x))
    
    return patches, positions

patches, positions = crop_to_patches(image, patch_size=(64, 64), overlap=0.2)
patches, positions

([array([[[-1.0000000e+04],
          [-1.0000000e+04],
          [-1.0000000e+04],
          ...,
          [-1.0000000e+04],
          [-1.0000000e+04],
          [-1.0000000e+04]],
  
         [[-1.0000000e+04],
          [-1.0000000e+04],
          [-1.0000000e+04],
          ...,
          [-1.0000000e+04],
          [-1.0000000e+04],
          [-1.0000000e+04]],
  
         [[-1.0000000e+04],
          [-1.0000000e+04],
          [-1.0000000e+04],
          ...,
          [-1.0000000e+04],
          [-1.0000000e+04],
          [-1.0000000e+04]],
  
         ...,
  
         [[-1.0000000e+04],
          [-1.0000000e+04],
          [-1.0000000e+04],
          ...,
          [ 3.5470046e-02],
          [ 3.7978251e-02],
          [ 3.5846300e-02]],
  
         [[-1.0000000e+04],
          [-1.0000000e+04],
          [-1.0000000e+04],
          ...,
          [ 3.6968179e-02],
          [ 4.1913848e-02],
          [ 4.0638614e-02]],
  
         [[-1.0000000e+04],
          [-1.000000

In [None]:
try:
    # 处理图像
    patches, positions, transform, crs = preprocess_image(
        str(image_path), patch_size, overlap
    )
    
    # 为每个patch创建文件名
    base_name = image_path.stem
    
    # 保存patches
    for idx, (patch, pos) in enumerate(zip(patches, positions)):
        output_name = f"{base_name}_patch{idx:03d}_y{pos[0]}_x{pos[1]}.tif"
        output_path = output_dir / output_name
        
        # 计算patch的地理变换矩阵
        patch_transform = rasterio.Affine(
            transform.a, transform.b, transform.c + pos[1] * transform.a,
            transform.d, transform.e, transform.f + pos[0] * transform.e
        )
        
        # 保存patch
        with rasterio.open(
            str(output_path),
            'w',
            driver='GTiff',
            height=patch_size[0],
            width=patch_size[1],
            count=patch.shape[2],
            dtype=patch.dtype,
            crs=crs,
            transform=patch_transform,
        ) as dst:
            for i in range(patch.shape[2]):
                dst.write(patch[:, :, i], i + 1)
    
    print(f"已处理: {image_path.name}, 生成 {len(patches)} 个patches")
    
except Exception as e:
    print(f"处理 {image_path.name} 时出错: {str(e)}")

In [7]:
# 处理图像
patches, positions, transform, crs = preprocess_image(
    str(image_path), patch_size, overlap
)

In [8]:
patches, positions, transform, crs

([array([[[-1.0000000e+04],
          [-1.0000000e+04],
          [-1.0000000e+04],
          ...,
          [-1.0000000e+04],
          [-1.0000000e+04],
          [-1.0000000e+04]],
  
         [[-1.0000000e+04],
          [-1.0000000e+04],
          [-1.0000000e+04],
          ...,
          [-1.0000000e+04],
          [-1.0000000e+04],
          [-1.0000000e+04]],
  
         [[-1.0000000e+04],
          [-1.0000000e+04],
          [-1.0000000e+04],
          ...,
          [-1.0000000e+04],
          [-1.0000000e+04],
          [-1.0000000e+04]],
  
         ...,
  
         [[-1.0000000e+04],
          [-1.0000000e+04],
          [-1.0000000e+04],
          ...,
          [ 4.4337947e-02],
          [ 4.1424181e-02],
          [ 4.1115895e-02]],
  
         [[-1.0000000e+04],
          [-1.0000000e+04],
          [-1.0000000e+04],
          ...,
          [ 4.4310339e-02],
          [ 4.4667747e-02],
          [ 4.5832194e-02]],
  
         [[-1.0000000e+04],
          [-1.000000

In [10]:
# 为每个patch创建文件名
# 将字符串路径转换为 Path 对象
image_path = Path(image_path)
base_name = image_path.stem
base_name

'CXZ-WN-240130-Blue-000'

In [12]:
input_dir = Path(input_dir)
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)

In [13]:




# 保存patches
for idx, (patch, pos) in enumerate(zip(patches, positions)):
    output_name = f"{base_name}_patch{idx:03d}_y{pos[0]}_x{pos[1]}.tif"
    output_path = output_dir / output_name
    
    # 计算patch的地理变换矩阵
    patch_transform = rasterio.Affine(
        transform.a, transform.b, transform.c + pos[1] * transform.a,
        transform.d, transform.e, transform.f + pos[0] * transform.e
    )
    
    # 保存patch
    with rasterio.open(
        str(output_path),
        'w',
        driver='GTiff',
        height=patch_size[0],
        width=patch_size[1],
        count=patch.shape[2],
        dtype=patch.dtype,
        crs=crs,
        transform=patch_transform,
    ) as dst:
        for i in range(patch.shape[2]):
            dst.write(patch[:, :, i], i + 1)

print(f"已处理: {image_path.name}, 生成 {len(patches)} 个patches")

已处理: CXZ-WN-240130-Blue-000.tif, 生成 5 个patches


In [None]:
import cv2
import os
from pathlib import Path
import rasterio
import numpy as np

def normalize_resolution(image, target_resolution=(580, 160)):
    """
    将图像降采样到目标分辨率
    Args:
        image: 输入图像 [H, W, C]
        target_resolution: 目标分辨率 (height, width)
    Returns:
        normalized_image: 降采样后的图像
    """
    current_h, current_w = image.shape[:2]
    
    # 计算缩放比例
    scale_h = target_resolution[0] / current_h
    scale_w = target_resolution[1] / current_w
    scale = min(scale_h, scale_w)
    
    # 计算新的尺寸
    new_h = int(current_h * scale)
    new_w = int(current_w * scale)
    
    # 使用双线性插值进行降采样
    normalized_image = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
    
    return normalized_image

def batch_normalize_images(input_dir, output_dir, target_resolution=(580, 160)):
    """
    批量处理图像并保存
    """
    input_dir = Path(input_dir)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    for image_path in input_dir.rglob('*.tif'):
        try:
            # 读取tif文件
            with rasterio.open(str(image_path)) as src:
                # 获取图像信息
                count = src.count
                if count == 0:
                    print(f"跳过空图像: {image_path.name}")
                    continue
                
                # 读取图像数据
                image = src.read()
                transform = src.transform
                crs = src.crs
                
                # 处理图像维度
                if len(image.shape) == 4:
                    # 如果是(C, H, W, 1)或(C, H, W, 1, 1)格式，去掉多余的维度
                    image = image.squeeze()
                
                # 处理单通道图像
                if len(image.shape) == 2:
                    image = np.expand_dims(image, axis=0)
                
                # 转换为[H, W, C]格式
                image = np.moveaxis(image, 0, -1)
                
                # 归一化分辨率
                normalized_image = normalize_resolution(image, target_resolution)
                
                # 转回[C, H, W]格式
                normalized_image = np.moveaxis(normalized_image, -1, 0)
                
                # 构建输出路径
                output_path = output_dir / image_path.name
                
                # 更新变换矩阵
                scale_h = normalized_image.shape[1] / src.height
                scale_w = normalized_image.shape[2] / src.width
                new_transform = rasterio.Affine(
                    transform.a / scale_w, transform.b, transform.c,
                    transform.d, transform.e / scale_h, transform.f
                )
                
                # 保存归一化后的图像
                with rasterio.open(
                    str(output_path),
                    'w',
                    driver='GTiff',
                    height=normalized_image.shape[1],
                    width=normalized_image.shape[2],
                    count=normalized_image.shape[0],
                    dtype=normalized_image.dtype,
                    crs=crs,
                    transform=new_transform,
                ) as dst:
                    dst.write(normalized_image)
                
            print(f"已处理: {image_path.name}, shape={normalized_image.shape}")
            
        except Exception as e:
            print(f"处理 {image_path.name} 时出错: {str(e)}")
            import traceback
            print(traceback.format_exc())

if __name__ == "__main__":
    input_dir = r"D:\work\DATA\DATA_TS4GPC\processed\clip_CXZ_WN_2024"
    output_dir = r"D:\work\DATA\DATA_TS4GPC\processed\clip_CXZ_WN_2024_normalized"
    batch_normalize_images(input_dir, output_dir)

In [2]:
import pandas as pd
from pathlib import Path
import json

image_dir = r"D:\work\DATA\DATA_TS4GPC\processed\clip_CXZ_WN_2024_patches_merged"
gpc_file = r'D:\work\DATA\DATA_TS4GPC\processed\gpc_data.csv'
output_file = r"train_doc.json"

gpc_data = pd.read_csv(gpc_file)
gpc_data

Unnamed: 0,品质_蛋白
0,9.500000
1,9.800000
2,9.100000
3,10.050000
4,9.400000
...,...
235,13.666667
236,14.500000
237,12.800000
238,14.900000


In [3]:
# 获取所有图像文件
image_dir = Path(image_dir)
image_files = list(image_dir.glob('CX-WN-2024_*_patch*_merged.tif'))

image_files

[WindowsPath('D:/work/DATA/DATA_TS4GPC/processed/clip_CXZ_WN_2024_patches_merged/CX-WN-2024_000_patch000_merged.tif'),
 WindowsPath('D:/work/DATA/DATA_TS4GPC/processed/clip_CXZ_WN_2024_patches_merged/CX-WN-2024_000_patch001_merged.tif'),
 WindowsPath('D:/work/DATA/DATA_TS4GPC/processed/clip_CXZ_WN_2024_patches_merged/CX-WN-2024_000_patch002_merged.tif'),
 WindowsPath('D:/work/DATA/DATA_TS4GPC/processed/clip_CXZ_WN_2024_patches_merged/CX-WN-2024_000_patch003_merged.tif'),
 WindowsPath('D:/work/DATA/DATA_TS4GPC/processed/clip_CXZ_WN_2024_patches_merged/CX-WN-2024_001_patch000_merged.tif'),
 WindowsPath('D:/work/DATA/DATA_TS4GPC/processed/clip_CXZ_WN_2024_patches_merged/CX-WN-2024_001_patch001_merged.tif'),
 WindowsPath('D:/work/DATA/DATA_TS4GPC/processed/clip_CXZ_WN_2024_patches_merged/CX-WN-2024_001_patch002_merged.tif'),
 WindowsPath('D:/work/DATA/DATA_TS4GPC/processed/clip_CXZ_WN_2024_patches_merged/CX-WN-2024_001_patch003_merged.tif'),
 WindowsPath('D:/work/DATA/DATA_TS4GPC/processed

In [9]:
# 创建训练文档
training_data = []
for img_path in image_files:
    try:
        # 解析文件名获取plot_id
        # 文件名格式: CX-WN-2024_000_patch000_merged.tif
        parts = img_path.stem.split('_')  # ['CX-WN-2024', '000', 'patch000', 'merged']
        plot_id = int(parts[1])  # 获取plot_id (000)

        print(f"处理文件 {img_path.name}")
        print(f"plot_id: {plot_id}")
        
        # 检查plot_id是否在GPC数据中
        if plot_id < len(gpc_data):
            gpc_value = gpc_data.iloc[plot_id]['品质_蛋白']
            print(f"GPC值: {gpc_value}")
            
            # 创建数据项
            data_item = {
                'image_path': str(img_path),
                'plot_id': plot_id,
                'patch_id': int(parts[2].replace('patch', '')),  # 获取patch编号
                'gpc_value': float(gpc_value)
            }
            training_data.append(data_item)
    except Exception as e:
        print(f"处理文件 {img_path.name} 时出错: {str(e)}")
        continue

处理文件 CX-WN-2024_000_patch000_merged.tif
plot_id: 0
GPC值: 9.5
处理文件 CX-WN-2024_000_patch001_merged.tif
plot_id: 0
GPC值: 9.5
处理文件 CX-WN-2024_000_patch002_merged.tif
plot_id: 0
GPC值: 9.5
处理文件 CX-WN-2024_000_patch003_merged.tif
plot_id: 0
GPC值: 9.5
处理文件 CX-WN-2024_001_patch000_merged.tif
plot_id: 1
GPC值: 9.8
处理文件 CX-WN-2024_001_patch001_merged.tif
plot_id: 1
GPC值: 9.8
处理文件 CX-WN-2024_001_patch002_merged.tif
plot_id: 1
GPC值: 9.8
处理文件 CX-WN-2024_001_patch003_merged.tif
plot_id: 1
GPC值: 9.8
处理文件 CX-WN-2024_002_patch000_merged.tif
plot_id: 2
GPC值: 9.1
处理文件 CX-WN-2024_002_patch001_merged.tif
plot_id: 2
GPC值: 9.1
处理文件 CX-WN-2024_002_patch002_merged.tif
plot_id: 2
GPC值: 9.1
处理文件 CX-WN-2024_002_patch003_merged.tif
plot_id: 2
GPC值: 9.1
处理文件 CX-WN-2024_003_patch000_merged.tif
plot_id: 3
GPC值: 10.05
处理文件 CX-WN-2024_003_patch001_merged.tif
plot_id: 3
GPC值: 10.05
处理文件 CX-WN-2024_003_patch002_merged.tif
plot_id: 3
GPC值: 10.05
处理文件 CX-WN-2024_003_patch003_merged.tif
plot_id: 3
GPC值: 10.05
处理文件 CX-WN-2024_

In [8]:
gpc_value = gpc_data.iloc[plot_id]['GPC']

KeyError: 'GPC'

In [7]:
plot_id < len(gpc_data)

True

In [5]:
len(gpc_data)

240