In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import os
from sklearn.preprocessing import StandardScaler
from torchvision import transforms

# ===== CPU 加速优化设置 =====
# 设置 PyTorch 使用多线程（充分利用 CPU 核心）
import multiprocessing
torch.set_num_threads(multiprocessing.cpu_count())  # 使用所有 CPU 核心
torch.set_num_interop_threads(multiprocessing.cpu_count())  # 设置线程间操作线程数
print(f"PyTorch CPU 线程数: {torch.get_num_threads()}")
print(f"系统 CPU 核心数: {multiprocessing.cpu_count()}")
# ============================

# --- 0. 重新执行数据准备步骤 ---
# (确保这个 Notebook 能独立运行)

# 请确保路径正确
DATA_DIR = './csiro-biomass' # 你的数据文件夹路径（修复拼写错误）
IMAGE_DIR = DATA_DIR # 你的图片文件夹路径

df = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
df_wide = pd.pivot_table(df, 
                         index=['image_path', 'Sampling_Date', 'State', 'Species', 'Pre_GSHH_NDVI', 'Height_Ave_cm'], 
                         columns='target_name', 
                         values='target',
                         aggfunc='mean').reset_index()
df_wide = df_wide.rename_axis(None, axis=1)
df_wide['Sampling_Date'] = pd.to_datetime(df_wide['Sampling_Date'])


# --- 1. 定义图像预处理/增强 ---
# 对于验证集，我们只做基础的尺寸调整、Tensor转换和归一化
# 对于训练集，可以加入随机翻转、颜色抖动等数据增强
data_transforms = {
    'train': transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.RandomHorizontalFlip(), # 随机水平翻转
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) # ImageNet 均值和标准差
    ]),
    'val': transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}


# --- 2. 自定义 PyTorch 数据集类 ---
class BiomassDataset(Dataset):
    def __init__(self, dataframe, image_dir, target_cols, transform=None):
        """
        Args:
            dataframe (pd.DataFrame): 包含所有信息的宽格式 DataFrame.
            image_dir (str): 图像文件所在的目录.
            target_cols (list): 目标列的列名列表.
            transform (callable, optional): 应用于图像的 torchvision 变换.
        """
        self.df = dataframe
        self.image_dir = image_dir
        self.transform = transform
        
        # !! 只提取目标值 !!
        self.targets = np.log1p(self.df[target_cols].values.astype(np.float32))

    def __len__(self):
        # 返回数据集的总样本数
        return len(self.df)

    def __getitem__(self, idx):
        # 根据索引 idx 获取单个样本
        
        # 1. 加载图像
        img_name = self.df.iloc[idx]['image_path']
        img_path = os.path.join(self.image_dir, img_name)
        image = Image.open(img_path).convert('RGB')
        
        # 2. 应用图像变换
        if self.transform:
            image = self.transform(image)
        
        # 3. 获取对应的目标值（纯视觉模型，不需要表格数据）
        target_row = torch.tensor(self.targets[idx], dtype=torch.float)
        
        return image, target_row

print("--- PyTorch 环境和 Dataset 类定义完成 ---")
print(f"PyTorch 版本: {torch.__version__}")
print("BiomassDataset 类已准备就绪。")


PyTorch CPU 线程数: 12
系统 CPU 核心数: 12
--- PyTorch 环境和 Dataset 类定义完成 ---
PyTorch 版本: 2.9.1+cpu
BiomassDataset 类已准备就绪。


In [None]:
import torch
import torch.nn as nn
import torchvision.models as models

class BiModalModel(nn.Module):
    def __init__(self, num_tabular_features, num_targets=5, pretrained=True):
        """
        Args:
            num_tabular_features (int): 输入的表格特征数量.
            num_targets (int): 需要预测的目标数量 (本项目中是 5).
            pretrained (bool): 是否使用预训练的 CNN 权重.
        """
        super(BiModalModel, self).__init__()
        
        # --- 1. 图像分支 (Image Branch) ---
        # 加载一个预训练的 ResNet18 模型
        self.cnn = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1 if pretrained else None)
        
        # 获取 ResNet18 最后一层全连接层 (fc) 的输入特征数
        num_cnn_features = self.cnn.fc.in_features
        
        # 将原始的 fc 层替换为一个 Identity 层，相当于只做特征提取，不做分类
        self.cnn.fc = nn.Identity()
        
        # --- 2. 表格分支 (Tabular Branch) ---
        self.tabular_mlp = nn.Sequential(
            nn.Linear(num_tabular_features, 128),
            nn.BatchNorm1d(128), # BatchNorm 有助于稳定训练
            nn.ReLU(),
            nn.Dropout(0.3), # Dropout 防止过拟合
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        
        # --- 3. 融合与最终预测 (Fusion Head) ---
        # 将 CNN 特征和 MLP 特征拼接后的总维度
        total_features = num_cnn_features + 64
        
        self.fusion_head = nn.Sequential(
            nn.Linear(total_features, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_targets) # 最终输出 5 个预测值
        )

    def forward(self, image, tabular):
        # 定义数据如何流过网络
        
        # 1. 图像数据通过 CNN
        image_features = self.cnn(image)
        
        # 2. 表格数据通过 MLP
        tabular_features = self.tabular_mlp(tabular)
        
        # 3. 拼接 (Concatenate) 特征
        combined_features = torch.cat((image_features, tabular_features), dim=1)
        
        # 4. 通过融合层得到最终输出
        output = self.fusion_head(combined_features)
        
        return output

print("--- 多模态模型 BiModalModel 定义完成 ---")
# 我们可以创建一个模型实例来测试一下结构是否正确
# (这里我们先假设表格特征有 21 个，和第二阶段一样)
test_model = BiModalModel(num_tabular_features=21)
print("模型结构:")
print(test_model)



--- 多模态模型 BiModalModel 定义完成 ---
Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to C:\Users\ExceptedGoat/.cache\torch\hub\checkpoints\resnet18-f37072fd.pth


100%|██████████| 44.7M/44.7M [00:35<00:00, 1.33MB/s]

模型结构:
BiModalModel(
  (cnn): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, tr




In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from tqdm.notebook import tqdm
import torch.optim as optim

class VisualModel(nn.Module):
    def __init__(self, num_targets=5, pretrained=True):
        super(VisualModel, self).__init__()
        
        # 加载预训练的 ResNet18
        self.cnn = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1 if pretrained else None)
        
        # 获取最后一层的输入特征数
        num_cnn_features = self.cnn.fc.in_features
        
        # 替换最后一层为一个新的全连接层，直接输出 5 个目标
        self.cnn.fc = nn.Sequential(
            nn.Linear(num_cnn_features, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_targets)
        )

    def forward(self, image):
        # 数据流非常简单：图片 -> CNN -> 输出
        output = self.cnn(image)
        return output
    
# --- 1. 最终数据准备 (简化版) ---
train_df, val_df = train_test_split(df_wide, test_size=0.2, random_state=42)

# --- 2. 创建 Datasets 和 DataLoaders (无需表格) ---
target_cols = ['Dry_Clover_g', 'Dry_Dead_g', 'Dry_Green_g', 'GDM_g', 'Dry_Total_g']
# 注意：BiomassDataset 现在只需要 dataframe, image_dir, targets, 和 transform
train_dataset = BiomassDataset(train_df.reset_index(drop=True), IMAGE_DIR, target_cols, transform=data_transforms['train'])
val_dataset = BiomassDataset(val_df.reset_index(drop=True), IMAGE_DIR, target_cols, transform=data_transforms['val'])

# ===== CPU 加速优化：优化 DataLoader 设置 =====
import multiprocessing
# 增大 batch_size 以充分利用 CPU（根据内存调整，如果内存不足可以减小）
BATCH_SIZE = 32  # 从16增加到32，如果内存不足可以改回16或24
# Windows系统多进程经常出错，必须使用 num_workers=0
# Linux/Mac 系统可以使用 num_workers=2-8 来加速
NUM_WORKERS = 0  # Windows系统必须设为0，避免多进程错误

train_loader = DataLoader(
    train_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True, 
    num_workers=NUM_WORKERS,
    pin_memory=False,  # CPU训练时设为False
    persistent_workers=NUM_WORKERS > 0  # 保持worker进程，减少重启开销
)
val_loader = DataLoader(
    val_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=False, 
    num_workers=NUM_WORKERS,
    pin_memory=False,
    persistent_workers=NUM_WORKERS > 0
)
print(f"DataLoader 优化: batch_size={BATCH_SIZE}, num_workers={NUM_WORKERS}")
# ============================================

# --- 3. 设置训练组件 ---
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"使用设备: {device}")

# ===== CPU 加速优化：启用 torch.backends 优化 =====
if device.type == 'cpu':
    # 启用 MKL 优化（如果可用）
    torch.backends.mkldnn.enabled = True
    # 使用非确定性算法以获得更好的性能（可选）
    # torch.use_deterministic_algorithms(False)
    print("CPU 优化已启用")
# ================================================

model = VisualModel().to(device) # 使用新的 VisualModel
criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-5) # 微调整个模型

# --- 4. 训练与验证循环 (简化版) ---
NUM_EPOCHS = 20
best_rmse = float('inf')

for epoch in range(NUM_EPOCHS):
    model.train()
    running_loss = 0.0
    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS} [训练]")
    for images, targets in pbar: # <-- 注意这里
        images, targets = images.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(images) # <-- 注意这里
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * images.size(0)
    
    epoch_train_loss = running_loss / len(train_dataset)

    model.eval()
    all_preds = []
    all_targets = []
    with torch.no_grad():
        pbar_val = tqdm(val_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS} [验证]")
        for images, targets in pbar_val: # <-- 注意这里
            images, targets = images.to(device), targets.to(device)
            outputs = model(images) # <-- 注意这里
            preds_orig = np.expm1(outputs.cpu().numpy())
            targets_orig = np.expm1(targets.cpu().numpy())
            all_preds.append(preds_orig)
            all_targets.append(targets_orig)
            
    val_rmse = np.sqrt(mean_squared_error(np.concatenate(all_targets), np.concatenate(all_preds)))
    print(f"Epoch {epoch+1}/{NUM_EPOCHS} -> 训练损失: {epoch_train_loss:.4f} | 验证 RMSE: {val_rmse:.4f}")
    
    if val_rmse < best_rmse:
        best_rmse = val_rmse
        torch.save(model.state_dict(), 'best_visual_model.pth') # 保存为新名字
        print(f"  -> 新的最佳模型已保存，RMSE: {best_rmse:.4f}")

print(f"\n--- 训练完成 ---\n最好的纯视觉模型 RMSE 是: {best_rmse:.4f}")


DataLoader 优化: batch_size=32, num_workers=2
使用设备: cpu
CPU 优化已启用


Epoch 1/20 [训练]:   0%|          | 0/9 [00:00<?, ?it/s]

RuntimeError: DataLoader worker (pid(s) 21044, 25212) exited unexpectedly