In [None]:
#逻辑错误、废除此文件、仅作为存档
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import os
from sklearn.preprocessing import StandardScaler
from torchvision import transforms
from tqdm.notebook import tqdm

# =============================================================================
# 步骤 0: 环境设置与类定义 (必须与训练时完全一致)
# =============================================================================

print("--- 步骤 0: 环境设置与类定义 ---")

# --- 定义模型结构 (直接从训练脚本复制) ---
class BiModalModel(nn.Module):
    def __init__(self, num_tabular_features, num_targets=5, pretrained=True):
        super(BiModalModel, self).__init__()
        self.cnn = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1 if pretrained else None)
        num_cnn_features = self.cnn.fc.in_features
        self.cnn.fc = nn.Identity()
        self.tabular_mlp = nn.Sequential(
            nn.Linear(num_tabular_features, 128), nn.BatchNorm1d(128), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(128, 64), nn.BatchNorm1d(64), nn.ReLU(), nn.Dropout(0.3)
        )
        total_features = num_cnn_features + 64
        self.fusion_head = nn.Sequential(
            nn.Linear(total_features, 256), nn.BatchNorm1d(256), nn.ReLU(), nn.Dropout(0.5),
            nn.Linear(256, num_targets)
        )
    def forward(self, image, tabular):
        image_features = self.cnn(image)
        tabular_features = self.tabular_mlp(tabular)
        combined_features = torch.cat((image_features, tabular_features), dim=1)
        output = self.fusion_head(combined_features)
        return output

# --- 定义推理时使用的数据集类 (不包含 target) ---
class InferenceDataset(Dataset):
    def __init__(self, dataframe, image_dir, tabular_features, transform=None):
        self.df = dataframe
        self.image_dir = image_dir
        self.transform = transform
        self.tabular_data = tabular_features.values.astype(np.float32)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_name = self.df.iloc[idx]['image_path']
        img_path = os.path.join(self.image_dir, img_name)
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        tabular_row = torch.tensor(self.tabular_data[idx], dtype=torch.float)
        return image, tabular_row

# --- 定义图像变换 (使用验证集的变换) ---
data_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

print("环境设置完毕。\n")

# =============================================================================
# 步骤 1: 加载训练好的模型
# =============================================================================

print("--- 步骤 1: 加载模型 ---")

# 确定路径和设备
MODEL_PATH = 'best_model.pth'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# !! 关键：先加载训练数据以确定特征数量和 scaler !!
# (这是为了让脚本独立，实际项目中 scaler 应该被单独保存和加载)
train_full_df = pd.read_csv('train.csv')
train_df_wide = pd.pivot_table(train_full_df, index=['image_path', 'Sampling_Date', 'State', 'Species', 'Pre_GSHH_NDVI', 'Height_Ave_cm'], columns='target_name', values='target', aggfunc='mean').reset_index()
train_df_wide['Sampling_Date'] = pd.to_datetime(train_df_wide['Sampling_Date'])
train_df_wide['Month'] = train_df_wide['Sampling_Date'].dt.month
train_df_wide['Month_sin'] = np.sin(2 * np.pi * train_df_wide['Month'] / 12)
train_df_wide['Month_cos'] = np.cos(2 * np.pi * train_df_wide['Month'] / 12)
train_df_wide = pd.get_dummies(train_df_wide, columns=['State', 'Species'], drop_first=True)
target_cols = ['Dry_Clover_g', 'Dry_Dead_g', 'Dry_Green_g', 'GDM_g', 'Dry_Total_g']
feature_cols = [col for col in train_df_wide.columns if col not in target_cols + ['image_path', 'Sampling_Date', 'Month']]

# 实例化模型
inference_model = BiModalModel(num_tabular_features=len(feature_cols)).to(device)
# 加载权重
inference_model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
# !! 必须设置为评估模式 !!
inference_model.eval()

print(f"模型 '{MODEL_PATH}' 加载成功，并已设置为评估模式。\n")


# =============================================================================
# 步骤 2: 准备测试数据和预处理工具
# =============================================================================

print("--- 步骤 2: 准备测试数据 ---")

# 加载测试集元数据
test_df = pd.read_csv('test.csv')

# 执行与训练时完全相同的特征工程
test_df['Sampling_Date'] = pd.to_datetime(test_df['Sampling_Date'])
test_df['Month'] = test_df['Sampling_Date'].dt.month
test_df['Month_sin'] = np.sin(2 * np.pi * test_df['Month'] / 12)
test_df['Month_cos'] = np.cos(2 * np.pi * test_df['Month'] / 12)
test_df = pd.get_dummies(test_df, columns=['State', 'Species'], drop_first=True)

# 确保测试集和训练集的列一致 (有些 species 可能只在训练集出现)
for col in feature_cols:
    if col not in test_df.columns:
        test_df[col] = 0
test_df = test_df[feature_cols + ['image_path']] # 保持列顺序一致

# !! 关键：使用在 *训练集* 上 fit 好的 scaler 来 transform 测试集 !!
scaler = StandardScaler()
numerical_cols = ['Pre_GSHH_NDVI', 'Height_Ave_cm', 'Month_sin', 'Month_cos']
scaler.fit(train_df_wide[numerical_cols]) # 在全部训练数据上 fit
test_df[numerical_cols] = scaler.transform(test_df[numerical_cols])


# 创建测试数据集和加载器
IMAGE_DIR = './'
test_dataset = InferenceDataset(test_df, IMAGE_DIR, test_df[feature_cols], transform=data_transforms)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=0)

print("测试数据准备完毕。\n")


# =============================================================================
# 步骤 3: 执行预测
# =============================================================================

print("--- 步骤 3: 开始预测 ---")

all_predictions = []
with torch.no_grad(): # 关闭梯度计算
    for images, tabular in tqdm(test_loader, desc="预测中"):
        images, tabular = images.to(device), tabular.to(device)
        
        # 模型输出 log 尺度的预测
        log_preds = inference_model(images, tabular)
        
        # !! 关键：将预测结果还原到原始尺度 !!
        preds = np.expm1(log_preds.cpu().numpy())
        
        all_predictions.append(preds)

# 将所有批次的预测结果合并成一个大的 numpy 数组
predictions_array = np.concatenate(all_predictions, axis=0)
print("预测完成。\n")


# =============================================================================
# 步骤 4: 创建提交文件 (submission.csv)
# =============================================================================

print("--- 步骤 4: 创建提交文件 ---")

# 将预测结果（宽格式）转换为 DataFrame
pred_df = pd.DataFrame(predictions_array, columns=target_cols)

# 关联 image_path
submission_df = pd.concat([test_df[['image_path']].reset_index(drop=True), pred_df], axis=1)

# 使用 pd.melt 将宽格式转换为竞赛要求的长格式
submission_df = submission_df.melt(id_vars=['image_path'], value_vars=target_cols, var_name='target_name', value_name='target')

# 创建最终的 'sample_id'
submission_df['sample_id'] = submission_df['image_path'] + '_' + submission_df['target_name']

# 按照 sample_submission.csv 的格式选择列并保存
final_submission = submission_df[['sample_id', 'target']]
final_submission.to_csv('submission.csv', index=False)

print("="*50)
print("提交文件 'submission.csv' 已成功生成！")
print("文件预览:")
print(final_submission.head())
print("="*50)



--- 步骤 0: 环境设置与类定义 ---
环境设置完毕。

--- 步骤 1: 加载模型 ---


  inference_model.load_state_dict(torch.load(MODEL_PATH, map_location=device))


模型 'best_model.pth' 加载成功，并已设置为评估模式。

--- 步骤 2: 准备测试数据 ---


KeyError: 'Sampling_Date'