In [1]:
# =======================================================
# 这个单元格包含了从第一阶段移植过来的数据清洗和重塑代码
# =======================================================
import pandas as pd
import numpy as np

# 1. 读取原始数据
# (请确保这里的路径是正确的)
DATA_DIR = './' # 或者你的数据所在路径
TRAIN_CSV = f'{DATA_DIR}/train.csv'
df = pd.read_csv(TRAIN_CSV)

# 2. 使用 pivot_table 将长格式转为宽格式
df_wide = pd.pivot_table(df, 
                         index=['image_path', 'Sampling_Date', 'State', 'Species', 'Pre_GSHH_NDVI', 'Height_Ave_cm'], 
                         columns='target_name', 
                         values='target',
                         aggfunc='mean').reset_index()

df_wide = df_wide.rename_axis(None, axis=1)

# 3. 转换日期类型
df_wide['Sampling_Date'] = pd.to_datetime(df_wide['Sampling_Date'])

print("df_wide 已成功创建！")
display(df_wide.head())


df_wide 已成功创建！


Unnamed: 0,image_path,Sampling_Date,State,Species,Pre_GSHH_NDVI,Height_Ave_cm,Dry_Clover_g,Dry_Dead_g,Dry_Green_g,Dry_Total_g,GDM_g
0,train/ID1011485656.jpg,2015-09-04,Tas,Ryegrass_Clover,0.62,4.6667,0.0,31.9984,16.2751,48.2735,16.275
1,train/ID1012260530.jpg,2015-04-01,NSW,Lucerne,0.55,16.0,0.0,0.0,7.6,7.6,7.6
2,train/ID1025234388.jpg,2015-09-01,WA,SubcloverDalkeith,0.38,1.0,6.05,0.0,0.0,6.05,6.05
3,train/ID1028611175.jpg,2015-05-18,Tas,Ryegrass,0.66,5.0,0.0,30.9703,24.2376,55.2079,24.2376
4,train/ID1035947949.jpg,2015-09-11,Tas,Ryegrass,0.54,3.5,0.4343,23.2239,10.5261,34.1844,10.9605


In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

# --- 1. 特征工程 (Feature Engineering) ---

# 复制一份数据，避免修改原始的 df_wide
df_model = df_wide.copy()

# a) 处理日期特征: 提取月份，并进行周期性编码 (cyclical encoding)
df_model['Month'] = df_model['Sampling_Date'].dt.month
df_model['Month_sin'] = np.sin(2 * np.pi * df_model['Month'] / 12)
df_model['Month_cos'] = np.cos(2 * np.pi * df_model['Month'] / 12)

# b) 处理类别特征: 使用 One-Hot Encoding
# pd.get_dummies 是一个实现 One-Hot Encoding 的简单方法
df_model = pd.get_dummies(df_model, columns=['State', 'Species'], drop_first=True)

# c) 定义特征列 (X) 和目标列 (y)
target_cols = ['Dry_Clover_g', 'Dry_Dead_g', 'Dry_Green_g', 'GDM_g', 'Dry_Total_g']

# 从 df_model 中移除原始目标、日期和图片路径，剩下的就是特征
feature_cols = [col for col in df_model.columns if col not in target_cols + ['image_path', 'Sampling_Date', 'Month']]

X = df_model[feature_cols]
# !! 关键：使用我们之前确认的 log 变换 !!
y_log = np.log1p(df_model[target_cols])

print("--- 特征工程完成 ---")
print(f"使用的特征数量: {len(feature_cols)}")
print("特征列:", feature_cols)
print("\n")


# --- 2. 交叉验证与模型训练 ---

# 设置 5-Fold 交叉验证
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 存储每个目标在交叉验证中的平均 RMSE
oof_rmse_scores = {}

# 循环为每个目标训练一个模型
for target in target_cols:
    print(f"--- 正在训练目标: {target} ---")
    
    fold_scores = []
    
    for fold, (train_index, val_index) in enumerate(kf.split(X, y_log)):
        # 划分训练集和验证集
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train_log, y_val_log = y_log[target].iloc[train_index], y_log[target].iloc[val_index]
        
        # 初始化 LightGBM 模型
        model = lgb.LGBMRegressor(random_state=42)
        
        # 训练模型
        model.fit(X_train, y_train_log)
        
        # 在验证集上进行预测 (得到的是 log 变换后的值)
        val_preds_log = model.predict(X_val)
        
        # !! 关键：将预测结果和真实值都还原到原始尺度 !!
        val_preds = np.expm1(val_preds_log)
        y_val_orig = np.expm1(y_val_log)
        
        # 计算 RMSE
        rmse = np.sqrt(mean_squared_error(y_val_orig, val_preds))
        fold_scores.append(rmse)
        print(f"  Fold {fold+1} RMSE: {rmse:.4f}")
    
    # 计算该目标下 5-Fold 的平均 RMSE
    mean_rmse = np.mean(fold_scores)
    oof_rmse_scores[target] = mean_rmse
    print(f"-> {target} 平均 OOF RMSE: {mean_rmse:.4f}\n")


# --- 3. 总结最终得分 ---
print("--- Baseline 模型交叉验证结果总结 ---")
for target, score in oof_rmse_scores.items():
    print(f"{target}: {score:.4f}")

# 竞赛的最终得分是所有目标 RMSE 的平均值
final_score = np.mean(list(oof_rmse_scores.values()))
print(f"\n======================================")
print(f"平均 Baseline 得分 (Mean RMSE): {final_score:.4f}")
print("======================================")



--- 特征工程完成 ---
使用的特征数量: 21
特征列: ['Pre_GSHH_NDVI', 'Height_Ave_cm', 'Month_sin', 'Month_cos', 'State_Tas', 'State_Vic', 'State_WA', 'Species_Fescue', 'Species_Fescue_CrumbWeed', 'Species_Lucerne', 'Species_Mixed', 'Species_Phalaris', 'Species_Phalaris_BarleyGrass_SilverGrass_SpearGrass_Clover_Capeweed', 'Species_Phalaris_Clover', 'Species_Phalaris_Clover_Ryegrass_Barleygrass_Bromegrass', 'Species_Phalaris_Ryegrass_Clover', 'Species_Ryegrass', 'Species_Ryegrass_Clover', 'Species_SubcloverDalkeith', 'Species_SubcloverLosa', 'Species_WhiteClover']


--- 正在训练目标: Dry_Clover_g ---
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000204 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 121
[LightGBM] [Info] Number of data points in the train set: 285, number of used features: 11
[LightGBM] [Info] Start training from score 1.133399
  Fold 1 RMSE: 9.8432
[LightGBM] [Info] Auto-choosing col-wise multi-threading, 

# 项目第二阶段总结：构建基准模型 (Baseline)

本阶段的目标是利用清洗后的**表格数据**，快速构建一个性能可靠的基准模型，为后续引入图像特征的复杂模型提供一个明确的评估标准。

### 1. 模型与策略

*   **模型**: 我们选择了 **LightGBM**，一个高效的梯度提升决策树模型，非常适合处理表格数据。
*   **特征工程**:
    *   **时间特征**: 从 `Sampling_Date` 中提取 `Month`，并进行 `sin/cos` 周期性编码，以捕捉季节性变化。
    *   **类别特征**: 对 `State` 和 `Species` 列进行 One-Hot 编码，将其转换为模型可用的数值格式。
*   **验证策略**: 采用 **5-Fold 交叉验证 (Cross-Validation)**，确保模型性能评估的稳定性和可靠性。
*   **目标处理**: 遵循第一阶段的结论，对所有 5 个目标变量进行 `np.log1p` 变换后进行训练，并在评估时用 `np.expm1` 还原。

### 2. 基准性能评估

我们为每个目标单独训练了一个模型，并计算了其在 5-Fold 交叉验证中的平均均方根误差 (RMSE)。

| 目标 (Target) | 平均 OOF RMSE |
| :--- | :--- |
| `Dry_Clover_g` | 3.2384 |
| `Dry_Dead_g` | 13.9782 |
| `Dry_Green_g` | 15.3411 |
| `GDM_g` | 14.8872 |
| `Dry_Total_g` | 11.0841 |

---

**最终基准分数 (竞赛指标)**:

### **平均 RMSE: 11.7058**

这个分数是我们的“黄金标准”。下一阶段的多模态模型必须超越这个表现，才能证明图像信息带来了额外的价值。
