In [1]:
import pandas as pd
import numpy as np

# --- 1. 加载所有数据文件 ---
print("⚙️ 步骤 1: 正在加载数据...")
base_path = 'E:\Kaggle\Store Sales - Time Series Forecasting/'
train_df = pd.read_csv(f'{base_path}train.csv', parse_dates=['date'])
test_df = pd.read_csv(f'{base_path}test.csv', parse_dates=['date'])
stores_df = pd.read_csv(f'{base_path}stores.csv')
oil_df = pd.read_csv(f'{base_path}oil.csv', parse_dates=['date'])
holidays_df = pd.read_csv(f'{base_path}holidays_events.csv', parse_dates=['date'])
transactions_df = pd.read_csv(f'{base_path}transactions.csv', parse_dates=['date'])
print("✅ 数据加载成功！")

# --- 2. 合并数据 ---
print("\n⚙️ 步骤 2: 正在合并数据...")
# 合并训练集
df_train_full = train_df.merge(stores_df, on='store_nbr', how='left')
df_train_full = df_train_full.merge(transactions_df, on=['date', 'store_nbr'], how='left')
df_train_full = df_train_full.merge(oil_df, on='date', how='left')

# 合并测试集
df_test_full = test_df.merge(stores_df, on='store_nbr', how='left')
df_test_full = df_test_full.merge(transactions_df, on=['date', 'store_nbr'], how='left')
df_test_full = df_test_full.merge(oil_df, on='date', how='left')
print("✅ 数据合并成功！")

# --- 3. 填充缺失值 ---
print("\n⚙️ 步骤 3: 正在填充缺失值...")
for df in [df_train_full, df_test_full]:
    df['dcoilwtico'] = df['dcoilwtico'].ffill().bfill()
    df['transactions'] = df['transactions'].fillna(0)
print("✅ 缺失值填充成功！")

⚙️ 步骤 1: 正在加载数据...
✅ 数据加载成功！

⚙️ 步骤 2: 正在合并数据...
✅ 数据合并成功！

⚙️ 步骤 3: 正在填充缺失值...
✅ 缺失值填充成功！


In [2]:
import pandas as pd
import numpy as np

# --- 4. Advanced Feature Engineering (Inspired by R script) ---
print("\n⚙️ 步骤 4: 正在进行高级特征工程...")

# 合并训练和测试集，以便统一处理。'sales' 列被保留用于创建滞后和编码特征。
df_combined = pd.concat([df_train_full, df_test_full], sort=False, ignore_index=True)
df_combined = df_combined.sort_values(by=['store_nbr', 'family', 'date'])


# --- Part A: Date-based & Holiday Features ---
print("    - 创建日期、趋势、傅里叶和节假日特征...")
# 创建一个包含所有日期的骨架，以计算不依赖于分组的特征
all_dates = pd.DataFrame({'date': df_combined['date'].unique()}).sort_values('date')

# 1. 趋势特征
all_dates['trend'] = np.arange(len(all_dates))

# 2. 傅里叶特征 (Fourier Features) for Seasonality
K = 2 # 使用2个谐波
for k in range(1, K + 1):
    # Yearly seasonality
    all_dates[f'fourier_sin_y{k}'] = np.sin(2 * np.pi * k * all_dates['date'].dt.dayofyear / 365.25)
    all_dates[f'fourier_cos_y{k}'] = np.cos(2 * np.pi * k * all_dates['date'].dt.dayofyear / 365.25)
    # Weekly seasonality
    all_dates[f'fourier_sin_w{k}'] = np.sin(2 * np.pi * k * all_dates['date'].dt.dayofweek / 7)
    all_dates[f'fourier_cos_w{k}'] = np.cos(2 * np.pi * k * all_dates['date'].dt.dayofweek / 7)

# 3. 高级节假日特征
h_df = holidays_df[holidays_df['transferred'] == False].copy()
h_df_national = h_df[h_df['locale'] == 'National'].rename(columns={'description': 'holiday_national'})[['date', 'holiday_national']]
h_df_regional = h_df[h_df['locale'] == 'Regional'].rename(columns={'description': 'holiday_regional'})[['date', 'holiday_regional']]
h_df_local = h_df[h_df['locale'] == 'Local'].rename(columns={'description': 'holiday_local'})[['date', 'holiday_local']]

# 创建标志位
h_df_national['is_national_holiday'] = 1
h_df_regional['is_regional_holiday'] = 1
h_df_local['is_local_holiday'] = 1

# 合并节假日标志
all_dates = all_dates.merge(h_df_national.groupby('date').first(), on='date', how='left')
all_dates = all_dates.merge(h_df_regional.groupby('date').first(), on='date', how='left')
all_dates = all_dates.merge(h_df_local.groupby('date').first(), on='date', how='left')
holiday_cols = ['is_national_holiday', 'is_regional_holiday', 'is_local_holiday']
all_dates[holiday_cols] = all_dates[holiday_cols].fillna(0)

# 4. 合并所有基于日期的特征到主数据框
df_featured = df_combined.merge(all_dates.drop(['holiday_national', 'holiday_regional', 'holiday_local'], axis=1, errors='ignore'), on='date', how='left')

# 5. 其他常规日期特征
df_featured['dayofweek'] = df_featured['date'].dt.dayofweek
df_featured['month'] = df_featured['date'].dt.month
df_featured['is_payday'] = ((df_featured['date'].dt.day == 15) | (df_featured['date'].dt.is_month_end)).astype(int)


# --- Part B: Target Encoding ---
print("    - 创建目标编码特征...")
# 使用2017-07-01之前的数据进行编码，防止数据泄露
encoding_cutoff = "2017-07-01"
temp_df = df_featured[df_featured['date'] < encoding_cutoff]

# 计算均值
family_mean = temp_df.groupby('family')['sales'].mean().to_dict()
store_family_mean = temp_df.groupby(['store_nbr', 'family'])['sales'].mean().to_dict()

# 映射到特征
df_featured['enc_family_mean'] = df_featured['family'].map(family_mean)
df_featured['enc_store_family_mean'] = df_featured.set_index(['store_nbr', 'family']).index.map(store_family_mean)

# 填充可能因新组合产生的NaN
df_featured['enc_family_mean'].fillna(np.mean(list(family_mean.values())), inplace=True)
df_featured['enc_store_family_mean'].fillna(df_featured['enc_family_mean'], inplace=True)


# --- Part C: Lag & Rolling Features ---
print("    - 创建滞后和滚动特征...")
# 重新排序以确保分组计算正确
df_featured = df_featured.sort_values(by=['store_nbr', 'family', 'date'])

# 定义要创建特征的列和参数
lag_features = {
    'sales': [7, 14, 28],
    'onpromotion': [1, 7, 14],
}
rolling_features = {
    'sales': [7, 14, 28],
    'onpromotion': [7, 14],
    'transactions': [7, 14]
}

# 创建滞后特征
for col, lags in lag_features.items():
    for lag in lags:
        df_featured[f'{col}_lag_{lag}'] = df_featured.groupby(['store_nbr', 'family'])[col].shift(lag)

# 创建滚动特征
for col, windows in rolling_features.items():
    for window in windows:
        # 使用shift(1)来防止在计算中使用当天的数据
        shifted_data = df_featured.groupby(['store_nbr', 'family'])[col].shift(1)
        df_featured[f'{col}_rolling_mean_{window}'] = shifted_data.rolling(window).mean()
        df_featured[f'{col}_rolling_std_{window}'] = shifted_data.rolling(window).std()

# --- Part D: Final Imputation & Split ---
print("    - 进行最终填充和数据分割...")
# 填充因滞后和滚动产生的NaN值
# 首先用前向填充处理组内的NaN，然后用0填充剩余的（通常是每个组的开头）
df_featured.fillna(method='ffill', inplace=True)
df_featured.fillna(0, inplace=True)

# 分离回最终的训练集和测试集
df_train_final = df_featured[df_featured['id'].isin(train_df['id'])].copy()
df_test_final = df_featured[df_featured['id'].isin(test_df['id'])].copy()

print("✅ 高级特征工程完成！")
print("最终训练集形状:", df_train_final.shape)
print("最终测试集形状:", df_test_final.shape)


⚙️ 步骤 4: 正在进行高级特征工程...
    - 创建日期、趋势、傅里叶和节假日特征...
    - 创建目标编码特征...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_featured['enc_family_mean'].fillna(np.mean(list(family_mean.values())), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_featured['enc_store_family_mean'].fillna(df_featured['enc_family_mean'], inplace=True)


    - 创建滞后和滚动特征...
    - 进行最终填充和数据分割...


  df_featured.fillna(method='ffill', inplace=True)


✅ 高级特征工程完成！
最终训练集形状: (3000888, 49)
最终测试集形状: (28512, 49)


In [10]:
import lightgbm as lgb
import pandas as pd
import numpy as np
import optuna

# 确保 Optuna 的日志级别设置为 WARNING，避免过多输出
optuna.logging.set_verbosity(optuna.logging.WARNING)

# --- 1. 准备最终的特征和目标 ---
print("⚙️ 从最终数据中分离目标变量和特征...")
features = [col for col in df_train_final.columns if col not in ['id', 'date', 'sales']]
test_ids = df_test_final['id']
X_train = df_train_final[features].copy()
X_test = df_test_final[features].copy()

# --- 2. 为 LightGBM 准备数据 ---
print("⚙️ 准备 LightGBM 数据格式...")
categorical_features = [
    "store_nbr", "family", "city", "state", "type", "dayofweek", "month",
    "is_national_holiday", "is_regional_holiday", "is_local_holiday"
]
categorical_features = [f for f in categorical_features if f in features]

print("    - 转换分类特征的数据类型为 'category'...")
for col in categorical_features:
    X_train[col] = X_train[col].astype('category')
    X_test[col] = X_test[col].astype('category')

# --- 3. 创建基于时间的验证集 ---
print("⚙️ 创建验证集...")
val_start_date = '2017-07-26'
val_mask = df_train_final['date'] >= val_start_date

# ===================================================================
#  模型 1: 分类器 (使用 Optuna 进行贝叶斯优化)
# ===================================================================
print("\n--- 开始优化模型 1: 分类器 ---")
y_classifier = (df_train_final['sales'] > 0).astype(int)
X_train_clf, X_val_clf = X_train[~val_mask], X_train[val_mask]
y_train_clf, y_val_clf = y_classifier[~val_mask], y_classifier[val_mask]

# *** 加速优化 1: 为 Optuna 创建一个数据子样本 ***
# 在耗时的调优阶段使用部分数据，可以极大提升速度
print("    - 为加速优化，创建数据子样本...")
N_OPTUNA_SAMPLES = 750000
optuna_clf_indices = np.random.choice(X_train_clf.index, N_OPTUNA_SAMPLES, replace=False)
X_train_clf_opt = X_train_clf.loc[optuna_clf_indices]
y_train_clf_opt = y_train_clf.loc[optuna_clf_indices]


def objective_classifier(trial):
    """Optuna 的目标函数，用于寻找分类器的最佳参数"""
    # LightGBM 默认使用高效的直方图算法
    params = {
        'objective': 'binary',
        'metric': 'logloss',
        'boosting_type': 'gbdt',
        'seed': 42,
        'verbose': -1,
        'n_jobs': -1,
        'learning_rate': trial.suggest_float('learning_rate', 0.02, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 80),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.7, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.7, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
    }

    model = lgb.LGBMClassifier(**params, n_estimators=1000)
    # 使用子样本进行训练
    model.fit(X_train_clf_opt, y_train_clf_opt,
              eval_set=[(X_val_clf, y_val_clf)],
              eval_metric='logloss',
              callbacks=[lgb.early_stopping(15, verbose=False)])

    return model.best_score_['valid_0']['binary_logloss']

# *** 加速优化 2: 减少试验次数 ***
N_TRIALS = 10
study_clf = optuna.create_study(direction='minimize')
study_clf.optimize(objective_classifier, n_trials=N_TRIALS)
best_params_clf = study_clf.best_params
print(f"✅ 分类器优化完成！最佳 Logloss: {study_clf.best_value:.5f}")

# ===================================================================
#  模型 2: 回归器 (使用 Optuna 进行贝叶斯优化)
# ===================================================================
print("\n--- 开始优化模型 2: 回归器 ---")
train_positive_mask = df_train_final['sales'] > 0
df_train_positive = df_train_final[train_positive_mask]
val_mask_positive = df_train_positive['date'] >= val_start_date
X_train_reg_full = df_train_positive[~val_mask_positive][features]
y_train_reg_full = np.log1p(df_train_positive[~val_mask_positive]['sales'])
X_val_reg = df_train_positive[val_mask_positive][features]
y_val_reg = np.log1p(df_train_positive[val_mask_positive]['sales'])

# 为回归器创建优化子样本
optuna_reg_indices = np.random.choice(X_train_reg_full.index, min(N_OPTUNA_SAMPLES, len(X_train_reg_full)), replace=False)
X_train_reg_opt = X_train_reg_full.loc[optuna_reg_indices]
y_train_reg_opt = y_train_reg_full.loc[optuna_reg_indices]

for col in categorical_features:
    X_train_reg_opt[col] = X_train_reg_opt[col].astype('category')
    X_val_reg[col] = X_val_reg[col].astype('category')


def objective_regressor(trial):
    """Optuna 的目标函数，用于寻找回归器的最佳参数"""
    params = {
        'objective': 'regression_l1',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'seed': 42,
        'verbose': -1,
        'n_jobs': -1,
        'learning_rate': trial.suggest_float('learning_rate', 0.02, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.7, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.7, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
    }

    model = lgb.LGBMRegressor(**params, n_estimators=2000)
    model.fit(X_train_reg_opt, y_train_reg_opt,
              eval_set=[(X_val_reg, y_val_reg)],
              eval_metric='rmse',
              callbacks=[lgb.early_stopping(15, verbose=False)])

    return model.best_score_['valid_0']['rmse']

study_reg = optuna.create_study(direction='minimize')
study_reg.optimize(objective_regressor, n_trials=N_TRIALS)
best_params_reg = study_reg.best_params
print(f"✅ 回归器优化完成！最佳 RMSE: {study_reg.best_value:.5f}")


# ===================================================================
#  使用找到的最佳参数在【完整数据】上训练最终模型
# ===================================================================
print("\n--- 使用最佳参数在【完整数据】上训练最终模型 ---")

final_clf_params = {**best_params_clf, 'objective': 'binary', 'metric': 'logloss', 'seed': 42, 'verbose': -1, 'n_jobs': -1}
final_reg_params = {**best_params_reg, 'objective': 'regression_l1', 'metric': 'rmse', 'seed': 42, 'verbose': -1, 'n_jobs': -1}

# 训练最终分类器 (使用完整训练集)
final_classifier = lgb.LGBMClassifier(**final_clf_params, n_estimators=1000)
final_classifier.fit(X_train_clf, y_train_clf,
                     eval_set=[(X_val_clf, y_val_clf)],
                     eval_metric='logloss',
                     callbacks=[lgb.early_stopping(15, verbose=False)])

# 训练最终回归器 (使用完整正销量训练集)
for col in categorical_features:
    X_train_reg_full[col] = X_train_reg_full[col].astype('category')

final_regressor = lgb.LGBMRegressor(**final_reg_params, n_estimators=2000)
final_regressor.fit(X_train_reg_full, y_train_reg_full,
                    eval_set=[(X_val_reg, y_val_reg)],
                    eval_metric='rmse',
                    callbacks=[lgb.early_stopping(15, verbose=False)])

# ===================================================================
#  最终预测: 合并两个模型的结果 (已修正)
# ===================================================================
print("\n⚙️ 开始生成并合并预测...")

# *** 已修正：采用更稳健的预测逻辑 ***

# 1. 直接用回归器预测对数转换后的销售额
#    这个预测结果本身就是为了优化 RMSLE
log_amount_pred = final_regressor.predict(X_test)

# 2. 将预测值转换回原始尺度
final_predictions = np.expm1(log_amount_pred)

# 3. 将所有负数预测修正为 0
final_predictions[final_predictions < 0] = 0

# (可选但推荐) 使用分类器进行修正：
# 对于分类器预测概率极低的项，直接将其销量置为0，这可以修正回归器的一些极端预测
# prob_of_sale = final_classifier.predict_proba(X_test)[:, 1]
# final_predictions[prob_of_sale < 0.01] = 0 # 阈值可以调整

submission_df = pd.DataFrame({'id': test_ids, 'sales': final_predictions})
submission_df.to_csv('submission.csv', index=False)

print("\n✅ 提交文件 'submission.csv' 已成功生成！")
print("提交文件预览:")
print(submission_df.head())


⚙️ 从最终数据中分离目标变量和特征...
⚙️ 准备 LightGBM 数据格式...
    - 转换分类特征的数据类型为 'category'...
⚙️ 创建验证集...

--- 开始优化模型 1: 分类器 ---
    - 为加速优化，创建数据子样本...
✅ 分类器优化完成！最佳 Logloss: 0.09656

--- 开始优化模型 2: 回归器 ---
✅ 回归器优化完成！最佳 RMSE: 0.35250

--- 使用最佳参数在【完整数据】上训练最终模型 ---

⚙️ 开始生成并合并预测...

✅ 提交文件 'submission.csv' 已成功生成！
提交文件预览:
           id     sales
1684  3000888  2.875436
1685  3002670  2.911248
1686  3004452  3.030787
1687  3006234  3.793828
1688  3008016  3.230670


In [12]:
import lightgbm as lgb
import pandas as pd
import numpy as np
import optuna

# ===================================================================
#  Part 1: 增强特征工程
# ===================================================================
print("\n⚙️ 步骤 1: 正在进行增强特征工程...")

# 合并训练和测试集，以便统一处理。'sales' 列被保留用于创建滞后和编码特征。
df_combined = pd.concat([df_train_full, df_test_full], sort=False, ignore_index=True)
df_combined = df_combined.sort_values(by=['store_nbr', 'family', 'date'])


# --- Part A: Date, Trend, and Fourier Features ---
print("    - 创建日期、趋势和傅里叶特征...")
all_dates = pd.DataFrame({'date': df_combined['date'].unique()}).sort_values('date')
all_dates['trend'] = np.arange(len(all_dates))
K = 2
for k in range(1, K + 1):
    all_dates[f'fourier_sin_y{k}'] = np.sin(2 * np.pi * k * all_dates['date'].dt.dayofyear / 365.25)
    all_dates[f'fourier_cos_y{k}'] = np.cos(2 * np.pi * k * all_dates['date'].dt.dayofyear / 365.25)
df_featured = df_combined.merge(all_dates, on='date', how='left')
df_featured['dayofweek'] = df_featured['date'].dt.dayofweek
df_featured['month'] = df_featured['date'].dt.month
df_featured['is_payday'] = ((df_featured['date'].dt.day == 15) | (df_featured['date'].dt.is_month_end)).astype(int)


# --- Part B: Holiday Feature Engineering (新功能：基于相关性的筛选和独热编码) ---
print("    - 创建基于相关性的假日特征...")
# 1. 准备用于计算相关性的数据
holidays_filtered = holidays_df[holidays_df['transferred'] == False].copy()
hol_sales = df_featured.merge(holidays_filtered[['date', 'description']], on='date', how='left')
hol_sales = hol_sales[hol_sales['description'].notna()]

# 2. 计算每个假日与销量的相关性
correlations = {}
for holiday in hol_sales['description'].unique():
    holiday_indicator = (hol_sales['description'] == holiday)
    # 使用 .corr() 方法，更安全
    correlation = hol_sales['sales'].corr(holiday_indicator)
    correlations[holiday] = correlation

corr_df = pd.DataFrame(list(correlations.items()), columns=['holiday', 'sales_correlation']).fillna(0)
# 3. 筛选出相关性强的假日
selected_hols_df = corr_df[abs(corr_df['sales_correlation']) > 0.1]
selected_holidays_list = selected_hols_df['holiday'].tolist()
print(f"    - 已筛选出 {len(selected_holidays_list)} 个重要假日。")

# 4. 对筛选出的假日进行独热编码
hols_to_encode = holidays_filtered[holidays_filtered['description'].isin(selected_holidays_list)]
holiday_dummies = pd.get_dummies(hols_to_encode[['date', 'description']], columns=['description'], prefix='hol')
# 清理列名
holiday_dummies.columns = [col.replace(" ", "_").replace(",", "").replace("+", "_plus_") for col in holiday_dummies.columns]
# 按日期分组，处理同一天有多个假日的情况
holiday_dummies = holiday_dummies.groupby('date').sum().reset_index()

# 5. 将独热编码后的假日特征合并到主数据框
df_featured = df_featured.merge(holiday_dummies, on='date', how='left')
# 填充非假日产生的NaN为0
dummy_cols = [col for col in holiday_dummies.columns if col != 'date']
df_featured[dummy_cols] = df_featured[dummy_cols].fillna(0)


# --- Part C: Lag, Rolling, and EWM Features ---
print("    - 创建滞后、滚动和EWM特征...")
df_featured = df_featured.sort_values(by=['store_nbr', 'family', 'date'])
lag_features = {'sales': [7, 14, 28, 35]} # 增加更多滞后
for col, lags in lag_features.items():
    for lag in lags:
        df_featured[f'{col}_lag_{lag}'] = df_featured.groupby(['store_nbr', 'family'])[col].shift(lag)

rolling_windows = [7, 14, 28, 60]
for window in rolling_windows:
    shifted_sales = df_featured.groupby(['store_nbr', 'family'])['sales'].shift(1)
    df_featured[f'sales_rolling_mean_{window}'] = shifted_sales.rolling(window, min_periods=1).mean()
    df_featured[f'sales_rolling_std_{window}'] = shifted_sales.rolling(window, min_periods=1).std()

alphas = [0.95, 0.9, 0.8, 0.7]
for alpha in alphas:
    shifted_sales = df_featured.groupby(['store_nbr', 'family'])['sales'].shift(1)
    df_featured[f'sales_ewm_alpha_{str(alpha).replace(".", "")}'] = shifted_sales.ewm(alpha=alpha).mean()


# --- Part D: Final Imputation & Split ---
print("    - 进行最终填充和数据分割...")
df_featured.fillna(0, inplace=True)
df_train_final = df_featured[df_featured['id'].isin(train_df['id'])].copy()
df_test_final = df_featured[df_featured['id'].isin(test_df['id'])].copy()
print("✅ 特征工程完成！")


# ===================================================================
#  Part 2: 单回归模型优化与训练
# ===================================================================
print("\n⚙️ 步骤 2: 准备模型训练...")
features = [col for col in df_train_final.columns if col not in ['id', 'date', 'sales']]
test_ids = df_test_final['id']
X_train = df_train_final[features].copy()
X_test = df_test_final[features].copy()
y_train = np.log1p(df_train_final['sales'])

# 更新分类特征列表 (不再需要旧的假日标志)
categorical_features = ["store_nbr", "family", "city", "state", "type", "dayofweek", "month"]
categorical_features = [f for f in categorical_features if f in features]
for col in categorical_features:
    X_train[col] = X_train[col].astype('category')
    X_test[col] = X_test[col].astype('category')

# --- 创建验证集 ---
val_start_date = '2017-07-26'
val_mask = df_train_final['date'] >= val_start_date
X_train_split, X_val_split = X_train[~val_mask], X_train[val_mask]
y_train_split, y_val_split = y_train[~val_mask], y_train[val_mask]

# --- Optuna 优化 ---
print("\n--- 开始优化回归模型 ---")
def objective_regressor(trial):
    params = {
        'objective': 'regression_l1', 'metric': 'rmse', 'boosting_type': 'gbdt',
        'seed': 42, 'verbose': -1, 'n_jobs': -1,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.08, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
    }
    model = lgb.LGBMRegressor(**params, n_estimators=1500)
    model.fit(X_train_split, y_train_split,
              eval_set=[(X_val_split, y_val_split)],
              eval_metric='rmse',
              callbacks=[lgb.early_stopping(20, verbose=False)])
    return model.best_score_['valid_0']['rmse']

N_TRIALS = 15
study = optuna.create_study(direction='minimize')
study.optimize(objective_regressor, n_trials=N_TRIALS)
best_params = study.best_params
print(f"✅ 回归器优化完成！最佳 RMSE: {study.best_value:.5f}")

# --- 使用最佳参数在完整数据上训练最终模型 ---
print("\n--- 使用最佳参数在【完整数据】上训练最终模型 ---")
final_params = {**best_params, 'objective': 'regression_l1', 'metric': 'rmse', 'seed': 42, 'verbose': -1, 'n_jobs': -1}
final_model = lgb.LGBMRegressor(**final_params, n_estimators=2500) # 增加最终训练的树数量
final_model.fit(X_train, y_train,
                eval_set=[(X_val_split, y_val_split)],
                eval_metric='rmse',
                callbacks=[lgb.early_stopping(25, verbose=False)])

# --- 生成最终预测 ---
print("\n⚙️ 开始生成最终预测...")
log_amount_pred = final_model.predict(X_test)
final_predictions = np.expm1(log_amount_pred)
final_predictions[final_predictions < 0] = 0

submission_df = pd.DataFrame({'id': test_ids, 'sales': final_predictions})
submission_df.to_csv('submission.csv', index=False)

print("\n✅ 提交文件 'submission.csv' 已成功生成！")
print("提交文件预览:")
print(submission_df.head())



⚙️ 步骤 1: 正在进行增强特征工程...
    - 创建日期、趋势和傅里叶特征...
    - 创建基于相关性的假日特征...
    - 已筛选出 0 个重要假日。
    - 创建滞后、滚动和EWM特征...
    - 进行最终填充和数据分割...
✅ 特征工程完成！

⚙️ 步骤 2: 准备模型训练...

--- 开始优化回归模型 ---
✅ 回归器优化完成！最佳 RMSE: 0.37762

--- 使用最佳参数在【完整数据】上训练最终模型 ---

⚙️ 开始生成最终预测...

✅ 提交文件 'submission.csv' 已成功生成！
提交文件预览:
           id     sales
1684  3000888  0.405771
1685  3002670  0.742122
1686  3004452  0.543227
1687  3006234  1.062653
1688  3008016  0.679269
