## 导入库

In [1]:
import os
import gc
import pickle
import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import mean_absolute_error
from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import VotingRegressor
import lightgbm as lgb
import optuna

## MonthlyKFold类的定义

In [2]:
#定义了一个用于交叉验证的自定义类MonthlyKFold, 它被设计为基于唯一的时间段(在本例中为月)拆分数据
class MonthlyKFold:
    def __init__(self, n_splits=3):
        self.n_splits = n_splits
        
    def split(self, X, y, groups=None):
        dates = 12 * X["year"] + X["month"]
        timesteps = sorted(dates.unique().tolist())
        X = X.reset_index()
        
        #该类将拆分次数作为输入，并提供基于指定时间段将数据集拆分为训练集和测试集的方法, 确保每个拆分都是不同的月份
        for t in timesteps[-self.n_splits:]:
            idx_train = X[dates.values < t].index
            idx_test = X[dates.values == t].index
            
            yield idx_train, idx_test
            
    def get_n_splits(self, X, y, groups=None):
        return self.n_splits

## 特征工程

In [3]:
#定义feature_engfunction对各种数据帧进行特征工程
def feature_eng(df_data, df_client, df_gas, df_electricity, df_forecast, df_historical, df_location, df_target):
    
    #包括将列转换为适当的数据类型、重命名列、执行日期-时间操作和创建新特征
    df_data = (df_data.with_columns(pl.col("datetime").cast(pl.Date).alias("date"),))
    df_client = (df_client.with_columns((pl.col("date") + pl.duration(days=2)).cast(pl.Date)))
    df_gas = (df_gas.rename({"forecast_date": "date"}).with_columns((pl.col("date") + pl.duration(days=1)).cast(pl.Date)))
    df_electricity = (df_electricity.rename({"forecast_date": "datetime"}).with_columns(pl.col("datetime") + pl.duration(days=1)))
    df_location = (df_location.with_columns(pl.col("latitude").cast(pl.datatypes.Float32),pl.col("longitude").cast(pl.datatypes.Float32)))
    df_forecast = (
        df_forecast.rename({"forecast_datetime": "datetime"}).with_columns(pl.col("latitude").cast(pl.datatypes.Float32),pl.col("longitude").cast(pl.datatypes.Float32),pl.col('datetime').dt.convert_time_zone("Europe/Bucharest").dt.replace_time_zone(None).cast(pl.Datetime("us")),).join(df_location, how="left", on=["longitude", "latitude"]).drop("longitude", "latitude")
    )
    df_historical = (
        df_historical.with_columns(pl.col("latitude").cast(pl.datatypes.Float32),pl.col("longitude").cast(pl.datatypes.Float32),pl.col("datetime") + pl.duration(hours=37)).join(df_location, how="left", on=["longitude", "latitude"]).drop("longitude", "latitude")
    )
    df_forecast_date = (
        df_forecast.group_by("datetime").mean().drop("county")
    )
    df_forecast_local = (df_forecast.filter(pl.col("county").is_not_null()).group_by("county", "datetime").mean())
    df_historical_date = (df_historical.group_by("datetime").mean().drop("county"))
    df_historical_local = (df_historical.filter(pl.col("county").is_not_null()).group_by("county", "datetime").mean())
    #该函数接受多个数据帧作为输入，例如客户端、燃气、电力、预测、历史数据和位置数据
    df_data = (df_data.join(df_gas, on="date", how="left").join(df_client, on=["county", "is_business", "product_type", "date"], how="left").join(df_electricity, on="datetime", how="left").join(df_forecast_date, on="datetime", how="left", suffix="_fd").join(df_forecast_local, on=["county", "datetime"], how="left", suffix="_fl").join(df_historical_date, on="datetime", how="left", suffix="_hd").join(df_historical_local, on=["county", "datetime"], how="left", suffix="_hl").join(df_forecast_date.with_columns(pl.col("datetime") + pl.duration(days=7)), on="datetime", how="left", suffix="_fdw").join(df_forecast_local.with_columns(pl.col("datetime") + pl.duration(days=7)), on=["county", "datetime"], how="left", suffix="_flw").join(df_historical_date.with_columns(pl.col("datetime") + pl.duration(days=7)), on="datetime", how="left", suffix="_hdw").join(df_historical_local.with_columns(pl.col("datetime") + pl.duration(days=7)), on=["county", "datetime"], how="left", suffix="_hlw")
        .join(df_target.with_columns(pl.col("datetime") + pl.duration(days=2)).rename({"target": "target_1"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left").join(df_target.with_columns(pl.col("datetime") + pl.duration(days=3)).rename({"target": "target_2"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left").join(df_target.with_columns(pl.col("datetime") + pl.duration(days=4)).rename({"target": "target_3"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left").join(df_target.with_columns(pl.col("datetime") + pl.duration(days=5)).rename({"target": "target_4"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left").join(df_target.with_columns(pl.col("datetime") + pl.duration(days=6)).rename({"target": "target_5"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left").join(df_target.with_columns(pl.col("datetime") + pl.duration(days=7)).rename({"target": "target_6"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left").join(df_target.with_columns(pl.col("datetime") + pl.duration(days=14)).rename({"target": "target_7"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
        .with_columns(pl.col("datetime").dt.ordinal_day().alias("dayofyear"),pl.col("datetime").dt.hour().alias("hour"),pl.col("datetime").dt.day().alias("day"),pl.col("datetime").dt.weekday().alias("weekday"),pl.col("datetime").dt.month().alias("month"),pl.col("datetime").dt.year().alias("year"),).with_columns(pl.concat_str("county", "is_business", "product_type", "is_consumption", separator="_").alias("category_1"),)
        .with_columns((np.pi * pl.col("dayofyear") / 183).sin().alias("sin(dayofyear)"),(np.pi * pl.col("dayofyear") / 183).cos().alias("cos(dayofyear)"),(np.pi * pl.col("hour") / 12).sin().alias("sin(hour)"),(np.pi * pl.col("hour") / 12).cos().alias("cos(hour)"),).with_columns(pl.col(pl.Float64).cast(pl.Float32),).drop("date", "datetime", "hour", "dayofyear")
    )
    
    #返回转换后的数据帧
    return df_data

## 转换为Pandas数据框架函数

In [4]:
#to_pandas函数用于将数据转换为Pandas数据框架
def to_pandas(X, y=None):
    cat_cols = ["county", "is_business", "product_type", "is_consumption", "category_1"]
    #可以处理单个数据帧(X)，也可以处理带有目标变量(X和y)的成对数据帧
    if y is not None:
        df = pd.concat([X.to_pandas(), y.to_pandas()], axis=1)
    else:
        df = X.to_pandas()    

    #设置索引
    df = df.set_index("row_id")
    df[cat_cols] = df[cat_cols].astype("category")
    #计算目标变量的平均值、标准差和比率
    df["target_mean"] = df[[f"target_{i}" for i in range(1, 7)]].mean(1)
    df["target_std"] = df[[f"target_{i}" for i in range(1, 7)]].std(1)
    df["target_ratio"] = df["target_6"] / (df["target_7"] + 1e-3)
    
    return df

## 新特征生成函数

In [5]:
#定义' new_features '函数, 迭代与温度和风成分相关的特定列，根据指定窗口上这些特征的差异创建新列
def new_features(df):
    
    for col in ['temperature', 'dewpoint', '10_metre_u_wind_component', '10_metre_v_wind_component', '10_metre_u_wind_component_fl', 
                '10_metre_v_wind_component_fl', '10_metre_u_wind_component_fdw', '10_metre_v_wind_component_fdw', '10_metre_u_wind_component_flw', 
                '10_metre_v_wind_component_flw']:
        for window in [1]:
            df[f"{col}_diff_{window}"] = df.groupby(["county", 'is_consumption', 'product_type', 'is_business'])[col].diff(window)
    
    return df

## 超参数优化

In [6]:
#lgb_objective函数设计用于使用Optuna与LightGBM模型进行超参数优化
def lgb_objective(trial):
    params = {
        #定义一组要优化的超参数，如学习率、树相关参数和正则化项
        'n_iter'           : 1000,
        'verbose'          : -1,
        'random_state'     : 42,
        'objective'        : 'l2',
        'learning_rate'    : trial.suggest_float('learning_rate', 0.005, 0.1),
        'colsample_bytree' : trial.suggest_float('colsample_bytree', 0.3, 1.0),
        'colsample_bynode' : trial.suggest_float('colsample_bynode', 0.5, 1.0),
        'lambda_l1'        : trial.suggest_float('lambda_l1', 1e-4, 10.0),
        'lambda_l2'        : trial.suggest_float('lambda_l2', 1e-4, 10.0),
        'min_data_in_leaf' : trial.suggest_int('min_data_in_leaf', 4, 256),
        'max_depth'        : trial.suggest_int('max_depth', 5, 16),
        'max_bin'          : trial.suggest_int('max_bin', 32, 1024),
    }
    
    model  = lgb.LGBMRegressor(**params)
    X, y   = df_train.drop(columns=["target"]), df_train["target"]
    #使用“MonthlyKFold”类进行交叉验证,使得数据更加强健
    cv     = MonthlyKFold(1)
    #计算平均绝对误差作为优化的度量
    scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_absolute_error')
    #最终目标是找到一组最佳的超参数，使平均绝对误差最小化
    return -1 * np.mean(scores)

## 数据加载和数据透视表创建

In [7]:
#加载包含训练数据的CSV文件
train = pd.read_csv("/kaggle/input/predict-energy-behavior-of-prosumers/train.csv")
#从中创建数据透视表,pivot操作使用'datetime'作为索引和基于'county'、'product_type'、'is_business'和'is_consumption'的多级列结构来重组数据
#计算这些类别的每个组合的“目标”变量随时间的平均值
pivot_train = train.pivot_table(index='datetime',columns=['county','product_type','is_business','is_consumption'], values='target', aggfunc='mean')
pivot_train.columns = ['county{}_productType{}_isBusiness{}_isConsumption{}'.format(*col) for col in pivot_train.columns.values]
#为了清晰起见，将重命名列，并将索引转换为日期时间格式
pivot_train.index = pd.to_datetime(pivot_train.index)
pivot_train

Unnamed: 0_level_0,county0_productType0_isBusiness1_isConsumption0,county0_productType0_isBusiness1_isConsumption1,county0_productType1_isBusiness0_isConsumption0,county0_productType1_isBusiness0_isConsumption1,county0_productType1_isBusiness1_isConsumption0,county0_productType1_isBusiness1_isConsumption1,county0_productType2_isBusiness0_isConsumption0,county0_productType2_isBusiness0_isConsumption1,county0_productType2_isBusiness1_isConsumption0,county0_productType2_isBusiness1_isConsumption1,...,county15_productType0_isBusiness1_isConsumption0,county15_productType0_isBusiness1_isConsumption1,county15_productType1_isBusiness0_isConsumption0,county15_productType1_isBusiness0_isConsumption1,county15_productType1_isBusiness1_isConsumption0,county15_productType1_isBusiness1_isConsumption1,county15_productType3_isBusiness0_isConsumption0,county15_productType3_isBusiness0_isConsumption1,county15_productType3_isBusiness1_isConsumption0,county15_productType3_isBusiness1_isConsumption1
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-09-01 00:00:00,0.000,59.000,0.713,96.590,0.000,501.760,0.000,17.314,,,...,,,0.000,5.524,0.100,17.703,0.000,59.366,0.000,289.159
2021-09-01 01:00:00,0.000,61.600,1.132,77.691,0.000,486.297,0.000,15.872,,,...,,,0.000,4.632,0.100,20.416,0.000,56.357,0.000,280.657
2021-09-01 02:00:00,0.000,63.100,0.490,91.594,0.000,458.562,0.000,16.510,,,...,,,0.000,5.461,0.100,27.440,0.000,51.038,0.000,295.537
2021-09-01 03:00:00,0.000,64.000,0.496,87.955,0.000,511.794,0.000,14.271,,,...,,,0.000,4.617,0.000,19.050,0.000,48.482,0.000,287.724
2021-09-01 04:00:00,0.000,60.500,0.149,88.184,0.000,520.318,0.000,18.225,,,...,,,0.000,4.578,0.100,21.228,0.000,57.115,0.000,286.074
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-05-31 19:00:00,45.516,592.897,439.610,300.096,126.947,761.060,3.346,2.978,5.709,11.249,...,20.147,530.777,26.074,27.369,59.005,13.353,124.516,65.457,111.403,171.252
2023-05-31 20:00:00,12.259,586.746,120.856,443.340,27.563,827.122,0.792,2.730,1.546,14.582,...,8.649,537.154,19.584,35.140,25.563,13.756,43.101,85.693,43.751,232.759
2023-05-31 21:00:00,1.984,544.978,22.428,569.301,5.091,808.137,0.136,3.997,0.304,19.392,...,1.260,323.350,1.804,52.427,2.840,24.250,6.982,124.146,7.030,265.328
2023-05-31 22:00:00,0.001,516.906,5.086,577.499,0.028,870.630,0.000,4.329,0.002,21.783,...,0.000,207.244,0.005,63.068,0.000,36.083,2.545,131.833,0.001,274.569


## 数据列和文件路径设置
这一步设置对于确保只从每个文件读取相关的列、优化内存使用和处理时间至关重要

In [8]:
#定义存储这些数据文件的根目录
root = "/kaggle/input/predict-energy-behavior-of-prosumers"
#设置各种列表，这些列表定义与能耗和价格、天气预报、历史天气数据和位置数据相关的不同数据文件中要使用的列
data_cols        = ['target', 'county', 'is_business', 'product_type', 'is_consumption', 'datetime', 'row_id']
client_cols      = ['product_type', 'county', 'eic_count', 'installed_capacity', 'is_business', 'date']
gas_cols         = ['forecast_date', 'lowest_price_per_mwh', 'highest_price_per_mwh']
electricity_cols = ['forecast_date', 'euros_per_mwh']
forecast_cols    = ['latitude', 'longitude', 'hours_ahead', 'temperature', 'dewpoint', 'cloudcover_high', 'cloudcover_low', 'cloudcover_mid', 'cloudcover_total', '10_metre_u_wind_component', '10_metre_v_wind_component', 'forecast_datetime', 'direct_solar_radiation', 'surface_solar_radiation_downwards', 'snowfall', 'total_precipitation']
historical_cols  = ['datetime', 'temperature', 'dewpoint', 'rain', 'snowfall', 'surface_pressure','cloudcover_total','cloudcover_low','cloudcover_mid','cloudcover_high','windspeed_10m','winddirection_10m','shortwave_radiation','direct_solar_radiation','diffuse_radiation','latitude','longitude']
location_cols    = ['longitude', 'latitude', 'county']
target_cols      = ['target', 'county', 'is_business', 'product_type', 'is_consumption', 'datetime']

save_path = None
load_path = None

## 数据读取和模式定义
这一步对于为特征工程和建模准备数据是必不可少的

In [9]:
#使用polar库将不同的CSV文件读取到数据框架中，只选择前面定义的相关列
df_data        = pl.read_csv(os.path.join(root, "train.csv"), columns=data_cols, try_parse_dates=True)
df_client      = pl.read_csv(os.path.join(root, "client.csv"), columns=client_cols, try_parse_dates=True)
df_gas         = pl.read_csv(os.path.join(root, "gas_prices.csv"), columns=gas_cols, try_parse_dates=True)
df_electricity = pl.read_csv(os.path.join(root, "electricity_prices.csv"), columns=electricity_cols, try_parse_dates=True)
df_forecast    = pl.read_csv(os.path.join(root, "forecast_weather.csv"), columns=forecast_cols, try_parse_dates=True)
df_historical  = pl.read_csv(os.path.join(root, "historical_weather.csv"), columns=historical_cols, try_parse_dates=True)
df_location    = pl.read_csv(os.path.join(root, "weather_station_to_county_mapping.csv"), columns=location_cols, try_parse_dates=True)
df_target      = df_data.select(target_cols)

schema_data        = df_data.schema
schema_client      = df_client.schema
schema_gas         = df_gas.schema
schema_electricity = df_electricity.schema
schema_forecast    = df_forecast.schema
schema_historical  = df_historical.schema
schema_target      = df_target.schema

## 为建模准备数据

In [10]:
#从训练数据中分离出特征(X)和目标变量(y)
X, y = df_data.drop("target"), df_data.select("target")
#使用先前定义的feature_engfunction在X上执行并集成各种数据集
X = feature_eng(X, df_client, df_gas, df_electricity, df_forecast, df_historical, df_location, df_target)
#使用' to_pandas函数将处理后的数据转换为Pandas DataFrame
df_train = to_pandas(X, y)
#通过new_features函数生成新特性来进一步增强
df_train = new_features(df_train)

## 训练的数据过滤

In [11]:
#过滤训练数据(' df_train")，仅包括目标变量不为空且年份大于2021的行。
df_train = df_train[df_train["target"].notnull() & df_train["year"].gt(2021)]

In [12]:
df_train.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 1651902 entries, 366048 to 2018351
Data columns (total 147 columns):
 #    Column                                 Dtype   
---   ------                                 -----   
 0    county                                 category
 1    is_business                            category
 2    product_type                           category
 3    is_consumption                         category
 4    lowest_price_per_mwh                   float32 
 5    highest_price_per_mwh                  float32 
 6    eic_count                              float64 
 7    installed_capacity                     float32 
 8    euros_per_mwh                          float32 
 9    hours_ahead                            float32 
 10   temperature                            float32 
 11   dewpoint                               float32 
 12   cloudcover_high                        float32 
 13   cloudcover_low                         float32 
 14   cloudcover_mid  

### HyperParam Optimization
为LightGBM模型定义了多组超参数。这些参数包括学习率、列采样率、正则化项等。每一组参数(“p1”到“p7”等)表示将在集成建模中使用的不同配置。
为接下来的投票回归准备

In [13]:
p1={'n_iter': 2000,'verbose': -1,'objective': 'l2','metric': 'mae','learning_rate': 0.05073909898961407, 'colsample_bytree': 0.726023996436955, 'colsample_bynode': 0.5803681307354022, 'lambda_l1': 8.562963348932286, 'lambda_l2': 4.893256185259296, 'min_data_in_leaf': 115, 'max_depth': 23, 'max_bin': 898}
p2={'n_iter': 2000,'verbose': -1,'objective': 'l2','metric': 'mae','learning_rate': 0.05670084478292278, 'colsample_bytree': 0.6440444070196796, 'colsample_bynode': 0.637635804565811, 'lambda_l1': 6.29090474401462, 'lambda_l2': 6.775341543233317, 'min_data_in_leaf': 95, 'max_depth': 9, 'max_bin': 630}
p3={'n_iter': 2000,'verbose': -1,'objective': 'l2','metric': 'mae','learning_rate': 0.0632167263149817, 'colsample_bytree': 0.6958033941948067, 'colsample_bynode': 0.6030801666196094, 'lambda_l1': 7.137580620471935, 'lambda_l2': 9.348169401713742, 'min_data_in_leaf': 74, 'max_depth': 11, 'max_bin': 530}
p4={'n_iter': 2000,'verbose': -1,'objective': 'l2','metric': 'mae','learning_rate': 0.061236402165228264, 'colsample_bytree': 0.81427095118471, 'colsample_bynode': 0.6097376843527067, 'lambda_l1': 6.360490880385201, 'lambda_l2': 9.954136008333839, 'min_data_in_leaf': 238, 'max_depth': 16, 'max_bin': 649}
p5={'n_iter': 2000,'verbose': -1,'objective': 'l2','metric': 'mae','learning_rate': 0.08753282378023663, 'colsample_bytree': 0.7508715107428325, 'colsample_bynode': 0.6831819500325418, 'lambda_l1': 8.679353563755722, 'lambda_l2': 6.105008696961338, 'min_data_in_leaf': 198, 'max_depth': 15, 'max_bin': 835}
p6={'n_iter': 2000,'verbose': -1,'objective': 'l2','metric': 'mae','learning_rate': 0.05929380742257108, 'colsample_bytree': 0.6101576947777211, 'colsample_bynode': 0.6052639518604396, 'lambda_l1': 8.087311995794915, 'lambda_l2': 6.067361158677095, 'min_data_in_leaf': 122, 'max_depth': 9, 'max_bin': 797}
p7={'n_iter': 2000,'verbose': -1,'objective': 'l2','metric': 'mae','learning_rate': 0.05689066836106983, 'colsample_bytree': 0.8915976762048253, 'colsample_bynode': 0.5942203285139224, 'lambda_l1': 7.6277555139102864, 'lambda_l2': 6.6591278779517808, 'min_data_in_leaf' : 156, 'max_depth': 11, 'max_bin': 813}
p8={'n_iter': 2000,'verbose': -1,'objective': 'l2','metric': 'mae','learning_rate': 0.06210133914728566, 'colsample_bytree': 0.9394149364406023, 'colsample_bynode': 0.6136449922460668, 'lambda_l1': 6.8170120783290963, 'lambda_l2': 6.9413925098162625, 'min_data_in_leaf': 100, 'max_depth': 12, 'max_bin': 749}
p9={'n_iter': 2000,'verbose': -1,'objective': 'l2','metric': 'mae','learning_rate': 0.05352743463192712, 'colsample_bytree': 0.7677968926673415, 'colsample_bynode': 0.619885528937525, 'lambda_l1': 6.352422005269177, 'lambda_l2': 7.56567996461831, 'min_data_in_leaf': 61, 'max_depth': 25, 'max_bin': 615}

## 模型集成,使用投票回归器训练
条件加载以及模型训练。

定义了两个VotingRegressor模型，每个模型由几个具有不同超参数(前一个单元中定义的)的LightGBM回归器组成。然后将这些模型拟合到训练数据上。

In [14]:
from sklearn.metrics import mean_absolute_error as MAE

if load_path is not None:
    #如果一个模型已经保存在load_path，它就会被加载
    model = pickle.load(open(load_path, "rb"))
else:
    #否则，将训练新模型。
    model = VotingRegressor([
        ('lgb_1', lgb.LGBMRegressor(**p1, random_state=42)), 
        ('lgb_2', lgb.LGBMRegressor(**p2, random_state=42)), 
        ('lgb_3', lgb.LGBMRegressor(**p3, random_state=42)), 
        ('lgb_4', lgb.LGBMRegressor(**p4, random_state=42)), 
        ('lgb_5', lgb.LGBMRegressor(**p5, random_state=42)), 
        ('lgb_6', lgb.LGBMRegressor(**p6, random_state=42)),
        ('lgb_7', lgb.LGBMRegressor(**p7, random_state=42)),
        ('lgb_8', lgb.LGBMRegressor(**p8, random_state=42)),
    ],weights=[0.16,0.13,0.12,0.11,0.12,0.11,0.14,0.11])

    model_solar = VotingRegressor([
        ('lgb_11', lgb.LGBMRegressor(**p1, random_state=42)), 
        ('lgb_12', lgb.LGBMRegressor(**p2, random_state=42)), 
        ('lgb_13', lgb.LGBMRegressor(**p3, random_state=42)), 
        ('lgb_14', lgb.LGBMRegressor(**p4, random_state=42)), 
        ('lgb_15', lgb.LGBMRegressor(**p5, random_state=42)), 
        ('lgb_16', lgb.LGBMRegressor(**p6, random_state=42)),
        ('lgb_17', lgb.LGBMRegressor(**p7, random_state=42)),
        ('lgb_18', lgb.LGBMRegressor(**p8, random_state=42)),
    ],weights=[0.16,0.13,0.12,0.11,0.12,0.11,0.14,0.11])

    model.fit(
        X=df_train.drop(columns=["target"]),
        y=df_train["target"]
    )
    
    model_solar.fit(
        X=df_train[df_train['is_consumption']==0].drop(columns=["target"]),
        y=df_train[df_train['is_consumption']==0]["target"]
    )

#训练后，模型可选择保存到“save_path”
if save_path is not None:
    with open(save_path, "wb") as f:
        pickle.dump(model, f)
    with open(save_path, "wb") as f:
        pickle.dump(model_solar, f)







## 设置环境

In [15]:
import enefit

env = enefit.make_env()
iter_test = env.iter_test()

In [16]:
for (test, revealed_targets, client, historical_weather,
        forecast_weather, electricity_prices, gas_prices, sample_prediction) in iter_test:
    
    test = test.rename(columns={"prediction_datetime": "datetime"})
    
    df_test           = pl.from_pandas(test[data_cols[1:]], schema_overrides=schema_data)
    df_client         = pl.from_pandas(client[client_cols], schema_overrides=schema_client)
    df_gas            = pl.from_pandas(gas_prices[gas_cols], schema_overrides=schema_gas)
    df_electricity    = pl.from_pandas(electricity_prices[electricity_cols], schema_overrides=schema_electricity)
    df_new_forecast   = pl.from_pandas(forecast_weather[forecast_cols], schema_overrides=schema_forecast)
    df_new_historical = pl.from_pandas(historical_weather[historical_cols], schema_overrides=schema_historical)
    df_new_target     = pl.from_pandas(revealed_targets[target_cols], schema_overrides=schema_target)
    
    df_forecast       = pl.concat([df_forecast, df_new_forecast]).unique()
    df_historical     = pl.concat([df_historical, df_new_historical]).unique()
    df_target         = pl.concat([df_target, df_new_target]).unique()
    
    X_test = feature_eng(df_test, df_client, df_gas, df_electricity, df_forecast, df_historical, df_location, df_target)
    X_test = to_pandas(X_test)
    X_test = new_features(X_test)
    
    test['target'] = model.predict(X_test).clip(0)
    test['target_solar'] = model_solar.predict(X_test).clip(0)
    test.loc[test['is_consumption']==0, "target"] = test.loc[test['is_consumption']==0, "target_solar"]    
    
    sample_prediction["target"] = test['target']
    
    env.predict(sample_prediction)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
