# Predict Prosumer Energy Patterns¶
This notebook is modified from the existing notebook [Enefit PEBOP: LGBM ensemble](https://www.kaggle.com/code/siddhvr/enefit-pebop-lgbm-ensemble)

This notebook predicts electricity production and consumption for Estonian solar customers using weather, price, and PV data and train an ensemble of LightGBM models that uses decision tree based learning algorithms.

**[Change logs]**
- [version 13] (score=66.84) Include two classes to process and transform the training and testing data



In [1]:
import enefit, torch, pickle, gc

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, date, timedelta

# Sklean packages...
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
DEVICE = "gpu" if torch.cuda.is_available() else "cpu"
print(DEVICE)

cpu


## **Imports**

In [2]:
train= pd.read_csv('/kaggle/input/predict-energy-behavior-of-prosumers/train.csv')
gas_df= pd.read_csv('/kaggle/input/predict-energy-behavior-of-prosumers/gas_prices.csv')
electricity_df= pd.read_csv('/kaggle/input/predict-energy-behavior-of-prosumers/electricity_prices.csv')
client_df= pd.read_csv('/kaggle/input/predict-energy-behavior-of-prosumers/client.csv')
fw_df= pd.read_csv('/kaggle/input/predict-energy-behavior-of-prosumers/forecast_weather.csv')
hw_df= pd.read_csv('/kaggle/input/predict-energy-behavior-of-prosumers/historical_weather.csv')
locations= pd.read_csv('/kaggle/input/locations/county_lon_lats.csv')

# Data Processing and Transformation

In [3]:
class DataProcessing():

    def feat_eng_train(self, data, client, hist_weather,forecast_weather, electricity, gas, locations):

        #Dropping (target) nan values
        data= data[data['target'].notnull()] 
        #Converting (datetime) column to datetime
        data['datetime'] = pd.to_datetime(data['datetime'], utc=True)
        #Renaming (forecast_date) to (datetime) for merging with the train data later
        electricity = electricity.rename(columns= {'forecast_date' : 'datetime'})
        #Converting (datetime) column to datetime
        electricity['datetime'] = pd.to_datetime(electricity['datetime'], utc= True)
        #Decreasing (data_block_id) in client data because it's 2 steps ahead from train's data (data_block_id)
        client['data_block_id'] -= 2

        locations = locations.drop('Unnamed: 0', axis= 1) 

        #Rounding the (latitude) and (longitude) for 1 decimal fraction
        forecast_weather[['latitude', 'longitude']] = forecast_weather[['latitude',
                                                                        'longitude']].astype(float).round(1)
        #Merging counties in locations data with the coordinations in the forecast_weather data
        forecast_weather= forecast_weather.merge(locations, how='left',
                                                 on=['longitude','latitude'])
        #dropping nan values
        forecast_weather.dropna(axis= 0, inplace= True)    
        #Converting (county) column to integer
        forecast_weather['county'] = forecast_weather['county'].astype('int64')
        #Dropping the columns we won't need | We will use the (forecast_datetime) column instead of the (origin_datetime)
        forecast_weather.drop(['origin_datetime', 'latitude','longitude', 'hours_ahead', 'data_block_id'], axis=1, inplace= True)
        #Renaming (forecast_datetime) to (datetime) for merging with the train data later
        forecast_weather.rename(columns={'forecast_datetime': 'datetime'}, inplace= True)
        #Converting (datetime) column to datetime
        forecast_weather['datetime']= pd.to_datetime(forecast_weather['datetime'], utc= True)

        forecast_weather_datetime= forecast_weather.groupby([forecast_weather['datetime'].dt.to_period('h')])[list(forecast_weather.drop(['county','datetime'], axis= 1).columns)].mean().reset_index()
        #After converting the (datetime) column to hour period for the groupby we convert it back to datetime
        forecast_weather_datetime['datetime']= pd.to_datetime(forecast_weather_datetime['datetime'].dt.to_timestamp(), utc=True)

        forecast_weather_datetime_county= forecast_weather.groupby(['county',forecast_weather['datetime'].dt.to_period('h')])[list(forecast_weather.drop(['county','datetime'], axis= 1).columns)].mean().reset_index()

        #After converting the (datetime) column to hour period for the groupby we convert it back to datetime
        forecast_weather_datetime_county['datetime']= pd.to_datetime(
            forecast_weather_datetime_county['datetime'].dt.to_timestamp(), utc=True)

        #Rounding the (latitude) and (longitude) for 1 decimal fraction           
        hist_weather[['latitude', 'longitude']] = hist_weather[['latitude', 'longitude']].astype(float).round(1)

        #Merging counties in locations data with the coordinations in the historical_weather data
        hist_weather= hist_weather.merge(locations, how='left', on=['longitude','latitude'])    
        #Dropping nan values
        hist_weather.dropna(axis= 0, inplace= True)
        #Dropping the columns we won't need
        hist_weather.drop(['latitude', 'longitude'], axis=1, inplace= True)
        #Converting (county) to integer
        hist_weather['county'] = hist_weather['county'].astype('int64')
        #Converting (datetime) column to datetime
        hist_weather['datetime']= pd.to_datetime(hist_weather['datetime'], utc= True)

        hist_weather_datetime= hist_weather.groupby([hist_weather['datetime'].dt.to_period('h')])[list(hist_weather.drop(['county','datetime','data_block_id'], axis= 1).columns)].mean().reset_index()    

        #After converting the (datetime) column to hour period for the groupby we convert it back to datetime
        hist_weather_datetime['datetime']= pd.to_datetime(hist_weather_datetime['datetime'].dt.to_timestamp(), utc=True)

        #Merging (data_block_id) back after dropping it in the last step | (data_block_id will be used to merge with train data)
        hist_weather_datetime= hist_weather_datetime.merge(
            hist_weather[['datetime', 'data_block_id']], how='left', on='datetime')

        hist_weather_datetime_county= hist_weather.groupby(['county',hist_weather['datetime'].dt.to_period('h')])[list(hist_weather.drop(['county','datetime', 'data_block_id'], axis= 1).columns)].mean().reset_index() 
        #After converting the (datetime) column to hour period for the groupby we convert it back to datetime
        hist_weather_datetime_county['datetime']= pd.to_datetime(
            hist_weather_datetime_county['datetime'].dt.to_timestamp(), utc=True)
        #Merging (data_block_id) back after dropping it in the last step
        hist_weather_datetime_county= hist_weather_datetime_county.merge(hist_weather[['datetime', 'data_block_id']], how='left', on='datetime')
        #Adding year column in train data
        data['year'] = data['datetime'].dt.year
        #Adding month column in train data
        data['month'] = data['datetime'].dt.month
        #Adding day column in train data
        data['day'] = data['datetime'].dt.day
        #Adding hour column in train data
        data['hour'] = data['datetime'].dt.hour
        #Adding dayofweek column in train data
        data['dayofweek'] = data['datetime'].dt.dayofweek
        #Adding dayofyear column in train data
        data['dayofyear']= data['datetime'].dt.dayofyear
        #Adding hour column to electricity used to merge with the train data
        electricity['hour'] = electricity['datetime'].dt.hour
        #Merging train data with client data
        data= data.merge(client.drop(columns = ['date']), how='left', on=['data_block_id', 'county', 'is_business', 'product_type'])

        data= data.merge(gas[['data_block_id', 'lowest_price_per_mwh', 'highest_price_per_mwh']], how='left', on='data_block_id')

        data= data.merge(electricity[['euros_per_mwh', 'hour', 'data_block_id']], how='left', on=['hour', 'data_block_id'])

        data= data.merge(forecast_weather_datetime, how='left', on=['datetime'])

        data= data.merge(forecast_weather_datetime_county, how='left', on=['datetime', 'county'],
                         suffixes= ('_fcast_mean','_fcast_mean_by_county'))

        hist_weather_datetime['hour']= hist_weather_datetime['datetime'].dt.hour
        hist_weather_datetime_county['hour']= hist_weather_datetime_county['datetime'].dt.hour

        hist_weather_datetime.drop_duplicates(inplace=True)
        hist_weather_datetime_county.drop_duplicates(inplace=True)
        hist_weather_datetime.drop('datetime', axis= 1, inplace= True)
        hist_weather_datetime_county.drop('datetime', axis= 1, inplace= True)

        #Merging hist_weather_datetime with train data
        data= data.merge(hist_weather_datetime, how='left', on=['data_block_id', 'hour'])

        data= data.merge(hist_weather_datetime_county, how='left', on=['data_block_id', 'county', 'hour'],suffixes= ('_hist_mean','_hist_mean_by_county'))

        data= data.groupby(['year', 'day', 'hour'], as_index=False).apply(lambda x: x.ffill().bfill()).reset_index()

        #Dropping uneeded data
        data.drop(['level_0', 'level_1', 'row_id', 'data_block_id'], axis= 1, inplace= True)
        return data

    def create_revealed_targets_train(self, data, N_day_lags):

        original_datetime = data['datetime']
        revealed_targets = data[['datetime', 'prediction_unit_id', 'is_consumption', 'target']].copy()

        #Create revealed targets for n days lags
        for day_lag in range(2, N_day_lags+1):
            revealed_targets['datetime'] = original_datetime + pd.DateOffset(day_lag)
            data = data.merge(revealed_targets, 
                              how='left', 
                              on = ['datetime', 'prediction_unit_id', 'is_consumption'],
                              suffixes = ('', f'_{day_lag}_days_ago')
                             )
        data['datetime'] = data['datetime'].astype('int64')
        return data

    def get_agg_target_lag(self, df):

        tgt_lag_columns = [c for c in df.columns if '_days_ago' in c]

        for m in ['mean', 'std', 'var', 'median', 'max', 'min']:
            df[f'target_{m}'] = df[tgt_lag_columns].agg(m, axis=1)

        return df
    
    

## Data transformation

In [4]:
def make_logs(df):
    #Log columns with outliers
    to_log= ['installed_capacity', 'euros_per_mwh', 'temperature_fcast_mean', 'dewpoint_fcast_mean',
            'cloudcover_high_fcast_mean', 'cloudcover_low_fcast_mean', 'cloudcover_mid_fcast_mean', 'cloudcover_total_fcast_mean',
            '10_metre_u_wind_component_fcast_mean', '10_metre_v_wind_component_fcast_mean', 'direct_solar_radiation_fcast_mean',
            'snowfall_fcast_mean', 'total_precipitation_fcast_mean', 'temperature_fcast_mean_by_county', 'dewpoint_fcast_mean_by_county',
            'cloudcover_high_fcast_mean_by_county', 'cloudcover_low_fcast_mean_by_county', 'cloudcover_mid_fcast_mean_by_county',
            'cloudcover_total_fcast_mean_by_county', '10_metre_u_wind_component_fcast_mean_by_county', '10_metre_v_wind_component_fcast_mean_by_county',
            'surface_solar_radiation_downwards_fcast_mean_by_county', 'snowfall_fcast_mean_by_county', 'total_precipitation_fcast_mean_by_county',
            'rain_hist_mean', 'snowfall_hist_mean', 'windspeed_10m_hist_mean_by_county', 'target_2_days_ago', 'target_3_days_ago',
            'target_4_days_ago', 'target_5_days_ago', 'target_6_days_ago', 'target_7_days_ago', 'target_mean', 'target_std']
    for i in to_log:
        df[f"log_{i}"]= np.where((df[i])!= 0, np.log(df[i]),0)
    return df

In [5]:
LOADED = False
#Specify how many days to lag and applying the function
N_day_lags = 7

#Data transformation
dp = DataProcessing()
#Applying the Train function and storing our train data in the (train) variable
train = dp.feat_eng_train(train, client_df, hw_df, fw_df, electricity_df, gas_df, locations)
train = dp.create_revealed_targets_train(train, N_day_lags)
train['sin_hour']= (np.pi * np.sin(train['hour']) / 12)
train['cos_hour']= (np.pi * np.cos(train['hour']) / 12)
train['sin_dayofyear']= (np.pi * np.sin(train['dayofyear']) / 183)
train['cos_dayofyear']= (np.pi * np.cos(train['dayofyear']) / 183)
train = dp.get_agg_target_lag(train)
#Log columns with outliers
train = make_logs(train)
# Filter out the year >= 2022
train=train[train.year >= 2022]
train.to_csv("train.csv")
    
    
print(f"Training the number of columns = {len(train.columns.tolist())} \n {sorted(train.columns.tolist())}")

Training the number of columns = 121 
 ['10_metre_u_wind_component_fcast_mean', '10_metre_u_wind_component_fcast_mean_by_county', '10_metre_v_wind_component_fcast_mean', '10_metre_v_wind_component_fcast_mean_by_county', 'cloudcover_high_fcast_mean', 'cloudcover_high_fcast_mean_by_county', 'cloudcover_high_hist_mean', 'cloudcover_high_hist_mean_by_county', 'cloudcover_low_fcast_mean', 'cloudcover_low_fcast_mean_by_county', 'cloudcover_low_hist_mean', 'cloudcover_low_hist_mean_by_county', 'cloudcover_mid_fcast_mean', 'cloudcover_mid_fcast_mean_by_county', 'cloudcover_mid_hist_mean', 'cloudcover_mid_hist_mean_by_county', 'cloudcover_total_fcast_mean', 'cloudcover_total_fcast_mean_by_county', 'cloudcover_total_hist_mean', 'cloudcover_total_hist_mean_by_county', 'cos_dayofyear', 'cos_hour', 'county', 'datetime', 'day', 'dayofweek', 'dayofyear', 'dewpoint_fcast_mean', 'dewpoint_fcast_mean_by_county', 'dewpoint_hist_mean', 'dewpoint_hist_mean_by_county', 'diffuse_radiation_hist_mean', 'diffus

## Training columns

In [6]:
#Storing training features into numpy arrays
X= train[train['is_consumption'] != 0].drop('target', axis= 1).values
y= train[train['is_consumption'] != 0]['target']

#Storing production targets into an array itself | Will seperate it into another model
X2= train[train['is_consumption'] == 0].drop('target', axis= 1).values
y2= train[train['is_consumption'] == 0]['target']

## Training models

In [7]:
LOADED = True # False: create and train the model
               # True: load the trained model
# Split your data
SEED = 73
Xtrain, Xval, Ytrain, Yval = train_test_split(X, y, test_size=0.25, random_state=SEED, shuffle=True)
Xtrain2, Xval2, Ytrain2, Yval2 = train_test_split(X2, y2, test_size=0.25, random_state=SEED, shuffle=True)

In [8]:
DEVICE = "gpu" if torch.cuda.is_available() else "cpu"
common_params = {
    'n_iter':4000,'verbose': -1,'random_state':SEED,'objective':'tweedie',
    'device':DEVICE, 'n_jobs':4
}
m_model_params = [
    {**common_params,'learning_rate': 0.004811751415536496, 'colsample_bytree': 0.8841841689852410, 'colsample_bynode': 0.4305836942635745, 'reg_alpha': 3.11984157361821, 'reg_lambda': 1.088469732297296, 'min_data_in_leaf': 162, 'max_depth': 16, 'num_leaves': 435},
    {**common_params,'learning_rate': 0.007922862526647507, 'colsample_bytree': 0.9052952790963521, 'colsample_bynode': 0.4416947152746856, 'reg_alpha': 3.31471952672932, 'reg_lambda': 1.349570843308307, 'min_data_in_leaf': 185, 'max_depth': 18, 'num_leaves': 445},
    {**common_params,'learning_rate': 0.010339736147758608, 'colsample_bytree': 0.9263063801074632, 'colsample_bynode': 0.4527058263857967, 'reg_alpha': 3.62802063709343, 'reg_lambda': 1.650681954419419, 'min_data_in_leaf': 201, 'max_depth': 20, 'num_leaves': 455},
    {**common_params,'learning_rate': 0.013090718804096083, 'colsample_bytree': 0.9499770953943448, 'colsample_bynode': 0.4670163857441046, 'reg_aplha': 3.96946065556807, 'reg_lambda': 1.925712107567988, 'min_data_in_leaf': 223, 'max_depth': 22, 'num_leaves': 465},
    {**common_params,'learning_rate': 0.015559490612977255, 'colsample_bytree': 0.9682791614810814, 'colsample_bynode': 0.4722023075509447, 'reg_aplha': 4.15624585398345, 'reg_lambda': 2.265053303366992, 'min_data_in_leaf': 254, 'max_depth': 24, 'num_leaves': 475},
    {**common_params,'learning_rate': 0.018908744594789185, 'colsample_bytree': 0.9864875442500248, 'colsample_bynode': 0.4832525869590394, 'reg_aplha': 4.35845913192557, 'reg_lambda': 2.355521088983217, 'min_data_in_leaf': 289, 'max_depth': 26, 'num_leaves': 485},
    {**common_params,'learning_rate': 0.021819855605890296, 'colsample_bytree': 0.9995986553611359, 'colsample_bynode': 0.4953634970601405, 'reg_alpha': 4.58956024201648, 'reg_lambda': 2.616432197094328, 'min_data_in_leaf': 309, 'max_depth': 28, 'num_leaves': 495}
]

In [9]:
## 7 lgbm models
m_models = []
for i, model_params in enumerate(m_model_params): # Train the m_models
    model_file = f'/kaggle/input/energe-models/lgbm_models/m_model{i}.txt'
    if LOADED:
        lgbm_model = lgb.Booster(model_file=model_file)
        print(f'Trained LGB model {i} was Loaded!')
    else:
        lgbm_model = LGBMRegressor(**model_params)
        print('_______________________________________________________')
        print('Start')
        lgbm_model.fit(Xtrain, Ytrain, eval_set=[(Xval, Yval)], callbacks=[
                lgb.callback.early_stopping(stopping_rounds=100),
                lgb.callback.log_evaluation(period=100),
            ])
         # Save the model
        lgbm_model.booster_.save_model(model_file)
        print(f'Trained LGB model {i} was saved!')
    m_models.append(lgbm_model)

Trained LGB model 0 was Loaded!
Trained LGB model 1 was Loaded!
Trained LGB model 2 was Loaded!
Trained LGB model 3 was Loaded!
Trained LGB model 4 was Loaded!
Trained LGB model 5 was Loaded!
Trained LGB model 6 was Loaded!


In [10]:
gc.collect()

0

In [11]:
## another 7 lgbm models
common_params = {
    'n_iter':4000,'verbose': -1,'random_state':SEED,'objective':'tweedie',
    'device':DEVICE, 'n_jobs':4
}
n_model_params =[
    {**common_params,'learning_rate': 0.006311751415536496, 'colsample_bytree': 0.8441841689852410, 'colsample_bynode': 0.4305836942635745, 'lambda_l1': 3.21984157361821, 'lambda_l2': 1.108469732297296, 'min_data_in_leaf': 53, 'max_depth': 10,  'min_data_per_groups': 39,'num_leaves': 435},
    {**common_params,'learning_rate': 0.008228625036647597, 'colsample_bytree': 0.8652952790963521, 'colsample_bynode': 0.4416947152746856, 'lambda_l1': 3.41471952672932, 'lambda_l2': 1.349570843308307, 'min_data_in_leaf': 58, 'max_depth': 11,  'min_data_per_groups': 49,'num_leaves': 445},
    {**common_params,'learning_rate': 0.010339736147758608, 'colsample_bytree': 0.8893063801074632, 'colsample_bynode': 0.4527058263857967, 'lambda_l1': 3.62802063709343, 'lambda_l2': 1.650681954419419, 'min_data_in_leaf': 63, 'max_depth': 13,  'min_data_per_groups': 59,'num_leaves': 455},
    {**common_params,'learning_rate': 0.012090718804096083, 'colsample_bytree': 0.9099770953943448, 'colsample_bynode': 0.4670163857441046, 'lambda_l1': 3.86946065556807, 'lambda_l2': 1.925712107567988, 'min_data_in_leaf': 68, 'max_depth': 15,  'min_data_per_groups': 69,'num_leaves': 465},
    {**common_params,'learning_rate': 0.014559490612977255, 'colsample_bytree': 0.9282791614810814, 'colsample_bynode': 0.4722023075509447, 'lambda_l1': 4.05624585398343, 'lambda_l2': 2.265053303366992, 'min_data_in_leaf': 73, 'max_depth': 17,  'min_data_per_groups': 79,'num_leaves': 475},
    {**common_params,'learning_rate': 0.016908744594789185, 'colsample_bytree': 0.9534875442500248, 'colsample_bynode': 0.4832525869590394, 'lambda_l1': 4.25845913192557, 'lambda_l2': 2.555521088983217, 'min_data_in_leaf': 78, 'max_depth': 19,  'min_data_per_groups': 89,'num_leaves': 485},
    {**common_params,'learning_rate': 0.018819855605890296, 'colsample_bytree': 0.9715986553611359, 'colsample_bynode': 0.4953634970601405, 'lambda_l1': 4.58956024201648, 'lambda_l2': 2.816432197094328, 'min_data_in_leaf': 83, 'max_depth': 21,  'min_data_per_groups': 99,'num_leaves': 495}
]
print(n_model_params[0])

{'n_iter': 4000, 'verbose': -1, 'random_state': 73, 'objective': 'tweedie', 'device': 'cpu', 'n_jobs': 4, 'learning_rate': 0.006311751415536496, 'colsample_bytree': 0.844184168985241, 'colsample_bynode': 0.4305836942635745, 'lambda_l1': 3.21984157361821, 'lambda_l2': 1.108469732297296, 'min_data_in_leaf': 53, 'max_depth': 10, 'min_data_per_groups': 39, 'num_leaves': 435}


In [12]:
n_models = []
for i, model_params in enumerate(n_model_params):
    model_file = f'/kaggle/input/energe-models/lgbm_models/n_model{i}.txt'
    if LOADED:
        lgbm_model = lgb.Booster(model_file=model_file)
        print(f'Trained LGB model {i} was Loaded!')
    else:
        lgbm_model = LGBMRegressor(**model_params)
        print('_______________________________________________________')
        print('Start')
        lgbm_model.fit(Xtrain2, Ytrain2, eval_set=[(Xval2, Yval2)], callbacks=[
                lgb.callback.early_stopping(stopping_rounds=100),
                lgb.callback.log_evaluation(period=100),
            ])
         # Save the model
        lgbm_model.booster_.save_model(model_file)
        print(f'Trained LGB model {i} was saved!')
    n_models.append(lgbm_model)

Trained LGB model 0 was Loaded!
Trained LGB model 1 was Loaded!
Trained LGB model 2 was Loaded!
Trained LGB model 3 was Loaded!
Trained LGB model 4 was Loaded!
Trained LGB model 5 was Loaded!
Trained LGB model 6 was Loaded!


In [13]:
gc.collect()

0

## **Test function**

In [14]:
class TestDataProcessing():

    def feat_eng_test(self, data, client, hist_weather, forecast_weather, electricity, gas, locations):

        data= data.rename(columns={'prediction_datetime' : 'datetime'})
        data['datetime'] = pd.to_datetime(data['datetime'], utc=True)
        electricity = electricity.rename(columns= {'forecast_date' : 'datetime'})
        electricity['datetime'] = pd.to_datetime(electricity['datetime'], utc= True)
        locations = locations.drop('Unnamed: 0', axis= 1) 
        forecast_weather[['latitude', 'longitude']] = forecast_weather[['latitude', 'longitude']].astype(float).round(1)
        forecast_weather= forecast_weather.merge(locations, how='left', on=['longitude','latitude'])
        forecast_weather.dropna(axis= 0, inplace= True)    
        forecast_weather['county'] = forecast_weather['county'].astype('int64')
        forecast_weather.drop(['origin_datetime', 'latitude', 'longitude', 'hours_ahead', 'data_block_id'], axis=1, inplace= True)
        forecast_weather.rename(columns={'forecast_datetime': 'datetime'}, inplace= True)
        forecast_weather['datetime']= pd.to_datetime(forecast_weather['datetime'], utc= True)
        forecast_weather_datetime= forecast_weather.groupby([forecast_weather['datetime'].
                                                dt.to_period('h')])[list(forecast_weather.drop(['county',
                                                                                                'datetime'], axis= 1)
                                                                         .columns)].mean().reset_index()
        forecast_weather_datetime['datetime']= pd.to_datetime(
            forecast_weather_datetime['datetime'].dt.to_timestamp(), utc=True)

        forecast_weather_datetime_county= forecast_weather.groupby(['county',forecast_weather['datetime'].
                                  dt.to_period('h')])[list(forecast_weather.drop(['county',
                                                                                  'datetime'], axis= 1)
                                                           .columns)].mean().reset_index()
        forecast_weather_datetime_county['datetime']= pd.to_datetime(
            forecast_weather_datetime_county['datetime'].dt.to_timestamp(), utc=True)


        hist_weather[['latitude', 'longitude']] = hist_weather[['latitude', 'longitude']].astype(float).round(1)

        hist_weather= hist_weather.merge(locations, how='left', on=['longitude','latitude'])    

        hist_weather.dropna(axis= 0, inplace= True)

        hist_weather.drop(['latitude', 'longitude'], axis=1, inplace= True)

        hist_weather['county'] = hist_weather['county'].astype('int64')

        hist_weather['datetime']= pd.to_datetime(hist_weather['datetime'], utc= True)


        hist_weather_datetime= hist_weather.groupby([hist_weather['datetime'].dt.to_period('h')])
        
        hist_weather_datetime = hist_weather_datetime[list(hist_weather.drop(['county',
                                                                              'datetime',
                                                                              'data_block_id'], axis= 1).columns)
                                                                             ].mean().reset_index()    

        hist_weather_datetime['datetime']= pd.to_datetime(hist_weather_datetime['datetime'].dt.to_timestamp(), utc=True)
        hist_weather_datetime= hist_weather_datetime.merge(hist_weather[['datetime', 'data_block_id']], 
                                                           how='left', on='datetime')


        hist_weather_datetime_county = hist_weather.groupby(['county',
                                                            hist_weather['datetime'].dt.to_period('h')])
        hist_weather_datetime_county = hist_weather_datetime_county[list(hist_weather.drop(['county',
                                                                                            'datetime',
                                                                                            'data_block_id'], axis= 1).columns)].mean().reset_index() 
        hist_weather_datetime_county['datetime']= pd.to_datetime(hist_weather_datetime_county['datetime'].dt.to_timestamp(), utc=True)
        hist_weather_datetime_county= hist_weather_datetime_county.merge(hist_weather[['datetime', 'data_block_id']], how='left', on='datetime')

        data['year'] = data['datetime'].dt.year
        data['month'] = data['datetime'].dt.month
        data['day'] = data['datetime'].dt.day
        data['hour'] = data['datetime'].dt.hour
        data['dayofweek']= data['datetime'].dt.dayofweek
        data['dayofyear']= data['datetime'].dt.dayofyear

        electricity['hour'] = electricity['datetime'].dt.hour

        data= data.merge(client.drop(columns = ['date']), how='left', on=['data_block_id', 'county', 'is_business', 'product_type'])
        data= data.merge(gas[['data_block_id', 'lowest_price_per_mwh', 'highest_price_per_mwh']], how='left', on='data_block_id')
        data= data.merge(electricity[['euros_per_mwh', 'hour', 'data_block_id']], how='left', on=['hour', 'data_block_id'])
        data= data.merge(forecast_weather_datetime, how='left', on=['datetime'])
        data= data.merge(forecast_weather_datetime_county, how='left', on=['datetime', 'county'],
                         suffixes= ('_fcast_mean','_fcast_mean_by_county'))

        hist_weather_datetime['hour']= hist_weather_datetime['datetime'].dt.hour
        hist_weather_datetime_county['hour']= hist_weather_datetime_county['datetime'].dt.hour

        hist_weather_datetime.drop_duplicates(inplace=True)
        hist_weather_datetime_county.drop_duplicates(inplace=True)
        hist_weather_datetime.drop('datetime', axis= 1, inplace= True)
        hist_weather_datetime_county.drop('datetime', axis= 1, inplace= True)


        data= data.merge(hist_weather_datetime, how='left', on=['data_block_id', 'hour'])

        data= data.merge(hist_weather_datetime_county, how='left', on=['data_block_id', 'county', 'hour'],
                         suffixes= ('_hist_mean','_hist_mean_by_county'))

        data= data.groupby(['year', 'day', 'hour'], as_index=False).apply(lambda x: x.ffill().bfill()).reset_index()

        data.drop(['level_0', 'level_1', 'row_id', 'data_block_id'], axis= 1, inplace= True)

        return data
    def create_revealed_targets_test(self, data, previous_revealed_targets, N_day_lags):
        #   Create new test data based on previous_revealed_targets and N_day_lags 
        for count, revealed_targets in enumerate(previous_revealed_targets) :
            day_lag = count + 2
            # Get hour
            revealed_targets['hour'] = pd.to_datetime(revealed_targets['datetime'], utc= True).dt.hour
            # Select columns and rename target
            revealed_targets = revealed_targets[['hour', 'prediction_unit_id', 'is_consumption', 'target']]
            revealed_targets = revealed_targets.rename(columns = {"target" : f"target_{day_lag}_days_ago"})
            # Add past revealed targets
            data = pd.merge(data,
                            revealed_targets,
                            how = 'left',
                            on = ['hour', 'prediction_unit_id', 'is_consumption'],
                           )

        # If revealed_target_columns not available, replace by nan
        all_revealed_columns = [f"target_{day_lag}_days_ago" for day_lag in range(2, N_day_lags+1)]
        missing_columns = list(set(all_revealed_columns) - set(data.columns))
        data[missing_columns] = np.nan 
        
        return data

    def get_agg_target_lag(self, df):

        tgt_lag_columns = [c for c in df.columns if '_days_ago' in c]

        for m in ['mean', 'std', 'var', 'median', 'max', 'min']:
            df[f'target_{m}'] = df[tgt_lag_columns].agg(m, axis=1)
            
        df['target'] = np.nan

        return df

## **Submission**

In [15]:
env = enefit.make_env()
iter_test = env.iter_test()

In [16]:
tdp = TestDataProcessing()
previous_revealed_targets = []
N_day_lags = 7
for (test, revealed_targets, client_test, historical_weather_test,
     forecast_weather_test, electricity_test, gas_test, sample_prediction) in iter_test:
    
    # Rename test set to make consistent with train
    test = test.rename(columns = {'prediction_datetime': 'datetime'})
    
    # Initiate column data_block_id with default value to merge the data on
    id_column = 'data_block_id' 
    
    test[id_column] = 0
    gas_test[id_column] = 0
    electricity_test[id_column] = 0
    historical_weather_test[id_column] = 0
    forecast_weather_test[id_column] = 0
    client_test[id_column] = 0
    revealed_targets[id_column] = 0
    
    data_test = tdp.feat_eng_test(test, client_test, historical_weather_test,
                                  forecast_weather_test, electricity_test, gas_test, locations)
    
    data_test['datetime']= pd.to_datetime(data_test['datetime'], utc= True).astype('int64')
    
    # Store revealed_targets
    previous_revealed_targets.insert(0, revealed_targets)
    if len(previous_revealed_targets) == N_day_lags:
        previous_revealed_targets.pop()
    
    # Add previous revealed targets
    df_test = tdp.create_revealed_targets_test(data=data_test.copy(),
                                               previous_revealed_targets=previous_revealed_targets.copy(),
                                               N_day_lags=N_day_lags)
    #Data Transformation
    df_test['sin_hour']= (np.pi * np.sin(df_test['hour']) / 12)
    df_test['cos_hour']= (np.pi * np.cos(df_test['hour']) / 12)
    df_test['sin_hour']= (np.pi * np.sin(df_test['hour']) / 12)
    df_test['cos_hour']= (np.pi * np.cos(df_test['hour']) / 12)
    df_test['sin_dayofyear']= (np.pi * np.sin(df_test['dayofyear']) / 183)
    df_test['cos_dayofyear']= (np.pi * np.cos(df_test['dayofyear']) / 183)
    df_test = tdp.get_agg_target_lag(df_test)
        
    df_test = make_logs(df_test)
    #for i in to_log:
    #    df_test[f"log_{i}"]= np.where((df_test[i])!= 0, np.log(df_test[i]),0)
    df_test = df_test.drop('currently_scored', axis= 1)
    X_test = df_test.values
    
    print(f"df_test the number of columns = {len(df_test.columns)}\n {sorted(df_test.columns.tolist())} ")

    #Predictions
    #create a list to store predictions of each model
    target_preds=[]
    model_weights = [0.12, 0.13, 0.17, 0.16, 0.15, 0.14, 0.13]
    pred=0
    for i, model in enumerate(m_models):
        pred = model.predict(X_test).clip(0)
        print(f"pred = {pred}")
        target_preds.append(pred)
    
    #weighted average
    pred=0
    for i in range(len(model_weights)):
        pred += (target_preds[i]* model_weights[i])
        
    test['target'] = pred
    
#     #repeat above process for target_solar
    tsolar_preds=[]
    for i, model in enumerate(n_models):
        pred = model.predict(X_test).clip(0)
        print(f"pred = {pred}")
        tsolar_preds.append(pred)
        
    pred_solar=0
    for i in range(len(model_weights)):
        pred_solar += (tsolar_preds[i]* model_weights[i])
        
    test['target_solar'] = pred_solar
    
    gc.collect()
    
    test.loc[test['is_consumption']==0, "target"] = test.loc[test['is_consumption']==0, "target_solar"]  
    sample_prediction["target"] = test['target']
    
    #Sending predictions to the API
    env.predict(sample_prediction)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
df_test the number of columns = 121
 ['10_metre_u_wind_component_fcast_mean', '10_metre_u_wind_component_fcast_mean_by_county', '10_metre_v_wind_component_fcast_mean', '10_metre_v_wind_component_fcast_mean_by_county', 'cloudcover_high_fcast_mean', 'cloudcover_high_fcast_mean_by_county', 'cloudcover_high_hist_mean', 'cloudcover_high_hist_mean_by_county', 'cloudcover_low_fcast_mean', 'cloudcover_low_fcast_mean_by_county', 'cloudcover_low_hist_mean', 'cloudcover_low_hist_mean_by_county', 'cloudcover_mid_fcast_mean', 'cloudcover_mid_fcast_mean_by_county', 'cloudcover_mid_hist_mean', 'cloudcover_mid_hist_mean_by_county', 'cloudcover_total_fcast_mean', 'cloudcover_total_fcast_mean_by_county', 'cloudcover_total_hist_mean', 'cloudcover_total_hist_mean_by_county', 'cos_dayofyear', 'cos_hour', 'county', 'datetime', 'day', 'dayofweek', 'dayofyear', 'dewpoint_fcast_mean', 'd