#Forecasting Daily Electrical Energy Consumption with LightGBM

This notebook demonstrates how to use Light Gradient Boosting Machine (LightGBM) to forecast the daily consumption of electrical energy.

In [2]:
import pandas as pd
import numpy as np

In [None]:
# Donload and read the files from kaggle

train= pd.read_csv('path')

weather= pd.read_csv('path')

building= pd.read_csv('path')

In [None]:
print(len(train))

#EDA & FE

In [None]:
print( "train columns are :",train.columns)

In [None]:
print("weather columns are :",weather.columns)

In [None]:
print("building columns are :", building.columns)

In [None]:
data = train.merge(building, on='building_id', how='left')

In [None]:
data = data.merge(weather, on=['site_id', 'timestamp'], how='left')

In [None]:
data.columns

In [None]:
import gc

In [None]:
#For saving some memory
del weather,building
gc.collect();

In [None]:
data.dtypes

Let's save some memory

In [None]:
d_types = {
    "building_id": np.int16,
    "meter": np.int8,
    "site_id": np.int8,
    "primary_use": "category",
    "square_feet": np.int32,
    "year_built": np.float16,       # better: Int16 if no fractions
    "floor_count": np.float16,      # better: Int8/Int16 if integer counts
    "air_temperature": np.float32,
    "cloud_coverage": np.float16,   # better: Int8 if bounded counts
    "dew_temperature": np.float32,
    "precip_depth_1_hr": np.float16,
    "sea_level_pressure": np.float32,
    "wind_direction": np.float16,
    "wind_speed": np.float32,
    "timestamp": "datetime64[ns]"        # ✅ convert object → datetime
}



data=data.astype(d_types,copy=False)

In [None]:
data.dtypes

In [None]:
def Break_timestamp(df):
  df['timestamp']= pd.to_datetime(df['timestamp'])
  df['hour']= np.uint8(df['timestamp'].dt.hour)
  df['dayofweek']= np.uint8(df['timestamp'].dt.dayofweek)
  df['month']= np.uint8(df['timestamp'].dt.month)
  df['dayofyear']= np.uint16(df['timestamp'].dt.dayofyear)
  df['day']= np.uint16(df['timestamp'].dt.day)
  df['year']= np.uint16(df['timestamp'].dt.year)
  return df

In [None]:
Break_timestamp(data);
data.columns

In [None]:
def missing_values(df):

  missing_vals = df.isnull().sum() * 100 / len(df);
  missing_vals_df=pd.DataFrame({

      'Percentage of missing values':missing_vals
  })
  print(missing_vals_df)


In [None]:
missing_values(data)

In [None]:
#zero_readings=data[data['meter_reading']==0].index
#data.drop(zero_readings,inplace=True)

data=data[data['meter_reading']!=0]

In [None]:
data.columns

In [None]:
data.drop(['year_built', 'floor_count'], axis=1,inplace=True)

Smart fill (group mean) → keeps local/site/day/month context.
If every row in a group is missing(no data for this group combination) → the group mean is NaN → nothing gets filled ❌

Backup fill (global median) → guarantees no NaNs survive.



In [None]:
def nan_filler(df):
  group=['site_id','day','month','year']
  weather_columns= [
        'air_temperature', 'dew_temperature', 'cloud_coverage',
        'sea_level_pressure', 'precip_depth_1_hr',
        'wind_direction', 'wind_speed'
    ]

  for col in weather_columns:
  # Step 1: group mean fill
    group_mean=df.groupby(group)[col].transform('mean')
    df[col]=df[col].fillna(group_mean)

  for col_2 in weather_columns:
  # Step 2: fallback global median fill
    df[col_2] = df[col_2].fillna(df[col_2].median())

  return df

In [None]:
nan_filler(data)

In [None]:
missing_values(data)

In [None]:
def get_season(df):

  if df['month'] in [12,1,2]:
    return 'Winter'
  elif df['month'] in [3,4,5]:
    return 'Spring'
  elif df['month'] in [6,7,8]:
    return 'Summer'
  elif df['month'] in [9,10,11]:
    return 'Autumn'

In [None]:
data['Season']=data.apply(get_season,axis=1)

In [None]:
def get_day_time(df):
  if df['hour']>=6 and df['hour']<=18:
    return 'Day'
  else :
    return 'Night'



In [None]:
data['DateTime']=data.apply(get_day_time,axis=1)

In [None]:
print(pd.DataFrame({
    'Builndings Categories': data['primary_use'].unique()
}))

In [None]:
Elec_df = data.loc[data['meter'] == 0].copy()

# Labeling the Data

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
# Function to label encode categorical columns
def label_encode_categoricals(df):
    le = LabelEncoder()
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

    for col in categorical_cols:

      df[col] = df[col].astype(str).str.lower()
      df[col] = le.fit_transform(df[col])

    return df

# Apply label encoding to categorical columns
Elec_encode = label_encode_categoricals(Elec_df)

In [None]:
Elec_encode

In [None]:
del Elec_df
gc.collect();

#Let's Visualise the data as Time Series

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def lineplot_of_the_data(df):
  df = df.sort_values(by='timestamp')


  daily = df.groupby(['year','dayofyear'])['meter_reading'].sum().reset_index()

  sns.lineplot(data=daily, x='dayofyear', y='meter_reading', hue='year')
  plt.title("Daily Consumption by Year")
  plt.xlabel("Day of Year")
  plt.ylabel("Total Meter Reading")
  plt.show()



In [None]:
lineplot_of_the_data(Elec_encode)

#Machine Learning
#Forecasting using Light GBM

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.model_selection import ParameterGrid

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, ParameterGrid
from lightgbm import LGBMRegressor

def rmsle_metric(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred).clip(min=0)
    return np.sqrt(mean_squared_error(np.log1p(y_true), np.log1p(y_pred)))

def LGBM(data, params):
    # Sort and create a daily datetime (keep for EVAL only)
    data = data.sort_values('timestamp').copy()
    data['date'] = pd.to_datetime(data['timestamp']).dt.floor('D')  # datetime64 (not object)

    # IMPORTANT: don't pass 'date' to the model
    X = data.drop(columns=['timestamp', 'meter_reading', 'date'])
    y = data['meter_reading']

    # Time-aware split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=0.85, shuffle=False
    )

    # Train on log target
    y_train_log = np.log1p(y_train)
    model = LGBMRegressor(verbosity=-1, **params)
    model.fit(X_train, y_train_log)

    # Predict and back-transform
    y_pred = np.expm1(model.predict(X_test))

    # Get the dates of the test rows from the original data by index
    test_dates = data.loc[X_test.index, 'date']

    # Build evaluation DataFrame
    df_eval = pd.DataFrame({
        'date': test_dates.values,
        'y_true': y_test.values,
        'y_pred': y_pred
    })

    # Aggregate to daily totals and sort chronologically
    daily_true = df_eval.groupby('date')['y_true'].sum()
    daily_pred = df_eval.groupby('date')['y_pred'].sum()

    # Metrics on daily totals
    rmse_score = np.sqrt(mean_squared_error(daily_true, daily_pred))
    rmsle_score = rmsle_metric(daily_true, daily_pred)

    print(f'Daily RMSE: {rmse_score:.3f}')
    print(f'Daily RMSLE: {rmsle_score:.3f}')
    return rmsle_score, rmse_score







In [None]:
# ---- Grid search loop ----
grid = {
    'learning_rate': [0.05, 0.1, 1],
    'max_depth': [5, 7],
    'n_estimators': [200, 400],
    'subsample': [0.9, 1.0],
    'colsample_bytree': [0.8, 0.9],
    'min_child_samples': [20, 100],
    'reg_lambda': [0.0, 0.1],
    'reg_alpha': [0.0]
}

best = (float('inf'), None)
for p in ParameterGrid(grid):
    rmsle_score, rmse_score = LGBM(Elec_encode, p)
    if rmsle_score < best[0]:
        best = (rmsle_score, p)

print("Best RMSLE:", best[0])
print("Best params:", best[1])

In [None]:
best_parameters = {
    'learning_rate': 1,
    'max_depth': 7,
    'n_estimators': 200,
    'subsample': 0.9,
    'colsample_bytree': 0.9,
    'reg_alpha': 0,
    'reg_lambda': 0.1,
    'min_child_samples': 20
}

In [None]:
rmsle_score, rmse_score = LGBM(Elec_encode, best_parameters)

print(f'RMSLE: {rmsle_score: .4f}')


Daily RMSE: 352586.469
Daily RMSLE: 0.061
RMSLE:  0.0611


Best RMSLE is ≈ 0.06