# 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, TimeSeriesSplit, RandomizedSearchCV
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

# 2. Load and Clean Data

In [2]:
df = pd.read_csv("energy.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16599 entries, 0 to 16598
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   States       16599 non-null  object 
 1   Regions      16599 non-null  object 
 2   latitude     16599 non-null  float64
 3   longitude    16599 non-null  float64
 4   Dates        16599 non-null  object 
 5   temperature  16599 non-null  float64
 6   humidity     16599 non-null  int64  
 7   Usage        16599 non-null  float64
dtypes: float64(4), int64(1), object(3)
memory usage: 1.0+ MB


In [4]:
df['Dates'] = pd.to_datetime(df['Dates'], dayfirst = True)
print(f"Shape of Data :{df.shape}")
print(f"Date Range: {df['Dates'].min()} - {df['Dates'].max()}")
print(f"Unique States: {df['States'].nunique()}")

Shape of Data :(16599, 8)
Date Range: 2019-01-02 00:00:00 - 2020-12-05 00:00:00
Unique States: 33


In [5]:
df.head(10)

Unnamed: 0,States,Regions,latitude,longitude,Dates,temperature,humidity,Usage
0,Punjab,NR,31.519974,75.980003,2019-01-02,12.1,79,119.9
1,Haryana,NR,28.450006,77.019991,2019-01-02,13.9,72,130.3
2,Rajasthan,NR,26.449999,74.639981,2019-01-02,16.7,40,234.1
3,Delhi,NR,28.669993,77.230004,2019-01-02,13.7,72,85.8
4,UP,NR,27.599981,78.050006,2019-01-02,13.9,72,313.9
5,Uttarakhand,NR,30.320409,78.050006,2019-01-02,11.7,65,40.7
6,HP,NR,31.100025,77.166597,2019-01-02,2.2,71,30.0
7,J&K,NR,33.45,76.24,2019-01-02,-20.6,78,52.5
8,Chandigarh,NR,30.719997,76.780006,2019-01-02,12.9,71,5.0
9,Chhattisgarh,WR,22.09042,82.159987,2019-01-02,16.6,66,78.7


In [6]:
df = df.sort_values(by = ['States', 'Dates']).reset_index(drop = True)
df.head(10)

Unnamed: 0,States,Regions,latitude,longitude,Dates,temperature,humidity,Usage
0,Andhra Pradesh,SR,14.750429,78.570026,2019-01-02,23.1,56,164.6
1,Andhra Pradesh,SR,14.750429,78.570026,2019-01-03,24.0,57,170.1
2,Andhra Pradesh,SR,14.750429,78.570026,2019-01-04,24.0,59,165.2
3,Andhra Pradesh,SR,14.750429,78.570026,2019-01-05,23.5,64,167.4
4,Andhra Pradesh,SR,14.750429,78.570026,2019-01-06,23.7,60,171.2
5,Andhra Pradesh,SR,14.750429,78.570026,2019-01-07,24.1,57,166.4
6,Andhra Pradesh,SR,14.750429,78.570026,2019-01-08,25.1,48,160.8
7,Andhra Pradesh,SR,14.750429,78.570026,2019-01-09,25.5,41,163.0
8,Andhra Pradesh,SR,14.750429,78.570026,2019-01-10,24.2,58,168.8
9,Andhra Pradesh,SR,14.750429,78.570026,2019-01-11,24.8,63,167.7


# 3. Feature Engineering

In [7]:
# Temporal Features
df['DayOfWeek'] = df['Dates'].dt.dayofweek
df['IsWeekend'] = df['DayOfWeek'].isin([5,6]).astype(int)
df['Day'] = df['Dates'].dt.day
df['Month'] = df['Dates'].dt.month
df['Quarter'] = df['Dates'].dt.quarter

# Lag Features
for lag in [1,7,14]:
    df[f'Lag{lag}'] = df.groupby('States')['Usage'].shift(lag)

# Rolling Features
for win in [7, 14, 30]:
    df[f'Rolling{win}'] = df.groupby('States')['Usage'].transform(
        lambda x: x.shift(1).rolling(window=win, min_periods=1).mean())

# Rolling Standard(for volatility)
df['RollingStd7'] = df.groupby('States')['Usage'].transform(
    lambda x: x.shift(1).rolling(window=7, min_periods=1).std())

df.isnull().sum()

States           0
Regions          0
latitude         0
longitude        0
Dates            0
temperature      0
humidity         0
Usage            0
DayOfWeek        0
IsWeekend        0
Day              0
Month            0
Quarter          0
Lag1            33
Lag7           231
Lag14          462
Rolling7        33
Rolling14       33
Rolling30       33
RollingStd7     66
dtype: int64

# 4. Handling Missing Values

In [8]:
df['Lag1'] = df['Lag1'].fillna(df.groupby('States')['Usage'].transform('mean'))

df['Lag7'] = df['Lag7'].fillna(df.groupby('States')['Usage'].transform('mean'))

df['Lag14'] = df['Lag14'].fillna(df.groupby('States')['Usage'].transform('mean'))

df['RollingStd7'].fillna(0, inplace = True)

df = df.fillna(df.mean(numeric_only=True))

df.isnull().sum()

States         0
Regions        0
latitude       0
longitude      0
Dates          0
temperature    0
humidity       0
Usage          0
DayOfWeek      0
IsWeekend      0
Day            0
Month          0
Quarter        0
Lag1           0
Lag7           0
Lag14          0
Rolling7       0
Rolling14      0
Rolling30      0
RollingStd7    0
dtype: int64

In [9]:
df.to_csv("energy_with_features.csv", index=False)

# 5. Train Test Split

In [10]:
df.columns

Index(['States', 'Regions', 'latitude', 'longitude', 'Dates', 'temperature',
       'humidity', 'Usage', 'DayOfWeek', 'IsWeekend', 'Day', 'Month',
       'Quarter', 'Lag1', 'Lag7', 'Lag14', 'Rolling7', 'Rolling14',
       'Rolling30', 'RollingStd7'],
      dtype='object')

In [11]:
features = ['latitude', 'longitude', 'temperature', 'humidity', 'DayOfWeek',
            'IsWeekend', 'Day', 'Month', 'Quarter', 'Lag1', 'Lag7', 'Lag14', 'Rolling7',
            'Rolling14', 'Rolling30', 'RollingStd7']

X = df[features]
y = df['Usage']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, shuffle = False)

print(f"Train Sample: {X_train.shape[0]}")
print(f"Test Sample: {X_test.shape[0]}")

Train Sample: 13279
Test Sample: 3320


# Evaluation Helper

In [12]:
def evaluate(name, model, X_tr, y_tr, X_te, y_te):
    ytr_p = model.predict(X_tr)
    yte_p = model.predict(X_te)
    tr_r2   = r2_score(y_tr, ytr_p)
    te_r2   = r2_score(y_te, yte_p)
    tr_mae  = mean_absolute_error(y_tr, ytr_p)
    te_mae  = mean_absolute_error(y_te, yte_p)
    tr_rmse = np.sqrt(mean_squared_error(y_tr, ytr_p))
    te_rmse = np.sqrt(mean_squared_error(y_te, yte_p))
    te_mape = np.mean(np.abs((y_te - yte_p) / y_te)) * 100
    gap = abs(tr_r2 - te_r2)
    status = '✓ Good' if gap < 0.05 else ('⚠ Mild overfit' if gap < 0.1 else '❌ Overfitting')

    print(f'\n{"="*55}')
    print(f'  {name}')
    print(f'{"="*55}')
    print(f'  {"Metric":20s}  {"Train":>10}  {"Test":>10}')
    print(f'  {"-"*44}')
    print(f'  {"R²":20s}  {tr_r2:>10.4f}  {te_r2:>10.4f}')
    print(f'  {"MAE":20s}  {tr_mae:>10.2f}  {te_mae:>10.2f}')
    print(f'  {"RMSE":20s}  {tr_rmse:>10.2f}  {te_rmse:>10.2f}')
    print(f'  {"MAPE (Test)":20s}  {"": >10}  {te_mape:>9.2f}%')
    print(f'  Overfit check (R² gap={gap:.4f}): {status}')
    return {'name': name, 'Test R²': round(te_r2,4), 'Test MAE': round(te_mae,2),
            'Test RMSE': round(te_rmse,2), 'MAPE': round(te_mape,2), 'preds': yte_p}

# 7. Model1 - Ridge Regression

In [13]:
ridge = Ridge(alpha = 10)
ridge.fit(X_train, y_train)
res_ridge = evaluate('Ridge Regression (alpha=10)', ridge, X_train, y_train, X_test, y_test)


  Ridge Regression (alpha=10)
  Metric                     Train        Test
  --------------------------------------------
  R²                        0.9867      0.9752
  MAE                         5.34        9.71
  RMSE                       12.72       19.97
  MAPE (Test)                           10.82%
  Overfit check (R² gap=0.0115): ✓ Good


# 8. Model2 - Lasso Regression

In [14]:
lasso = Lasso(alpha=1)
lasso.fit(X_train, y_train)
res_lasso = evaluate('Lasso Regression (alpha=1)', lasso, X_train, y_train, X_test, y_test)

zeroed = [f for f, c in zip(features, lasso.coef_) if c == 0]
print(f'\nFeatures zeroed by Lasso: {zeroed if zeroed else "None"}')


  Lasso Regression (alpha=1)
  Metric                     Train        Test
  --------------------------------------------
  R²                        0.9867      0.9751
  MAE                         5.27        9.67
  RMSE                       12.73       20.00
  MAPE (Test)                            9.16%
  Overfit check (R² gap=0.0116): ✓ Good

Features zeroed by Lasso: ['latitude', 'longitude', 'temperature', 'DayOfWeek', 'IsWeekend', 'Month', 'Quarter']


# 10. Model3 - Random Forest(Tuned with time series)

In [15]:
tscv = TimeSeriesSplit(n_splits=5)

param_grid = {
    'n_estimators':      [100, 200, 300],
    'max_depth':         [6, 8, 10, 12],
    'min_samples_split': [5, 10],
    'min_samples_leaf':  [3, 5, 8],
    'max_features':      ['sqrt', 0.5]
}

rf = RandomForestRegressor(random_state=42, n_jobs=-1)
search = RandomizedSearchCV(rf, param_grid, n_iter=20, cv=tscv,
                             scoring='r2', random_state=42, n_jobs=-1, verbose=1)
search.fit(X_train, y_train)

print(f'\nBest params: {search.best_params_}')
print(f'Best CV R²:  {search.best_score_:.4f}')

Fitting 5 folds for each of 20 candidates, totalling 100 fits

Best params: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 8, 'max_features': 'sqrt', 'max_depth': 6}
Best CV R²:  0.8295


In [16]:
best_rf = search.best_estimator_
res_rf = evaluate('Random Forest (Tuned)', best_rf, X_train, y_train, X_test, y_test)


  Random Forest (Tuned)
  Metric                     Train        Test
  --------------------------------------------
  R²                        0.9859      0.9530
  MAE                         6.67       14.92
  RMSE                       13.09       27.47
  MAPE (Test)                           16.60%
  Overfit check (R² gap=0.0329): ✓ Good


# 11. Creating Pickel Files

In [17]:
import joblib

joblib.dump(ridge, 'ridge_model.pkl')
print("Model saved as 'ridge_model.pkl'")

joblib.dump(lasso, 'lasso_model.pkl')
print("Model saved as 'lasso_model.pkl'")

joblib.dump(best_rf, 'best_rf_model.pkl')
print("Model saved as 'best_rf_model.pkl'")

Model saved as 'ridge_model.pkl'
Model saved as 'lasso_model.pkl'
Model saved as 'best_rf_model.pkl'
