In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import logging
import os
import sys
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from xgboost import XGBRegressor

In [2]:
log_dir = 'logs'
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

log_filename = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '.log'
log_filepath = os.path.join(log_dir, log_filename)
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    filename=log_filepath
)

In [3]:
class CustomException(Exception):
    def __init__(self,error_message,error_details:sys):
        self.error_message = error_message
        _,_,exc_tb = error_details.exc_info()
        
        self.lineno=exc_tb.tb_lineno
        self.file_name=exc_tb.tb_frame.f_code.co_filename 
    def __str__(self):
        logging.error("Error occured in python script name [{0}] line number [{1}] error message [{2}]".format(
        self.file_name, self.lineno, str(self.error_message)))
        return "Error occured in python script name [{0}] line number [{1}] error message [{2}]".format(
        self.file_name, self.lineno, str(self.error_message))

In [4]:
try:
    logging.info(f'Loading data')
    df=pd.read_excel(r'datasets\city-power-consumption-cleaned.xlsx')
    logging.info(f'Data Loaded successfully')
    df.head(5)
except Exception as ex:
    raise CustomException(ex, sys)

In [5]:
def feature_importance_plot(model, features):
    feature_importances = model.feature_importances_

    # Create a DataFrame for better visualization
    features_df = pd.DataFrame({
        'Feature': features,
        'Importance': feature_importances
    })

    # Sort features by importance
    features_df = features_df.sort_values(by='Importance', ascending=False)

    print("\n--- Feature Importances from RandomForestRegressor ---")
    print(features_df)

In [12]:
def build_model(df):
    try:
        X = df.drop(columns=['power_consumption'])
        y = df['power_consumption']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32)
            
            # Scale the features
        scaler = RobustScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
            
        # Define models
        models = {
            'Linear Regression': LinearRegression(),
            'Random Forest': RandomForestRegressor(n_estimators=200, random_state=32),
            'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=32),
            'XG Boosting': XGBRegressor(n_estimators=200, random_state=32)
        }
            
        results = {}
        for name, model in models.items():
            # Train the model
            if name in ['Linear Regression', 'Ridge Regression', 'Lasso Regression']:
                model.fit(X_train_scaled, y_train)
                y_pred = model.predict(X_test_scaled)
            else:
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
            
            # Calculate metrics
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            
            results[name] = {
                'model': model,
                'mse': mse,
                'rmse': rmse,
                'mae': mae,
                'r2_score': r2,
                'predictions': y_pred,
                'actual': y_test
            }
            
            print(f"\n{name} Performance:")
            print(f"R² Score: {r2:.4f}")
            print(f"RMSE: {rmse:.4f}")
            print(f"MAE: {mae:.4f}") 
        return results,scaler
    except Exception as ex:
        raise CustomException(ex,sys)


In [7]:
build_model(df)


Random Forest Performance:
R² Score: 0.6396
RMSE: 4837.2694
MAE: 3348.7260


({'Random Forest': {'model': RandomForestRegressor(random_state=32),
   'mse': 23399175.65268575,
   'rmse': np.float64(4837.269441811749),
   'mae': 3348.7260319012985,
   'r2_score': 0.6396205983334566,
   'predictions': array([27782.118344 , 30892.3141874, 34212.4638653, ..., 26389.0978315,
          38240.0061779, 34563.3393777], shape=(10517,)),
   'actual': 15782    26326.11410
   42971    31648.49015
   21980    36263.84106
   34653    37072.40844
   13603    34894.81163
               ...     
   11894    33640.85106
   36245    34012.03540
   50290    21767.30038
   35116    40708.67257
   16282    24174.63940
   Name: power_consumption, Length: 10517, dtype: float64}},
 RobustScaler())

In [None]:
# Extract additional polynomial feature
df_enhanced = df.copy()
df_enhanced['temp_squared'] = df_enhanced['temperature'] ** 2
df_enhanced['temp_cubed'] = df_enhanced['temperature'] ** 3
df_enhanced['temp_humidity_interaction'] = df_enhanced['temperature'] * df_enhanced['humidity']
df_enhanced['temp_wind_interaction'] = df_enhanced['temperature'] * df_enhanced['wind_speed']
    
df_enhanced['humidity_squared'] = df_enhanced['humidity'] ** 2
df_enhanced['humidity_wind_interaction'] = df_enhanced['humidity'] * df_enhanced['wind_speed']
    
df_enhanced['wind_speed_squared'] = df_enhanced['wind_speed'] ** 2
df_enhanced['wind_power'] = df_enhanced['wind_speed'] ** 3 
 
df_enhanced

Unnamed: 0,temperature,humidity,wind_speed,general_diffuse_flows,diffuse_flows,air_quality_index,cloudiness,power_consumption,temp_squared,temp_cubed,temp_humidity_interaction,temp_wind_interaction,humidity_squared,humidity_wind_interaction,wind_speed_squared,wind_power
0,6.559000,73.80,0.083,0.051,0.119,158,1,34055.69620,43.020481,282.171335,484.054200,0.544397,5446.4400,6.12540,0.006889,0.000572
1,6.414000,74.50,0.083,0.070,0.085,159,1,29814.68354,41.139396,263.868086,477.843000,0.532362,5550.2500,6.18350,0.006889,0.000572
2,6.313000,74.50,0.080,0.062,0.100,151,1,29128.10127,39.853969,251.598106,470.318500,0.505040,5550.2500,5.96000,0.006400,0.000512
3,6.121000,75.00,0.083,0.091,0.096,151,1,28228.86076,37.466641,229.333310,459.075000,0.508043,5625.0000,6.22500,0.006889,0.000572
4,5.921000,75.70,0.081,0.048,0.085,154,1,27335.69620,35.058241,207.579845,448.219700,0.479601,5730.4900,6.13170,0.006561,0.000531
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52578,18.827384,75.60,0.094,436.200,38.960,157,0,34444.55696,354.470387,6673.750076,1423.350228,1.769774,5715.3600,7.10640,0.008836,0.000831
52579,18.827384,73.50,0.089,451.600,37.730,157,0,34438.48101,354.470387,6673.750076,1383.812721,1.675637,5402.2500,6.54150,0.007921,0.000705
52580,18.827384,70.10,0.085,466.400,37.490,157,0,33873.41772,354.470387,6673.750076,1319.799616,1.600328,4914.0100,5.95850,0.007225,0.000614
52581,18.827384,67.75,0.086,477.000,37.370,152,0,33988.86076,354.470387,6673.750076,1275.555263,1.619155,4590.0625,5.82650,0.007396,0.000636


In [13]:
model_results, scaler = build_model(df_enhanced)
# for key_item, value_item in model_results.items():
#     print(key_item)
#     print(value_item)
#     print('-'*10)


Linear Regression Performance:
R² Score: 0.3584
RMSE: 6454.3365
MAE: 5195.3842

Random Forest Performance:
R² Score: 0.6620
RMSE: 4684.8768
MAE: 3224.1344

Gradient Boosting Performance:
R² Score: 0.4612
RMSE: 5914.7635
MAE: 4646.3932

XG Boosting Performance:
R² Score: 0.6105
RMSE: 5028.8911
MAE: 3643.0510


In [None]:
for key_item, value_item in model_results.items():
    print(key_item)
    feature_importance_plot(value_item['model'], df_enhanced.drop(columns=['power_consumption']).columns)

In [None]:
def optimize_best_model(X, y):
    """Optimize hyperparameters for the best performing model"""
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    param_grid = {
            'n_estimators': [100, 200, 300],
            'max_depth': [10, 20, 30, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'max_features': ['sqrt', 'log2', None]
        }
    model = RandomForestRegressor(random_state=42)
    
    # Perform grid search
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2', n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)
    
    # Get best model
    best_model = grid_search.best_estimator_
    
    # Evaluate on test set
    y_pred = best_model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    
    print(f"\nOptimized {model_type} Performance:")
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"R² Score: {r2:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    
    return best_model, grid_search.best_params_

In [None]:
X = df_enhanced.drop(columns=['power_consumption'])
y = df_enhanced['power_consumption']
optimize_best_model(X, y)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits
