In [1]:
# Importing the required libraries 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error,  r2_score
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb

In [2]:
# Load datasets

# Train Data 
temp_df = pd.read_csv("/Users/inutenneti/Desktop/Winter Quarter/Predictive Analytics 2/project/train.csv", nrows=0)  # Read only the header
total_columns = len(temp_df.columns)
columns_to_use = temp_df.columns[1:total_columns] 
train_data = pd.read_csv("/Users/inutenneti/Desktop/Winter Quarter/Predictive Analytics 2/project/train.csv", usecols=columns_to_use)

# Test Data 
test_data = pd.read_csv("/Users/inutenneti/Desktop/Winter Quarter/Predictive Analytics 2/project/test.csv", usecols=columns_to_use)

# Dropping the columns that are not relevant to our analysis 
train_data = train_data.drop(columns=['building_name', 'site_name','date'])
test_data = test_data.drop(columns=['building_name', 'site_name','date'])

# Building index on building_id for furhter assessment 
train_data.set_index('building_id', inplace=True)
test_data.set_index('building_id', inplace=True)

In [3]:
# Filtering for solar meter_reading
train_data = train_data[train_data['meter'] == 'solar']
test_data = test_data[test_data['meter'] == 'solar']

train_data = train_data.drop(columns=['meter'])
test_data = test_data.drop(columns=['meter'])

In [4]:
# Inspecting the data frames
print(train_data.sample(2))
print('-------------------------------------------------------------')
print(test_data.sample(2))

             meter_reading sub_primaryspaceusage      sqm      sqft  \
building_id                                                           
71                    8.42              Academic  10551.9  113580.0   
71                  103.03              Academic  10551.9  113580.0   

                timezone  airTemperature  cloudCoverage  dewTemperature  \
building_id                                                               
71           US/Mountain        5.466886       2.170902        1.214254   
71           US/Mountain       15.610307       2.110612        8.263158   

             precipDepth1HR  precipDepth6HR  seaLvlPressure  windDirection  \
building_id                                                                  
71                 1.619233       11.996276     1024.178292     197.804775   
71                 1.006206       12.470690     1015.009984     155.936066   

             windSpeed  season  site_id  
building_id                              
71            3.2

In [5]:
# Separating into X and Y dataframes 
X_train = train_data.drop(columns=['meter_reading'])  # Exclude target variable
y_train = train_data['meter_reading']

X_test = test_data.drop(columns=['meter_reading'])  # Exclude target variable
y_test = test_data['meter_reading']

In [6]:
# Convert 'site_id' from numeric to categorical 
X_train['site_id'] = X_train['site_id'].astype('category')
X_test['site_id'] = X_test['site_id'].astype('category')

In [7]:
print(X_train.dtypes)
print(X_train.columns)

sub_primaryspaceusage      object
sqm                       float64
sqft                      float64
timezone                   object
airTemperature            float64
cloudCoverage             float64
dewTemperature            float64
precipDepth1HR            float64
precipDepth6HR            float64
seaLvlPressure            float64
windDirection             float64
windSpeed                 float64
season                     object
site_id                  category
dtype: object
Index(['sub_primaryspaceusage', 'sqm', 'sqft', 'timezone', 'airTemperature',
       'cloudCoverage', 'dewTemperature', 'precipDepth1HR', 'precipDepth6HR',
       'seaLvlPressure', 'windDirection', 'windSpeed', 'season', 'site_id'],
      dtype='object')


In [3]:
# Define features and types based on your dataset
numerical_features = ['sqm', 'sqft', 'airTemperature', 'cloudCoverage', 'dewTemperature',
                      'precipDepth1HR', 'precipDepth6HR', 'seaLvlPressure', 'windDirection', 'windSpeed']
categorical_features = ['timezone', 'season', 'sub_primaryspaceusage', 'site_id']

In [11]:
# Create a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [7]:
# Create a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Fit the preprocessor on the training data and transform both training and test data
preprocessor.fit(X_train)
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [10]:
# Convert the processed data back to dense DataFrames
X_train_processed_df = pd.DataFrame(X_train_processed, columns=preprocessor.get_feature_names_out())
X_test_processed_df = pd.DataFrame(X_test_processed, columns=preprocessor.get_feature_names_out())

In [11]:
# Checking the columns 
X_train_processed_df.columns

Index(['num__sqm', 'num__sqft', 'num__airTemperature', 'num__cloudCoverage',
       'num__dewTemperature', 'num__precipDepth1HR', 'num__precipDepth6HR',
       'num__seaLvlPressure', 'num__windDirection', 'num__windSpeed',
       'cat__timezone_US/Mountain', 'cat__season_Fall', 'cat__season_Spring',
       'cat__season_Summer', 'cat__season_Winter',
       'cat__sub_primaryspaceusage_Academic',
       'cat__sub_primaryspaceusage_Student Center', 'cat__site_id_2'],
      dtype='object')

In [12]:
scaler = MinMaxScaler()

# Fit scaler on the training set
y_train_scaled = scaler.fit_transform(y_train.values.reshape(-1, 1))

# Only transform the test set, do not fit the scaler to it to avoid data leakage
y_test_scaled = scaler.transform(y_test.values.reshape(-1, 1))

## LightGBM

### Light GBM Example with meter = solar

In [14]:
# Prepare the dataset for LightGBM
lgb_train = lgb.Dataset(X_train_processed, label=y_train_scaled)
lgb_eval = lgb.Dataset(X_test_processed, label=y_test_scaled, reference=lgb_train)

In [15]:
# Specify your model parameters here
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'rf',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'force_col_wise' : True
}

In [16]:
# Train the model
gbm = lgb.train(
    params,  # Your defined parameters
    lgb_train,  # Training dataset
    num_boost_round=10,  # Number of boosting rounds
    valid_sets=[lgb_eval],  # Validation dataset for early stopping
)

[LightGBM] [Info] Total Bins 2058
[LightGBM] [Info] Number of data points in the train set: 1098, number of used features: 16
[LightGBM] [Info] Start training from score 0.070944




In [18]:
# Continue with prediction using the best iteration
y_pred = gbm.predict(X_test_processed, num_iteration=gbm.best_iteration)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test_scaled, y_pred)

# Calculate Negative Mean Squared Error
negative_mse = -mse

print(f'Negative Mean Squared Error: {negative_mse}')

Negative Mean Squared Error: -0.029281184887184574


### Light GBM for all meter values

In [6]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV

unique_meters = train_data['meter'].unique()
results = []

for meter_value in unique_meters:
    print(f"Processing meter value: {meter_value}")
    
    # Splitting data for the specific meter value
    X = train_data[train_data['meter'] == meter_value].drop(columns=['meter_reading', 'meter'])
    y = train_data[train_data['meter'] == meter_value]['meter_reading']
    
    # Convert 'site_id' from numeric to categorical
    X['site_id'] = X['site_id'].astype('category')
    
    # Splitting data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Define and apply preprocessing pipeline
    preprocessor = ColumnTransformer(transformers=[
        ('num', MinMaxScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])
    
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)
    
    # Scale the target variable
    y_scaler = MinMaxScaler()
    y_train_scaled = y_scaler.fit_transform(np.array(y_train).reshape(-1, 1))
    y_test_scaled = y_scaler.transform(np.array(y_test).reshape(-1, 1))
    
    # Preparing dataset for LightGBM
    lgb_train = lgb.Dataset(X_train_processed, label=y_train_scaled.flatten())
    lgb_eval = lgb.Dataset(X_test_processed, label=y_test_scaled.flatten(), reference=lgb_train)
    
    # LightGBM parameters
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'rf',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'force_col_wise': True
    }
    
    # Model training
    gbm = lgb.train(
        params,
        lgb_train,
        num_boost_round=10,
        valid_sets=[lgb_eval]
    )
    
    # Model prediction and evaluation
    y_pred_scaled = gbm.predict(X_test_processed, num_iteration=gbm.best_iteration)
    mse = mean_squared_error(y_test_scaled, y_pred_scaled)
    negative_mse = -mse

    results.append({'meter_value': meter_value, 'negative_mse': negative_mse})

# Convert results to a DataFrame and print
results_df = pd.DataFrame(results)
print(results_df)


Processing meter value: electricity
[LightGBM] [Info] Total Bins 2586
[LightGBM] [Info] Number of data points in the train set: 176851, number of used features: 48
[LightGBM] [Info] Start training from score 0.023853
Processing meter value: chilledwater
[LightGBM] [Info] Total Bins 2503
[LightGBM] [Info] Number of data points in the train set: 68515, number of used features: 36
[LightGBM] [Info] Start training from score 0.006933
Processing meter value: gas
[LightGBM] [Info] Total Bins 2209
[LightGBM] [Info] Number of data points in the train set: 29280, number of used features: 27
[LightGBM] [Info] Start training from score 0.020290
Processing meter value: hotwater
[LightGBM] [Info] Total Bins 2163
[LightGBM] [Info] Number of data points in the train set: 23716, number of used features: 31
[LightGBM] [Info] Start training from score 0.016442
Processing meter value: solar
[LightGBM] [Info] Total Bins 1930
[LightGBM] [Info] Number of data points in the train set: 878, number of used fea

### Light GBM for all meter values with hyperparameter tuning

In [4]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'num_leaves': [20, 31, 40],
    'learning_rate': [0.01, 0.05, 0.1],
    'feature_fraction': [0.8, 0.9],
}

unique_meters = train_data['meter'].unique()
results = []

for meter_value in unique_meters:
    print(f"Processing meter value: {meter_value}")
    
    # Splitting data for the specific meter value
    X = train_data[train_data['meter'] == meter_value].drop(columns=['meter_reading', 'meter'])
    y = train_data[train_data['meter'] == meter_value]['meter_reading']
    
    # Convert 'site_id' from numeric to categorical
    X['site_id'] = X['site_id'].astype('category')
    
    # Splitting data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Define and apply preprocessing pipeline
    preprocessor = ColumnTransformer(transformers=[
        ('num', MinMaxScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])
    
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)
    
    # Scale the target variable
    y_scaler = MinMaxScaler()
    y_train_scaled = y_scaler.fit_transform(np.array(y_train).reshape(-1, 1))
    y_test_scaled = y_scaler.transform(np.array(y_test).reshape(-1, 1))
    
    # Preparing dataset for LightGBM
    model = LGBMRegressor(boosting_type='rf', force_col_wise=True)
    
    # Set up GridSearchCV
    grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3)
    
    # Fit the grid search to the data
    grid.fit(X_train_processed, y_train_scaled.flatten())
    
    # Get the best parameters
    best_params = grid.best_params_
    
    # Train a final model with the best parameters
    final_model = LGBMRegressor(boosting_type='rf', force_col_wise=True, **best_params)
    final_model.fit(X_train_processed, y_train_scaled.flatten())
    
    # Model prediction and evaluation
    y_pred_scaled = final_model.predict(X_test_processed)
    mse = mean_squared_error(y_test_scaled, y_pred_scaled)
    negative_mse = -mse

    results.append({
        'meter_value': meter_value,
        'best_params': best_params,
        'negative_mse': negative_mse
    })

# Convert results to a DataFrame and print
results_df = pd.DataFrame(results)
print(results_df)


Processing meter value: electricity
[LightGBM] [Info] Total Bins 2565
[LightGBM] [Info] Number of data points in the train set: 117900, number of used features: 48
[LightGBM] [Info] Start training from score 0.023886
[LightGBM] [Info] Total Bins 2567
[LightGBM] [Info] Number of data points in the train set: 117901, number of used features: 48
[LightGBM] [Info] Start training from score 0.023905
[LightGBM] [Info] Total Bins 2560
[LightGBM] [Info] Number of data points in the train set: 117901, number of used features: 48
[LightGBM] [Info] Start training from score 0.023770
[LightGBM] [Info] Total Bins 2565
[LightGBM] [Info] Number of data points in the train set: 117900, number of used features: 48
[LightGBM] [Info] Start training from score 0.023886
[LightGBM] [Info] Total Bins 2567
[LightGBM] [Info] Number of data points in the train set: 117901, number of used features: 48
[LightGBM] [Info] Start training from score 0.023905
[LightGBM] [Info] Total Bins 2560
[LightGBM] [Info] Number 