In [1]:
# Importing the required libraries 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error,  r2_score
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import tensorflow.keras.layers as layers
from scipy.sparse import isspmatrix
import matplotlib.pyplot as plt
#from sklearn.model_selection import GridSearchCV
#from scikeras.wrappers import KerasRegressor

In [2]:
# Load datasets

# Train Data 
temp_df = pd.read_csv("/Users/inutenneti/Desktop/Winter Quarter/Predictive Analytics 2/project/train.csv", nrows=0)  # Read only the header
total_columns = len(temp_df.columns)
columns_to_use = temp_df.columns[1:total_columns] 
train_data = pd.read_csv("/Users/inutenneti/Desktop/Winter Quarter/Predictive Analytics 2/project/train.csv", usecols=columns_to_use)

# Test Data 
test_data = pd.read_csv("/Users/inutenneti/Desktop/Winter Quarter/Predictive Analytics 2/project/test.csv", usecols=columns_to_use)

# Dropping the columns that are not relevant to our analysis 
train_data = train_data.drop(columns=['building_name', 'site_name'])
test_data = test_data.drop(columns=['building_name', 'site_name'])

# Building index on building_id for furhter assessment 
#train_data.set_index('building_id', inplace=True)
#test_data.set_index('building_id', inplace=True)

In [3]:
# Filtering for electrity meter_reading
train_data = train_data[train_data['meter'] == 'solar']
test_data = test_data[test_data['meter'] == 'solar']

train_data = train_data.drop(columns=['meter'])
test_data = test_data.drop(columns=['meter'])

In [4]:
# Inspecting the data frames
print(train_data.sample(2))
print('-------------------------------------------------------------')
print(test_data.sample(2))

             date  meter_reading sub_primaryspaceusage      sqm      sqft  \
30816  2016-03-13         117.69        Student Center  17358.0  186840.0   
30958  2016-08-02         193.06        Student Center  17358.0  186840.0   

          timezone  airTemperature  cloudCoverage  dewTemperature  \
30816  US/Mountain       11.916960       2.035447        5.470264   
30958  US/Mountain       23.411894       2.275106       17.420044   

       precipDepth1HR  precipDepth6HR  seaLvlPressure  windDirection  \
30816        0.807454       12.221662     1019.473356     134.487438   
30958        0.897203       17.677378     1014.908378     161.312014   

       windSpeed  season  building_id  site_id  
30816   3.277313  Spring           72        2  
30958   3.426872  Summer           72        2  
-------------------------------------------------------------
             date  meter_reading sub_primaryspaceusage      sqm      sqft  \
26044  2017-05-10      3538.9423              Academic  1

In [5]:
# Separating into X and Y dataframes 
X_train = train_data.drop(columns=['meter_reading'])  # Exclude target variable
y_train = train_data['meter_reading']

X_test = test_data.drop(columns=['meter_reading'])  # Exclude target variable
y_test = test_data['meter_reading']

In [6]:
# Convert 'site_id' from numeric to categorical 
X_train['site_id'] = X_train['site_id'].astype('category')
X_test['site_id'] = X_test['site_id'].astype('category')

# Making sure the date columns is in the right format 
X_train['date'] = pd.to_datetime(X_train['date'])
X_test['date'] = pd.to_datetime(X_test['date'])

In [7]:
print(X_train.dtypes)
print(X_train.columns)

date                     datetime64[ns]
sub_primaryspaceusage            object
sqm                             float64
sqft                            float64
timezone                         object
airTemperature                  float64
cloudCoverage                   float64
dewTemperature                  float64
precipDepth1HR                  float64
precipDepth6HR                  float64
seaLvlPressure                  float64
windDirection                   float64
windSpeed                       float64
season                           object
building_id                       int64
site_id                        category
dtype: object
Index(['date', 'sub_primaryspaceusage', 'sqm', 'sqft', 'timezone',
       'airTemperature', 'cloudCoverage', 'dewTemperature', 'precipDepth1HR',
       'precipDepth6HR', 'seaLvlPressure', 'windDirection', 'windSpeed',
       'season', 'building_id', 'site_id'],
      dtype='object')


In [8]:
# Define features and types based on your dataset
numerical_features = ['sqm', 'sqft', 'airTemperature', 'cloudCoverage', 'dewTemperature',
                      'precipDepth1HR', 'precipDepth6HR', 'seaLvlPressure', 'windDirection', 'windSpeed']
categorical_features = ['timezone', 'season', 'sub_primaryspaceusage', 'site_id']
date_feature = 'date'
id_feature = 'building_id'

In [9]:
# Save the 'building_id' and 'date' columns
building_ids_train = X_train[id_feature].values
dates_train = X_train[date_feature].values
building_ids_test = X_test[id_feature].values
dates_test = X_test[date_feature].values

In [10]:
# Drop the 'building_id' and 'date' columns for preprocessing
X_train = X_train.drop(columns=[id_feature, date_feature])
X_test = X_test.drop(columns=[id_feature, date_feature])

In [11]:
# Create a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Fit the preprocessor on the training data and transform both training and test data
preprocessor.fit(X_train)
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [12]:
# Convert the processed data back to dense DataFrames
X_train_processed_df = pd.DataFrame(X_train_processed, columns=preprocessor.get_feature_names_out())
X_test_processed_df = pd.DataFrame(X_test_processed, columns=preprocessor.get_feature_names_out())

In [13]:
# Reattach the 'building_id' and 'date' columns to the processed data
X_train_processed_df[id_feature] = building_ids_train
X_train_processed_df[date_feature] = dates_train
X_test_processed_df[id_feature] = building_ids_test
X_test_processed_df[date_feature] = dates_test

# Sort the DataFrames by 'building_id' and 'date' to ensure the correct sequence
X_train_processed_df.sort_values(by=[id_feature, date_feature], inplace=True)
X_test_processed_df.sort_values(by=[id_feature, date_feature], inplace=True)

In [14]:
X_train_processed_df.columns

Index(['num__sqm', 'num__sqft', 'num__airTemperature', 'num__cloudCoverage',
       'num__dewTemperature', 'num__precipDepth1HR', 'num__precipDepth6HR',
       'num__seaLvlPressure', 'num__windDirection', 'num__windSpeed',
       'cat__timezone_US/Mountain', 'cat__season_Fall', 'cat__season_Spring',
       'cat__season_Summer', 'cat__season_Winter',
       'cat__sub_primaryspaceusage_Academic',
       'cat__sub_primaryspaceusage_Student Center', 'cat__site_id_2',
       'building_id', 'date'],
      dtype='object')

## LightGBM

In [15]:
X_train_processed

array([[0.103275  , 0.103276  , 0.2006084 , ..., 1.        , 0.        ,
        1.        ],
       [0.103275  , 0.103276  , 0.23011522, ..., 1.        , 0.        ,
        1.        ],
       [0.103275  , 0.103276  , 0.21693751, ..., 1.        , 0.        ,
        1.        ],
       ...,
       [1.        , 1.        , 0.21150202, ..., 0.        , 1.        ,
        1.        ],
       [1.        , 1.        , 0.14297168, ..., 0.        , 1.        ,
        1.        ],
       [1.        , 1.        , 0.18044175, ..., 0.        , 1.        ,
        1.        ]])

In [None]:
import lightgbm as lgb

# Prepare the dataset for LightGBM
lgb_train = lgb.Dataset(X_train_processed, label=y_train)
lgb_eval = lgb.Dataset(X_test_processed, label=y_test, reference=lgb_train)

# Specify your model parameters here
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5
}

# Train the model
gbm = lgb.train(
    params,  # Your defined parameters
    lgb_train,  # Training dataset
    num_boost_round=100,  # Number of boosting rounds
    valid_sets=[lgb_eval],  # Validation dataset for early stopping
    #early_stopping_rounds=10  # Stops training if one metric of one validation data doesn't improve in last 10 rounds
)

# Continue with prediction using the best iteration
y_pred = gbm.predict(X_test_processed, num_iteration=gbm.best_iteration)
# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate Negative Mean Squared Error
negative_mse = -mse

print(f'Negative Mean Squared Error: {negative_mse}')