<a href="https://colab.research.google.com/github/orfi9/google_ml_course/blob/main/energy_consumption.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

In [None]:
# data visualization
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import seaborn as sns

## Get data at kaggle

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("govindaramsriram/energy-consumption-dataset-linear-regression")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/energy-consumption-dataset-linear-regression


## Load and preprocess data

In [None]:
# energy consumption of training data
df_train = pd.read_csv(path+"/train_energy_data.csv")

# energy consumption of testing data
df_test = pd.read_csv(path+"/test_energy_data.csv")

In [None]:
# train dataset
df_train.head()

Unnamed: 0,Building Type,Square Footage,Number of Occupants,Appliances Used,Average Temperature,Day of Week,Energy Consumption
0,Residential,7063,76,10,29.84,Weekday,2713.95
1,Commercial,44372,66,45,16.72,Weekday,5744.99
2,Industrial,19255,37,17,14.3,Weekend,4101.24
3,Residential,13265,14,41,32.82,Weekday,3009.14
4,Commercial,13375,26,18,11.92,Weekday,3279.17


In [None]:
# test dataset
df_test.head()

Unnamed: 0,Building Type,Square Footage,Number of Occupants,Appliances Used,Average Temperature,Day of Week,Energy Consumption
0,Residential,24563,15,4,28.52,Weekday,2865.57
1,Commercial,27583,56,23,23.07,Weekend,4283.8
2,Commercial,45313,4,44,33.56,Weekday,5067.83
3,Residential,41625,84,17,27.39,Weekend,4624.3
4,Residential,36720,58,47,17.08,Weekday,4820.59


In [None]:
df_train.describe()

Unnamed: 0,Square Footage,Number of Occupants,Appliances Used,Average Temperature,Energy Consumption
count,1000.0,1000.0,1000.0,1000.0,1000.0
mean,25462.388,48.372,25.606,22.61139,4166.25257
std,14294.554,29.061972,14.105166,7.139943,933.313064
min,560.0,1.0,1.0,10.05,1683.95
25%,13169.75,22.0,13.0,16.475,3509.4825
50%,25477.0,47.0,26.0,22.815,4175.73
75%,37446.25,73.25,38.0,28.85,4863.85
max,49997.0,99.0,49.0,34.99,6530.6


In [None]:
df_test.describe()

Unnamed: 0,Square Footage,Number of Occupants,Appliances Used,Average Temperature,Energy Consumption
count,100.0,100.0,100.0,100.0,100.0
mean,25881.92,47.23,26.97,22.0433,4187.5783
std,13711.075264,29.905526,14.237846,6.957951,832.55985
min,1161.0,2.0,1.0,10.4,2351.97
25%,14161.0,21.0,16.75,15.6825,3621.925
50%,27582.5,47.0,27.5,21.97,4249.39
75%,38109.5,73.0,39.25,27.4925,4797.175
max,49354.0,99.0,49.0,34.71,6042.56


In [None]:
df_train_encoded = pd.get_dummies(df_train, columns=['Building Type', 'Day of Week' ], prefix_sep=' ', sparse=False, dtype=float, drop_first=True)

df_test_encoded = pd.get_dummies(df_test, columns=['Building Type', 'Day of Week' ], prefix_sep=' ', sparse=False, dtype=float, drop_first=True)

In [None]:
df_train_encoded.head()

Unnamed: 0,Square Footage,Number of Occupants,Appliances Used,Average Temperature,Energy Consumption,Building Type Industrial,Building Type Residential,Day of Week Weekend
0,7063,76,10,29.84,2713.95,0.0,1.0,0.0
1,44372,66,45,16.72,5744.99,0.0,0.0,0.0
2,19255,37,17,14.3,4101.24,1.0,0.0,1.0
3,13265,14,41,32.82,3009.14,0.0,1.0,0.0
4,13375,26,18,11.92,3279.17,0.0,0.0,0.0


## Standard Scaler

In [None]:
cols_to_standardize = ['Appliances Used', 'Average Temperature', 'Number of Occupants', 'Square Footage', 'Energy Consumption']

df_train_encoded[cols_to_standardize] = df_train_encoded[cols_to_standardize].apply(lambda x: (x - x.mean())/x.std())

df_test_encoded[cols_to_standardize] = df_test_encoded[cols_to_standardize].apply(lambda x: (x - x.mean())/x.std())


In [None]:
df_train_encoded.head()

Unnamed: 0,Square Footage,Number of Occupants,Appliances Used,Average Temperature,Energy Consumption,Building Type Industrial,Building Type Residential,Day of Week Weekend
0,-1.287161,0.950658,-1.106403,1.012419,-1.556072,0.0,1.0,0.0
1,1.322854,0.606566,1.374957,-0.825131,1.691541,0.0,0.0,0.0
2,-0.434248,-0.391302,-0.610131,-1.16407,-0.069658,1.0,0.0,1.0
3,-0.853289,-1.182714,1.091373,1.429789,-1.23979,0.0,1.0,0.0
4,-0.845594,-0.769803,-0.539235,-1.497406,-0.950466,0.0,0.0,0.0


In [None]:
df_train_encoded.corr(numeric_only=True)

Unnamed: 0,Square Footage,Number of Occupants,Appliances Used,Average Temperature,Energy Consumption,Building Type Industrial,Building Type Residential,Day of Week Weekend
Square Footage,1.0,0.033379,-0.013478,0.027273,0.774873,0.035415,0.018532,0.011354
Number of Occupants,0.033379,1.0,0.023646,0.014122,0.354485,0.028786,-0.016206,0.014296
Appliances Used,-0.013478,0.023646,1.0,-0.06287,0.312792,0.014161,-0.028652,-0.009046
Average Temperature,0.027273,0.014122,-0.06287,1.0,-0.034487,0.015166,0.025854,-0.011106
Energy Consumption,0.774873,0.354485,0.312792,-0.034487,1.0,0.415468,-0.378708,-0.004393
Building Type Industrial,0.035415,0.028786,0.014161,0.015166,0.415468,1.0,-0.496624,0.015987
Building Type Residential,0.018532,-0.016206,-0.028652,0.025854,-0.378708,-0.496624,1.0,-0.029712
Day of Week Weekend,0.011354,0.014296,-0.009046,-0.011106,-0.004393,0.015987,-0.029712,1.0


In [None]:
def plot_histograms(df, columns, fig):
  for i, c_row in enumerate(columns, start=1):
    for j, c_col in enumerate(c_row, start=1):
      fig.append_trace(px.histogram(df[c_col]).data[0], row=i, col=j)
  return

In [None]:
fig = make_subplots(rows=2,cols=3,
                    subplot_titles=("Square Footage", "Number of Occupants", "Appliances Used", "Average Temperature", "Energy Consumption"),
                    )

columns = [["Square Footage", "Number of Occupants", "Appliances Used"], ["Average Temperature", "Energy Consumption"]]
plot_histograms(df_train_encoded, columns, fig)

fig.update_layout(height=600, width=1000, title_text="Numeric Data", template="plotly_dark")
fig.show()

## Define plotting functions

In [None]:
def make_plots(df, feature_names, label_name, model_output, sample_size=20):
  random_sample = df.sample(n=sample_size).copy()
  random_sample.reset_index()
  weights, bias, epochs, rmse = model_output

  is_2d_plot = len(feature_names) == 1
  model_plot_type = "scatter" if is_2d_plot else "surface"
  if len(feature_names)<3:
    fig = make_subplots(rows=1, cols=2,
                        subplot_titles=("Loss Curve", "Model Plot"),
                        specs=[[{"type": "scatter"}, {"type": model_plot_type}]])
    plot_data(random_sample, feature_names, label_name, fig)
    plot_loss_curve(epochs, rmse, fig)
    plot_model(random_sample, feature_names, weights, bias, fig)


  else:

    fig = px.line(x=epochs, y=rmse)
    fig.update_traces(line_color='#ff0000', line_width=3)
    fig.update_xaxes(title_text="Epoch")
    fig.update_yaxes(title_text="Root Mean Squared Error", range=[rmse.min()*0.8, rmse.max()])
    fig.update_layout(height=400, width=600, title_text="Loss Curve", template="plotly_dark")


  fig.show()
  return

def plot_loss_curve(epochs, rmse, fig):
  curve = px.line(x=epochs, y=rmse)
  curve.update_traces(line_color='#ff0000', line_width=3)

  fig.append_trace(curve.data[0], row=1, col=1)
  fig.update_xaxes(title_text="Epoch", row=1, col=1)
  fig.update_yaxes(title_text="Root Mean Squared Error", row=1, col=1, range=[rmse.min()*0.8, rmse.max()])

  return

def plot_data(df, features, label, fig):
  if len(features) == 1:
    scatter = px.scatter(df, x=features[0], y=label)
  else:
    scatter = px.scatter_3d(df, x=features[0], y=features[1], z=label)

  fig.append_trace(scatter.data[0], row=1, col=2)
  if len(features) == 1:
    fig.update_xaxes(title_text=features[0], row=1, col=2)
    fig.update_yaxes(title_text=label, row=1, col=2)
  else:
    fig.update_layout(scene1=dict(xaxis_title=features[0], yaxis_title=features[1], zaxis_title=label))

  return

def plot_model(df, features, weights, bias, fig):
  df['Energy Consumption Prediction'] = bias[0]

  for index, feature in enumerate(features):
    df['Energy Consumption Prediction'] = df['Energy Consumption Prediction'] + weights[index][0] * df[feature]

  if len(features) == 1:
    model = px.line(df, x=features[0], y='Energy Consumption Prediction')
    model.update_traces(line_color='#ff0000', line_width=3)
  else:
    z_name, y_name = "Energy Consumption Prediction", features[1]
    z = [df[z_name].min(), (df[z_name].max() - df[z_name].min()) / 2, df[z_name].max()]
    y = [df[y_name].min(), (df[y_name].max() - df[y_name].min()) / 2, df[y_name].max()]
    x = []
    for i in range(len(y)):
      x.append((z[i] - weights[1][0] * y[i] - bias[0]) / weights[0][0])

    plane=pd.DataFrame({'x':x, 'y':y, 'z':[z] * 3})

    light_yellow = [[0, '#89CFF0'], [1, '#FFDB58']]
    model = go.Figure(data=go.Surface(x=plane['x'], y=plane['y'], z=plane['z'],
                                      colorscale=light_yellow))

  fig.add_trace(model.data[0], row=1, col=2)

  return

def model_info(feature_names, label_name, model_output):
  weights = model_output[0]
  bias = model_output[1]

  nl = "\n"
  header = "-" * 80
  banner = header + nl + "|" + "MODEL INFO".center(78) + "|" + nl + header

  info = ""
  equation = label_name + " = "

  for index, feature in enumerate(feature_names):
    info = info + "Weight for feature[{}]: {:.3f}\n".format(feature, weights[index][0])
    equation = equation + "{:.3f} * {} + ".format(weights[index][0], feature)

  info = info + "Bias: {:.3f}\n".format(bias[0])
  equation = equation + "{:.3f}\n".format(bias[0])

  return banner + nl + info + nl + equation

print("SUCCESS: defining plotting functions complete.")



SUCCESS: defining plotting functions complete.


##ML Functions

In [None]:
import keras

In [None]:
def build_model(my_learning_rate, num_features):
  inputs = keras.Input(shape=(num_features,))
  outputs = keras.layers.Dense(units=1)(inputs)
  model = keras.Model(inputs=inputs, outputs=outputs)

  model.compile(optimizer=keras.optimizers.RMSprop(learning_rate=my_learning_rate),
                loss="mean_squared_error",
                metrics=[keras.metrics.RootMeanSquaredError()])

  return model


def train_model(model, features, label, epochs, batch_size):
  history=model.fit(x=features,
                    y=label,
                    batch_size=batch_size,
                    epochs=epochs)
  trained_weight = model.get_weights()[0]
  trained_bias = model.get_weights()[1]

  epochs = history.epoch
  hist = pd.DataFrame(history.history)
  rmse=hist["root_mean_squared_error"]

  return trained_weight, trained_bias, epochs, rmse


def run_model(df, feature_names, label_name, learning_rate, epochs, batch_size):
  print('INFO: starting training experiment with features={} and label={}\n'.format(feature_names, label_name))

  num_features = len(feature_names)

  features = df.loc[:, feature_names].values
  label = df[label_name].values

  model = build_model(learning_rate, num_features)
  model_output = train_model(model, features, label, epochs, batch_size)

  print('\nSUCCESS: training experiment complete\n')
  print('{}'.format(model_info(feature_names, label_name, model_output)))
  make_plots(df, feature_names, label_name, model_output)

  return model

print("SUCCESS: defining linear regression functions complete.")



SUCCESS: defining linear regression functions complete.


## Model Excecution

In [None]:
#@title Code - Experiment 1

# The following variables are the hyperparameters.
learning_rate = 0.01
epochs = 20
batch_size = 20 #default 20

# Specify the feature and the label.
features = ['Square Footage']
label = 'Energy Consumption'

model_1 = run_model(df_train_encoded, features, label, learning_rate, epochs, batch_size)

INFO: starting training experiment with features=['Square Footage'] and label=Energy Consumption

Epoch 1/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 5.6386 - root_mean_squared_error: 2.3736
Epoch 2/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3.6908 - root_mean_squared_error: 1.9200
Epoch 3/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 2.1333 - root_mean_squared_error: 1.4601
Epoch 4/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1.2433 - root_mean_squared_error: 1.1125
Epoch 5/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.5896 - root_mean_squared_error: 0.7675
Epoch 6/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.4375 - root_mean_squared_error: 0.6609
Epoch 7/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.3725 - roo

In [None]:
#@title Code - Experiment 2

# The following variables are the hyperparameters.
learning_rate = 0.01
epochs = 20
batch_size = 20

# Specify the feature and the label.
features = ['Square Footage', 'Building Type Industrial', 'Building Type Residential', 'Number of Occupants', 'Appliances Used']
label = 'Energy Consumption'

model_2 = run_model(df_train_encoded, features, label, learning_rate, epochs, batch_size)

INFO: starting training experiment with features=['Square Footage', 'Building Type Industrial', 'Building Type Residential', 'Number of Occupants', 'Appliances Used'] and label=Energy Consumption

Epoch 1/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.4465 - root_mean_squared_error: 0.6617
Epoch 2/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0501 - root_mean_squared_error: 0.2211
Epoch 3/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0051 - root_mean_squared_error: 0.0706
Epoch 4/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0028 - root_mean_squared_error: 0.0524
Epoch 5/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0025 - root_mean_squared_error: 0.0501
Epoch 6/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0026 - root_mean_squared_error: 0.0513
Epoch

In [None]:
#@title Code - Define functions to make predictions
def format_currency(x):
  return "{:.2f}".format(x)

def build_batch(df, batch_size):
  batch = df.sample(n=batch_size).copy()
  batch.set_index(np.arange(batch_size), inplace=True)
  return batch

def predict_fare(model, df, features, label, batch_size=12):
  batch = build_batch(df, batch_size)
  predicted_values = model.predict_on_batch(x=batch.loc[:, features].values)

  data = {"PREDICTED_Consumption": [], "OBSERVED_Consumption": [], "L1_LOSS": [],
          features[0]: [], features[1]: [], features[2]: [], features[3]: [], features[4]: []}
  for i in range(batch_size):
    predicted = predicted_values[i][0]
    observed = batch.at[i, label]
    data["PREDICTED_Consumption"].append(format_currency(predicted))
    data["OBSERVED_Consumption"].append(format_currency(observed))
    data["L1_LOSS"].append(format_currency(abs(observed - predicted)))
    data[features[0]].append(batch.at[i, features[0]])
    data[features[1]].append("{:.2f}".format(batch.at[i, features[1]]))
    data[features[2]].append("{:.2f}".format(batch.at[i, features[2]]))
    data[features[3]].append("{:.2f}".format(batch.at[i, features[3]]))
    data[features[4]].append("{:.2f}".format(batch.at[i, features[4]]))

  output_df = pd.DataFrame(data)
  return output_df

def show_predictions(output):
  header = "-" * 80
  banner = header + "\n" + "|" + "PREDICTIONS".center(78) + "|" + "\n" + header
  print(banner)
  # print(output)
  return output

In [None]:
#@title Code - Make predictions

output = predict_fare(model_2, df_test_encoded, features, label)
show_predictions(output)

--------------------------------------------------------------------------------
|                                 PREDICTIONS                                  |
--------------------------------------------------------------------------------


Unnamed: 0,PREDICTED_Consumption,OBSERVED_Consumption,L1_LOSS,Square Footage,Building Type Industrial,Building Type Residential,Number of Occupants,Appliances Used
0,-1.3,-1.43,0.12,-1.18349,0.0,1.0,1.33,-0.98
1,-0.73,-0.81,0.09,-0.434388,0.0,0.0,-0.68,-0.7
2,-0.66,-0.71,0.04,0.159585,0.0,1.0,-1.24,0.42
3,0.81,0.97,0.16,0.241052,0.0,0.0,1.16,0.84
4,-0.19,-0.18,0.01,-1.463264,1.0,0.0,0.29,0.99
5,0.46,0.5,0.04,1.360731,0.0,1.0,-1.31,1.2
6,0.71,0.85,0.15,0.985195,0.0,1.0,0.16,1.48
7,-0.19,-0.22,0.03,-0.224995,0.0,0.0,-1.35,1.27
8,-1.09,-1.23,0.15,-0.121939,0.0,1.0,-0.64,-0.91
9,0.2,0.23,0.03,-0.08168,1.0,0.0,0.13,-1.05
