In [1]:
# install missing packages
%pip install plotly
%pip install py-cpuinfo

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
# check system details
import os
import psutil
import cpuinfo

try:
    ram_info = psutil.virtual_memory()
    print(f"Total RAM: {ram_info.total / 1024 / 1024 / 1024:.2f} GB")
    print(f"Available RAM: {ram_info.available / 1024 / 1024 / 1024:.2f} GB")
    print(f"Used RAM: {ram_info.used / 1024 / 1024 / 1024:.2f} GB")
    print(f"Percentage Usage Of RAM: {ram_info.percent}%")
    print(f"CPU Cores: {os.cpu_count()}")
    print(f"CPU Speed: {cpuinfo.get_cpu_info()['hz_actual_friendly']}")
except:
    print("RAM and CPU info not available on this system")

Total RAM: 15.47 GB
Available RAM: 14.18 GB
Used RAM: 1.03 GB
Percentage Usage Of RAM: 8.3%
CPU Cores: 4
CPU Speed: 2.5000 GHz


In [3]:
# import requirements
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
from regression import Regression

In [4]:
# get the data
energy = pd.read_csv("energy.csv")
energy = energy.head(int(0.4 * energy.shape[0]))  # only use 40% of the data


Columns (716,717) have mixed types. Specify dtype option on import or set low_memory=False.



In [5]:
# split up the data into training and testing
y = energy[["NWEIGHT"]]
X = energy.drop(columns="NWEIGHT")
trainX = X.head(int(0.8 * X.shape[0]))
trainy = y.head(int(0.8 * y.shape[0]))
testX = X.tail(int(0.2 * X.shape[0])).reset_index(drop=True)
testy = y.tail(int(0.2 * y.shape[0])).reset_index(drop=True)

In [6]:
# build the model
print("\n---- Energy Regression Analysis ----\n")
model = Regression(name="Energy Regression Analysis", frac=0.6)  # only use 60% of the data for preprocessing
try:
    model.load()  # load the machine learning pipeline
    predictions = model.predict(testX)
except:
    model.fit(trainX, trainy)  # build the machine learning pipeline
    predictions = model.predict(testX)


---- Energy Regression Analysis ----

1/6) Model Training
> Renaming Features
> Extracting Time Features
> Transforming Categorical Features
> Renaming Features
> Filling In Missing Values
> Removing Constant Features
> Scaling Features
> Selecting Features
> Computing Atwoood Numbers
> Binning Features
> Computing Reciprocals
> Computing Interactions
> Removing Constant Features
> Selecting Features
> Transforming The Training Data
> Training XGBoost
3.77 Minutes
2/6) Model Performance
> Transforming The Testing Data
9.42 Seconds
3/6) Model Deployment
> Transforming All The Data
> Training XGBoost
44.06 Seconds
4/6) Model Indicators
0.15 Seconds
5/6) Model Prediction
> Transforming The New Data
8.45 Seconds
6/6) Model Monitoring
1.82 Seconds


In [7]:
# model diagnostics
print("Model Diagnostics:")
print(f"> RMSE: {model.rmse}")
print(f"> R2: {model.r2}")
print(f"> In Control: {model.in_control}")
print("> Model Indicators:")
for i, indicator in enumerate(model.indicators["Indicator"][:10]):
    print(f"{i+1}. {indicator}")
print("> Feature Drift:")
for i, feature in enumerate(model.drift.loc[model.drift["pvalue"] < 0.05, "Feature"][:10]):
    print(f"{i+1}. {feature}")
if model.drift.loc[model.drift["pvalue"] < 0.05].shape[0] == 0:
    print("None")

Model Diagnostics:
> RMSE: 2787.4270834963154
> R2: 0.6478002321561322
> In Control: 97.3%
> Model Indicators:
1. DIVISION_3*1/GND_HDD65
2. DIVISION_3*DesignDBT99
3. DIVISION_4*1/CDD80
4. REPORTABLE_DOMAIN_1*1/HDD50
5. DIPSTICK_-2*DIVISION_2
6. DIVISION_3*1/HDD50
7. NHSLDMEM_1*1/DesignDBT99
8. REPORTABLE_DOMAIN_2*TOTSQFT_EN
9. REPORTABLE_DOMAIN_23*KWHWTH
10. REPORTABLE_DOMAIN_20*1/TOTUCSQFT
> Feature Drift:
1. TOTCSQFT*GND_HDD65(0.4-0.6)
2. DIPSTICK_-2*1/BTUELCOL
3. ZWHEATAGE_0*1/HDD65
4. CENACHP_0*1/BTUELCOL
5. DOLLARLP*1/KWHCOL
6. DEFROST_0*TOTSQFT_EN
7. DEFROST_0*TOTCSQFT
8. NWEIGHT


In [8]:
# score the model
rmse = mean_squared_error(
    y_true=testy.iloc[:,0].to_numpy(),
    y_pred=predictions,
    squared=False,
)
r2 = r2_score(
    y_true=testy.iloc[:,0].to_numpy(),
    y_pred=predictions,
)

print(f"RMSE: {rmse}")
print(f"R2: {r2}")

RMSE: 2189.9059018092776
R2: 0.7810552438309766


In [9]:
# save the machine learning pipeline
model.dump()

In [10]:
# refit the model to include the test data
model.refit(testX, testy)

Model Retraining:
> Transforming The New Data
> Training XGBoost
16.62 Seconds
