In [1]:
# install missing packages
%pip install plotly
%pip install py-cpuinfo
%pip install pandas-datareader
%pip install nltk

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
# check system details
import os
import psutil
import cpuinfo

ram_info = psutil.virtual_memory()
print(f"Total RAM: {ram_info.total / 1024 / 1024 / 1024:.2f} GB")
print(f"Available RAM: {ram_info.available / 1024 / 1024 / 1024:.2f} GB")
print(f"Used RAM: {ram_info.used / 1024 / 1024 / 1024:.2f} GB")
print(f"Percentage Usage Of RAM: {ram_info.percent}%")
print(f"CPU Cores: {os.cpu_count()}")
print(f"CPU Speed: {cpuinfo.get_cpu_info()['hz_actual_friendly']}")
disk_info = psutil.disk_usage(os.getcwd())
print(f"Total Disk: {disk_info.total / 1024 / 1024 / 1024:.2f} GB")
print(f"Available Disk: {disk_info.free / 1024 / 1024 / 1024:.2f} GB")
print(f"Used Disk: {disk_info.used / 1024 / 1024 / 1024:.2f} GB")
print(f"Percentage Usage Of Disk: {disk_info.percent}%")

Total RAM: 15.47 GB
Available RAM: 13.69 GB
Used RAM: 1.46 GB
Percentage Usage Of RAM: 11.5%
CPU Cores: 4
CPU Speed: 2.5000 GHz
Total Disk: 24.99 GB
Available Disk: 12.35 GB
Used Disk: 12.64 GB
Percentage Usage Of Disk: 50.6%


In [3]:
# import requirements
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
from lm_regressor import Regression

[nltk_data] Downloading package vader_lexicon to /home/studio-lab-
[nltk_data]     user/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [4]:
# get the data
prices = pd.read_csv("prices.csv")

In [5]:
# remove the id column
prices = prices.drop(columns="Id")

In [6]:
# replace missing values with None for string columns
for col in prices.columns:
    if prices[col].dtypes == "object" and prices[col].isna().any():
        prices[col] = prices[col].fillna("None")

In [7]:
# convert YrSold to a date
prices["DateSold"] = prices["YrSold"].astype(str) + "-01-01"
prices = prices.drop(columns="YrSold")

In [8]:
# fill in missing values for LotFrontage, MasVnrArea, and GarageYrBlt with 0
prices["LotFrontage"] = prices["LotFrontage"].fillna(0)
prices["MasVnrArea"] = prices["MasVnrArea"].fillna(0)
prices["GarageYrBlt"] = prices["GarageYrBlt"].fillna(0)

In [9]:
# shuffle the data
prices = prices.sample(frac=1, random_state=0).reset_index(drop=True)

In [10]:
# make sales price the first column
prices = pd.concat([prices[["SalePrice"]], prices.drop(columns="SalePrice")], axis="columns")

In [11]:
# get the testing data
y = prices[["SalePrice"]]
X = prices.drop(columns="SalePrice")
testX = X.tail(int(0.2 * X.shape[0])).reset_index(drop=True)
testy = y.tail(int(0.2 * y.shape[0])).reset_index(drop=True)

In [12]:
# build the model
print("\n---- House Regression Analysis ----\n")
model = Regression(
    name="Lasso Without Feature Engineering", 
    path=None,
    rename=False, 
    time=True, 
    text=False,
    binary=True, 
    imputation=False, 
    variance=True,
    scale=True,
    atwood=False,
    binning=False,
    reciprocal=False, 
    interaction=False, 
    selection=False,
    plots=True,
)
try:
    model.load()  # load the machine learning pipeline
    predictions = model.predict(testX)
except:
    model.explore(prices)
    model.validate(X, y)  # build the machine learning pipeline
    predictions = model.predict(testX)
    print("\nModel Performance:")
    print(f"R2: {model.r2}")
    print(f"RMSE: {model.rmse}")
    print(f"In Control: {model.in_control}")


---- House Regression Analysis ----

Visualizing The Data:
> Plotting Correlations
> SalePrice vs. GrLivArea
> TotalBsmtSF vs. 1stFlrSF
> Plotting SalePrice
> Plotting LotFrontage
> Plotting LotArea
> Plotting YearBuilt
> Plotting YearRemodAdd
> Plotting MasVnrArea
> Plotting BsmtFinSF1
> Plotting BsmtFinSF2
> Plotting BsmtUnfSF
> Plotting TotalBsmtSF
> Plotting 1stFlrSF
> Plotting 2ndFlrSF
> Plotting GrLivArea
> Plotting GarageYrBlt
> Plotting GarageArea
> Plotting WoodDeckSF
> Plotting OpenPorchSF
> Plotting EnclosedPorch
> Plotting ScreenPorch
> Plotting MSZoning
> Plotting Street
> Plotting Alley
> Plotting LotShape
> Plotting LandContour
> Plotting Utilities
> Plotting LotConfig
> Plotting LandSlope
> Plotting Neighborhood
> Plotting Condition1
> Plotting Condition2
> Plotting BldgType
> Plotting HouseStyle
> Plotting RoofStyle
> Plotting RoofMatl
> Plotting Exterior1st
> Plotting Exterior2nd
> Plotting MasVnrType
> Plotting ExterQual
> Plotting ExterCond
> Plotting Foundation
> 

In [17]:
# model diagnostics
print("Model Indicators:")
for i, indicator in enumerate(model.indicators["Indicator"][:10].tolist()):
    print(f"{i+1}. {indicator}")
print(" ")
print("Feature Drift:")
for i, feature in enumerate(model.drift.loc[model.drift["pvalue"] < 0.05, "Feature"][:10].tolist()):
    print(f"{i+1}. {feature}")
if model.drift.loc[model.drift["pvalue"] < 0.05].shape[0] == 0:
    print("None")

Model Indicators:
1. RoofMatl_ClyTile
2. GrLivArea
3. Condition2_PosN
4. OverallQual_10
5. OverallQual_9
6. PoolArea_555
7. RoofMatl_WdShngl
8. TotalBsmtSF
9. FullBath_3
10. Neighborhood_StoneBr
 
Feature Drift:
None


In [14]:
# score the model
rmse = mean_squared_error(
    y_true=testy.iloc[:,0].to_numpy(),
    y_pred=predictions,
    squared=False,
)
r2 = r2_score(
    y_true=testy.iloc[:,0].to_numpy(),
    y_pred=predictions,
)

print(f"RMSE: {rmse}")
print(f"R2: {r2}")

RMSE: 23499.204036992076
R2: 0.8878847904438694


In [15]:
# save the machine learning pipeline
model.dump()

In [16]:
# refit the model to include the test data
model.refit(testX, testy)

Model Retraining:
> Transforming The Updated Data
> Extracting Time Features
> Transforming Categorical Features
> Removing Constant Features
> Scaling Features
> Training Lasso
2.95 Seconds
Model Indicators:
> Extracting Important Features
0.66 Seconds
