In [1]:
# Dataset: House Sales in King County, USA
# https://www.kaggle.com/datasets/harlfoxem/housesalesprediction

In [2]:
import torch
from torch import nn

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option("display.max_columns", 200)

In [3]:
df = pd.read_csv("./data/kc_house_data.csv")
df.sample(5)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
7213,3319500299,20140806T000000,304000.0,2,1.5,950,676,2.0,0,0,3,7,850,100,2003,0,98144,47.6005,-122.306,950,1280
12484,1172000150,20140829T000000,238000.0,1,1.0,530,6350,1.0,0,0,5,5,530,0,1941,0,98103,47.6946,-122.357,1200,6350
15566,3062600050,20140714T000000,745000.0,3,2.75,3010,12432,1.0,0,0,4,8,1890,1120,1970,0,98052,47.6392,-122.108,2500,12432
18512,4473400155,20150417T000000,1137500.0,4,3.5,3160,4200,2.0,0,4,3,8,2180,980,1999,0,98144,47.5963,-122.292,2180,5200
4652,2322059039,20140821T000000,238000.0,3,1.0,1470,32670,1.0,0,0,3,7,1020,450,1958,0,98042,47.3811,-122.144,2640,24100


In [4]:
df.head(5)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [5]:
df.isna().sum()

id               0
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   date           21613 non-null  object 
 2   price          21613 non-null  float64
 3   bedrooms       21613 non-null  int64  
 4   bathrooms      21613 non-null  float64
 5   sqft_living    21613 non-null  int64  
 6   sqft_lot       21613 non-null  int64  
 7   floors         21613 non-null  float64
 8   waterfront     21613 non-null  int64  
 9   view           21613 non-null  int64  
 10  condition      21613 non-null  int64  
 11  grade          21613 non-null  int64  
 12  sqft_above     21613 non-null  int64  
 13  sqft_basement  21613 non-null  int64  
 14  yr_built       21613 non-null  int64  
 15  yr_renovated   21613 non-null  int64  
 16  zipcode        21613 non-null  int64  
 17  lat            21613 non-null  float64
 18  long  

In [7]:
df["year"] = df["date"].apply(lambda x: x[0:4]).astype("int64")
df.drop(labels="date", axis=1, inplace=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   price          21613 non-null  float64
 2   bedrooms       21613 non-null  int64  
 3   bathrooms      21613 non-null  float64
 4   sqft_living    21613 non-null  int64  
 5   sqft_lot       21613 non-null  int64  
 6   floors         21613 non-null  float64
 7   waterfront     21613 non-null  int64  
 8   view           21613 non-null  int64  
 9   condition      21613 non-null  int64  
 10  grade          21613 non-null  int64  
 11  sqft_above     21613 non-null  int64  
 12  sqft_basement  21613 non-null  int64  
 13  yr_built       21613 non-null  int64  
 14  yr_renovated   21613 non-null  int64  
 15  zipcode        21613 non-null  int64  
 16  lat            21613 non-null  float64
 17  long           21613 non-null  float64
 18  sqft_l

In [9]:
df.head()

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,year
0,7129300520,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650,2014
1,6414100192,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639,2014
2,5631500400,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062,2015
3,2487200875,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000,2014
4,1954400510,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503,2015


In [10]:
df.loc[df.duplicated()]

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,year
3951,1825069031,550000.0,4,1.75,2410,8447,2.0,0,3,4,8,2060,350,1936,1980,98074,47.6499,-122.088,2520,14789,2014
20054,8648900110,555000.0,3,2.5,1940,3211,2.0,0,0,3,8,1940,0,2009,0,98027,47.5644,-122.093,1880,3078,2014


In [11]:
df.query("id == 8648900110")

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,year
20053,8648900110,555000.0,3,2.5,1940,3211,2.0,0,0,3,8,1940,0,2009,0,98027,47.5644,-122.093,1880,3078,2014
20054,8648900110,555000.0,3,2.5,1940,3211,2.0,0,0,3,8,1940,0,2009,0,98027,47.5644,-122.093,1880,3078,2014


In [12]:
df = df.loc[~df.duplicated(subset=["id"])].reset_index(drop=True).copy()

In [13]:
df.drop(labels="id", axis=1, inplace=True)

In [14]:
df.shape

(21436, 20)

In [15]:
df.nunique()

price            4013
bedrooms           13
bathrooms          30
sqft_living      1038
sqft_lot         9782
floors              6
waterfront          2
view                5
condition           5
grade              12
sqft_above        946
sqft_basement     306
yr_built          116
yr_renovated       70
zipcode            70
lat              5034
long              752
sqft_living15     777
sqft_lot15       8689
year                2
dtype: int64

In [16]:
df.drop(labels="yr_renovated", axis=1, inplace=True)

In [17]:
def one_hot_encode(column):
    data = pd.concat([df, pd.get_dummies(df[column], prefix=column, dtype=int)], axis=1).copy()
    data.drop(labels=column, axis=1, inplace=True)
    return data

In [18]:
df = one_hot_encode("zipcode")

In [19]:
df.shape

(21436, 88)

In [20]:
df.columns

Index(['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'yr_built', 'lat', 'long', 'sqft_living15',
       'sqft_lot15', 'year', 'zipcode_98001', 'zipcode_98002', 'zipcode_98003',
       'zipcode_98004', 'zipcode_98005', 'zipcode_98006', 'zipcode_98007',
       'zipcode_98008', 'zipcode_98010', 'zipcode_98011', 'zipcode_98014',
       'zipcode_98019', 'zipcode_98022', 'zipcode_98023', 'zipcode_98024',
       'zipcode_98027', 'zipcode_98028', 'zipcode_98029', 'zipcode_98030',
       'zipcode_98031', 'zipcode_98032', 'zipcode_98033', 'zipcode_98034',
       'zipcode_98038', 'zipcode_98039', 'zipcode_98040', 'zipcode_98042',
       'zipcode_98045', 'zipcode_98052', 'zipcode_98053', 'zipcode_98055',
       'zipcode_98056', 'zipcode_98058', 'zipcode_98059', 'zipcode_98065',
       'zipcode_98070', 'zipcode_98072', 'zipcode_98074', 'zipcode_98075',
       'zipcode_98077', 'zipcode_9

In [21]:
df.shape

(21436, 88)

In [22]:
X = df.drop("price", axis=1).copy()
y = df["price"].copy()

In [23]:
scalar = StandardScaler()
X = scalar.fit_transform(X)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
model = LinearRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

In [26]:
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R-squared Score:", r2)

Mean Squared Error: 21844005658.944714
Mean Absolute Error: 93434.92893444728
R-squared Score: 0.815250734645228


In [27]:
device = "cuda" if torch.cuda.is_available else "cpu"

In [28]:
X = torch.tensor(X, dtype=torch.float32, device=device)
y = torch.tensor(y, dtype=torch.float32, device=device)

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
class HousePricePredictor(nn.Module):
    def __init__(self, input_features, hidden_units, output_features):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(in_features=input_features, out_features=hidden_units),
            nn.ReLU(),
            nn.Linear(in_features=hidden_units, out_features=hidden_units),
            nn.ReLU(),
            nn.Linear(in_features=hidden_units, out_features=output_features)
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.layers(x)

house_price_predictor = HousePricePredictor(input_features=X.shape[1], hidden_units=20, output_features=1).to(device)

In [31]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(house_price_predictor.parameters(), lr=0.01)

In [32]:
from torchmetrics.regression import R2Score

r2score = R2Score().to(device)
epochs = 10000

for epoch in range(epochs):
    # ----- TRAINING -----
    house_price_predictor.train()
    pred = house_price_predictor(X_train).flatten()
    loss = loss_fn(pred, y_train)
    r_loss = torch.sqrt(loss)
    r2_train = r2score(y_train, pred)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # ----- TESTING -----
    if epoch % 1000 == 0:
        house_price_predictor.eval()
        with torch.inference_mode():
            test_pred = house_price_predictor(X_test).flatten()
            test_loss = loss_fn(test_pred, y_test)
            r_test_loss = torch.sqrt(test_loss)
            r2_test = r2score(y_test, test_pred)
        print(f"Epoch: {epoch} | MSE: {test_loss}, RMSE: {r_test_loss}, R2Score: {r2_test} |")

Epoch: 0 | MSE: 401053908992.0, RMSE: 633288.1875, R2Score: -64391189889024.0 |
Epoch: 1000 | MSE: 22228736000.0, RMSE: 149093.046875, R2Score: 0.8160134553909302 |
Epoch: 2000 | MSE: 15378348032.0, RMSE: 124009.46875, R2Score: 0.8593145608901978 |
Epoch: 3000 | MSE: 14144121856.0, RMSE: 118929.0625, R2Score: 0.8691863417625427 |
Epoch: 4000 | MSE: 13806979072.0, RMSE: 117503.1015625, R2Score: 0.8723907470703125 |
Epoch: 5000 | MSE: 13596606464.0, RMSE: 116604.484375, R2Score: 0.8745642900466919 |
Epoch: 6000 | MSE: 13406648320.0, RMSE: 115787.078125, R2Score: 0.8766495585441589 |
Epoch: 7000 | MSE: 13162549248.0, RMSE: 114728.15625, R2Score: 0.8789782524108887 |
Epoch: 8000 | MSE: 12796166144.0, RMSE: 113120.140625, R2Score: 0.883062481880188 |
Epoch: 9000 | MSE: 12774634496.0, RMSE: 113024.9296875, R2Score: 0.8835712671279907 |
