## California Housing in PyTorch

In [13]:
# import libraries
import torch
import torch.nn as nn 
from torch.utils.data import DataLoader, TensorDataset
from sklearn.datasets import fetch_california_housing
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
import numpy as np
from datetime import datetime

# check PyTorch version
print(f"PyTorch version: {torch.__version__}")

# check device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

# created date
print(f"Created date: 2023-06-25 13:50:27.768534")

# modified date
print(f"Modified date: {datetime.now()}")

PyTorch version: 1.12.1+cu102
Device: cuda
Created date: 2023-06-25 13:50:27.768534
Modified date: 2023-06-25 22:24:05.354285


#### 1. Dataset

 California Housing dataset
 > https://scikit-learn.org/stable/datasets/real_world.html#california-housing-dataset

In [14]:
# load california housing dataset
california_dataset = fetch_california_housing()

california_dataset

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [15]:
# features
data = california_dataset.data
data.shape

(20640, 8)

In [16]:
# labels
target = california_dataset.target
target.shape

(20640,)

In [17]:
# convert to float32
X = data.astype(np.float32)
y = target.astype(np.float32)

X.shape, y.shape

((20640, 8), (20640,))

In [18]:
# Splitting train_set, and test_set
raw_X_train, raw_X_test, raw_y_train, raw_y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
# check shapes
print(f"raw_X_train: {raw_X_train.shape}")
print(f"raw_X_test: {raw_X_test.shape}")
print(f"raw_y_train: {raw_y_train.shape}")
print(f"raw_y_test: {raw_y_test.shape}")

raw_X_train: (16512, 8)
raw_X_test: (4128, 8)
raw_y_train: (16512,)
raw_y_test: (4128,)


In [20]:
# calculate mean and stardard deviation
X_mean = np.mean(raw_X_train, axis=0)
X_stddev = np.std(raw_X_train, axis=0)

X_train = (raw_X_train - X_mean) / X_stddev
X_test = (raw_X_test - X_mean) / X_stddev

X_train.shape, X_test.shape

((16512, 8), (4128, 8))

In [21]:
# reshapes
y_train  = raw_y_train.reshape(-1, 1)
y_test  = raw_y_test.reshape(-1, 1)

y_train.shape, y_test.shape

((16512, 1), (4128, 1))

#### 2. Modeling

In [22]:
class CaliforniaHousingRegressionNeuralNetwork(nn.Module):
    def __init__(self) -> None:
        super().__init__()

        self.layers = nn.Sequential(
            nn.Linear(out_features=64, in_features=8),
            nn.ReLU(),
            nn.Linear(out_features=64, in_features=64),
            nn.ReLU(),
            nn.Linear(out_features=1, in_features=64)
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.layers(x)

In [23]:
# set seed
torch.manual_seed(42)

# model
california_model = CaliforniaHousingRegressionNeuralNetwork()

# init weights
for module in california_model.modules():
    if isinstance(module, nn.Linear):
        nn.init.xavier_uniform_(module.weight)
        nn.init.constant_(module.bias, 0.0)

# loss function
loss_fn = nn.MSELoss()

# optimizer
optimizer = torch.optim.Adam(params= california_model.parameters())

# copy model to device
california_model.to(device)

CaliforniaHousingRegressionNeuralNetwork(
  (layers): Sequential(
    (0): Linear(in_features=8, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=1, bias=True)
  )
)

In [24]:
# preparing data
batch_size = 32

train_tensordataset = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
test_tensordataset = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))

# dataloader
train_loader = DataLoader(train_tensordataset, batch_size=batch_size, shuffle= True)
test_loader = DataLoader(test_tensordataset, batch_size=batch_size, shuffle= False)

# epochs
epochs = 500

for epoch in range(epochs):

     ## training ----

    accumulated_train_losses = 0.0
    train_batches = 0

    for input, target in train_loader:

        # copy to device
        input = input.to(device)
        target = target.to(device)

        # train mode
        california_model.train()

        # forward pass
        ouput = california_model(input)

        # calculate loss
        loss = loss_fn(ouput, target)

        # accumulate losses
        accumulated_train_losses += loss

        # accumulate batches
        train_batches += 1

        # zero gradients
        optimizer.zero_grad()

        # backward propagation
        loss.backward()

        # update parameters
        optimizer.step()
    
    train_loss = accumulated_train_losses / train_batches

    ## testing ----

    # eval mode
    california_model.eval()
    with torch.inference_mode():

        # accumulate losses
        accumulated_test_losses = 0.0
        test_batches = 0

        for input, target in test_loader:

            # copy to device
            input = input.to(device)
            target = target.to(device)

            # forward pass
            output = california_model(input)

            # calculate loss
            loss = loss_fn(output, target)

            # accumulate losses
            accumulated_test_losses += loss

            # accumulate batches
            test_batches += 1

    test_loss = accumulated_test_losses / test_batches


    print(f"Epoch: {epoch}/{epochs} | "
          f"train_loss: {train_loss.item() : 0.4f} | "
          f"test_loss: {test_loss.item() : 0.4f}")





Epoch: 0/500 | train_loss:  0.9754 | train_loss:  0.4464
Epoch: 1/500 | train_loss:  0.4904 | train_loss:  0.3893
Epoch: 2/500 | train_loss:  0.3684 | train_loss:  0.3700
Epoch: 3/500 | train_loss:  0.3994 | train_loss:  0.3616
Epoch: 4/500 | train_loss:  0.3575 | train_loss:  0.3388
Epoch: 5/500 | train_loss:  0.3338 | train_loss:  0.3365
Epoch: 6/500 | train_loss:  0.3239 | train_loss:  0.3245
Epoch: 7/500 | train_loss:  0.3157 | train_loss:  0.3132
Epoch: 8/500 | train_loss:  0.3080 | train_loss:  0.3172
Epoch: 9/500 | train_loss:  0.3000 | train_loss:  0.3086
Epoch: 10/500 | train_loss:  0.3143 | train_loss:  0.3119
Epoch: 11/500 | train_loss:  0.3340 | train_loss:  0.3258
Epoch: 12/500 | train_loss:  0.2983 | train_loss:  0.3048
Epoch: 13/500 | train_loss:  0.2912 | train_loss:  0.2976
Epoch: 14/500 | train_loss:  0.2860 | train_loss:  0.2961
Epoch: 15/500 | train_loss:  0.2817 | train_loss:  0.2919
Epoch: 16/500 | train_loss:  0.2880 | train_loss:  0.3104
Epoch: 17/500 | train_lo

In [27]:
# evaluation
# eval mode
california_model.eval()
with torch.inference_mode():
    predictions = california_model(torch.from_numpy(X_test).to(device))

predictions = predictions.cpu()

In [28]:
# print out top-4 predictions
for i in range(0,4):
    print(f"Prediction: {predictions[i].squeeze(): 0.2f} | True value: {y_test[i].squeeze(): 0.2f}")

Prediction:  0.45 | True value:  0.48
Prediction:  0.69 | True value:  0.46
Prediction:  4.95 | True value:  5.00
Prediction:  2.33 | True value:  2.19
