In [53]:
#@title Import relevant modules
import pandas as pd
import torch
from matplotlib import pyplot as plt

# The following lines adjust the granularity of reporting. 
pd.options.display.max_rows = 10
pd.options.display.float_format = "{:.1f}".format

In [59]:
# Import the dataset.
training_df = pd.read_csv(filepath_or_buffer="https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv")

# Scale the label.
training_df["median_house_value"] /= 1000.0
training_df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0
mean,-119.6,35.6,28.6,2643.7,539.4,1429.6,501.2,3.9,207.3
std,2.0,2.1,12.6,2179.9,421.5,1147.9,384.5,1.9,116.0
min,-124.3,32.5,1.0,2.0,1.0,3.0,1.0,0.5,15.0
25%,-121.8,33.9,18.0,1462.0,297.0,790.0,282.0,2.6,119.4
50%,-118.5,34.2,29.0,2127.0,434.0,1167.0,409.0,3.5,180.4
75%,-118.0,37.7,37.0,3151.2,648.2,1721.0,605.2,4.8,265.0
max,-114.3,42.0,52.0,37937.0,6445.0,35682.0,6082.0,15.0,500.0


In [55]:
training_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.3,34.2,15.0,5612.0,1283.0,1015.0,472.0,1.5,66900.0
1,-114.5,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.8,80100.0
2,-114.6,33.7,17.0,720.0,174.0,333.0,117.0,1.7,85700.0
3,-114.6,33.6,14.0,1501.0,337.0,515.0,226.0,3.2,73400.0
4,-114.6,33.6,20.0,1454.0,326.0,624.0,262.0,1.9,65500.0


In [60]:
# Understand more about the dataset
print(f"The shape of the dataset is {training_df.shape}")
print(f"The number of rows and attributes are {training_df.shape[0]} and {training_df.shape[1]} respectively")
print(f"The column attributes are {training_df.columns}")

The shape of the dataset is (17000, 9)
The number of rows and attributes are 17000 and 9 respectively
The column attributes are Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value'],
      dtype='object')


In [92]:
class RegressionModel(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super(RegressionModel, self).__init__()

        # Define a simple layer.
        # Equation of layer is y = Wx + b
        self.linear = torch.nn.Linear(input_dim, output_dim)

    def forward(self, x):

        # Simple linear regression.
        y_pred = self.linear(x)
        return y_pred

    def train(self, training_data, attributes, num_epochs, learning_rate):
        # Define the loss and optimizer.
        criterion = torch.nn.MSELoss()

        # Define the optimizer needed. There's SGD, Adam, etc.
        # self.parameters() returns all trainable parameters of the model.
        optimizer = torch.optim.RMSprop(self.parameters(), lr=learning_rate)

        inputs = torch.from_numpy(training_data[attributes].values).float()
        labels = torch.from_numpy(training_data["median_house_value"].values).float()
        
        # Train the model with early stopping.
        for epoch in range(num_epochs):

            # Reset the gradients to zero before starting to do backpropragation.
            optimizer.zero_grad()

            # Forward pass.
            y_pred = self.forward(inputs)

            # Compute the loss.
            loss = criterion(y_pred, labels)

            # Backward pass.
            loss.backward()
            optimizer.step()

            # Print the loss every 10 epochs.
            if (epoch + 1) % 10 == 0:
                print("Epoch: {}/{}, Loss: {}".format(epoch + 1, num_epochs, loss.item()))

In [97]:
selected_attributes = ["total_rooms", "population", "households", "median_income"]
no_attributes = len(selected_attributes)

# If the model is already trained, we can load it from the file.
import os 
if os.path.exists("models/housing_price.pt"):
  model = RegressionModel(no_attributes, 1)
  state = torch.load("models/housing_price.pt")
  print("Model Loaded!")
  model.load_state_dict(state_dict=state)
else:
  model = RegressionModel(no_attributes, 1)
  model.train(training_data=training_df, attributes=selected_attributes, num_epochs=30, learning_rate=0.01)

  # Create model directory if it does not exist 
  if not os.path.exists("models"):
    os.makedirs("models")
  # Save the pytorch model 
  torch.save(model.state_dict(), "models/housing_price.pt") 

Model Loaded!
