In [2]:
# Learning Objective: Create a minimal set of features that performs just as well as a more complex feature set
# So far, we've thrown all of our features into the model. Models with fewer features use fewer resources and are easier to maintain. Let's see if we can build a model on a minimal set of housing features that will perform equally as well as one that uses all the features in the data set.

In [3]:
import math

from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
from tensorflow.python.data import Dataset

In [5]:
tf.logging.set_verbosity(tf.logging.ERROR)

In [6]:
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

In [7]:
california_housing_dataframe = pd.read_csv("https://storage.googleapis.com/mledu-datasets/california_housing_train.csv", sep=",")

california_housing_dataframe = california_housing_dataframe.reindex(
    np.random.permutation(california_housing_dataframe.index))

In [10]:
california_housing_dataframe

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
1293,-117.2,33.0,4.0,15029.0,2279.0,5613.0,1696.0,7.3,450400.0
520,-117.0,33.7,8.0,5330.0,1529.0,2143.0,1107.0,2.1,94400.0
13776,-122.0,37.9,18.0,2808.0,337.0,1038.0,337.0,8.4,353600.0
6152,-118.2,33.9,35.0,1255.0,344.0,1782.0,343.0,2.2,95100.0
2936,-117.8,33.5,24.0,2105.0,346.0,712.0,332.0,10.6,500001.0
...,...,...,...,...,...,...,...,...,...
9313,-119.2,34.2,26.0,5444.0,1293.0,3700.0,1158.0,2.8,213200.0
2121,-117.3,33.6,23.0,6859.0,1535.0,3405.0,1351.0,2.5,109200.0
10814,-120.7,35.6,31.0,3476.0,644.0,1476.0,567.0,3.3,195200.0
16638,-122.7,38.4,15.0,3265.0,690.0,1629.0,629.0,3.7,167600.0


In [11]:
california_housing_dataframe.corr()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
longitude,1.0,-0.9,-0.1,0.0,0.1,0.1,0.1,-0.0,-0.0
latitude,-0.9,1.0,0.0,-0.0,-0.1,-0.1,-0.1,-0.1,-0.1
housing_median_age,-0.1,0.0,1.0,-0.4,-0.3,-0.3,-0.3,-0.1,0.1
total_rooms,0.0,-0.0,-0.4,1.0,0.9,0.9,0.9,0.2,0.1
total_bedrooms,0.1,-0.1,-0.3,0.9,1.0,0.9,1.0,-0.0,0.0
population,0.1,-0.1,-0.3,0.9,0.9,1.0,0.9,-0.0,-0.0
households,0.1,-0.1,-0.3,0.9,1.0,0.9,1.0,0.0,0.1
median_income,-0.0,-0.1,-0.1,0.2,-0.0,-0.0,0.0,1.0,0.7
median_house_value,-0.0,-0.1,0.1,0.1,0.0,-0.0,0.1,0.7,1.0


In [22]:
# omits median_house_value
selected_features = california_housing_dataframe[
    ["latitude",
     "longitude",
     "housing_median_age",
     "total_rooms",
     "total_bedrooms",
     "population",
     "households",
     "median_income"]]
processed_features = selected_features.copy()

In [23]:
 processed_features["rooms_per_person"] = (
    california_housing_dataframe["total_rooms"] /
    california_housing_dataframe["population"])

In [25]:
def preprocess_features(california_housing_dataframe):
  """Prepares input features from California housing data set.

  Args:
    california_housing_dataframe: A Pandas DataFrame expected to contain data
      from the California housing data set.
  Returns:
    A DataFrame that contains the features to be used for the model, including
    synthetic features.
  """
  selected_features = california_housing_dataframe[
    ["latitude",
     "longitude",
     "housing_median_age",
     "total_rooms",
     "total_bedrooms",
     "population",
     "households",
     "median_income"]]
  processed_features = selected_features.copy()
  # Create a synthetic feature.
  processed_features["rooms_per_person"] = (
    california_housing_dataframe["total_rooms"] /
    california_housing_dataframe["population"])
  return processed_features

In [26]:
def preprocess_targets(california_housing_dataframe):
  """Prepares target features (i.e., labels) from California housing data set.

  Args:
    california_housing_dataframe: A Pandas DataFrame expected to contain data
      from the California housing data set.
  Returns:
    A DataFrame that contains the target feature.
  """
  output_targets = pd.DataFrame()
  # Scale the target to be in units of thousands of dollars.
  output_targets["median_house_value"] = (
    california_housing_dataframe["median_house_value"] / 1000.0)
  return output_targets

In [28]:
# Choose the first 12000 (out of 17000) examples for training.
training_examples = preprocess_features(california_housing_dataframe.head(12000))
training_targets = preprocess_targets(california_housing_dataframe.head(12000))

# Choose the last 5000 (out of 17000) examples for validation.
validation_examples = preprocess_features(california_housing_dataframe.tail(5000))
validation_targets = preprocess_targets(california_housing_dataframe.tail(5000))

# Double-check that we've done the right thing.
display.display(training_examples.describe())
display.display(validation_examples.describe())

display.display(training_targets.describe())
display.display(validation_targets.describe())

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_person
count,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0
mean,35.6,-119.6,28.6,2648.6,540.5,1429.0,501.7,3.9,2.0
std,2.1,2.0,12.6,2217.7,425.7,1135.9,387.3,1.9,1.1
min,32.5,-124.3,1.0,2.0,1.0,3.0,1.0,0.5,0.0
25%,33.9,-121.8,18.0,1457.8,296.0,785.0,281.0,2.6,1.5
50%,34.2,-118.5,29.0,2118.0,432.0,1170.0,408.0,3.5,1.9
75%,37.7,-118.0,37.0,3145.2,650.0,1717.0,607.0,4.8,2.3
max,42.0,-114.6,52.0,37937.0,6445.0,28566.0,6082.0,15.0,52.0


Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_person
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,35.6,-119.6,28.7,2631.7,536.8,1431.0,500.1,3.9,2.0
std,2.1,2.0,12.5,2086.7,411.4,1176.2,377.7,1.9,1.3
min,32.5,-124.3,1.0,15.0,3.0,9.0,3.0,0.5,0.1
25%,33.9,-121.8,18.0,1471.0,298.0,797.8,282.0,2.6,1.5
50%,34.2,-118.5,29.0,2155.5,438.0,1159.5,412.0,3.6,1.9
75%,37.7,-118.0,37.0,3165.0,645.2,1733.2,601.2,4.8,2.3
max,41.9,-114.3,52.0,32054.0,5290.0,35682.0,5050.0,15.0,55.2


Unnamed: 0,median_house_value
count,12000.0
mean,207.4
std,116.4
min,15.0
25%,119.4
50%,180.1
75%,264.9
max,500.0


Unnamed: 0,median_house_value
count,5000.0
mean,207.1
std,115.0
min,22.5
25%,119.6
50%,180.6
75%,265.4
max,500.0


In [29]:
# Task 1: Develop a Good Feature Set

In [31]:
correlation_dataframe = training_examples.copy()
correlation_dataframe["target"] = training_targets["median_house_value"]

correlation_dataframe.corr()
# -1.0: perfect negative correlation
# 0.0: no correlation
# 1.0: perfect positive correlation


Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_person,target
latitude,1.0,-0.9,0.0,-0.0,-0.1,-0.1,-0.1,-0.1,0.1,-0.1
longitude,-0.9,1.0,-0.1,0.1,0.1,0.1,0.1,-0.0,-0.1,-0.0
housing_median_age,0.0,-0.1,1.0,-0.4,-0.3,-0.3,-0.3,-0.1,-0.1,0.1
total_rooms,-0.0,0.1,-0.4,1.0,0.9,0.9,0.9,0.2,0.1,0.1
total_bedrooms,-0.1,0.1,-0.3,0.9,1.0,0.9,1.0,-0.0,0.1,0.0
population,-0.1,0.1,-0.3,0.9,0.9,1.0,0.9,0.0,-0.1,-0.0
households,-0.1,0.1,-0.3,0.9,1.0,0.9,1.0,0.0,-0.0,0.1
median_income,-0.1,-0.0,-0.1,0.2,-0.0,0.0,0.0,1.0,0.2,0.7
rooms_per_person,0.1,-0.1,-0.1,0.1,0.1,-0.1,-0.0,0.2,1.0,0.2
target,-0.1,-0.0,0.1,0.1,0.0,-0.0,0.1,0.7,0.2,1.0
