In [1]:
from __future__ import print_function

import math

from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
from tensorflow.python.data import Dataset

tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

In [2]:
california_housing_dataframe = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv", sep=",")

In [3]:
california_housing_dataframe = california_housing_dataframe.reindex(
    np.random.permutation(california_housing_dataframe.index))
california_housing_dataframe["median_house_value"] /= 1000.0
california_housing_dataframe

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
9339,-119.2,34.2,16.0,4609.0,1220.0,2147.0,1007.0,3.4,218.8
8050,-118.4,34.1,48.0,2413.0,261.0,770.0,248.0,15.0,500.0
8967,-118.9,34.4,30.0,2861.0,613.0,2065.0,586.0,3.2,176.1
1010,-117.1,32.8,49.0,4449.0,711.0,1606.0,709.0,5.8,281.6
10116,-119.8,36.8,28.0,2268.0,336.0,752.0,330.0,5.3,151.5
...,...,...,...,...,...,...,...,...,...
15834,-122.4,37.8,52.0,1314.0,317.0,473.0,250.0,4.3,500.0
7644,-118.4,34.0,32.0,4018.0,564.0,1400.0,568.0,8.7,439.1
13344,-121.9,37.9,13.0,2085.0,292.0,852.0,264.0,7.3,366.7
16842,-123.4,39.4,21.0,1081.0,254.0,715.0,275.0,1.6,71.5


california_housing_dataframe.describe()

### Selecting input features and targets

Input feature is total_rooms and target is median_house_value

Preliminary prediction: More rooms -> Higher median house value

In [5]:
# Define the input feature: total_rooms.
my_feature = california_housing_dataframe[["total_rooms"]]

# Configure a numeric feature column for total_rooms.
feature_columns = [tf.feature_column.numeric_column("total_rooms")]

In [6]:
# Define the label.
targets = california_housing_dataframe["median_house_value"]

### Configuration of linear regression model

Next, we'll configure a linear regression model using LinearRegressor. We'll train this model using the `GradientDescentOptimizer`, which implements Mini-Batch Stochastic Gradient Descent (SGD). The `learning_rate` argument controls the size of the gradient step.

In [10]:
# Use gradient descent as the optimizer for training the model.
my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.0000001)

# A commonly used mechanism to mitigate the exploding gradient problem by limiting 
# the maximum value of gradients when using gradient descent to train a model.
my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0) 

# Configure the linear regression model with our feature columns and optimizer.
# Set a learning rate of 0.0000001 for Gradient Descent.
linear_regressor = tf.estimator.LinearRegressor(
    feature_columns=feature_columns,
    optimizer=my_optimizer
)

### Processing data into model

In [11]:
def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
    """Trains a linear regression model of one feature.
  
    Args:
      features: pandas DataFrame of features
      targets: pandas DataFrame of targets
      batch_size: Size of batches to be passed to the model
      shuffle: True or False. Whether to shuffle the data.
      num_epochs: Number of epochs for which data should be repeated. None = repeat indefinitely
    Returns:
      Tuple of (features, labels) for next data batch
    """
  
    # Convert pandas data into a dict of np arrays.
    features = {key:np.array(value) for key,value in dict(features).items()}                                           
 
    # Construct a dataset, and configure batching/repeating.
    ds = Dataset.from_tensor_slices((features,targets)) # warning: 2GB limit
    ds = ds.batch(batch_size).repeat(num_epochs)
    
    # Shuffle the data, if specified.
    if shuffle:
      ds = ds.shuffle(buffer_size=10000)
    
    # Return the next batch of data.
    features, labels = ds.make_one_shot_iterator().get_next()
    return features, labels