In [1]:
# Estimating Multiple Regression Coefficients (Gradient Descent)

In [1]:
import graphlab
# 1. If you’re using SFrames, import graphlab and load in the house
# data (this is the graphlab command you can also download the csv).
# e.g. in python with SFrames:
sales = graphlab.SFrame('kc_house_data.gl/')

This non-commercial license of GraphLab Create for academic use is assigned to mikael.baymani@gmail.com and will expire on May 13, 2020.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1564478530.log


In [2]:
# 2. If you’re using python: to do the matrix operations required to
# perform a gradient descent we will be using the popular python library
# ‘numpy’ which is a computational library specialized for operations on
# arrays. For students unfamiliar with numpy we have created a numpy tutorial
# (see useful resources). It is common to import numpy under the name ‘np’ for
# short, to do this execute:
import numpy as np

In [3]:
print graphlab.version

2.1


In [4]:
# 3. Next write a function that takes a data set, a list of features
# (e.g. [‘sqft_living’, ‘bedrooms’]), to be used as inputs, and a name of
# the output (e.g. ‘price’). This function should return a features_matrix
# (2D array) consisting of first a column of ones followed by columns containing
# the values of the input features in the data set in the same order as the input
# list. It should also return an output_array which is an array of the values of
# the output in the data set (e.g. ‘price’). e.g. if you’re using SFrames and numpy
# you can complete the following function:

In [5]:
def get_numpy_data(data_sframe, features, output):
    data_sframe['constant'] = 1 # add a constant column to an SFrame
    # prepend variable 'constant' to the features list
    features = ['constant'] + features
    # select the columns of data_SFrame given by the ‘features’ list into the SFrame ‘features_sframe’
    features_sframe = graphlab.SFrame()
    for feature in features:
        features_sframe[feature] = data_sframe[feature]

    # this will convert the features_sframe into a numpy matrix with GraphLab Create >= 1.7!!
    features_matrix = features_sframe.to_numpy()
    # assign the column of data_sframe associated with the target to the variable ‘output_sarray’
    output_sarray = data_sframe[output]

    # this will convert the SArray into a numpy array:
    output_array = output_sarray.to_numpy() # GraphLab Create>= 1.7!!
    return(features_matrix, output_array)

get_numpy_data(sales, ["bedrooms", "bathrooms"], "price")

(array([[1.  , 3.  , 1.  ],
        [1.  , 3.  , 2.25],
        [1.  , 2.  , 1.  ],
        ...,
        [1.  , 2.  , 0.75],
        [1.  , 3.  , 2.5 ],
        [1.  , 2.  , 0.75]]),
 array([221900., 538000., 180000., ..., 402101., 400000., 325000.]))

In [6]:
# 4. If the features matrix (including a column of 1s for the constant)
# is stored as a 2D array (or matrix) and the regression weights are stored
# as a 1D array then the predicted output is just the dot product between
# the features matrix and the weights (with the weights on the right).
# Write a function ‘predict_output’ which accepts a 2D array ‘feature_matrix’
# and a 1D array ‘weights’ and returns a 1D array ‘predictions’. e.g. in python:

In [7]:
def predict_outcome(feature_matrix, weights):
    # ŷ_i = h^T (x_i) * w
    predictions = np.dot(feature_matrix, weights)
    return predictions

In [8]:
# 5. If we have a the values of a single input feature in an array ‘feature’
# and the prediction ‘errors’ (predictions - output) then the derivative of
# the regression cost function with respect to the weight of ‘feature’ is just
# twice the dot product between ‘feature’ and ‘errors’. Write a function that
# accepts a ‘feature’ array and ‘error’ array and returns the ‘derivative’
# (a single number). e.g. in python:

In [9]:
def feature_derivative(errors, feature):
    ## partial[j] = -2 * sum_(i=1)^N h_j(x_i)[y_i - ŷ_i(w^(t))]
    derivative = 2 * np.dot(feature, errors)
    return derivative

In [10]:
# 6. Now we will use our predict_output and feature_derivative to write a gradient
# descent function. Although we can compute the derivative for all the features
# simultaneously (the gradient) we will explicitly loop over the features individually
# for simplicity. Write a gradient descent function that does the following:

# - Accepts a numpy feature_matrix 2D array, a 1D output array, an array of initial weights,
#   a step size and a convergence tolerance.
# - While not converged updates each feature weight by subtracting the step size times the
#   derivative for that feature given the current weights
# - At each step computes the magnitude/length of the gradient (square root of the sum of
#   squared components)
# - When the magnitude of the gradient is smaller than the input tolerance returns the
#   final weight vector.

In [11]:
from math import sqrt
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False
    weights = np.array(initial_weights)
    while not converged:
        # compute the predictions based on feature_matrix and weights:
        predictions = predict_outcome(feature_matrix, weights)
        # compute the errors as predictions - output:
        errors = predictions - output
        
        gradient_sum_squares = 0 # initialize the gradient
        # while not converged, update each weight individually:
        for i in range(len(weights)):
            # Recall that feature_matrix[:, i] is the feature column associated with weights[i]
            # compute the derivative for weight[i]:
            partial = feature_derivative(errors, feature_matrix[:, i])
            
            # add the squared derivative to the gradient magnitude
            gradient_sum_squares += partial**2
            
            # update the weight based on step size and derivative:
            weights[i] = weights[i] - step_size * partial
            
        gradient_magnitude = sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
    return weights

In [12]:
# 7. Now split the sales data into training and test data. Like previous notebooks
# it’s important to use the same seed.
train_data,test_data = sales.random_split(.8,seed=0)

In [13]:
# 8. Now we will run the regression_gradient_descent function on some actual data.
# In particular we will use the gradient descent to estimate the model from Week 1
# using just an intercept and slope. Use the following parameters:

In [14]:
simple_features = ['sqft_living']
my_output= 'price'
(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

In [15]:
simple_weights = regression_gradient_descent(simple_feature_matrix, output, initial_weights, step_size, tolerance)

In [16]:
simple_weights

array([-46999.88716555,    281.91211912])

In [18]:
# 9. Quiz Question: What is the value of the weight for sqft_living -- the second
# element of ‘simple_weights’ (rounded to 1 decimal place)?
print "%.1f" % simple_weights[1]

281.9


In [19]:
# 10. Now build a corresponding ‘test_simple_feature_matrix’ and ‘test_output’
# using test_data. Using ‘test_simple_feature_matrix’ and ‘simple_weights’ compute
# the predicted house prices on all the test data.

In [21]:
(test_simple_feature_matrix, test_output) = get_numpy_data(test_data, simple_features, my_output)

In [34]:
test_predictions = predict_outcome(test_simple_feature_matrix, simple_weights)

In [30]:
# 11. Quiz Question: What is the predicted price for the 1st house in the Test data
# set for model 1 (round to nearest dollar)?

In [35]:
print "%.0f" % test_predictions[0]

356134


In [32]:
# 12. Now compute RSS on all test data for this model. Record the value and store it for later

In [33]:
RSS = lambda output, predictions : sum((output - predictions)**2)

In [36]:
RSS(test_output, test_predictions)

275400047593155.7

In [37]:
# 13. Now we will use the gradient descent to fit a model with more than
# 1 predictor variable (and an intercept). Use the following parameters:

In [38]:
model_features = ['sqft_living', 'sqft_living15']
my_output = 'price'
(feature_matrix, output) = get_numpy_data(train_data, model_features,my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

In [39]:
multiple_weights = regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance)

In [40]:
multiple_weights

array([-9.99999688e+04,  2.45072603e+02,  6.52795277e+01])

In [41]:
# 14. Use the regression weights from this second model (using sqft_living and
# sqft_living_15) and predict the outcome of all the house prices on the TEST data.
(test_multiple_feature_matrix, test_output) = get_numpy_data(test_data, model_features, my_output)
test_predictions_multiple_feature = predict_outcome(test_multiple_feature_matrix, multiple_weights)

In [44]:
# 15. Quiz Question: What is the predicted price for the 1st house in the TEST
# data set for model 2 (round to nearest dollar)?
print "%.0f" % test_predictions_multiple_feature[0]

366651


In [45]:
# 16. What is the actual price for the 1st house in the Test data set?
test_data[0]

{'bathrooms': 1.0,
 'bedrooms': 3.0,
 'condition': 4,
 'constant': 1,
 'date': datetime.datetime(2014, 5, 28, 0, 0, tzinfo=GMT +0.0),
 'floors': '1.5',
 'grade': 7,
 'id': '0114101516',
 'lat': 47.75584254,
 'long': -122.22874498,
 'price': 310000.0,
 'sqft_above': 1430,
 'sqft_basement': 0,
 'sqft_living': 1430.0,
 'sqft_living15': 1780.0,
 'sqft_lot': 19901,
 'sqft_lot15': 12697.0,
 'view': 0,
 'waterfront': 0,
 'yr_built': 1927,
 'yr_renovated': 0,
 'zipcode': '98028'}

In [47]:
# 17. Quiz Question: Which estimate was closer to the true price for the
# 1st house on the TEST data set, model 1 or model 2?
# Answer: Model 1

In [48]:
# 18. Now compute RSS on all test data for the second model.
# Record the value and store it for later.
RSS(test_output, test_predictions_multiple_feature)

270263446465243.97

In [None]:
# 19. Quiz Question: Which model (1 or 2) has lowest RSS on all of
# the TEST data?
# Answer: Model 2