In [1]:
import graphlab
from math import sqrt

# 1.

In [2]:
sales = graphlab.SFrame('kc_house_data.gl/')

[INFO] graphlab.cython.cy_server: GraphLab Create v1.10.1 started. Logging: /tmp/graphlab_server_1466446054.log


This non-commercial license of GraphLab Create is assigned to nicolas.batistoni@hotmail.com and will expire on May 27, 2017. For commercial licensing options, visit https://dato.com/buy/.


# 2.

In [3]:
import numpy as np

# 3.

In [4]:
def get_numpy_data(data_sframe, features, output):
    data_sframe['constant'] = 1 # this is how you add a constant column to an SFrame
    # add the column 'constant' to the front of the features list so that we can extract it along with the others:
    features = ['constant'] + features # this is how you combine two lists
    # select the columns of data_SFrame given by the features list into the SFrame features_sframe (now including constant):
    features_sframe = data_sframe[features]
    # the following line will convert the features_SFrame into a numpy matrix:
    feature_matrix = features_sframe.to_numpy()
    # assign the column of data_sframe associated with the output to the SArray output_sarray
    output_sarray = data_sframe[output]
    # the following will convert the SArray into a numpy array by first converting it to a list
    output_array = output_sarray.to_numpy()
    return(feature_matrix, output_array)

# 4.

In [5]:
def predict_output(feature_matrix, weights):
    # assume feature_matrix is a numpy matrix containing the features as columns and weights is a corresponding numpy array
    # create the predictions vector by using np.dot()
    predictions = np.dot(feature_matrix, weights)
    return(predictions)

# 5.

In [6]:
def feature_derivative(errors, feature):
    # Assume that errors and feature are both numpy arrays of the same length (number of data points)
    # compute twice the dot product of these vectors as 'derivative' and return the value
    derivative = 2 * (np.dot(errors, feature))
    return(derivative)

# 6.

In [7]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False 
    weights = np.array(initial_weights) # make sure it's a numpy array
    while not converged:
        # compute the predictions based on feature_matrix and weights using your predict_output() function
        predictions = predict_output(feature_matrix, weights)
        # compute the errors as predictions - output
        errors = predictions - output
        gradient_sum_squares = 0 # initialize the gradient sum of squares
        # while we haven't reached the tolerance yet, update each feature's weight
        for i in range(len(weights)): # loop over each weight
            # Recall that feature_matrix[:, i] is the feature column associated with weights[i]
            # compute the derivative for weight[i]:
            derivative = feature_derivative(errors, feature_matrix[:, i])
            # add the squared value of the derivative to the gradient sum of squares (for assessing convergence)
            gradient_sum_squares += derivative * derivative
            # subtract the step size times the derivative from the current weight
            weights[i] -= step_size * derivative
        # compute the square-root of the gradient sum of squares to get the gradient matnigude:
        gradient_magnitude = sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
    return(weights)

# 7.

In [8]:
train_data,test_data = sales.random_split(.8,seed=0)

# 8.

In [9]:
simple_features = ['sqft_living']
my_output= 'price'
(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

In [10]:
simple_weights = regression_gradient_descent(simple_feature_matrix, output,initial_weights, step_size, tolerance)

In [11]:
print simple_weights

[-46999.88716555    281.91211912]


# 9.

In [12]:
print simple_weights[1]

281.912119116


# 10.

In [13]:
(test_simple_feature_matrix, test_output) = get_numpy_data(test_data, simple_features, my_output)

In [14]:
test_predicted = predict_output(test_simple_feature_matrix, simple_weights)

# 11.

In [15]:
print test_predicted[0]

356134.443171


# 12.

In [16]:
test_predicted_sf = graphlab.SArray(test_predicted)

In [17]:
t = (test_data['price'] - test_predicted_sf)

In [18]:
sum(t*t)

275400047593155.7

# 13.

In [19]:
features_2 = ['sqft_living', 'sqft_living15']
my_output_2 = 'price'
(feature_matrix_2, output_2) = get_numpy_data(train_data, features_2, my_output_2)
initial_weights_2 = np.array([-100000., 1., 1.])
step_size_2 = 4e-12
tolerance_2 = 1e9

In [20]:
weights_2 = regression_gradient_descent(feature_matrix_2, output_2, initial_weights_2, step_size_2, tolerance_2)

In [21]:
print weights_2

[ -9.99999688e+04   2.45072603e+02   6.52795277e+01]


# 14.

In [26]:
(test_feature_matrix_2, test_output_2) = get_numpy_data(test_data, features_2, my_output_2)
predicted_2 = predict_output(test_feature_matrix_2, weights_2)

In [27]:
predicted_2

array([ 366651.41203656,  762662.39786164,  386312.09499712, ...,
        682087.39928241,  585579.27865729,  216559.20396617])

# 15.

In [28]:
print predicted_2[0]

366651.412037


# 16.

In [31]:
print test_data[0]['price']

310000.0


# 17.

In [32]:
print test_data['price'][0]
print test_predicted[0]
print predicted_2[0]

310000.0
356134.443171
366651.412037


### Model 1 is closer.

# 18.

In [33]:
predicted_2_sf = graphlab.SArray(predicted_2)

In [34]:
t_2 = (test_data['price'] - predicted_2_sf)

In [35]:
sum(t_2*t_2)

270263446465244.03

# 19.

In [36]:
print (sum(t*t))
print (sum(t_2*t_2))

2.75400047593e+14
2.70263446465e+14


### Model 2 has lower RSS