In [1]:
import pandas as pd
import numpy as np
import math
from sklearn import linear_model

## Importing house sales, training and test data

In [2]:
sales = pd.read_csv('../kc_house_data.csv')
train_data = pd.read_csv('../kc_house_train_data.csv')
test_data = pd.read_csv('../kc_house_test_data.csv')

## Get Numpy Data to get Feature Matrix and Output Array

In [3]:
def get_numpy_data(data_sframe, features, output):
    data_sframe['constant'] = 1
    features = ['constant'] + features
    features_sframe = data_sframe[features]
    features_matrix = features_sframe.as_matrix()
    output_sarray = data_sframe[output]
    output_array = output_sarray.as_matrix()
    return (features_matrix, output_array)

## Predict the outcome

In [4]:
def predict_outcome(features_matrix, weights):
    predictions = np.dot(features_matrix, weights)
    return (predictions)

In [5]:
def feature_derivative(errors, feature):
    derivative = 2 * np.dot(errors, feature)
    return (derivative)

In [6]:
def regression_gradient_descent(features_matrix, output, initial_weights, step_size, tolerance):
    converged = False
    weights = np.array(initial_weights)
    while not converged:
        predictions = predict_outcome(features_matrix, weights)
        errors = predictions - output
        gradient_sum_squares = 0
        for i in range(len(weights)):
            derivative = feature_derivative(errors, features_matrix[:, i])
            gradient_sum_squares += derivative ** 2
            weights[i] -= step_size * derivative
        gradient_magnitude = math.sqrt(gradient_sum_squares)
        if(gradient_magnitude < tolerance):
            converged = True
    return (weights)

In [7]:
simple_features = ['sqft_living']
my_output = 'price'
(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

In [8]:
simple_weights = regression_gradient_descent(simple_feature_matrix, output, initial_weights, step_size, tolerance)

In [9]:
simple_weights

array([-46999.88716555,    281.91211918])

## Building a Test Feature and Test Output Matrix

In [10]:
simple_features_test_data = ['sqft_living']
my_output_test_data = 'price'
(simple_feature_matrix_test_data, output_test_data) = get_numpy_data(test_data, simple_features_test_data, my_output_test_data)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

In [11]:
simple_weights_test_data = regression_gradient_descent(simple_feature_matrix_test_data, output_test_data, initial_weights, step_size, tolerance)

In [12]:
simple_weights_test_data

array([-46999.87880043,    282.3594539 ])

## Compute RSS on the test data

In [13]:
predicted_outcomes = predict_outcome(simple_feature_matrix_test_data, simple_weights_test_data)
rss_test_data = ((output_test_data - predicted_outcomes) ** 2).sum()

In [14]:
predicted_outcomes[0]

356774.14027533506

In [15]:
rss_test_data

275395691278132.81

## Gradient Descent to fit more than one model

In [16]:
model_features = ['sqft_living', 'sqft_living15']
my_output = 'price'
(feature_matrix, output) = get_numpy_data(train_data, model_features, my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

In [17]:
model_weights = regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance)

In [18]:
model_weights

array([ -9.99999688e+04,   2.45072603e+02,   6.52795267e+01])

## Predicted Price for the first house in the Test Data Set for model 2

In [19]:
(feature_matrix_test_data, output_test_data) = get_numpy_data(test_data, model_features, my_output)
model2_predicted_outcomes = predict_outcome(feature_matrix_test_data, model_weights)

In [20]:
model2_predicted_outcomes[0]

366651.41162949387

## Actual price for the house in the Test Data Set

In [21]:
output_test_data[0]

310000.0

##  Which estimate was closer to the true price for the 1st house on the TEST data set, model 1 or model 2?

model2 estimates are much closer

## Compute RSS on Test Data on Second Model

In [22]:
rss_test_data_model2 = ((model2_predicted_outcomes - output_test_data) ** 2).sum()

In [23]:
rss_test_data_model2

270263443629803.56

## Which model has lower RSS model 1 or model 2 ?

model2 has lower RSS than model1