In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.model_selection import train_test_split

In [17]:
train_data = pd.read_csv('kc_house_train_data.csv') 
test_data = pd.read_csv('kc_house_test_data.csv')

In [18]:
def get_numpy_data(data, features, output):
    data['constant'] = 1
    
    features = ['constant'] + features
    
    data_features = data[features]
    
    return (np.array(data_features), np.array(data[output]))

In [21]:
feature_matrix, output_matrix = get_numpy_data(train_data, ['sqft_living', 'bedrooms'], 'price')

In [22]:
feature_matrix

array([[   1, 1180,    3],
       [   1, 2570,    3],
       [   1,  770,    2],
       ...,
       [   1, 1530,    3],
       [   1, 1600,    3],
       [   1, 1020,    2]])

In [23]:
output_matrix

array([221900., 538000., 180000., ..., 360000., 400000., 325000.])

In [24]:
train_data.shape

(17384, 22)

In [25]:
test_data.shape

(4229, 21)

## Predict Outcome

In [26]:
def predict_outcome(feature_matrix, weights):
    result = np.dot(feature_matrix,weights)
    #print(result)
    return result

In [27]:
def feature_derivative(error_vector, feature_matrix):
    #Error_vector = actual - predicted
    #error_vector = error_vector.reshape(1, error_vector.shape[0])
    #error_vector = error_vector.reshape(1,-1)
    derivative = 2*(np.dot(error_vector, feature_matrix))
    return derivative

In [39]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False
    #initialize the weights
    weights = np.array(initial_weights)
    count = 0 
    #iterations = 1000
    while not converged:
        #Predict the output using the feature matrix and the weights
        error_vector = (predict_outcome(feature_matrix, weights) - output)
        #initialize the gradient
        gradient_sum_squares = 0
        for i in range(len(weights)):
            # Recall that feature_matrix[:, i] is the feature column associated with weights[i]
            # compute the derivative for weight[i]:
            derivative = feature_derivative(error_vector, feature_matrix[:, i])
            #print(derivative.shape)
            
            # add the squared derivative to the gradient magnitude
            #derivative = derivative.ravel()
            gradient_sum_squares += (derivative*derivative)  
            
            # update the weight based on step size and derivative:
            weights[i] -= (step_size * derivative) 
            
        gradient_magnitude = np.sqrt(gradient_sum_squares)
        #print(gradient_magnitude)
        #count += 1 
        #print(count)
        #iterations -= 1
        
        if(gradient_magnitude < tolerance):
            converged = True
        
    return weights   

## Load the Training and Testing Data

In [29]:
simple_features = ['sqft_living']
my_output= 'price'
(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

In [30]:
simple_feature_matrix[:, :].shape

(17384, 2)

In [31]:
output.shape

(17384,)

In [32]:
initial_weights.shape

(2,)

In [33]:
initial_weights

array([-4.7e+04,  1.0e+00])

In [34]:
error_vector = (predict_outcome(simple_feature_matrix, initial_weights) - output)

In [35]:
error_vector

array([-267720., -582430., -226230., ..., -405470., -445400., -370980.])

In [36]:
error_vector.shape

(17384,)

In [40]:
simple_weights = regression_gradient_descent(simple_feature_matrix, output,initial_weights, step_size, tolerance)

In [41]:
simple_weights

array([-46999.88716555,    281.91211918])

##  Quiz Question: What is the value of the weight for sqft_living -- the second element of ‘simple_weights’ (rounded to 1 decimal place)?

In [42]:
281.91

281.91

### Now build a corresponding ‘test_simple_feature_matrix’ and ‘test_output’ using test_data. Using ‘test_simple_feature_matrix’ and ‘simple_weights’ compute the predicted house prices on all the test data.

In [43]:
(test_simple_feature_matrix, test_output) = get_numpy_data(test_data, 
                                                           simple_features,
                                                           my_output)

In [44]:
predicted_output = predict_outcome(test_simple_feature_matrix, simple_weights)

In [45]:
list(map(lambda x: x.shape, [test_simple_feature_matrix, simple_weights]))

[(4229, 2), (2,)]

In [46]:
predicted_output

array([356134.443255  , 784640.86440132, 435069.83662406, ...,
       663418.65315598, 604217.10812919, 240550.47439317])

In [49]:
test_simple_feature_matrix

array([[   1, 1430],
       [   1, 2950],
       [   1, 1710],
       ...,
       [   1, 2520],
       [   1, 2310],
       [   1, 1020]])

In [51]:
np.dot(test_simple_feature_matrix[1], np.array([-46999.88716555,    281.91211918]))

784640.8644154499

In [53]:
difference_model_1_1st_house = np.abs(285656.41 - test_output[0])

In [54]:
difference_model_1_1st_house

24343.590000000026

##  Quiz Question: What is the predicted price for the 1st house in the Test data set for model 1 (round to nearest dollar)?

In [55]:
356134.443255

356134.443255

### Now compute RSS on all test data for this model. Record the value and store it for later

In [60]:
test_residual_1 = predicted_output - test_output
test_RSS_1 = (test_residual_1**2).sum()
test_RSS_1

275400044902128.3

In [61]:
test_residual_1**2

array([2.12838685e+09, 1.81281624e+10, 4.08322189e+10, ...,
       2.78083818e+09, 4.17046273e+10, 2.60985723e+10])

### Now we will use the gradient descent to fit a model with more than 1 predictor variable (and an intercept). Use the following parameters:

In [62]:
model_features = ['sqft_living', 'sqft_living15']
model_output = 'price'
#intercept, sqft_living, and sqft_living_15 respectively
initial_weights = np.array([-100000., 1., 1.])
#Please use a decimal point after integers, else it will not converge
step_size = 4e-12
tolerance = 1e9

In [63]:
train_data.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15', 'constant'],
      dtype='object')

In [64]:
#Get the modified feature vectoe
(train_feature_matrix_multiple_features, train_output_multiple_features) = get_numpy_data(train_data, model_features, model_output)

In [65]:
(train_feature_matrix_multiple_features, train_output_multiple_features)

(array([[   1, 1180, 1340],
        [   1, 2570, 1690],
        [   1,  770, 2720],
        ...,
        [   1, 1530, 1530],
        [   1, 1600, 1410],
        [   1, 1020, 1020]]),
 array([221900., 538000., 180000., ..., 360000., 400000., 325000.]))

In [66]:
train_feature_matrix_multiple_features.shape

(17384, 3)

In [67]:
train_output_multiple_features.shape

(17384,)

In [68]:
weights_multiple_features = regression_gradient_descent(train_feature_matrix_multiple_features, train_output_multiple_features, initial_weights, step_size, tolerance)

In [69]:
weights_multiple_features

array([-9.99999688e+04,  2.45072603e+02,  6.52795267e+01])

### Quiz Question: What is the predicted price for the 1st house in the TEST data set for model 2 (round to nearest dollar)?

In [70]:
(test_model_feature_matrix, test_model_output) = get_numpy_data(test_data, model_features, model_output)

In [71]:
test_predictions = predict_outcome(test_model_feature_matrix, weights_multiple_features)

In [72]:
test_predictions[0]

366651.4116294939

In [87]:
predicted_output[0]

356134.4432550024

In [73]:
test_data['price'][0]

310000.0

### Which model (1 or 2) has lowest RSS on all of the TEST data?

In [74]:
test_residuals_2 = test_model_output - test_predictions
test_RSS_2 = (test_residuals_2**2).sum()
print (test_RSS_2)

270263443629803.56


In [75]:
test_residuals_2

array([ -56651.41162949, -112662.39850726, -153312.09557541, ...,
        -71402.39916306, -185579.27901327,  185541.79608214])

In [76]:
if(test_RSS_2 < test_RSS_1):
    print("Model 2")
else:
    print("Model 1")

Model 2


In [85]:
test_output[0] == test_model_output[0]

True

In [81]:
difference_model_1_1st_house = np.abs(predicted_output[0] - test_output[0])

In [82]:
difference_model_2_1st_house = np.abs(test_predictions[0] - test_output[0])

In [86]:
if(difference_model_1_1st_house>=difference_model_2_1st_house):
    print("Model 2")
else: 
    print("Model 1")

Model 1
