In [2]:
import pandas as pd
import numpy as np

dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int,
              'sqft_living15':float, 'grade':int, 'yr_renovated':int,
              'price':float, 'bedrooms':float, 'zipcode':str, 'long':float,
              'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int,
              'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str,
              'sqft_lot':int, 'view':int}

In [3]:
train_data = pd.read_csv("kc_house_train_data.csv", dtype = dtype_dict)
test_data = pd.read_csv('kc_house_test_data.csv', dtype = dtype_dict)  

In [4]:
def get_numpy_data(data_pd, features, output):
    data_pd['constant'] = 1
    features = ['constant'] + features
    features_matrix = np.array(data_pd[features])
    output_array = np.array(data_pd[output])

    return(features_matrix, output_array)

In [5]:
def predict_outcome(feature_matrix, weights):
    predictions = feature_matrix.dot(weights)
    return(predictions)

In [6]:
def feature_derivative(errors, feature):
    derivative = 2 * errors.dot(feature)
    return(derivative)

In [21]:
import math

def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False
    weights = np.array(initial_weights)
    while not converged:
        predictions = predict_outcome(feature_matrix, weights)
        errors = np.array([x-y for x,y in zip(predictions, output)])
        
        gradient_sum_squares = 0
        for i in range(len(weights)):
            derivative = feature_derivative(errors, feature_matrix[:,i])
            gradient_sum_squares += derivative**2
            
            weights[i] = weights[i] - step_size * derivative
        
        gradient_magnitude = math.sqrt(gradient_sum_squares)
        print gradient_magnitude
        if gradient_magnitude < tolerance:
            converged = True
    return(weights)

### Testing code

In [22]:
simple_features = ['sqft_living']
my_output= 'price'
(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

In [23]:
simple_weights = regression_gradient_descent(simple_feature_matrix, output,initial_weights, step_size, tolerance)

5.0551530785e+13
1.31274510263e+13
3.40899608324e+12
8.85263580285e+11
2.29889265768e+11
59698688272.2
15502826425.3
4025844402.35
1045449748.39
271487891.953
70504114.8434
18320017.2673


In [24]:
simple_weights

array([-46999.88716555,    281.91211918])

In [25]:
(simple_test_feature_matrx, output_test) = get_numpy_data(test_data, simple_features, my_output)

In [26]:
test_predictions = predict_outcome(simple_test_feature_matrx, simple_weights)

In [27]:
print test_predictions[:5]

[ 356134.443255    784640.86440132  435069.83662406  607036.22932094
  260284.32273543]


## Compute RSS on test Data

In [28]:
RSS = sum([(x-y)**2 for x,y in zip(test_predictions, output_test)])
print math.sqrt(RSS)

16595181.376


### Model  Number 2

In [30]:
model_features = ['sqft_living', 'sqft_living15']
my_output = 'price'
(feature_matrix, output) = get_numpy_data(train_data, model_features,my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9
adv_weights = regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance)

7.3072020556e+13
2.26732209675e+13
7.06079458349e+12
2.27568239659e+12
9.28984110636e+11
6.56307431863e+11
6.10615358663e+11
5.9307877202e+11
5.78705926686e+11
5.64945682565e+11
5.51538687675e+11
5.38452428981e+11
5.25676918666e+11
5.13204549506e+11
5.01028105997e+11
4.89140564644e+11
4.77535070645e+11
4.66204932038e+11
4.55143615657e+11
4.44344743348e+11
4.33802088283e+11
4.23509571376e+11
4.13461257776e+11
4.03651353442e+11
3.94074201805e+11
3.84724280507e+11
3.75596198212e+11
3.66684691503e+11
3.57984621843e+11
3.49490972614e+11
3.41198846224e+11
3.33103461282e+11
3.25200149843e+11
3.17484354712e+11
3.09951626824e+11
3.0259762267e+11
2.95418101798e+11
2.88408924365e+11
2.81566048754e+11
2.74885529238e+11
2.68363513707e+11
2.61996241449e+11
2.55780040981e+11
2.49711327926e+11
2.43786602956e+11
2.38002449767e+11
2.3235553311e+11
2.26842596872e+11
2.21460462192e+11
2.16206025635e+11
2.11076257397e+11
2.0606819956e+11
2.01178964388e+11
1.96405732658e+11
1.91745752039e+11
1.87196335501e+

### Predicting model2 on test data

In [32]:
(adv_test_feature_matrx, output_test) = get_numpy_data(test_data, model_features, my_output)
test_predictions2 = predict_outcome(adv_test_feature_matrx, adv_weights)
print test_predictions2[:5]
print output_test[:5]

[ 366651.41162949  762662.39850726  386312.09557541  636989.65007208
  269618.02584477]
[ 310000.  650000.  233000.  580500.  535000.]


## Compute RSS

In [33]:
RSS = sum([(x-y)**2 for x,y in zip(test_predictions2, output_test)])
print math.sqrt(RSS)

16439691.1051
