In [1]:
import pandas as pd

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [3]:
data = pd.read_csv('C:\\Users\\HP PAVILION LAPTOP\\Downloads\\kc_house_data.csv',dtype=dtype_dict)

In [4]:
data.dtypes


id                object
date              object
price            float64
bedrooms         float64
bathrooms        float64
sqft_living      float64
sqft_lot           int32
floors            object
waterfront         int32
view               int32
condition          int32
grade              int32
sqft_above         int32
sqft_basement      int32
yr_built           int32
yr_renovated       int32
zipcode           object
lat              float64
long             float64
sqft_living15    float64
sqft_lot15       float64
dtype: object

In [5]:
import numpy as np

In [6]:
def get_numpy_data(data, features, output):
    data['constant'] = 1
    features = ['constant'] + features
    featuredata = data[features]
    feature_matrix = featuredata.as_matrix()
    out = data[output]
    output_array = out.as_matrix()
    return(feature_matrix, output_array)

In [7]:
(example_features, example_output) = get_numpy_data(data, ['sqft_living'], 'price') # the [] around 'sqft_living' makes it a list


In [8]:
print example_features[0,:] # this accesses the first row of the data the ':' indicates 'all columns'
print example_output[0]

[  1.00000000e+00   1.18000000e+03]
221900.0


In [9]:
my_weights = np.array([1., 1.]) # the example weights
my_features = example_features[0,] # we'll use the first data point
predicted_value = np.dot(my_features, my_weights)
print predicted_value

1181.0


In [10]:
def predict_output(feature_matrix, weights):
    predictions = np.dot(feature_matrix,weights)
    return(predictions)

In [11]:
test_predictions = predict_output(example_features, my_weights)
print test_predictions[0] # should be 1181.0
print test_predictions[1] # should be 2571.0

1181.0
2571.0


In [12]:
def feature_derivative(errors, feature):
    derivative = 2*np.dot(errors,feature)
    return(derivative)

In [13]:
(example_features, example_output) = get_numpy_data(data, ['sqft_living'], 'price') 
my_weights = np.array([0., 0.]) # this makes all the predictions 0
test_predictions = predict_output(example_features, my_weights) 
# just like SFrames 2 numpy arrays can be elementwise subtracted with '-': 
errors = test_predictions - example_output # prediction errors in this case is just the -example_output
feature = example_features[:,0] # let's compute the derivative with respect to 'constant', the ":" indicates "all rows"
derivative = feature_derivative(errors, feature)
print derivative
print -np.sum(example_output)*2 # should be the same as derivative

-23345850016.0
-23345850016.0


In [14]:
from math import sqrt

In [15]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = 0
    weights = np.array(initial_weights)
    while converged == 0 :
        predictions = predict_output(feature_matrix,weights)
        errors = predictions - output
        gradient_sum_squares = 0
        for i in range(len(weights)):
            derivative = feature_derivative(errors,feature_matrix[:,i])
            gradient_sum_squares = gradient_sum_squares + (derivative*derivative)
            weights[i] = weights[i] - (step_size*derivative)
        gradient_magnitude = sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = 1
    return(weights)

In [16]:
train = pd.read_csv('C:\\Users\\HP PAVILION LAPTOP\\Downloads\\kc_house_train_data.csv',dtype=dtype_dict)

In [17]:
test = pd.read_csv('C:\\Users\\HP PAVILION LAPTOP\\Downloads\\kc_house_test_data.csv',dtype=dtype_dict)

In [18]:
simple_features = ['sqft_living']
my_output = 'price'
(simple_feature_matrix, output) = get_numpy_data(train, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

In [20]:
new_weights = regression_gradient_descent(simple_feature_matrix, output, initial_weights, step_size, tolerance)
print new_weights

[-46999.88716555    281.91211918]


In [36]:
(test_simple_feature_matrix, test_output) = get_numpy_data(test, simple_features, my_output)

In [37]:
test_pred = predict_output(test_simple_feature_matrix,new_weights)

In [38]:
test_pred[0]

356134.44325500238

In [40]:
residual = test_pred - test_output

In [56]:
rss1 = np.dot(residual,residual)

In [55]:
model_features = ['sqft_living', 'sqft_living15'] # sqft_living15 is the average squarefeet for the nearest 15 neighbors. 
my_output = 'price'
(feature_matrix, output) = get_numpy_data(train, model_features, my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

In [57]:
print rss1

2.75400044902e+14


In [58]:
new_weights1 = regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance)

In [59]:
(test_simple_feature_matrix1, test_output1) = get_numpy_data(test, model_features, my_output)

In [60]:
test_pred1 = predict_output(test_simple_feature_matrix1,new_weights1)

In [61]:
test_pred1[0]

366651.41162949387

In [62]:
test['price'][0]

310000.0

In [63]:
residual1 = test_pred1 - test_output1

In [64]:
rss2 = np.dot(residual1,residual1)

In [65]:
print rss2


2.7026344363e+14
