In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import linear_model
import numpy as np
from math import sqrt
from sklearn.pipeline import Pipeline

In [None]:
data = pd.read_csv("data.csv")
train_data,test_data = train_test_split(data, test_size = .2)

In [None]:
def get_numpy_data(data, features, output):
    #add a constant column
    data['constant'] = 1
    features = ['constant'] + features
    data_features = data[features]
    feature_matrix = data_features.as_matrix()
    output_array = data[output]
    output_matrix = output_array.as_matrix()
    return (feature_matrix, output_matrix)

In [None]:
(example_features, example_output) = get_numpy_data(data, ['sqft_living'], 'price') 
# the [] around 'sqft_living' makes it a list
print example_features # this accesses the first row of the data the ':' indicates 'all columns'
print example_output # and the corresponding output

In [None]:
my_weights = np.array([1., 1.]) #example_weights
my_features = example_features[0,]
predicted_value = np.dot(my_features, my_weights)
print predicted_value

In [None]:
def predict_output(feature_matrix, weights):
     # assume feature_matrix is a numpy matrix containing the features as columns and weights is a corresponding numpy array
    # create the predictions vector by using np.dot()
    predictions = np.dot(feature_matrix, weights)
    return(predictions)

In [None]:
test_predictions = predict_output(example_features, my_weights)
print test_predictions[0]
print test_predictions[1]

In [None]:
def feature_derivative(errors, feature):
    derivative = 2 * np.dot(errors, feature)
    return derivative

In [None]:
(example_features, example_output) = get_numpy_data(data, ['sqft_living'], 'price') 
my_weights = np.array([0.,0.])
test_predictions = predict_output(example_features, my_weights)
errors = test_predictions - example_output
feature = example_features[:,0]
derivative = feature_derivative(errors, feature)
print derivative
print -np.sum(example_output)*2 # should be the same as derivative

In [None]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False 
    weights = np.array(initial_weights) # make sure it's a numpy array
    while not converged:
        # compute the predictions based on feature_matrix and weights using your predict_output() function
        predictions = predict_output(feature_matrix, weights)
        # compute the errors as predictions - output
        errors = predictions - output
        gradient_sum_squares = 0 # initialize the gradient sum of squares
        # while we haven't reached the tolerance yet, update each feature's weight
        for i in range(len(weights)): # loop over each weight
            # Recall that feature_matrix[:, i] is the feature column associated with weights[i]
            # compute the derivative for weight[i]:
            derivative = feature_derivative(errors, feature_matrix[:, i])
            # subtract the step size times the derivative from the current weight
            weights[i]= weights[i] - (step_size * derivative)
            # add the squared value of the derivative to the gradient sum of squares (for assessing convergence)
            derivative_square = derivative * derivative 
            gradient_sum_squares = derivative_square.sum()
        # compute the square-root of the gradient sum of squares to get the gradient magnitude:
        gradient_magnitude = sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
    return(weights)  

In [None]:
# let's test out the gradient descent as simple regression
simple_features = ['sqft_living']
my_output = 'price'
(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

test_weight = regression_gradient_descent(simple_feature_matrix, output, initial_weights, step_size, tolerance)
print test_weight

(test_simple_feature_matrix, test_output) = get_numpy_data(test_data, simple_features, my_output)
prediction = predict_output(test_simple_feature_matrix, test_weight)
print 'prediction: ',prediction
print 'prediction_sum: ',prediction.sum()
print 'output: ',test_output.sum()

In [None]:
## using gradient descent for multiple regression
model_features = ['sqft_living', 'sqft_living15'] # sqft_living15 is the average squarefeet for the nearest 15 neighbors. 
my_output = 'price'
(feature_matrix, output) = get_numpy_data(train_data, model_features, my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9
test_weight_multiple = regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance)
(test_simple_feature_matrix_multiple, test_output_multiple) = get_numpy_data(test_data, model_features, my_output)
prediction_multiple = predict_output(test_simple_feature_matrix_multiple, test_weight_multiple)
print 'weights: ',test_weight_multiple
print 'prediction_multiple: ',prediction_multiple
print 'prediction_multiple_sum: ',prediction_multiple.sum()
print 'output_multiple: ',test_output_multiple

In [None]:
## using scikit_learn for simple regression
test_feature = test_data[simple_features].values.reshape(-1,1)
test_output = test_data['price'].values.reshape(-1,1)
train_feature = train_data[simple_features].values.reshape(-1,1)
train_output = train_data['price'].values.reshape(-1,1)
regr = linear_model.LinearRegression(fit_intercept=True)
regr.fit(train_feature, train_output)
prediction = regr.predict(test_feature)
print 'prediction: ',prediction
print 'prediction_sum: ',prediction.sum()
print 'intercept',regr.intercept_
print 'coefficients',regr.coef_
print ('R^2: ',regr.score(train_output , train_feature))

In [None]:
## using scikit learn for multiple regression
test_feature_multiple = test_data[model_features].values.reshape(-1,2)
test_output_multiple = test_data['price'].values.reshape(-1,1)
train_feature_multiple = train_data[model_features].values.reshape(-1,2)
train_output_multiple = train_data['price'].values.reshape(-1,1)
regr = linear_model.LinearRegression(fit_intercept=True)
regr.fit(train_feature_multiple, train_output_multiple)
prediction = regr.predict(test_feature_multiple)

print 'prediction: ',prediction
print 'prediction_sum: ',prediction.sum()
print 'intercept',regr.intercept_
print 'coefficients',regr.coef_
# print ('R^2: ',regr.score(outputs , simple_feature))