In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

**Read Data**

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
df_house_data=pd.read_csv("kc_house_data.csv", dtype=dtype_dict)
df_house_train=pd.read_csv("kc_house_train_data.csv", dtype=dtype_dict)
df_house_test=pd.read_csv("kc_house_test_data.csv", dtype=dtype_dict)

In [3]:
from sklearn import linear_model

def simple_regression_scikit(input_feature, output):
    clf = linear_model.LinearRegression()
    clf.fit(input_feature,output)
    return clf
    
    
model = simple_regression_scikit(df_house_train[['sqft_living']], df_house_train[['price']])

(model.intercept_[0],model.coef_[0][0])


(-47116.079072893248, 281.95883963034214)

In [4]:
predictions = model.predict(df_house_train[['sqft_living']])
from sklearn.metrics import mean_squared_error
MSE=mean_squared_error(df_house_train[['price']],predictions)
#MSE = RSS/n
RSS=MSE*len(predictions) 
print(RSS)

1.20191835418e+15


In [5]:
df_house_train.index

RangeIndex(start=0, stop=17384, step=1)

In [6]:
df_house_train.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

In [7]:
df_house_train['bedrooms_squared']=df_house_train['bedrooms']*df_house_train['bedrooms']
df_house_test['bedrooms_squared']=df_house_test['bedrooms']*df_house_test['bedrooms']
df_house_train[['bedrooms_squared','bedrooms']].head()
df_house_train['bed_bath_rooms']=df_house_train['bedrooms']*df_house_train['bathrooms']
df_house_test['bed_bath_rooms']=df_house_test['bedrooms']*df_house_test['bathrooms']
df_house_train['log_sqft_living']=np.log(df_house_train['sqft_living'])
df_house_test['log_sqft_living']=np.log(df_house_test['sqft_living'])
df_house_train['lat_plus_long']=df_house_train['lat']+df_house_train['long']
df_house_test['lat_plus_long']=df_house_test['lat']+df_house_test['long']
df_house_train[['log_sqft_living','sqft_living']].head()

Unnamed: 0,log_sqft_living,sqft_living
0,7.07327,1180.0
1,7.851661,2570.0
2,6.646391,770.0
3,7.5807,1960.0
4,7.426549,1680.0


In [8]:
model1 = linear_model.LinearRegression().fit(df_house_train[['sqft_living', 'bedrooms', 'bathrooms','lat','long']], df_house_train[['price']])
model2 = linear_model.LinearRegression().fit(df_house_train[['sqft_living', 'bedrooms', 'bathrooms','lat','long','bed_bath_rooms']], df_house_train[['price']])
model3 = linear_model.LinearRegression().fit(df_house_train[['sqft_living', 'bedrooms', 'bathrooms','lat','long','bed_bath_rooms','bedrooms_squared','log_sqft_living','lat_plus_long']], df_house_train[['price']])

In [9]:
model1.coef_, model2.coef_, model3.coef_


(array([[  3.12258646e+02,  -5.95865332e+04,   1.57067421e+04,
           6.58619264e+05,  -3.09374351e+05]]),
 array([[  3.06610053e+02,  -1.13446368e+05,  -7.14613083e+04,
           6.54844630e+05,  -2.94298969e+05,   2.55796520e+04]]),
 array([[  5.29422820e+02,   3.45142296e+04,   6.70607813e+04,
           5.34085611e+05,  -4.06750711e+05,  -8.57050439e+03,
          -6.78858667e+03,  -5.61831484e+05,   1.27334900e+05]]))

In [10]:
predictions_model1 = model1.predict(df_house_train[['sqft_living', 'bedrooms', 'bathrooms','lat','long']])
predictions_model2 = model2.predict(df_house_train[['sqft_living', 'bedrooms', 'bathrooms','lat','long','bed_bath_rooms']])
predictions_model3 = model3.predict(df_house_train[['sqft_living', 'bedrooms', 'bathrooms','lat','long','bed_bath_rooms','bedrooms_squared','log_sqft_living','lat_plus_long']])

predictions_model1_test = model1.predict(df_house_test[['sqft_living', 'bedrooms', 'bathrooms','lat','long']])
predictions_model2_test = model2.predict(df_house_test[['sqft_living', 'bedrooms', 'bathrooms','lat','long','bed_bath_rooms']])
predictions_model3_test = model3.predict(df_house_test[['sqft_living', 'bedrooms', 'bathrooms','lat','long','bed_bath_rooms','bedrooms_squared','log_sqft_living','lat_plus_long']])

In [11]:
RSS_model1 = ((df_house_train[['price']] - predictions_model1)**2).sum()
RSS_model2 = ((df_house_train[['price']] - predictions_model2)**2).sum()
RSS_model3 = ((df_house_train[['price']] - predictions_model3)**2).sum()

RSS_test_model1 = ((df_house_test[['price']] - predictions_model1_test)**2).sum()
RSS_test_model2 = ((df_house_test[['price']] - predictions_model2_test)**2).sum()
RSS_test_model3 = ((df_house_test[['price']] - predictions_model3_test)**2).sum()

print(RSS_model1, RSS_model2, RSS_model3)
print(RSS_test_model1, RSS_test_model2, RSS_test_model3)

price    9.678800e+14
dtype: float64 price    9.584196e+14
dtype: float64 price    9.034365e+14
dtype: float64
price    2.255005e+14
dtype: float64 price    2.233775e+14
dtype: float64 price    2.592363e+14
dtype: float64


<h1>Gradient Descent Algo</h1>


In [12]:
df_house_train.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15', 'bedrooms_squared',
       'bed_bath_rooms', 'log_sqft_living', 'lat_plus_long'],
      dtype='object')

In [13]:
df_house_train['constant']=1

In [14]:
df_house_train['constant'].head()

0    1
1    1
2    1
3    1
4    1
Name: constant, dtype: int64

In [15]:
df_house_train['price'].values[0:10]


array([  221900.,   538000.,   180000.,   604000.,   510000.,  1225000.,
         257500.,   291850.,   229500.,   323000.])

In [16]:
def get_numpy_data(data_frame, features, output):
    data_frame['constant']=1
    features=['constant']+features
    
    features_np = data_frame[features].values
    output_np = data_frame[[output]].values
    
    return features_np, output_np
    

In [17]:
print(get_numpy_data(df_house_train, ['sqft_living','bedrooms','bathrooms'], 'price'))

(array([[  1.00000000e+00,   1.18000000e+03,   3.00000000e+00,
          1.00000000e+00],
       [  1.00000000e+00,   2.57000000e+03,   3.00000000e+00,
          2.25000000e+00],
       [  1.00000000e+00,   7.70000000e+02,   2.00000000e+00,
          1.00000000e+00],
       ..., 
       [  1.00000000e+00,   1.53000000e+03,   3.00000000e+00,
          2.50000000e+00],
       [  1.00000000e+00,   1.60000000e+03,   3.00000000e+00,
          2.50000000e+00],
       [  1.00000000e+00,   1.02000000e+03,   2.00000000e+00,
          7.50000000e-01]]), array([[ 221900.],
       [ 538000.],
       [ 180000.],
       ..., 
       [ 360000.],
       [ 400000.],
       [ 325000.]]))


If the features matrix (including a column of 1s for the constant) is stored as a 2D array (or matrix) and the regression weights are stored as a 1D array then the predicted output is just the dot product between the features matrix and the weights (with the weights on the right). Write a function ‘predict_output’ which accepts a 2D array ‘feature_matrix’ and a 1D array ‘weights’ and returns a 1D array ‘predictions’. e.g. in python:

In [18]:
def predict_outcome(feature_matrix, weights):
    predictions=np.dot(feature_matrix,weights)
    return(predictions)

 If we have a the values of a single input feature in an array ‘feature’ and the prediction ‘errors’ (predictions - output) then the derivative of the regression cost function with respect to the weight of ‘feature’ is just twice the dot product between ‘feature’ and ‘errors’. Write a function that accepts a ‘feature’ array and ‘error’ array and returns the ‘derivative’ (a single number). e.g. in python:

In [19]:
def feature_derivative(errors, feature):
    derivative = 2*np.dot(feature,errors)
    return(derivative)

Now we will use our predict_output and feature_derivative to write a gradient descent function. Although we can compute the derivative for all the features simultaneously (the gradient) we will explicitly loop over the features individually for simplicity. Write a gradient descent function that does the following:

Accepts a numpy feature_matrix 2D array, a 1D output array, an array of initial weights, a step size and a convergence tolerance.
While not converged updates each feature weight by subtracting the step size times the derivative for that feature given the current weights
At each step computes the magnitude/length of the gradient (square root of the sum of squared components)
When the magnitude of the gradient is smaller than the input tolerance returns the final weight vector.

In [63]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False
    weights = np.array(initial_weights).reshape((2,1))
    while not converged:
        # compute the predictions based on feature_matrix and weights:
        # compute the errors as predictions - output:
        print(feature_matrix.ndim, weights.shape)
        predictions = predict_outcome(feature_matrix, weights)
        print("predictions",predictions.shape, predictions[:10])
        print("output",output[:10])
        errors = predictions - output
        print("errors",errors[:10])
        gradient_sum_squares = 0 # initialize the gradient
        # while not converged, update each weight individually:
        for i in range(len(weights)):
            # Recall that feature_matrix[:, i] is the feature column associated with weights[i]
            # compute the derivative for weight[i]:
            partial_derivative = feature_derivative(errors,feature_matrix[:,i])
            print("before", i, partial_derivative, weights[0],weights[1])
            
            
            # add the squared derivative to the gradient magnitude
            gradient_sum_squares += partial_derivative**2
            # update the weight based on step size and derivative:
            weights[i] = weights[i] - step_size*partial_derivative
            print("after",i, partial_derivative, weights[0],weights[1])
        
            
        gradient_magnitude = np.sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
    return(weights)


Now we will run the regression_gradient_descent function on some actual data. In particular we will use the gradient descent to estimate the model from Week 1 using just an intercept and slope. Use the following parameters:

features: ‘sqft_living’
output: ‘price’
initial weights: -47000, 1 (intercept, sqft_living respectively)
step_size = 7e-12
tolerance = 2.5e7
e.g. in python with numpy and SFrames:

In [43]:
simple_features = ['sqft_living']
my_output= 'price'
(simple_feature_matrix, output) = get_numpy_data(df_house_train, simple_features, my_output)
print(simple_feature_matrix.shape, output.shape)
initial_weights = np.array([-47000., 1.])
print(initial_weights.shape)
step_size = 7e-12
tolerance = 2.5e7
print(simple_feature_matrix[:10])

(17384, 2) (17384, 1)
(2,)
[[  1.00000000e+00   1.18000000e+03]
 [  1.00000000e+00   2.57000000e+03]
 [  1.00000000e+00   7.70000000e+02]
 [  1.00000000e+00   1.96000000e+03]
 [  1.00000000e+00   1.68000000e+03]
 [  1.00000000e+00   5.42000000e+03]
 [  1.00000000e+00   1.71500000e+03]
 [  1.00000000e+00   1.06000000e+03]
 [  1.00000000e+00   1.78000000e+03]
 [  1.00000000e+00   1.89000000e+03]]


In [64]:
simple_weights = regression_gradient_descent(simple_feature_matrix, output,initial_weights, step_size,tolerance)

2 (2, 1)
predictions (17384, 1) [[-45820.]
 [-44430.]
 [-46230.]
 [-45040.]
 [-45320.]
 [-41580.]
 [-45285.]
 [-45940.]
 [-45220.]
 [-45110.]]
output [[  221900.]
 [  538000.]
 [  180000.]
 [  604000.]
 [  510000.]
 [ 1225000.]
 [  257500.]
 [  291850.]
 [  229500.]
 [  323000.]]
errors [[ -267720.]
 [ -582430.]
 [ -226230.]
 [ -649040.]
 [ -555320.]
 [-1266580.]
 [ -302785.]
 [ -337790.]
 [ -274720.]
 [ -368110.]]
before 0 [ -2.03144765e+10] [-47000.] [ 1.]
after 0 [ -2.03144765e+10] [-46999.85779866] [ 1.]
before 1 [ -5.05515267e+13] [-46999.85779866] [ 1.]
after 1 [ -5.05515267e+13] [-46999.85779866] [ 354.86068692]
2 (2, 1)
predictions (17384, 1) [[  371735.75276992]
 [  864992.10759223]
 [  226242.87113168]
 [  648527.08856949]
 [  549166.09623118]
 [ 1876345.06532143]
 [  561586.22027347]
 [  329152.47033921]
 [  584652.16492343]
 [  623686.84048491]]
output [[  221900.]
 [  538000.]
 [  180000.]
 [  604000.]
 [  510000.]
 [ 1225000.]
 [  257500.]
 [  291850.]
 [  229500.]
 [  32

In [65]:
print(simple_weights)

[[-46999.88716555]
 [   281.91211918]]
