In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics,model_selection
from sklearn.linear_model import LinearRegression

In [2]:
train=pd.read_csv('kc_house_train_data.csv')
test=pd.read_csv('kc_house_test_data.csv')

In [3]:
def get_numpy_data(df, feat, out):
    df['constant'] = 1                          # add a constant col
    feat = ['constant'] + feat
    
    feat_mat=df.loc[:,feat]
    feat_mat = np.array(feat_mat)
    
    out_arr=df.loc[:,out]
    out_arr = np.array(out_arr).reshape(-1,)
    
    return feat_mat, out_arr

In [4]:
def predict_outcome(feat_mat, wts):
    return np.dot(feat_mat,wts)

In [5]:
def feature_derivative(feat_mat,out,wts):
    m,n=feat_mat.shape
    grad = np.zeros((n,))
    y_pred=predict_outcome(feat_mat,wts)
    grad=-2* np.dot( feat_mat.T , out-y_pred )
    return grad

In [8]:
def regression_gradient_descent(feat_mat, out, init_wts, step, tol):
    converged = False
    wts = np.array(init_wts)
    while not converged:
        y_pred=predict_outcome(feat_mat,wts)
        error= np.sum( (out-y_pred)**2 )
        
        grad=feature_derivative(feat_mat,out,wts)
        wts = wts - step*grad
        
        grad_sum_sq=np.sum(grad**2)    
        grad_mag = np.sqrt(grad_sum_sq)
        if grad_mag < tol:
            converged = True
    return wts

In [13]:
feat=['sqft_living']
out=['price']
feat_mat,out_arr = get_numpy_data(train,feat,out)


In [10]:
init_wts = [-47000,1]
step = 7e-12
tol = 2.5e7

In [None]:
wts = regression_gradient_descent(feat_mat,out_arr,init_wts,step,tol)

In [None]:
print(wts)

In [None]:
# predict on test data    # t for test
tfeat_mat,tout_arr = get_numpy_data(test,feat,out)
y_pred=predict_outcome(tfeat_mat,wts)

In [None]:
# first house price
y_pred[0]

In [None]:
# RSS
def error(feat_mat,wts,out):
    y_pred=predict_outcome(feat_mat,wts)
    err= np.sum( (out-y_pred)**2 )
    return err

In [None]:
# RSS on test data
err=error(tfeat_mat,wts,tout_arr)
print(err)

In [None]:
## More than 1 feature matrix    # m for multiple
model_feat = ['sqft_living', 'sqft_living15']
my_out= 'price'
mfeat_mat, mout = get_numpy_data(train, model_feat ,my_out)
minit_wts = np.array([-100000., 1., 1.])
mstep = 4e-12
mtol = 1e9

In [None]:
mwts = regression_gradient_descent( mfeat_mat,mout,minit_wts,mstep,mtol )

In [None]:
print(mwts)

In [None]:
# predict on test data
mtfeat_mat,mtout_arr = get_numpy_data(test,model_feat,my_out)
y_pred=predict_outcome(mtfeat_mat,mwts)

In [None]:
# predicted price for 1st house
y_pred[0]

In [None]:
# actual price for 1st house
mtout_arr[0]        ### model 1 price is more closer to actual price than multiple feat model

In [None]:
## RSS for second model test data
err2 = error(mtfeat_mat,mwts,mtout_arr)

In [None]:
print(err2)    ## this model has lower error