In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from sklearn.linear_model import LinearRegression, Ridge, Lasso

In [2]:
data=pd.read_csv('kc_house_data_small.csv')
train=pd.read_csv('kc_house_data_small_train.csv')
test=pd.read_csv('kc_house_data_small_test.csv')
valid=pd.read_csv('kc_house_data_validation.csv')

In [3]:
def get_numpy_data(df, feat, out):
    df['constant'] = 1                          # add a constant col
    feat = ['constant'] + feat
    
    feat_mat=df.loc[:,feat]
    feat_mat = np.array(feat_mat)
    
    out_arr=df.loc[:,out]
    out_arr = np.array(out_arr).reshape(-1,)
    
    return feat_mat, out_arr

In [4]:
def normalize_features(feat_mat):
    
    norms = np.linalg.norm(feat_mat, axis=0)
    feat_mat = feat_mat/norms
    
    return feat_mat, norms

In [5]:
cols=list(train.columns)
feat=['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view',
 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated','lat',
 'long', 'sqft_living15', 'sqft_lot15']
out=['price']

In [6]:
feat_train,out_train = get_numpy_data(train,feat,out)
feat_test,out_test = get_numpy_data(test,feat,out)
feat_val,out_val = get_numpy_data(valid,feat,out)

In [7]:
feat_train,norms = normalize_features(feat_train)
feat_test = feat_test/norms
feat_val = feat_val/norms

In [23]:
## query house
h1_test = feat_test[0]
h10_train = feat_train[9]
h1_test.shape

(18,)

In [26]:
## eucledian distance = sqrt(a1(xj[1]−xq[1])2+...+ad(xj[d]−xq[d])2)
edis = np.sqrt( np.sum( (h10_train - h1_test)**2 ) )
edis

0.05972359371398078

In [27]:
## compute eucledian distance of query with first 10 train houses
q_house = feat_test[0]
dist=[]
for i in range(10):
    point = feat_train[i]
    edis = np.sqrt( np.sum( (point - q_house)**2 ) )
    dist.append(edis)
dist

[0.06027470916295592,
 0.08546881147643746,
 0.06149946435279315,
 0.05340273979294363,
 0.05844484060170442,
 0.059879215098128345,
 0.05463140496775461,
 0.055431083236146074,
 0.052383627840220305,
 0.05972359371398078]

In [34]:
min_val = min(dist)
dist.index( min_val )   ## 9th house is closest to query point

8

## 1 NN regression

In [37]:
diff = feat_train - q_house
edis = np.sqrt( np.sum( diff**2, axis=1 ) )

In [38]:
edis[100]

0.023708232416678195

In [39]:
def comp_dist(feat_inst, query):
    diff = feat_inst - query
    edis = np.sqrt( np.sum( diff**2, axis=1 ) )
    return edis

In [45]:
query = feat_test[2]
edist = comp_dist(feat_train,query)
## min distance house
edist.argmin()

382

In [46]:
## predicted price using 1 NN
out_train[382]

249000

## k NN regression

In [57]:
def k_nrst_ngbrs(k, feat_train, query):
    edist = comp_dist(feat_train,query)
    ngbrs = np.argsort(edist)[:k]
    return ngbrs

In [58]:
## k neighbors
query = feat_test[2]
ngbrs = k_nrst_ngbrs(4,feat_train,query)
ngbrs

array([ 382, 1149, 4087, 3142], dtype=int64)

In [61]:
def pred_out_of_query(k, feat_train, out_train, query):
    ngbrs = k_nrst_ngbrs(k,feat_train,query)
    pred = np.mean(out_train[ngbrs])
    return pred

In [64]:
## predictio for query house
pred = pred_out_of_query(4,feat_train,out_train,query)
pred

413987.5

In [65]:
### prediction of all query houses matrix
def pred_output(k, feat_train, out_train, query):
    m,n=query.shape
    pred=[]
    for i in range(m):
        pred1 = pred_out_of_query(k,feat_train,out_train,query[i])
        pred.append(pred1)
    return pred

In [70]:
query = feat_test[0:10]
pred = pred_output(10,feat_train,out_train,query)
pred

[881300.0,
 431860.0,
 460595.0,
 430200.0,
 766750.0,
 667420.0,
 350032.0,
 512800.7,
 484000.0,
 457235.0]

In [76]:
### min predicted value
min_val = min(pred)
print(min_val)
min_ind = pred.index(min_val)
print(min_ind)

350032.0
6


### Best value of k using validation set

In [88]:
rss=[]
for k in range(1,16):
    pred = pred_output(k,feat_train,out_train,feat_val)
    err = np.sum( (out_val-pred)**2 )
    rss.append(err)

In [91]:
rss

[105453830251561.0,
 83445073504025.5,
 72692096019202.56,
 71946721652091.69,
 69846517419718.6,
 68899544353180.836,
 68341973450051.09,
 67361678735491.5,
 68372727958976.09,
 69335048668556.74,
 69523855215598.83,
 69049969587246.17,
 70011254508263.69,
 70908698869034.34,
 71106928385945.16]

In [89]:
min_val = min(rss)
min_ind = rss.index(min_val)
best_k = min_ind+1
best_k

8

In [90]:
### rss on test data
pred = pred_output(best_k,feat_train,out_train,feat_test)
err = np.sum( (out_test-pred)**2 )
err

133118823551516.81