In [1]:
import numpy as np
import pandas as pd

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float,
              'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str,
              'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int,
              'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [3]:
def numpy_data(data_pd, features, output):
    data_pd['constant'] = 1
    features = ['constant'] + features
    feature_matrix = np.array(data_pd[features], dtype = np.float32)
    output_array = np.array(data_pd[output])
    
    return (feature_matrix, output_array)

In [4]:
def normalize_features(features):
    norms = np.linalg.norm(features, axis = 0)
    X_normalized = features / norms
    
    return (X_normalized, norms)

## Compute a single distance

In [5]:
train = pd.read_csv('kc_house_data_small_train.csv', dtype=dtype_dict)
test = pd.read_csv('kc_house_data_small_test.csv', dtype=dtype_dict)
valid = pd.read_csv('kc_house_data_validation.csv', dtype=dtype_dict)

In [10]:
feature_list = ['bedrooms',  
                'bathrooms',  
                'sqft_living',  
                'sqft_lot',  
                'floors',
                'waterfront',  
                'view',  
                'condition',  
                'grade',  
                'sqft_above',  
                'sqft_basement',
                'yr_built',  
                'yr_renovated',  
                'lat',  
                'long',  
                'sqft_living15',  
                'sqft_lot15']
features_train, output_train = numpy_data(train, feature_list, 'price')
features_test, output_test = numpy_data(test, feature_list, 'price')
features_valid, output_valid = numpy_data(valid, feature_list, 'price')

features_train, norms = normalize_features(features_train)
features_test = features_test / norms
features_valid = features_valid / norms

In [11]:
print features_test[0]
print features_train[9]

[ 0.01345102  0.01551285  0.01807473  0.01759212  0.00160518  0.017059    0.
  0.05102365  0.0116321   0.01564352  0.01362084  0.02481682  0.01350306
  0.          0.01345387 -0.01346922  0.01375926  0.0016225 ]
[ 0.01345102  0.01163464  0.00602491  0.0083488   0.00050756  0.01279425
  0.          0.          0.01938684  0.01390535  0.0096309   0.
  0.01302544  0.          0.01346821 -0.01346251  0.01195898  0.00156612]


#Quiz Question: What is the Euclidean distance between the query house and the 10th house of the training set?

In [26]:
def euclidean(a,b):
    result = (a - b)**2
    
    return np.sqrt(np.sum(result))

In [27]:
euclidean(features_test[0], features_train[9])

0.059723593

In [28]:
ll = [(i, euclidean(features_test[0], features_train[i])) for i in xrange(10)]

In [29]:
print ll

[(0, 0.060274705), (1, 0.085468821), (2, 0.061499465), (3, 0.053402741), (4, 0.058444839), (5, 0.059879214), (6, 0.054631405), (7, 0.055431083), (8, 0.052383628), (9, 0.059723593)]


#Quiz Question: Among the first 10 training houses, which house is the closest to the query house?

In [30]:
reduce(lambda a,b: a if a[1] < b[1] else b, ll)

(8, 0.052383628)

## Vectorization

In [31]:
results = features_train[0:3] - features_test[0]
print results[0] - (features_train[0]-features_test[0])
# should print all 0's if results[0] == (features_train[0]-features_test[0])
print results[1] - (features_train[1]-features_test[0])
# should print all 0's if results[1] == (features_train[1]-features_test[0])
print results[2] - (features_train[2]-features_test[0])
# should print all 0's if results[2] == (features_train[2]-features_test[0])

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]


In [36]:
diff = features_train - features_test[0]

In [37]:
diff[-1].sum()

-0.093433984

In [38]:
distances = np.sum(diff**2, axis=1)

In [39]:
distances[15]

0.0033070587

In [40]:
np.sum(diff[15]**2)

0.0033070587

In [41]:
dist = np.sqrt(distances)

In [42]:
def euclidean(train, query):
    diff = train - query
    dist = np.sqrt(np.sum(diff**2, axis=1))
    return dist

In [43]:
dist[100]

0.02370823

# Quiz Question: Take the query house to be third house of the test set (features_test[2]). What is the index of the house in the training set that is closest to this query house?

In [46]:
dists = euclidean(features_train, features_test[2])

In [47]:
ll = [(i, dists[i]) for i in xrange(len(dists))]

In [48]:
reduce(lambda a,b: a if a[1] < b[1] else b, ll) 

(382, 0.0028604956)

#Quiz Question: What is the predicted value of the query house based on 1-nearest neighbor regression?

In [49]:
output_train[382]

249000.0

## Perform K-nearest neighbors

In [52]:
def k_nearest_neighbors(k, feature_train, features_query):
    dists = euclidean(features_train, features_query)
    ll = [(i, dists[i]) for i in xrange(len(dists))]
    ll = sorted(ll, key = lambda s: s[1])
    ll_k = ll[:k]
    
    return ll_k

#Quiz Question: Take the query house to be third house of the test set (features_test[2]). What are the indices of the 4 training houses closest to the query house?

In [53]:
k_nearest_neighbors(4, features_train, features_test[2])

[(382, 0.0028604956),
 (1149, 0.0032258409),
 (4087, 0.0035021554),
 (3142, 0.0035931543)]

In [54]:
def predict_output_of_query(k, features_train, output_train, features_query):
    dists = euclidean(features_train, features_query)
    ll = [(i, dists[i], output_train[i]) for i in xrange(len(dists))]
    ll = sorted(ll, key = lambda s: s[1])
    ll_k = ll[:k]
    prediction = np.mean([x[2] for x in ll_k])
    
    return prediction

#Quiz Question: Again taking the query house to be third house of the test set (features_test[2]), predict the value of the query house using k-nearest neighbors with k=4 and the simple averaging method described and implemented above.

In [55]:
predict_output_of_query(4, features_train, output_train, features_test[2])

413987.5

In [56]:
def predict_output(k, features_train, output_train, features_query):
    predictions = [predict_output_of_query(k, features_train, output_train, query) for query in features_query]
       
    return predictions

#Quiz Question: Make predictions for the first 10 houses in the test set, using k=10. What is the index of the house in this query set that has the lowest predicted value? What is the predicted value of this house?

In [57]:
predict_output(10, features_train, output_train, features_test[:10])

[881300.0,
 431860.0,
 460595.0,
 430200.0,
 766750.0,
 667420.0,
 350032.0,
 512800.70000000001,
 484000.0,
 457235.0]

##Choosing the best value of k using a validation set



In [64]:
ks = xrange(1,16)

In [65]:
print k

xrange(1, 16)


In [66]:
preds_on_valid = [(k , predict_output(k, features_train, output_train, features_valid)) for k in ks]

In [71]:
def rss(preds, actual):
    diff = preds - actual
    diff = np.sum(diff**2)
    return diff

In [72]:
diff_on_valid = [(elem[0], rss(output_valid, elem[1])) for elem in preds_on_valid]

In [73]:
best = reduce(lambda a,b: a if a[1] < b[1] else b, diff_on_valid)

In [74]:
best

(8, 67361678735491.5)

#Quiz Question: What is the RSS on the TEST data using the value of k found above? To be clear, sum over all houses in the TEST set.

In [75]:
preds_on_test = predict_output(8, features_train, output_train, features_test)

In [76]:
rss(preds_on_test, output_test)

133091689366516.81