In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
#Create a feature Matrix
def get_numpy_data(data, features, output):
    '''
    output is the target column name
    '''
    features = ['Constant'] + features
    data['Constant'] = 1
    
    feature_matrix, output_array = np.array(data[features]), np.array(data[output])
    
    return (feature_matrix, output_array)    

In [3]:
temp_arr = np.array([[1, 3, 5],[2, 4, 6]])
norms = np.linalg.norm(temp_arr, axis=0)
normalized_temp = temp_arr/norms


In [4]:
normalized_temp

array([[0.4472136 , 0.6       , 0.6401844 ],
       [0.89442719, 0.8       , 0.76822128]])

In [5]:
#Normalize the features
def normalize_features(features):
    
    norms = np.linalg.norm(np.array(training[features]), axis=0)
    normalized_features = np.array(training[features])/norms
    
    return (normalized_features, norms)

### Using get_numpy_data (or equivalent), extract numpy arrays of the training, test, and validation sets.

In [6]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 
              'sqft_living15':float, 'grade':int, 'yr_renovated':int, 
              'price':float, 'bedrooms':float, 'zipcode':str, 
              'long':float, 'sqft_lot15':float, 'sqft_living':float, 
              'floors':float, 'condition':int, 'lat':float, 'date':str,
              'sqft_basement':int, 'yr_built':int, 'id':str, 
              'sqft_lot':int, 'view':int}

In [7]:
#Read the training, test and validation data sets
training = pd.read_csv('kc_house_data_small_train.csv', dtype=dtype_dict)
test = pd.read_csv('kc_house_data_small_test.csv', dtype = dtype_dict)
validation = pd.read_csv('kc_house_data_small_validation.csv/kc_house_data_validation.csv', dtype=dtype_dict)

In [8]:
training.dtypes

id                object
date              object
price            float64
bedrooms         float64
bathrooms        float64
sqft_living      float64
sqft_lot           int64
floors           float64
waterfront         int64
view               int64
condition          int64
grade              int64
sqft_above         int64
sqft_basement      int64
yr_built           int64
yr_renovated       int64
zipcode           object
lat              float64
long             float64
sqft_living15    float64
sqft_lot15       float64
dtype: object

In [9]:
#Setting the features
features = training.columns.tolist()[3:]
output = 'price'


In [10]:
features

['bedrooms',
 'bathrooms',
 'sqft_living',
 'sqft_lot',
 'floors',
 'waterfront',
 'view',
 'condition',
 'grade',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'yr_renovated',
 'zipcode',
 'lat',
 'long',
 'sqft_living15',
 'sqft_lot15']

In [11]:
#zipcode is of object type
#remove it
features.remove('zipcode')

In [12]:
training.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180.0,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340.0,5650.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570.0,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690.0,7639.0
2,5631500400,20150225T000000,180000.0,2.0,1.0,770.0,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720.0,8062.0
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960.0,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360.0,5000.0
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680.0,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800.0,7503.0


In [13]:
features

['bedrooms',
 'bathrooms',
 'sqft_living',
 'sqft_lot',
 'floors',
 'waterfront',
 'view',
 'condition',
 'grade',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'yr_renovated',
 'lat',
 'long',
 'sqft_living15',
 'sqft_lot15']

In [14]:
#Saving the norms obtained using the training set
#These will be used for normalizing validation and test sets
features_train = features
features_train, norms = normalize_features(features_train)
features_test = np.array(test[features]) / norms
features_valid = np.array(validation[features]) / norms

In [15]:
[(training_matrix, training_output), (test_matrix, test_output), (validation_matrix, validation_output)] = [get_numpy_data(data, features, output) for data in [training, test, validation]]

### 7. To start, let's just explore computing the “distance” between two given houses. We will take our query house to be the first house of the test set and look at the distance between this house and the 10th house of the training set.

To see the features associated with the query house, print the first row (index 0) of the test feature matrix. You should get an 18-dimensional vector whose components are between 0 and 1. Similarly, print the 10th row (index 9) of the training feature matrix.

In [16]:
#Query house
print (features_test[0,:])

[ 0.01551285  0.01807473  0.01759212  0.00160518  0.017059    0.
  0.05102365  0.0116321   0.01564352  0.01362084  0.02481682  0.01350306
  0.          0.01345387 -0.01346922  0.01375926  0.0016225 ]


In [17]:
#10th house of the training set
print (features_train[9,:])

[ 0.01163464  0.00602491  0.0083488   0.00050756  0.01279425  0.
  0.          0.01938684  0.01390535  0.0096309   0.          0.01302544
  0.          0.01346821 -0.01346251  0.01195898  0.00156612]


### Quiz Question: What is the Euclidean distance between the query house and the 10th house of the training set? 

In [18]:
def calculate_distance(h_one, h_two):
    return(np.sqrt(np.sum([(h_o-h_t)**2 
                for h_o, h_t in zip(h_one, h_two)])))

In [19]:
calculate_distance(features_test[0,:], features_train[9,:])

0.05972359371398078

### Quiz Question: Among the first 10 training houses, which house is the closest to the query house?

In [20]:
query_house = features_test[0,:]

In [21]:
import timeit

In [22]:
#Calculate the time required for the operation
import sys
start_time = timeit.time.time()
min_dist = sys.maxsize
index = ''
for i in range(10):
    if(calculate_distance(query_house, features_train[i,:]) < min_dist):
        min_dist =  calculate_distance(query_house, features_train[i,:])
        index = i
print("The closest house is the {}_th house at a distance {}." .format(index, min_dist))
end_time = timeit.time.time()
print("The execution time is {}" .format(end_time - start_time))

The closest house is the 8_th house at a distance 0.052383627840220305.
The execution time is 0.002563953399658203


In [23]:
#Calculate the time required for the operation using numpy array
start_time = timeit.time.time()
distance = np.min(np.array([calculate_distance(query_house, features_train[i,:])
             for i in range(10)]))
print(distance)
end_time = timeit.time.time()
print("The execution time is {}" .format(end_time - start_time))

0.052383627840220305
The execution time is 0.003358125686645508


In [24]:
# verify that vectorization works
results = features_train[0:3] - features_test[0]
print (results[0] - (features_train[0]-features_test[0]))

# should print all 0's if results[0] == (features_train[0]-features_test[0])


[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [25]:
print (results[1] - (features_train[1]-features_test[0]))
# should print all 0's if results[1] == (features_train[1]-features_test[0])

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [26]:

print (results[2] - (features_train[2]-features_test[0]))
# should print all 0's if results[2] == (features_train[2]-features_test[0])

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


# Perform 1-nearest neighbor regression

In [27]:
diff = query_house - features_train

In [28]:
diff[-1].sum()

0.09343399874654643

### By default, ‘np.sum’ sums up everything in the matrix and returns a single number. To instead sum only over a row or column, we need to specifiy the ‘axis’ parameter described in the np.sum documentation. In particular, ‘axis=1’ computes the sum across each row.

In [29]:
np.sum(diff**2, axis=1)

array([0.00363304, 0.00730492, 0.00378218, ..., 0.0032681 , 0.00325555,
       0.00341846])

### Verify that the following two expressions lead the same results

In [30]:
np.sum(diff**2, axis=1)[15]

0.0033070590284564457

In [32]:
np.sum(diff[15]**2)

0.0033070590284564453

### 14. With this result in mind, write a single-line expression to compute the Euclidean distances from the query to all the instances. Assign the result to variable distances.

In [31]:
distances = np.sqrt(np.sum(diff**2, axis=1))

In [32]:
distances[100]

0.023708232416678195

### 15. Now you are ready to write a function that computes the distances from a query house to all training houses. The function should take two parameters: (i) the matrix of training features and (ii) the single feature vector associated with the query.



In [33]:
def compute_distances(features_instances, features_query):
    diff = features_query - features_instances
    
    #Sum of squares of the row(axis=1)
    distances = np.sqrt(np.sum(diff**2, axis=1))
    return (distances)

### 16. Quiz Question: Take the query house to be third house of the test set (features_test[2]). What is the index of the house in the training set that is closest to this query house?


In [34]:
compute_distances(features_train, features_test[2])

array([0.01954476, 0.06861035, 0.02165079, ..., 0.02433478, 0.02622734,
       0.02637942])

In [35]:
min_dist = sys.maxsize
index = ''
distances = compute_distances(features_train, features_test[2])
for dist, i in zip(distances, range(len(distances))):
        
    if(dist<=min_dist):
        min_dist =  dist
        index = i
print("The closest house is the {}_th house at a distance {}." .format(index, min_dist))

The closest house is the 382_th house at a distance 0.0028604955575117085.


### 17. Quiz Question: What is the predicted value of the query house based on 1-nearest neighbor regression?

In [36]:
'''
The value of the house =
value of the nearest house  
'''
training_output[index]

249000.0

# Perform k-nearest neighbor regression

In [37]:
def k_nearest_neighbors(k, feature_train, features_query):
    neighbors = []
    
    for k_i in range(k):
        min_dist = sys.maxsize
        distances = compute_distances(features_train, features_query)
        for dist, i in zip(distances, range(len(distances))):
            if(i in neighbors):
                continue
            if(dist<=min_dist):
                min_dist =  dist
                index = i
        neighbors.append(index)
    return neighbors

### 19. Quiz Question: Take the query house to be third house of the test set (features_test[2]). What are the indices of the 4 training houses closest to the query house?

In [38]:
k_nearest_neighbors(4, features_train, features_test[2])

[382, 1149, 4087, 3142]

In [39]:
k_nearest_neighbors(4, features_train, features_test[30])

[1821, 4692, 3181, 3462]

In [40]:
for ind in [382, 1149, 4087, 3142]:
    print(training_output[ind])

249000.0
477000.0
499950.0
430000.0


### 20. Now that we know how to find the k-nearest neighbors, write a function that predicts the value of a given query house. For simplicity, take the average of the prices of the k nearest neighbors in the training set. The function should have the following parameters:

    the value of k;
    the feature matrix for the instances;
    the output values (prices) of the instances; and
    the feature of the query, whose price we’re predicting.

In [41]:
'''
Predict the value of the given query house as
the average of the prices of the k nearest neighbors
in the training set.
'''
def predict_output_of_query(k, features_train, output_train, features_query):
    k_nearest_indices = k_nearest_neighbors(k, features_train, features_query)
    
    #Find the average of the values present at the indices
    prediction = np.mean([output_train[k_th] for k_th in k_nearest_indices])
    
    return prediction

### Quiz Question: Again taking the query house to be third house of the test set (features_test[2]), predict the value of the query house using k-nearest neighbors with k=4 and the simple averaging method described and implemented above.

In [42]:
np.round(predict_output_of_query(4, features_train, training_output, features_test[2]))

413988.0

In [43]:
for i in range(10):
    print(np.round(predict_output_of_query(4, features_train, training_output, features_test[i])))

923750.0
434400.0
413988.0
552750.0
869625.0
683238.0
332830.0
577500.0
436250.0
454975.0


### 22. Finally, write a function to predict the value of each and every house in a query set. (The query set can be any subset of the dataset, be it the test set or validation set.) The idea is to have a loop where we take each house in the query set as the query house and make a prediction for that specific house. The new function should take the following parameters:

    the value of k;
    the feature matrix for the training set;
    the output values (prices) of the training houses; and
    the feature matrix for the query set.



In [44]:
def predict_output(k, features_train, output_train, features_query):
    index = ''
    min_predicted_value = sys.maxsize
    for ind in range(len(features_query)):
        prediction = predict_output_of_query(k, features_train, output_train, features_query[ind])
        if(prediction <= min_predicted_value):
            min_predicted_value = prediction
            index = ind
        #print("Prediction:- %.2f\nUpdated Index:- %.2f" %(prediction, index))    
        
    return (index, min_predicted_value)

### 23. Quiz Question: Make predictions for the first 10 houses in the test set, using k=10. What is the index of the house in this query set that has the lowest predicted value? What is the predicted value of this house?

In [45]:
index, lowest_predicted_value = predict_output(10, features_train, 
                                               training_output, 
                                               features_test[:10])
print("Index:- %.2f\nPredicted Value:- %.2f" %(index, lowest_predicted_value))

Index:- 6.00
Predicted Value:- 350032.00


### Choosing the best value of k using a validation set
24. There remains a question of choosing the value of k to use in making predictions. Here, we use a validation set to choose this value. Write a loop that does the following:

For k in [1, 2, … 15]:

Make predictions for the VALIDATION data using the k-nearest neighbors from the TRAINING data.
Compute the RSS on VALIDATION data
Report which k produced the lowest RSS on validation data.

In [46]:
features_valid

array([[ 0.01551285,  0.01054359,  0.00966076, ..., -0.01346548,
         0.00900138,  0.00202001],
       [ 0.01939106,  0.01506227,  0.01353699, ..., -0.01344655,
         0.01440221,  0.0028414 ],
       [ 0.01551285,  0.01054359,  0.01389479, ..., -0.01346845,
         0.00938716,  0.00202813],
       ...,
       [ 0.00775643,  0.01506227,  0.00584416, ..., -0.01347054,
         0.00630097,  0.00041495],
       [ 0.00775643,  0.00903736,  0.00560563, ..., -0.01346042,
         0.0072654 ,  0.00047012],
       [ 0.01163464,  0.01204982,  0.00888551, ..., -0.01345964,
         0.00900138,  0.00049892]])

In [47]:
validation_output

array([385000., 285000., 687500., ..., 329000., 362500., 429000.])

### Note: The code below may take a bit long (upto 10minutes or more)

In [48]:
k_min = ''
lowest_RSS = sys.maxsize
RSS_all = []
for k_ in range(1, 16):
    for ind in range(len(features_valid)):
        prediction = predict_output_of_query(k_, features_train, training_output, features_valid[ind])
    print(prediction)
    #Compute the RSS on VALIDATION data 
    diff = validation_output - prediction
    current_RSS = np.sum(diff**2)
    RSS_all.append(current_RSS)
    if(current_RSS<=lowest_RSS):
        lowest_RSS = current_RSS
        k_min = k_
print(current_RSS)
print(k_min)

429900.0
464950.0
455300.0
469725.0
469180.0
485150.0
488700.0
472612.5
484544.44444444444
490990.0
480536.36363636365
481320.8333333333
484603.8461538461
487810.71428571426
490590.0
188606828225538.0
10


In [49]:
validation_output

array([385000., 285000., 687500., ..., 329000., 362500., 429000.])

In [50]:
current_RSS

188606828225538.0

In [51]:
RSS_all

[204121575521838.0,
 193871810438338.0,
 196342052853838.0,
 192748330317213.0,
 192873251808238.0,
 189566202254338.0,
 188930512025838.0,
 192100698406806.75,
 189678248739072.56,
 188539638177538.0,
 190446403279358.62,
 190292429652084.53,
 189667211032835.06,
 189086358942641.56,
 188606828225538.0]

In [52]:
for err in [355632427476622.0, 317939124951086.5, 313153111376088.5, 301621468995236.0, 294266734341982.37, 287781925015337.87, 287842561046849.31, 286179146468967.94, 281718696883431.62, 280358603702662.75, 278687700531166.91, 278744728841428.25, 275043861135800.91, 273895810640073.47, 272162684453609.75]:
    print(err)

355632427476622.0
317939124951086.5
313153111376088.5
301621468995236.0
294266734341982.4
287781925015337.9
287842561046849.3
286179146468967.94
281718696883431.6
280358603702662.75
278687700531166.9
278744728841428.25
275043861135800.9
273895810640073.47
272162684453609.75


### 25. Quiz Question: What is the RSS on the TEST data using the value of k found above? To be clear, sum over all houses in the TEST set.

In [None]:
actual = 
predicted = 