In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import random

k = 3

data = pd.read_csv('./kc_house_data.csv')
cols_for_knn = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 
                'floors', 'condition', 'grade', 'sqft_above', 'sqft_basement', 
                'yr_built', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'price']

data = data.loc[0:, cols_for_knn]
data.head()
data.shape

(21612, 15)

In [2]:
# normalize all columns except label
def normalize_column(column):
    min = column.min()
    max = column.max()
    diff = max - min
    
    def normalize_value(val):
        val = (val - min) / diff
        return val
    
    column = column.apply(normalize_value)
    return column

for column in data.columns:
    if column == 'price':
        continue
    data[column] = normalize_column(data[column])

# separate test data and training data
test_data = pd.DataFrame(columns = list(data.columns))
rows_count = data.shape[0]

for i in range(0, 100):
    idx = random.randrange(rows_count)
    test_row = data.iloc[idx:idx + 1]
    test_data = pd.concat([test_data, test_row])
    data.drop(idx, inplace = True)    
    
train_data = data
del data

In [3]:
last_col_idx = test_data.shape[1] - 1

test_data_features = test_data.iloc[0:, 0:last_col_idx]
test_data_labels = test_data.iloc[0:, last_col_idx:]

def predict_price(row, train_data):
    train_data_features = train_data.iloc[0:, 0:last_col_idx]
    train_data_labels = train_data.iloc[0:, last_col_idx]

    train_data_features = (train_data_features - row) ** 2
    distance = train_data_features.sum(axis = 1) ** 0.5
    distance = distance.rename('distance')

    train_data_labels = pd.concat([distance, train_data_labels], axis = 1)
    train_data_labels = train_data_labels.sort_values(by = ['distance'])
    train_data_labels = train_data_labels[0:k]

    predicted_price = round(train_data_labels['price'].mean(), 2)
    return predicted_price

predictions = []

for idx in range(len(test_data_features.index)):
    row = test_data_features.iloc[idx]
    predicted_price = predict_price(row, train_data)
    predictions.append(predicted_price)

test_data['prediction'] = predictions
test_data['error_perc'] = (test_data['prediction'] - test_data['price']) / test_data['price'] * 100

def calc_average_error(errors):
    errors = abs(errors)
    return errors.sum() / len(errors)

average_error = calc_average_error(test_data['error_perc'])
print('The average deviation of prediction from real price is:', round(average_error, 2), 'percent')
print('Describe error percent column:')
test_data['error_perc'].describe()

The average deviation of prediction from real price is: 10.94 percent
Describe error percent column:


count    100.000000
mean       1.313000
std       14.843592
min      -32.659091
25%       -7.460816
50%       -0.449266
75%        8.148065
max       49.540231
Name: error_perc, dtype: float64