In [188]:
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None

In [189]:
DF = pd.read_excel('data/data8.xls')

In [190]:
# Percent of rented properties
DF['pctrnths'] = DF['rnthsg'] / DF['tothsg'] * 100

# Normalise features
DF['pop_scaled'] = (DF['pop'] - DF['pop'].mean()) / DF['pop'].std()
DF['pctstu_scaled'] = (DF['pctstu'] - DF['pctstu'].mean()) / DF['pctstu'].std()
DF['pctrnths_scaled'] = (DF['pctrnths'] - DF['pctrnths'].mean()) / DF['pctrnths'].std()
DF['avginc_scaled'] = (DF['avginc'] - DF['avginc'].mean()) / DF['avginc'].std()

In [191]:
# Divide data for 80s and 90s
DF80 = DF[DF['year'] == 80]
DF90 = DF[DF['year'] == 90]

In [192]:
# Predict rent using 'pop', 'pctstu', 'pctrnths', 'avginc' feautures using Euclidean distance
def get_knn_rent_prediction(df, target, k):
    result = df.copy()
    result['dist'] = np.sqrt(
        (result['pop_scaled'] - target['pop_scaled']) ** 2 +
        (result['pctstu_scaled'] - target['pctstu_scaled']) ** 2 +
        (result['pctrnths_scaled'] - target['pctrnths_scaled']) ** 2 +
        (result['avginc_scaled'] - target['avginc_scaled']) ** 2
    )
    return result.sort_values(by='dist').iloc[1:k + 1]['rent'].mean()

In [193]:
def mean_square_error(prediction, actual):
    return np.mean((prediction - actual) ** 2)

In [194]:
def mean_absolute_error(prediction, actual):
    return np.mean(np.abs(prediction - actual))

In [195]:
def calculate_errors_for_knn(k):
    DF80['pred'] = DF80.apply(lambda x: get_knn_rent_prediction(DF80, x, k), axis=1)
    DF90['pred'] = DF90.apply(lambda x: get_knn_rent_prediction(DF90, x, k), axis=1)
    print("For k =", k)
    print("MSE for 80s: ", mean_square_error(DF80['pred'], DF80['rent']))
    print("MSE for 90s: ", mean_square_error(DF90['pred'], DF90['rent']))
    print("MAE for 80s: ", mean_absolute_error(DF80['pred'], DF80['rent']))
    print("MAE for 90s: ", mean_absolute_error(DF90['pred'], DF90['rent']))

In [196]:
# Experiment with different k values
for k in range(1, 10):
    calculate_errors_for_knn(k)

For k = 1
MSE for 80s:  1123.328125
MSE for 90s:  7784.65625
MAE for 80s:  22.515625
MAE for 90s:  56.5
For k = 2
MSE for 80s:  1547.8828125
MSE for 90s:  7225.515625
MAE for 80s:  24.15625
MAE for 90s:  55.453125
For k = 3
MSE for 80s:  1518.6302083333335
MSE for 90s:  6765.368055555555
MAE for 80s:  23.796875
MAE for 90s:  52.82291666666667
For k = 4
MSE for 80s:  1635.421875
MSE for 90s:  6516.6572265625
MAE for 80s:  23.90625
MAE for 90s:  50.57421875
For k = 5
MSE for 80s:  1685.7475000000002
MSE for 90s:  7162.996875000001
MAE for 80s:  24.125000000000004
MAE for 90s:  53.33437500000001
For k = 6
MSE for 80s:  1679.046875
MSE for 90s:  6911.506076388889
MAE for 80s:  23.786458333333336
MAE for 90s:  52.505208333333336
For k = 7
MSE for 80s:  1716.4639668367347
MSE for 90s:  6970.433035714284
MAE for 80s:  24.569196428571427
MAE for 90s:  53.1875
For k = 8
MSE for 80s:  1587.22900390625
MSE for 90s:  7245.0634765625
MAE for 80s:  23.9453125
MAE for 90s:  53.01171875
For k = 9
MSE 