In [1]:
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None

In [2]:
DF = pd.read_excel('data8.xls')

In [3]:
# Percent of rented properties
DF['pctrnths'] = DF['rnthsg'] / DF['tothsg'] * 100

# Normalise features
DF['pop_scaled'] = (DF['pop'] - DF['pop'].mean()) / DF['pop'].std()
DF['pctstu_scaled'] = (DF['pctstu'] - DF['pctstu'].mean()) / DF['pctstu'].std()
DF['pctrnths_scaled'] = (DF['pctrnths'] - DF['pctrnths'].mean()) / DF['pctrnths'].std()
DF['avginc_scaled'] = (DF['avginc'] - DF['avginc'].mean()) / DF['avginc'].std()
DF['rent_scaled'] = (DF['rent'] - DF['rent'].mean()) / DF['rent'].std()

In [4]:
# Divide data for 80s and 90s
DF80 = DF[DF['year'] == 80]
DF90 = DF[DF['year'] == 90]

X80 = DF80[['pop_scaled', 'pctstu_scaled', 'pctrnths_scaled', 'avginc_scaled']]
y80 = DF80['rent']



X90 = DF90[['pop_scaled', 'pctstu_scaled', 'pctrnths_scaled', 'avginc_scaled']]
y90 = DF90['rent']

In [5]:
# Predict rent using 'pop', 'pctstu', 'pctrnths', 'avginc' feautures using Euclidean distance
def get_knn_rent_prediction(df, target, k):
    result = df.copy()
    result['dist'] = np.sqrt(
        (result['pop_scaled'] - target['pop_scaled']) ** 2 +
        (result['pctstu_scaled'] - target['pctstu_scaled']) ** 2 +
        (result['pctrnths_scaled'] - target['pctrnths_scaled']) ** 2 +
        (result['avginc_scaled'] - target['avginc_scaled']) ** 2
    )
    return result.sort_values(by='dist').iloc[1:k + 1]['rent'].mean()

In [6]:
def mean_square_error(prediction, actual):
    return np.mean((prediction - actual) ** 2)

In [7]:
def mean_absolute_error(prediction, actual):
    return np.mean(np.abs(prediction - actual))

In [8]:
def calculate_errors_for_knn(k):
    DF80['pred'] = DF80.apply(lambda x: get_knn_rent_prediction(DF80, x, k), axis=1)
    DF90['pred'] = DF90.apply(lambda x: get_knn_rent_prediction(DF90, x, k), axis=1)
    print("For k =", k)
    MSE_80 = mean_square_error(DF80['pred'], DF80['rent'])
    MAE_80 = mean_square_error(DF90['pred'], DF90['rent'])
    MSE_90 = mean_absolute_error(DF80['pred'], DF80['rent'])
    MAE_90 = mean_absolute_error(DF90['pred'], DF90['rent'])
    print("MSE for 80s: ", MSE_80)
    print("MSE for 90s: ", MAE_80)
    print("MAE for 80s: ", MSE_90)
    print("MAE for 90s: ", MAE_90)

In [9]:
# Experiment with different k values
for k in range(1, 10):
    calculate_errors_for_knn(k)

For k = 1
MSE for 80s:  1123.328125
MSE for 90s:  7784.65625
MAE for 80s:  22.515625
MAE for 90s:  56.5
For k = 2
MSE for 80s:  1547.8828125
MSE for 90s:  7225.515625
MAE for 80s:  24.15625
MAE for 90s:  55.453125
For k = 3
MSE for 80s:  1518.6302083333335
MSE for 90s:  6765.368055555555
MAE for 80s:  23.796875
MAE for 90s:  52.82291666666667
For k = 4
MSE for 80s:  1635.421875
MSE for 90s:  6516.6572265625
MAE for 80s:  23.90625
MAE for 90s:  50.57421875
For k = 5
MSE for 80s:  1685.7475000000002
MSE for 90s:  7162.996875000001
MAE for 80s:  24.125000000000004
MAE for 90s:  53.33437500000001
For k = 6
MSE for 80s:  1679.046875
MSE for 90s:  6911.506076388889
MAE for 80s:  23.786458333333336
MAE for 90s:  52.505208333333336
For k = 7
MSE for 80s:  1716.4639668367347
MSE for 90s:  6970.433035714284
MAE for 80s:  24.569196428571427
MAE for 90s:  53.1875
For k = 8
MSE for 80s:  1587.22900390625
MSE for 90s:  7245.0634765625
MAE for 80s:  23.9453125
MAE for 90s:  53.01171875
For k = 9
MSE 

In [10]:
MSE_80 = 0
MSE_90 = 0
MAE_80 = 0
MAE_90 = 0
    
k_values = np.arange(1, 10)

# Perform LOOCV for each k
for k in k_values:
    MSE_80 = 0
    MSE_90 = 0
    MAE_80 = 0
    MAE_90 = 0

    # Perform LOOCV for 80
    for i in range(len(X80)):
        target = X80.iloc[i]
        y_train = np.delete(y80.values, i, axis=0)

        knn = DF80.apply(lambda x: get_knn_rent_prediction(DF80, x, k), axis=1)
        MSE_80 += mean_square_error(y80.iloc[i], knn.iloc[i])
        MAE_80 += mean_absolute_error(y80.iloc[i], knn.iloc[i])

    # Perform LOOCV for 90
    for i in range(len(X90)):
        target = X90.iloc[i]
        y_train = np.delete(y90.values, i, axis=0)

        knn = DF90.apply(lambda x: get_knn_rent_prediction(DF90, x, k), axis=1)
        MSE_90 += mean_square_error(y90.iloc[i], knn.iloc[i])
        MAE_90 += mean_absolute_error(y90.iloc[i], knn.iloc[i])

    # Calculate average MSE and MAE
    avg_mse_80s = MSE_80 / len(X80)
    avg_mse_90s = MSE_90 / len(X90)
    avg_mae_80s = MAE_80 / len(X80)
    avg_mae_90s = MAE_90 / len(X90)

    # Print results 
    print(f"For k = {k}")
    print(f"Average MSE for 80s: {avg_mse_80s}")
    print(f"Average MSE for 90s: {avg_mse_90s}")
    print(f"Average MAE for 80s: {avg_mae_80s}")
    print(f"Average MAE for 90s: {avg_mae_90s}")
    print()


For k = 1
Average MSE for 80s: 1123.328125
Average MSE for 90s: 7784.65625
Average MAE for 80s: 22.515625
Average MAE for 90s: 56.5

For k = 2
Average MSE for 80s: 1547.8828125
Average MSE for 90s: 7225.515625
Average MAE for 80s: 24.15625
Average MAE for 90s: 55.453125

For k = 3
Average MSE for 80s: 1518.6302083333323
Average MSE for 90s: 6765.368055555556
Average MAE for 80s: 23.796875000000007
Average MAE for 90s: 52.82291666666666

For k = 4
Average MSE for 80s: 1635.421875
Average MSE for 90s: 6516.6572265625
Average MAE for 80s: 23.90625
Average MAE for 90s: 50.57421875

For k = 5
Average MSE for 80s: 1685.7475
Average MSE for 90s: 7162.996875000002
Average MAE for 80s: 24.124999999999993
Average MAE for 90s: 53.334375

For k = 6
Average MSE for 80s: 1679.0468749999995
Average MSE for 90s: 6911.506076388886
Average MAE for 80s: 23.786458333333336
Average MAE for 90s: 52.50520833333333

For k = 7
Average MSE for 80s: 1716.4639668367354
Average MSE for 90s: 6970.433035714288
Avera

In [11]:
# Define the number of folds (k)
num_folds = 5
random_seed = 4

for k in range(1, 10):
    MSE_80 = 0
    MSE_90 = 0
    MAE_80 = 0
    MAE_90 = 0
    
    for fold in range(num_folds):
        # Set seed for reproducibility
        np.random.seed(random_seed + fold)

        # Shuffle indices for random splitting
        indices_80 = np.random.permutation(len(X80))
        indices_90 = np.random.permutation(len(X90))


    # Split data into folds manually for 80s
    fold_size_80 = len(X80) // num_folds
    for i in range(num_folds):
        start_idx = i * fold_size_80
        end_idx = (i + 1) * fold_size_80
        test_indices = indices_80[start_idx:end_idx]

        train_indices = np.concatenate([indices_80[:start_idx], indices_80[end_idx:]])

        X_train, X_test = X80.iloc[train_indices], X80.iloc[test_indices]
        y_train, y_test = y80.iloc[train_indices], y80.iloc[test_indices]

        knn = DF80.apply(lambda x: get_knn_rent_prediction(DF80, x, k), axis=1)
        knn = knn.iloc[test_indices]
        MSE_80 += mean_square_error(y_test, knn)
        MAE_80 += mean_absolute_error(y_test, knn)

    # Split data into folds manually for 90s
    fold_size_90 = len(X90) // num_folds
    for i in range(num_folds):
        start_idx = i * fold_size_90
        end_idx = (i + 1) * fold_size_90
        test_indices = indices_90[start_idx:end_idx]

        train_indices = np.concatenate([indices_90[:start_idx], indices_90[end_idx:]])

        X_train, X_test = X90.iloc[train_indices], X90.iloc[test_indices]
        y_train, y_test = y90.iloc[train_indices], y90.iloc[test_indices]

        knn = DF90.apply(lambda x: get_knn_rent_prediction(DF90, x, k), axis=1)
        knn = knn.iloc[test_indices]
        MSE_90 += mean_square_error(y_test, knn)
        MAE_90 += mean_absolute_error(y_test, knn)

    # Calculate average MSE and MAE
    avg_MSE_80s = MSE_80 / num_folds
    avg_MSE_90s = MSE_90 / num_folds
    avg_MAE_80s = MAE_80 / num_folds
    avg_MAE_90s = MAE_90 / num_folds

    #result
    print(f"For k = {k}")
    print(f"Average MSE for 80s: {avg_MSE_80s}")
    print(f"Average MSE for 90s: {avg_MSE_90s}")
    print(f"Average MAE for 80s: {avg_MAE_80s}")
    print(f"Average MAE for 90s: {avg_MAE_90s}")
    print()

For k = 1
Average MSE for 80s: 1151.3333333333333
Average MSE for 90s: 7950.333333333334
Average MAE for 80s: 22.466666666666665
Average MAE for 90s: 56.03333333333334

For k = 2
Average MSE for 80s: 1591.1208333333334
Average MSE for 90s: 7402.9000000000015
Average MAE for 80s: 23.958333333333332
Average MAE for 90s: 56.0

For k = 3
Average MSE for 80s: 1582.5444444444445
Average MSE for 90s: 6871.383333333333
Average MAE for 80s: 23.922222222222224
Average MAE for 90s: 53.216666666666676

For k = 4
Average MSE for 80s: 1710.2520833333335
Average MSE for 90s: 6623.479166666667
Average MAE for 80s: 24.116666666666664
Average MAE for 90s: 51.075

For k = 5
Average MSE for 80s: 1745.3593333333333
Average MSE for 90s: 7327.552666666667
Average MAE for 80s: 23.943333333333335
Average MAE for 90s: 54.07000000000001

For k = 6
Average MSE for 80s: 1745.026388888889
Average MSE for 90s: 7098.579166666667
Average MAE for 80s: 23.669444444444444
Average MAE for 90s: 53.108333333333334

For k = 