## KNN Regression

### Necessary libraries


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

### Fetch data from original source

In [2]:
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)

### Preprocessing the data

In [3]:
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

### Train-Test-Validation sets

In [4]:
X_train, X_val, y_train, y_val = train_test_split(data, target, test_size=0.3, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

### Feature Standardization

In [5]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

### KNN regression with hyperparameter tuning

In [6]:
def knn_regression(X_train, y_train, X_val, y_val, k):
  model = KNeighborsRegressor(n_neighbors=k)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_val)
  mse = mean_squared_error(y_val, y_pred)
  return mse

### Hyperparameter Tuning (k-value)

In [7]:
k_values = [3, 5, 7, 10]
mse_scores = []
for k in k_values:
  mse = knn_regression(X_train, y_train, X_val, y_val, k)
  mse_scores.append(mse)

best_k = k_values[mse_scores.index(min(mse_scores))]
print(best_k)

3


## Function for Distance Weighting

In [8]:
def distance_weight(distances):
  weights = 1 / distances
  weights = weights / np.sum(weights)
  return weights

### KNN Regression with Distance Weighting

In [9]:
class WeightedKNNRegressor(KNeighborsRegressor):
    def predict(self, X):
        distances, indices = self.kneighbors(X)
        weights = distance_weight(distances)
        y_pred = np.average(self._y[indices], weights=weights, axis=1)
        return y_pred

model = WeightedKNNRegressor(n_neighbors=best_k)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

### Model Evaluation

In [10]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [11]:
print(f"Mean Squared Error: {mse}")
print(f"R-squared Score: {r2}")

Mean Squared Error: 33.51687725132402
R-squared Score: 0.5526600345348396
