## KNN Regression

### Necessary libraries 

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

### Fetch the dataset from the original source

In [2]:
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

### Split the data into features and target

In [3]:
X = pd.DataFrame(data, columns=["CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT"])
y = pd.DataFrame(target, columns=["MEDV"])

### Basic exploratory data analysis

In [4]:
print(X.head())
print(X.describe())

      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900  1.0  296.0   
1  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671  2.0  242.0   
2  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671  2.0  242.0   
3  0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622  3.0  222.0   
4  0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622  3.0  222.0   

   PTRATIO       B  LSTAT  
0     15.3  396.90   4.98  
1     17.8  396.90   9.14  
2     17.8  392.83   4.03  
3     18.7  394.63   2.94  
4     18.7  396.90   5.33  
             CRIM          ZN       INDUS        CHAS         NOX          RM  \
count  506.000000  506.000000  506.000000  506.000000  506.000000  506.000000   
mean     3.613524   11.363636   11.136779    0.069170    0.554695    6.284634   
std      8.601545   23.322453    6.860353    0.253994    0.115878    0.702617   
min      0.006320    0.000000    0.460000    0.000000    0.385000    

### Feature standardization

In [5]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_features = scaler.fit_transform(X)
X_scaled = pd.DataFrame(scaled_features, columns=X.columns)

### Split the dataset into training, validation and testing sets

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

### Custom WeightedKNNRegressor class with distance weighting

In [7]:
class WeightedKNNRegressor(KNeighborsRegressor):
    def predict(self, X):
        distances, indices = self.kneighbors(X)
        weights = 1 / distances  # Inverse distance weighting
        weights = weights / np.sum(weights, axis=1)[:, np.newaxis]  # Normalize weights
        y_pred = np.sum(self._y[indices] * weights[:, :, np.newaxis], axis=1)  # Weighted average
        return y_pred


### Hyperparameter tuning

In [8]:
k_values = [3, 5, 7, 10]
best_k = None
best_mse = float('inf')

In [9]:
for k in k_values:
    knn_reg = KNeighborsRegressor(n_neighbors=k)
    knn_reg.fit(X_train, y_train)
    y_pred_val = knn_reg.predict(X_val)
    mse = mean_squared_error(y_val, y_pred_val)
    if mse < best_mse:
        best_mse = mse
        best_k = k

print("Best k value:", best_k)

Best k value: 3


### Model training with the best k value

In [10]:
final_knn_reg = WeightedKNNRegressor(n_neighbors=best_k)
final_knn_reg.fit(X_train, y_train)

### Model Evaluation

In [11]:
y_pred_test = final_knn_reg.predict(X_test)
mse_test = mean_squared_error(y_test, y_pred_test)
r2 = r2_score(y_test, y_pred_test)


In [12]:
print("Mean Squared Error on Test Set:", mse_test)
print(f"R-squared Score: {r2}")


Mean Squared Error on Test Set: 24.319580666120505
R-squared Score: 0.6940138951218369


### Analysis

In [13]:
print("Impact of the choice of k on model performance: As k increases, the model becomes smoother but may lose detail.")

Impact of the choice of k on model performance: As k increases, the model becomes smoother but may lose detail.
