# **House Price Prediction using KNN Regression - Hyperparameter Tuning**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

> **1. Data Pre-Processing :**

In [2]:
df = pd.read_csv('/kaggle/input/housing-price-prediction/Housing.csv')

In [3]:
df

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,yes,no,yes,no,no,2,no,unfurnished
541,1767150,2400,3,1,1,no,no,no,no,no,0,no,semi-furnished
542,1750000,3620,2,1,1,yes,no,no,no,no,0,no,unfurnished
543,1750000,2910,3,1,1,no,no,no,no,no,0,no,furnished


In [6]:
# Check NULL Values

df.isna().sum()

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

In [7]:
# Check duplicates in df

df.duplicated().sum()

0

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [13]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [16]:
print(df['mainroad'].unique())
df['mainroad'] = encoder.fit_transform(df['mainroad'])
print(df['mainroad'].unique())

['yes' 'no']
[1 0]


In [17]:
print(df['guestroom'].unique())
df['guestroom'] = encoder.fit_transform(df['guestroom'])
print(df['guestroom'].unique())

['no' 'yes']
[0 1]


In [18]:
print(df['basement'].unique())
df['basement'] = encoder.fit_transform(df['basement'])
print(df['basement'].unique())

['no' 'yes']
[0 1]


In [19]:
print(df['hotwaterheating'].unique())
df['hotwaterheating'] = encoder.fit_transform(df['hotwaterheating'])
print(df['hotwaterheating'].unique())

['no' 'yes']
[0 1]


In [20]:
print(df['airconditioning'].unique())
df['airconditioning'] = encoder.fit_transform(df['airconditioning'])
print(df['airconditioning'].unique())

['yes' 'no']
[1 0]


In [22]:
print(df['prefarea'].unique())
df['prefarea'] = encoder.fit_transform(df['prefarea'])
print(df['prefarea'].unique())

['yes' 'no']
[1 0]


In [21]:
print(df['furnishingstatus'].unique())
df['furnishingstatus'] = encoder.fit_transform(df['furnishingstatus'])
print(df['furnishingstatus'].unique())

['furnished' 'semi-furnished' 'unfurnished']
[0 1 2]


In [24]:
df

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,1
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,1,0,1,0,0,2,0,2
541,1767150,2400,3,1,1,0,0,0,0,0,0,0,1
542,1750000,3620,2,1,1,1,0,0,0,0,0,0,2
543,1750000,2910,3,1,1,0,0,0,0,0,0,0,0


In [25]:
X = df.drop('price', axis=1)
Y = df['price']

from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(X, Y, test_size=0.20, random_state=42)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((436, 12), (109, 12), (436,), (109,))

> **2. Create and Train KNN Regressor :**

In [26]:
from sklearn.neighbors import KNeighborsRegressor

In [27]:
knn_regr = KNeighborsRegressor(n_neighbors=5)
knn_regr.fit(x_train, y_train)

> **3. Predict Test Set Results :**

In [28]:
y_pred = knn_regr.predict(x_test)

In [29]:
pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})

Unnamed: 0,Actual,Predicted
316,4060000,3885000.0
77,6650000,5754000.0
360,3710000,3722600.0
90,6440000,4982600.0
493,2800000,4060000.0
...,...,...
15,9100000,6491800.0
357,3773000,5275200.0
39,7910000,7168000.0
54,7350000,7212800.0


> **4. Evaluate Model Performance :**

In [30]:
from sklearn.metrics import mean_squared_error, r2_score

In [31]:
# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)

# Calculate the coefficient of determination (R^2 score)
r2 = r2_score(y_test, y_pred)

mse, r2

(3232292808532.11, 0.36052074637601783)

> **5. Hyperparameter Tuning :**

In [33]:
from sklearn.model_selection import GridSearchCV

In [32]:
# Define a range of hyperparameter values to search over
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],  # k values
    'metric': ['euclidean', 'manhattan', 'minkowski']  # distance metrics
}

In [34]:
# Create a KNN regressor
knn = KNeighborsRegressor()

In [35]:
# Perform grid search with cross-validation to find the best hyperparameters
grid_search = GridSearchCV(knn, param_grid, cv=5)
grid_search.fit(x_train, y_train)

In [37]:
# Get the best hyperparameters
best_k = grid_search.best_params_['n_neighbors']
best_metric = grid_search.best_params_['metric']
best_k, best_metric

(9, 'manhattan')

In [39]:
# Train a KNN regression model with the best hyperparameters
best_knn = KNeighborsRegressor(n_neighbors=best_k, metric=best_metric)
best_knn.fit(x_train, y_train)

In [40]:
new_y_pred = best_knn.predict(x_test)

In [41]:
# Evaluate Best Model Performance

# Calculate the mean squared error
mse = mean_squared_error(y_test, new_y_pred)

# Calculate the coefficient of determination (R^2 score)
r2 = r2_score(y_test, new_y_pred)

mse, r2

(3260108807617.3516, 0.35501760808146665)