## kNN - Part 1

In [2]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing

In [4]:
housing = fetch_california_housing()
housing

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [10]:
df = pd.DataFrame(housing.data, columns = housing.feature_names)
df.shape

(20640, 8)

In [12]:
df.head(3)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24


In [14]:
X = df[['AveRooms','AveBedrms']]
X.head(3)

Unnamed: 0,AveRooms,AveBedrms
0,6.984127,1.02381
1,6.238137,0.97188
2,8.288136,1.073446


In [16]:
y = housing.target
y

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

In [41]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.1, random_state = 110 ) 

#### Manhatan distance

In [175]:
# Assume that we are in 2D and have 2 points : p and q
# p: (p_1,p_2) , q : (q_1,q_2)
# compute diff : (p_1 - q_1),(p_2 - q_2)
# Absolute  diffs
# Sum of Absolute
# Take the square root

def manhatan_distance(p,q):
    return np.sum(np.abs(p-q))
    

#### Euclidean distance

In [178]:
# we are in 2D and have 2 points : p and q
# p: (p_1,p_2) , q : (q_1,q_2)
# compute diff : (p_1 - q_1),(p_2 - q_2)
# Square diffs
# Sum of Squares
# Take the square root

def euclidean_distance(p,q):
    return np.sqrt(sum((p-q)**2))
    

#### Minkowski distance

In [213]:
def minkowski_distance(p,q,n=3):
    return np.power(np.sum((p-q)**n),1/n)
    

In [211]:
p = np.array([3,4])
q = np.array([0,0])
minkowski_distance(p,q)

4.497941445275415

In [67]:
subset_X_train = X_train.iloc[0:10]
subset_X_train

Unnamed: 0,AveRooms,AveBedrms
1200,5.0,1.004132
3026,5.495103,1.048966
13675,3.745014,1.078348
1884,6.794224,1.494585
19470,4.896552,1.068966
19712,6.53317,0.972973
1719,5.533981,1.056311
15197,7.425926,0.985185
17589,4.116711,1.075597
7322,3.22031,0.956971


In [232]:
def knn_reg(X_train,y_train,new_sample,k=3):
    distances = []
    for x in X_train.to_numpy():
        distance = euclidean_distance(new_sample,x)
        distances.append(distance)
    sorted_distances = np.argsort(distances)
    k_nearest_targets = y_train[sorted_distances[:k]]    
    prediction = np.mean(k_nearest_targets)
    
    return prediction

In [182]:
prediction

1.0416666666666667

In [234]:
new_sample =  np.array([4,5])
knn_reg(X_train,y_train,new_sample,k=3)

1.0416666666666667

In [217]:
test_sample = X_test.iloc[30]

In [236]:
print ("prediction is :" , knn_reg(X_train,y_train,test_sample,k=5))

prediction is : 1.2744


In [221]:
print("Actual price is : ",y_test[30])

Actual price is :  1.657


In [223]:
y_hats = []
for x in X_test.to_numpy():
    y_hat = knn_reg(X_train,y_train,x,k=5)
    y_hats.append(y_hat)
    

  return np.power(np.sum((p-q)**n),1/n)


### MSE

In [225]:
# 1/n * sum (y-y_hat)**2

from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test,y_hats)
mse

1.212343575902785

In [238]:
for k in range(5,15,2):
    y_hats = []
    for x in X_test.to_numpy():
        y_hat = knn_reg(X_train,y_train,x,k=5)
        y_hats.append(y_hat)
    print("Error for k = ", k , " is ", mean_squared_error(y_test,y_hats))
    print()    

Error for k =  5  is  1.1877747641786822

Error for k =  7  is  1.1877747641786822

Error for k =  9  is  1.1877747641786822

Error for k =  11  is  1.1877747641786822



KeyboardInterrupt: 

## Weighted kNN

In [None]:
### weighted avg = weighted sum/ sum of weights


In [253]:
def weighted_knn_reg(X_train,y_train,new_sample,k=3):
    distances = []
    for x in X_train.to_numpy():
        distance = euclidean_distance(new_sample,x)
        distances.append(distance)
    sorted_distances = np.argsort(distances)

    #get the distances
    k_nearest_distances = np.array(distances)[sorted_distances[:k]]
    k_nearest_targets = y_train[sorted_distances[:k]] 

    weights = 1 / (k_nearest_distances + 1e-20)

    #weighted avg
    weighted_sum = sum(weights * k_nearest_targets)
    weights_sum = sum(weights)
    prediction = weighted_sum/weights_sum
    
    return prediction

In [243]:
sorted_distances[:3]

array([  999, 14391, 18389])

In [247]:
np.array(distances)[sorted_distances[:k]]

array([2.49615088, 2.65900592, 3.14042113, 3.20156212, 3.23526177,
       3.35701225, 3.38972862, 3.42915587, 3.45160546, 3.49020952,
       3.50008251, 3.51200169, 3.52299748])

In [255]:
y_hats = []
for x in X_test.to_numpy():
    y_hat = weighted_knn_reg(X_train,y_train,x,k=5)
    y_hats.append(y_hat)

In [256]:
mse = mean_squared_error(y_test,y_hats)
mse

1.224115740356256