In [75]:
import pandas as pd
import numpy as np

In [76]:
df_train = pd.read_csv('trainHome_data.csv')
df_test = pd.read_csv('testingHome_data.csv')

In [77]:
df_train.drop(['id', 'date'], axis = 1, inplace = True)
df_test.drop(['id', 'date'], axis = 1, inplace = True)

In [78]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15999 entries, 0 to 15998
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   price          15999 non-null  int64  
 1   bedrooms       15999 non-null  int64  
 2   bathrooms      15999 non-null  float64
 3   sqft_living    15999 non-null  int64  
 4   sqft_lot       15999 non-null  int64  
 5   floors         15999 non-null  float64
 6   waterfront     15999 non-null  int64  
 7   view           15999 non-null  int64  
 8   condition      15999 non-null  int64  
 9   grade          15999 non-null  int64  
 10  sqft_above     15999 non-null  int64  
 11  sqft_basement  15999 non-null  int64  
 12  yr_built       15999 non-null  int64  
 13  yr_renovated   15999 non-null  int64  
 14  zipcode        15999 non-null  int64  
 15  lat            15999 non-null  float64
 16  long           15999 non-null  float64
 17  sqft_living15  15999 non-null  int64  
 18  sqft_l

In [79]:
df_train = df_train.astype({"bathrooms": int,"floors": int})
df_test = df_test.astype({"bathrooms": int,"floors": int})

In [80]:
df_train.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,221900,3,1,1180,5650,1,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,538000,3,2,2570,7242,2,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,180000,2,1,770,10000,1,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,604000,4,3,1960,5000,1,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,510000,3,2,1680,8080,1,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


Nominal:    
- bedroom
- bathrooms
- floors
- zip code

Binary:
- waterfront(asymmetric)
    
Ordinal
- view
- condition    
- grade
- yr_built
- yr_renovated
    
Numeric:
- sqft_living (discrete)
- sqft_lot (discrete)
- sqft_above (discrete)
- sqft_basement (discrete)
- lat (continous)
- long (continous)
- sqft_living15 (discrete)
- sqft_lot15 (discrete)

## Pre-processing

In [81]:
df_train_normalized = df_train.copy()
df_train_numeric = df_train.filter(['price','sqft_living','sqft_lot','sqft_above','sqft_basement','lat','long','sqft_living15','sqft_lot15'], axis=1)
df_train_normalized.drop(['price','sqft_living','sqft_lot','sqft_above','sqft_basement','lat','long','sqft_living15','sqft_lot15'], axis=1,inplace=True)
df_train_normalized.head()

Unnamed: 0,bedrooms,bathrooms,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode
0,3,1,1,0,0,3,7,1955,0,98178
1,3,2,2,0,0,3,7,1951,1991,98125
2,2,1,1,0,0,3,6,1933,0,98028
3,4,3,1,0,0,5,7,1965,0,98136
4,3,2,1,0,0,3,8,1987,0,98074


In [82]:
df_test_normalized = df_test.copy()
df_test_numeric = df_test.filter(['price','sqft_living','sqft_lot','sqft_above','sqft_basement','lat','long','sqft_living15','sqft_lot15'], axis=1)
df_test_normalized.drop(['price','sqft_living','sqft_lot','sqft_above','sqft_basement','lat','long','sqft_living15','sqft_lot15'], axis=1,inplace=True)
df_test_normalized.head()

Unnamed: 0,bedrooms,bathrooms,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode
0,4,2,2,0,0,3,7,2001,0,98059
1,3,1,1,0,0,3,7,1979,0,98074
2,3,1,1,0,0,3,7,1979,0,98074
3,4,2,1,0,0,3,8,1978,0,98075
4,4,3,1,0,0,5,7,1969,0,98034


In [83]:
df_train_numeric.head()

Unnamed: 0,price,sqft_living,sqft_lot,sqft_above,sqft_basement,lat,long,sqft_living15,sqft_lot15
0,221900,1180,5650,1180,0,47.5112,-122.257,1340,5650
1,538000,2570,7242,2170,400,47.721,-122.319,1690,7639
2,180000,770,10000,770,0,47.7379,-122.233,2720,8062
3,604000,1960,5000,1050,910,47.5208,-122.393,1360,5000
4,510000,1680,8080,1680,0,47.6168,-122.045,1800,7503


In [84]:
df_test_numeric.head(5)

Unnamed: 0,price,sqft_living,sqft_lot,sqft_above,sqft_basement,lat,long,sqft_living15,sqft_lot15
0,465950,2340,6896,2340,0,47.4896,-122.14,2950,6775
1,325000,1780,11096,1210,570,47.617,-122.051,1780,10640
2,500000,1780,11096,1210,570,47.617,-122.051,1780,10640
3,450000,2450,20348,1410,1040,47.5887,-122.064,2450,50094
4,475000,2410,8284,1210,1200,47.7202,-122.22,2050,7940


### Ordinal Normalization

In [85]:
def rank_by_interval(x, data):
    z= (x-1)/((data.shape[0]+1)-1)
    return z

#### Train Data

In [86]:
df_train_normalized

Unnamed: 0,bedrooms,bathrooms,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode
0,3,1,1,0,0,3,7,1955,0,98178
1,3,2,2,0,0,3,7,1951,1991,98125
2,2,1,1,0,0,3,6,1933,0,98028
3,4,3,1,0,0,5,7,1965,0,98136
4,3,2,1,0,0,3,8,1987,0,98074
...,...,...,...,...,...,...,...,...,...,...
15994,5,1,1,0,2,3,7,1955,0,98155
15995,3,1,1,0,1,5,7,1910,0,98136
15996,2,1,1,0,0,4,6,1942,0,98146
15997,3,2,1,0,2,5,8,1986,0,98022


In [87]:
#view to rank [1...M]
df_train_normalized.view = df_train_normalized["view"]+1

In [88]:
#year built to rank [1...M]
train_yr_built_sorted_unique = sorted(set(df_train.yr_built))
train_yr_built_ordinal_map = {val: i for i, val in enumerate(train_yr_built_sorted_unique, 1)}
train_yr_built_ordinals = [train_yr_built_ordinal_map[val] for val in df_train.yr_built]
train_yr_built_ordinals = pd.Series(train_yr_built_ordinals)

In [89]:
#year renovated to rank [1...M]
train_yr_renovated_sorted_unique = sorted(set(df_train.yr_renovated))
train_yr_renovated_ordinal_map = {val: i for i, val in enumerate(train_yr_renovated_sorted_unique, 1)}
train_yr_renovated_ordinals = [train_yr_renovated_ordinal_map[val] for val in df_train.yr_renovated]
train_yr_renovated_ordinals = pd.Series(train_yr_renovated_ordinals)

In [90]:
#map ordinals to [0,1]
df_train_normalized.view = rank_by_interval(df_train_normalized["view"], df_train["view"])
df_train_normalized.condition = rank_by_interval(df_train_normalized["condition"], df_train["condition"])
df_train_normalized.grade = rank_by_interval(df_train_normalized["grade"], df_train["grade"])
df_train_normalized.yr_built = rank_by_interval(train_yr_built_ordinals, df_train["yr_built"])
df_train_normalized.yr_renovated = rank_by_interval(train_yr_renovated_ordinals, df_train["yr_renovated"])

In [91]:
df_train_normalized

Unnamed: 0,bedrooms,bathrooms,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode
0,3,1,1,0,0.000000,0.000125,0.000375,0.003438,0.000000,98178
1,3,2,2,0,0.000000,0.000125,0.000375,0.003188,0.002813,98125
2,2,1,1,0,0.000000,0.000125,0.000313,0.002063,0.000000,98028
3,4,3,1,0,0.000000,0.000250,0.000375,0.004063,0.000000,98136
4,3,2,1,0,0.000000,0.000125,0.000438,0.005438,0.000000,98074
...,...,...,...,...,...,...,...,...,...,...
15994,5,1,1,0,0.000125,0.000125,0.000375,0.003438,0.000000,98155
15995,3,1,1,0,0.000063,0.000250,0.000375,0.000625,0.000000,98136
15996,2,1,1,0,0.000000,0.000188,0.000313,0.002625,0.000000,98146
15997,3,2,1,0,0.000125,0.000250,0.000438,0.005375,0.000000,98022


#### Test Data

In [92]:
df_test_normalized

Unnamed: 0,bedrooms,bathrooms,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode
0,4,2,2,0,0,3,7,2001,0,98059
1,3,1,1,0,0,3,7,1979,0,98074
2,3,1,1,0,0,3,7,1979,0,98074
3,4,2,1,0,0,3,8,1978,0,98075
4,4,3,1,0,0,5,7,1969,0,98034
...,...,...,...,...,...,...,...,...,...,...
5609,3,2,3,0,0,3,8,2009,0,98103
5610,4,2,2,0,0,3,8,2014,0,98146
5611,2,0,2,0,0,3,7,2009,0,98144
5612,3,2,2,0,0,3,8,2004,0,98027


In [93]:
#view to rank [1...M]
df_test_normalized.view = df_test_normalized["view"]+1

In [94]:
#year built to rank [1...M]
yr_built_sorted_unique = sorted(set(df_test.yr_built))
yr_built_ordinal_map = {val: i for i, val in enumerate(yr_built_sorted_unique, 1)}
yr_built_ordinals = [yr_built_ordinal_map[val] for val in df_test.yr_built]
yr_built_ordinals = pd.Series(yr_built_ordinals)

In [95]:
#year renovated to rank [1...M]
yr_renovated_sorted_unique = sorted(set(df_test.yr_renovated))
yr_renovated_ordinal_map = {val: i for i, val in enumerate(yr_renovated_sorted_unique, 1)}
yr_renovated_ordinals = [yr_renovated_ordinal_map[val] for val in df_test.yr_renovated]
yr_renovated_ordinals = pd.Series(yr_renovated_ordinals)

In [96]:
#map ordinals to [0,1]
df_test_normalized.view = rank_by_interval(df_test_normalized["view"], df_train["view"])
df_test_normalized.condition = rank_by_interval(df_test_normalized["condition"], df_train["condition"])
df_test_normalized.grade = rank_by_interval(df_test_normalized["grade"], df_train["grade"])
df_test_normalized.yr_built = rank_by_interval(yr_built_ordinals, df_train["yr_built"])
df_test_normalized.yr_renovated = rank_by_interval(yr_renovated_ordinals, df_train["yr_renovated"])

In [97]:
df_test_normalized

Unnamed: 0,bedrooms,bathrooms,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode
0,4,2,2,0,0.0,0.000125,0.000375,0.006313,0.0,98059
1,3,1,1,0,0.0,0.000125,0.000375,0.004938,0.0,98074
2,3,1,1,0,0.0,0.000125,0.000375,0.004938,0.0,98074
3,4,2,1,0,0.0,0.000125,0.000438,0.004875,0.0,98075
4,4,3,1,0,0.0,0.000250,0.000375,0.004313,0.0,98034
...,...,...,...,...,...,...,...,...,...,...
5609,3,2,3,0,0.0,0.000125,0.000438,0.006813,0.0,98103
5610,4,2,2,0,0.0,0.000125,0.000438,0.007125,0.0,98146
5611,2,0,2,0,0.0,0.000125,0.000375,0.006813,0.0,98144
5612,3,2,2,0,0.0,0.000125,0.000438,0.006500,0.0,98027


### Numeric Normalization

In [98]:
def Min_Max_Normalize(x, data, new_min, new_max):
    minimum = np.min(data)
    maximum = np.max(data)
    normalized = ((x - minimum) / (maximum - minimum)) * (new_max - new_min) + new_min
    return normalized

In [99]:
def Z_Score_Normalize(x, data):
    std = np.std(data)
    mean = np.mean(data)
    z_normalized = ((x-mean)/std)
    return z_normalized

In [100]:
def Decimal_Scalling(x, data):
    maximum = np.max(data)
    length = len(str(abs(maximum)))
    d_normalized = x/10**length
    return d_normalized

### Train Data

#### Min Max Normalization

In [101]:
df_train_numeric_min_max = df_train_numeric.copy()

In [102]:
for i in range(df_train_numeric.shape[1]):
    df_train_numeric_min_max.iloc[:,i] = Min_Max_Normalize(df_train_numeric.iloc[:,i],df_train_numeric.iloc[:,i],0,1)

In [103]:
df_train_min_max = df_train_numeric_min_max.join(df_train_normalized)

#### Z-score Normalization

In [104]:
df_train_numeric_zscore= df_train_numeric.copy()

In [105]:
for i in range(df_train_numeric.shape[1]):
    df_train_numeric_zscore.iloc[:,i] = Z_Score_Normalize(df_train_numeric.iloc[:,i],df_train_numeric.iloc[:,i])

In [106]:
df_train_zscore = df_train_numeric_zscore.join(df_train_normalized)

#### Decimal Scalling

In [107]:
df_train_numeric_decimal = df_train_numeric.copy()

In [108]:
for i in range(df_train_numeric.shape[1]):
    df_train_numeric_decimal.iloc[:,i] = Decimal_Scalling(df_train_numeric.iloc[:,i],df_train_numeric.iloc[:,i])

In [109]:
df_train_decimal = df_train_numeric_decimal.join(df_train_normalized)

### Test Data

#### Min Max Normalization

In [110]:
df_test_numeric_min_max = df_test_numeric.copy()

In [111]:
for i in range(df_test_numeric.shape[1]):
    df_test_numeric_min_max.iloc[:,i] = Min_Max_Normalize(df_test_numeric.iloc[:,i],df_train_numeric.iloc[:,i],0,1)

In [112]:
df_test_min_max = df_test_numeric_min_max.join(df_test_normalized)

#### Z-score Normalization

In [113]:
df_test_numeric_zscore= df_test_numeric.copy()

In [114]:
for i in range(df_test_numeric.shape[1]):
    df_test_numeric_zscore.iloc[:,i] = Z_Score_Normalize(df_test_numeric.iloc[:,i],df_train_numeric.iloc[:,i])

In [115]:
df_test_zscore = df_test_numeric_zscore.join(df_test_normalized)

#### Decimal Scalling

In [116]:
df_test_numeric_decimal = df_test_numeric.copy()

In [117]:
for i in range(df_test_numeric.shape[1]):
    df_test_numeric_decimal.iloc[:,i] = Decimal_Scalling(df_test_numeric.iloc[:,i],df_train_numeric.iloc[:,i])

In [118]:
df_test_decimal = df_test_numeric_decimal.join(df_test_normalized)

In [119]:
df_train_min_max.head()

Unnamed: 0,price,sqft_living,sqft_lot,sqft_above,sqft_basement,lat,long,sqft_living15,sqft_lot15,bedrooms,bathrooms,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode
0,0.019266,0.061503,0.003108,0.089602,0.0,0.571498,0.217608,0.16477,0.005732,3,1,1,0,0.0,0.000125,0.000375,0.003438,0.0,98178
1,0.060721,0.167046,0.004072,0.199115,0.082988,0.908959,0.166113,0.226055,0.008017,3,2,2,0,0.0,0.000125,0.000375,0.003188,0.002813,98125
2,0.01377,0.030372,0.005743,0.044248,0.0,0.936143,0.237542,0.406409,0.008503,2,1,1,0,0.0,0.000125,0.000313,0.002063,0.0,98028
3,0.069377,0.120729,0.002714,0.075221,0.188797,0.586939,0.104651,0.168272,0.004985,4,3,1,0,0.0,0.00025,0.000375,0.004063,0.0,98136
4,0.057049,0.099468,0.004579,0.144912,0.0,0.741354,0.393688,0.245316,0.007861,3,2,1,0,0.0,0.000125,0.000438,0.005438,0.0,98074


In [120]:
df_train_zscore.head()

Unnamed: 0,price,sqft_living,sqft_lot,sqft_above,sqft_basement,lat,long,sqft_living15,sqft_lot15,bedrooms,bathrooms,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode
0,-0.848292,-0.96371,-0.241246,-0.706075,-0.674535,-0.348228,-0.299702,-0.94319,-0.278186,3,1,1,0,0.0,0.000125,0.000375,0.003438,0.0,98178
1,0.013119,0.571009,-0.203638,0.522771,0.213019,1.152804,-0.746161,-0.422419,-0.206756,3,2,2,0,0.0,0.000125,0.000375,0.003188,0.002813,98125
2,-0.962474,-1.416396,-0.138486,-1.214991,-0.674535,1.273716,-0.126879,1.110134,-0.191565,2,1,1,0,0.0,0.000125,0.000313,0.002063,0.0,98028
3,0.192977,-0.102501,-0.256601,-0.867438,1.34465,-0.279544,-1.279031,-0.913432,-0.30153,4,3,1,0,0.0,0.00025,0.000375,0.004063,0.0,98136
4,-0.063184,-0.411652,-0.183842,-0.085445,-0.674535,0.407296,1.226899,-0.258748,-0.211641,3,2,1,0,0.0,0.000125,0.000438,0.005438,0.0,98074


In [121]:
df_train_decimal.head()

Unnamed: 0,price,sqft_living,sqft_lot,sqft_above,sqft_basement,lat,long,sqft_living15,sqft_lot15,bedrooms,bathrooms,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode
0,0.02219,0.0118,0.000565,0.118,0.0,5e-06,-1.2e-05,0.134,0.00565,3,1,1,0,0.0,0.000125,0.000375,0.003438,0.0,98178
1,0.0538,0.0257,0.000724,0.217,0.04,5e-06,-1.2e-05,0.169,0.007639,3,2,2,0,0.0,0.000125,0.000375,0.003188,0.002813,98125
2,0.018,0.0077,0.001,0.077,0.0,5e-06,-1.2e-05,0.272,0.008062,2,1,1,0,0.0,0.000125,0.000313,0.002063,0.0,98028
3,0.0604,0.0196,0.0005,0.105,0.091,5e-06,-1.2e-05,0.136,0.005,4,3,1,0,0.0,0.00025,0.000375,0.004063,0.0,98136
4,0.051,0.0168,0.000808,0.168,0.0,5e-06,-1.2e-05,0.18,0.007503,3,2,1,0,0.0,0.000125,0.000438,0.005438,0.0,98074


In [122]:
df_test_min_max.head()

Unnamed: 0,price,sqft_living,sqft_lot,sqft_above,sqft_basement,lat,long,sqft_living15,sqft_lot15,bedrooms,bathrooms,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode
0,0.051272,0.149582,0.003862,0.21792,0.0,0.536754,0.314784,0.446682,0.007024,4,2,2,0,0.0,0.000125,0.000375,0.006313,0.0,98059
1,0.032787,0.107062,0.006406,0.09292,0.118257,0.741676,0.388704,0.241814,0.011464,3,1,1,0,0.0,0.000125,0.000375,0.004938,0.0,98074
2,0.055738,0.107062,0.006406,0.09292,0.118257,0.741676,0.388704,0.241814,0.011464,3,1,1,0,0.0,0.000125,0.000375,0.004938,0.0,98074
3,0.04918,0.157935,0.012011,0.115044,0.215768,0.696156,0.377907,0.359132,0.056785,4,2,1,0,0.0,0.000125,0.000438,0.004875,0.0,98075
4,0.052459,0.154897,0.004703,0.09292,0.248963,0.907673,0.248339,0.289091,0.008363,4,3,1,0,0.0,0.00025,0.000375,0.004313,0.0,98034


In [123]:
df_test_zscore.head()

Unnamed: 0,price,sqft_living,sqft_lot,sqft_above,sqft_basement,lat,long,sqft_living15,sqft_lot15,bedrooms,bathrooms,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode
0,-0.183226,0.317063,-0.211811,0.733785,-0.674535,-0.502767,0.542809,1.452355,-0.237785,4,2,2,0,0.0,0.000125,0.000375,0.006313,0.0,98059
1,-0.567332,-0.301241,-0.112595,-0.668837,0.590229,0.408727,1.183694,-0.288507,-0.098983,3,1,1,0,0.0,0.000125,0.000375,0.004938,0.0,98074
2,-0.090436,-0.301241,-0.112595,-0.668837,0.590229,0.408727,1.183694,-0.288507,-0.098983,3,1,1,0,0.0,0.000125,0.000375,0.004938,0.0,98074
3,-0.226692,0.438515,0.105966,-0.420585,1.633105,0.206252,1.090081,0.708397,1.317909,4,2,1,0,0.0,0.000125,0.000438,0.004875,0.0,98075
4,-0.158564,0.394351,-0.179023,-0.668837,1.988127,1.14708,-0.033267,0.113231,-0.195947,4,3,1,0,0.0,0.00025,0.000375,0.004313,0.0,98034


In [124]:
df_test_decimal.head()

Unnamed: 0,price,sqft_living,sqft_lot,sqft_above,sqft_basement,lat,long,sqft_living15,sqft_lot15,bedrooms,bathrooms,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode
0,0.046595,0.0234,0.00069,0.234,0.0,5e-06,-1.2e-05,0.295,0.006775,4,2,2,0,0.0,0.000125,0.000375,0.006313,0.0,98059
1,0.0325,0.0178,0.00111,0.121,0.057,5e-06,-1.2e-05,0.178,0.01064,3,1,1,0,0.0,0.000125,0.000375,0.004938,0.0,98074
2,0.05,0.0178,0.00111,0.121,0.057,5e-06,-1.2e-05,0.178,0.01064,3,1,1,0,0.0,0.000125,0.000375,0.004938,0.0,98074
3,0.045,0.0245,0.002035,0.141,0.104,5e-06,-1.2e-05,0.245,0.050094,4,2,1,0,0.0,0.000125,0.000438,0.004875,0.0,98075
4,0.0475,0.0241,0.000828,0.121,0.12,5e-06,-1.2e-05,0.205,0.00794,4,3,1,0,0.0,0.00025,0.000375,0.004313,0.0,98034


## Testing

In [125]:
def Nominal_Diff(nominal_train,nominal_test):    
    p = nominal_train.shape[1]
    nominal_table=np.zeros([nominal_test.shape[0],nominal_train.shape[0]])
    for i in range(nominal_test.shape[0]):        
        for k in range(nominal_train.shape[0]):
            m=0
            for j in range(nominal_test.shape[1]):
                if nominal_train[k,j] == nominal_test[i,j]:
                    m+=1
            dist = (p - m)/p
            nominal_table[i,k] = dist
    return nominal_table

In [126]:
def Binary_Diff(binary_train,binary_test):
    q=diss=0;
    binary_table=np.zeros([binary_test.shape[0],binary_train.shape[0]])
    for i in range(binary_test.shape[0]):
        q=diss=0;
        for j in range(binary_train.shape[0]): 
            if binary_train[j] ==1 and binary_test[i]==1:
                q+=1    
            elif binary_train[j] ==0 and binary_test[i]==0:
                continue
            else:
                diss+=1                
        binary_table[i,j]=diss/(diss+q)
    return binary_table

In [127]:
def Manhattan_Distance(numeric_train,numeric_test):        
    manhattan_dist=np.zeros([numeric_test.shape[0],numeric_train.shape[0]])
    for j in range(numeric_test.shape[0]):
        distance=0
        for i in range(numeric_test.shape[1]):
            dist = (numeric_train.iloc[:,i])-(numeric_test.iloc[j,i])
            abs_dist = np.abs(dist[~np.isnan(dist)])        
            distance += abs_dist
        manhattan_dist[j,:]=distance
    return manhattan_dist

In [128]:
def Euclidean_Distance(numeric_train,numeric_test):        
    euclidean_dist=np.zeros([numeric_test.shape[0],numeric_train.shape[0]])
    for j in range(numeric_test.shape[0]):
        distance=0
        for i in range(numeric_test.shape[1]):
            dist = (numeric_train.iloc[:,i])-(numeric_test.iloc[j,i])
            squared = np.square(dist[~np.isnan(dist)])
            distance += squared
        euclidean_dist[j,:] = np.sqrt(distance)
    return euclidean_dist

In [129]:
# MSE calculation function
def MSE(y_true,y_pred):
    m = y_true.shape[0]
    return (1/(2*m)) * (np.sum(((y_true-y_pred)**2)))

In [130]:
def KNN(numeric_data, binary_data, nominal_data, price_train, price_test, k):
    predictions=[]
    for i in range(numeric_data.shape[0]):
        numeric = pd.DataFrame(numeric_data.iloc[i,:])
        binary = pd.DataFrame(binary_data.iloc[i,:])
        nominal = pd.DataFrame(nominal_data.iloc[i,:])
        
        result = pd.concat([numeric,binary,nominal,price_train],ignore_index = True,axis=1)
        index=(np.sum(result,axis=1)/3).argsort()[:k]
        pred = np.mean(result.iloc[index.tolist(),:].iloc[:,-1])   
        predictions.append(pred)
    mean_squared_error = MSE(price_test,predictions)
    
    return mean_squared_error, predictions

### Nominal and Binary fields

In [131]:
# Nominal data fields
nominal_data_train = df_train_min_max.copy()
nominal_data_train = df_train_min_max.filter(['bedrooms','bathrooms','floors','zipcode'], axis=1)
nominal_data_test = df_test_min_max.copy()
nominal_data_test = df_test_min_max.filter(['bedrooms','bathrooms','floors','zipcode'], axis=1)

In [132]:
# Binary data fields
binary_data_train = df_train_min_max.copy()
binary_data_train = df_train_min_max.filter(['waterfront'], axis=1)
binary_data_test = df_test_min_max.copy()
binary_data_test = df_test_min_max.filter(['waterfront'], axis=1)

In [133]:
nominal_data_comp = Nominal_Diff(nominal_data_train.to_numpy().astype(int),nominal_data_test.to_numpy().astype(int))
nominal_df = pd.DataFrame(nominal_data_comp)
nominal_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15989,15990,15991,15992,15993,15994,15995,15996,15997,15998
0,1.00,0.50,1.00,0.75,0.75,0.75,0.50,1.00,1.00,0.50,...,1.00,0.50,0.75,0.50,0.75,1.00,1.00,1.00,0.75,1.00
1,0.25,0.75,0.50,0.75,0.25,0.75,0.75,0.25,0.25,0.75,...,0.25,0.75,0.50,0.75,0.50,0.50,0.25,0.50,0.50,0.50
2,0.25,0.75,0.50,0.75,0.25,0.75,0.75,0.25,0.25,0.75,...,0.25,0.75,0.50,0.75,0.50,0.50,0.25,0.50,0.50,0.50
3,0.75,0.75,0.75,0.50,0.50,0.50,0.75,0.75,0.75,0.75,...,0.75,0.75,1.00,0.25,0.50,0.75,0.75,0.75,0.50,0.75
4,0.75,1.00,0.75,0.25,0.75,0.50,1.00,0.75,0.75,1.00,...,0.75,1.00,1.00,0.50,0.50,0.75,0.75,0.75,0.75,0.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5609,0.75,0.50,1.00,1.00,0.50,1.00,0.50,0.75,0.75,0.50,...,0.75,0.50,0.75,0.50,1.00,1.00,0.75,1.00,0.50,1.00
5610,1.00,0.50,1.00,0.75,0.75,0.75,0.50,1.00,0.75,0.50,...,1.00,0.50,0.75,0.50,0.75,1.00,1.00,0.75,0.75,1.00
5611,1.00,0.75,0.75,1.00,1.00,1.00,0.75,1.00,1.00,0.75,...,1.00,0.75,0.75,1.00,1.00,1.00,1.00,0.75,1.00,0.75
5612,0.75,0.25,1.00,1.00,0.50,1.00,0.25,0.75,0.75,0.25,...,0.75,0.25,0.50,0.75,1.00,1.00,0.75,1.00,0.50,1.00


In [134]:
binary_data_comp = Binary_Diff(binary_data_train.to_numpy(),binary_data_test.to_numpy())
binary_df = pd.DataFrame(binary_data_comp)
binary_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15989,15990,15991,15992,15993,15994,15995,15996,15997,15998
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5610,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5611,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5612,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## Min Max Normalization

In [135]:
df_train_numeric_min_max = df_train_min_max.filter(['sqft_living','sqft_lot','sqft_above','sqft_basement','lat','long','sqft_living15','sqft_lot15','view','condition','grade','yr_built','yr_renovated'], axis=1)
df_test_numeric_min_max = df_test_min_max.filter(['sqft_living','sqft_lot','sqft_above','sqft_basement','lat','long','sqft_living15','sqft_lot15','view','condition','grade','yr_built','yr_renovated'], axis=1)
df_train_min_max_price = df_train_min_max.filter(['price'], axis=1)
df_test_min_max_price = df_test_min_max.filter(['price'], axis=1)

### Manhattan Distance

In [136]:
manhattan_distance_min_max = Manhattan_Distance(df_train_numeric_min_max,df_test_numeric_min_max)
manhattan_min_max_df = pd.DataFrame(manhattan_distance_min_max)
manhattan_min_max_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15989,15990,15991,15992,15993,15994,15995,15996,15997,15998
0,0.635151,0.867900,0.817459,0.904640,0.610484,1.586748,0.686421,0.748056,0.746121,0.468726,...,0.899631,0.691917,0.577591,0.963158,0.321473,0.961641,0.834049,0.809525,0.846133,1.261893
1,0.595984,0.617428,0.760406,0.625410,0.192642,1.556428,0.990757,0.768221,0.463132,0.732899,...,0.917437,0.891509,0.489700,0.385516,0.598218,0.632434,0.521093,0.773253,0.948451,0.646342
2,0.595984,0.617428,0.760406,0.625410,0.192642,1.556428,0.990757,0.768221,0.463132,0.732899,...,0.917437,0.891509,0.489700,0.385516,0.598218,0.632434,0.521093,0.773253,0.948451,0.646342
3,0.878417,0.844907,0.899245,0.739429,0.535815,1.273995,1.068675,1.050653,0.679175,0.762477,...,1.193929,1.113115,0.592077,0.666651,0.522094,0.667015,0.570333,1.055686,1.113559,0.872662
4,0.842129,0.435907,0.582363,0.702891,0.713763,1.516742,1.143598,1.020163,0.643284,1.149766,...,1.171737,1.125522,0.642291,0.429622,0.736153,0.356073,0.533795,1.019272,1.429996,0.414444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5609,0.485971,0.335341,0.523864,0.639755,0.470430,1.969244,0.811415,0.618136,0.590122,1.022311,...,0.862015,0.873571,0.354226,0.314782,0.694854,0.577076,0.627375,0.514140,1.252411,0.439452
5610,0.390243,0.524893,0.924433,0.485313,0.560138,1.939821,0.537112,0.475017,0.368659,0.681840,...,0.732904,0.580005,0.480318,0.573732,0.470804,0.932087,0.467782,0.391485,0.867169,0.901042
5611,0.264755,0.681932,0.646953,0.528006,0.518523,2.143764,0.833834,0.445264,0.522957,0.998218,...,0.670027,0.849478,0.375041,0.499047,0.674747,0.779826,0.584010,0.240277,1.228443,0.605955
5612,0.294345,0.792771,0.864554,0.585339,0.247177,1.872418,0.751975,0.564879,0.562780,0.534300,...,0.702446,0.633068,0.498148,0.685376,0.574497,1.005143,0.677714,0.470354,0.764400,0.883634


### k=5

In [137]:
error_min_max_manhattan_k5,prediction = KNN(manhattan_min_max_df,binary_df,nominal_df,df_train_min_max_price,df_test_min_max_price.price,5)
print("MSE: ",error_min_max_manhattan_k5)

MSE:  0.0003395361475802469


### k=10

In [138]:
error_min_max_manhattan_k10,prediction = KNN(manhattan_min_max_df,binary_df,nominal_df,df_train_min_max_price,df_test_min_max_price.price,10)
print("MSE: ",error_min_max_manhattan_k10)

MSE:  0.0003396513235991349


### k=20

In [139]:
error_min_max_manhattan_k20,prediction = KNN(manhattan_min_max_df,binary_df,nominal_df,df_train_min_max_price,df_test_min_max_price.price,20)
print("MSE: ",error_min_max_manhattan_k20)

MSE:  0.0003509224538973662


### Euclidean Distance

In [140]:
euclidean_distance_min_max = Euclidean_Distance(df_train_numeric_min_max,df_test_numeric_min_max)
euclidean_min_max_df = pd.DataFrame(euclidean_distance_min_max)
euclidean_min_max_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15989,15990,15991,15992,15993,15994,15995,15996,15997,15998
0,0.338170,0.465704,0.459888,0.425484,0.310611,0.620368,0.361154,0.345046,0.339402,0.243849,...,0.445051,0.400346,0.274977,0.430734,0.190819,0.493344,0.367397,0.429677,0.500038,0.591972
1,0.283351,0.306475,0.331680,0.339957,0.129612,0.704034,0.566092,0.421006,0.293628,0.437398,...,0.571619,0.564097,0.253154,0.253524,0.261340,0.329885,0.305480,0.376305,0.667524,0.337833
2,0.283351,0.306475,0.331680,0.339957,0.129612,0.704034,0.566092,0.421006,0.293628,0.437398,...,0.571619,0.564097,0.253154,0.253524,0.261340,0.329885,0.305480,0.376305,0.667524,0.337833
3,0.371755,0.367539,0.386975,0.359851,0.261842,0.565864,0.551213,0.455582,0.301622,0.424344,...,0.584489,0.566012,0.299338,0.305794,0.271525,0.333875,0.295114,0.459674,0.656804,0.408914
4,0.447370,0.222977,0.307498,0.378457,0.344186,0.653743,0.716137,0.577633,0.368261,0.646218,...,0.750914,0.744926,0.323066,0.195455,0.365095,0.156669,0.309507,0.487527,0.876747,0.186535
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5609,0.316795,0.143963,0.258046,0.352710,0.287779,0.823474,0.639529,0.471691,0.344449,0.613847,...,0.670393,0.671542,0.211303,0.169945,0.313576,0.274584,0.315577,0.323630,0.835724,0.224690
5610,0.194862,0.351911,0.460484,0.251978,0.324998,0.779830,0.341835,0.239096,0.210979,0.375181,...,0.407928,0.381608,0.227466,0.299743,0.243690,0.466553,0.242662,0.214720,0.563900,0.472643
5611,0.150873,0.304144,0.382248,0.253828,0.268994,0.910163,0.514267,0.317710,0.249992,0.502908,...,0.506466,0.523745,0.248619,0.224986,0.351363,0.413158,0.277705,0.154204,0.688313,0.329016
5612,0.170735,0.389714,0.436705,0.336301,0.150878,0.806988,0.444940,0.298843,0.286224,0.321551,...,0.432694,0.419396,0.276394,0.338799,0.267935,0.478808,0.343529,0.282415,0.528412,0.461040


### k=5

In [141]:
error_min_max_euclidean_k5,prediction = KNN(euclidean_min_max_df,binary_df,nominal_df,df_train_min_max_price,df_test_min_max_price.price,5)
print("MSE: ",error_min_max_euclidean_k5)

MSE:  0.0004309244038327973


### k=10

In [142]:
error_min_max_euclidean_k10,prediction = KNN(euclidean_min_max_df,binary_df,nominal_df,df_train_min_max_price,df_test_min_max_price.price,10)
print("MSE: ",error_min_max_euclidean_k10)

MSE:  0.00043924469793113756


### k=20

In [143]:
error_min_max_euclidean_k20,prediction = KNN(euclidean_min_max_df,binary_df,nominal_df,df_train_min_max_price,df_test_min_max_price.price,20)
print("MSE: ",error_min_max_euclidean_k20)

MSE:  0.0004597767328814279


## Z-Score Normalization

In [144]:
df_train_numeric_zscore = df_train_zscore.filter(['sqft_living','sqft_lot','sqft_above','sqft_basement','lat','long','sqft_living15','sqft_lot15','view','condition','grade','yr_built','yr_renovated'], axis=1)
df_test_numeric_zscore = df_test_zscore.filter(['sqft_living','sqft_lot','sqft_above','sqft_basement','lat','long','sqft_living15','sqft_lot15','view','condition','grade','yr_built','yr_renovated'], axis=1)
df_train_zscore_price = df_train_zscore.filter(['price'], axis=1)
df_test_zscore_price = df_test_zscore.filter(['price'], axis=1)

#### Manhattan Distance

In [145]:
manhattan_distance_zscore = Manhattan_Distance(df_train_numeric_zscore,df_test_numeric_zscore)
manhattan_zscore_df = pd.DataFrame(manhattan_distance_zscore)
manhattan_zscore_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15989,15990,15991,15992,15993,15994,15995,15996,15997,15998
0,6.185938,6.216969,6.594485,8.561731,4.908254,19.238505,5.162715,6.943929,7.225344,3.577303,...,7.529913,4.940743,4.853888,8.082803,2.284790,7.629877,7.632118,7.739271,6.549163,10.158065
1,5.168859,5.452292,6.621687,5.275236,2.217429,19.946506,7.070156,5.892052,3.539775,5.279456,...,6.391340,6.165883,4.903360,3.543892,6.076142,4.772701,4.509889,6.735070,6.756687,4.997030
2,5.168859,5.452292,6.621687,5.275236,2.217429,19.946506,7.070156,5.892052,3.539775,5.279456,...,6.391340,6.165883,4.903360,3.543892,6.076142,4.772701,4.509889,6.735070,6.756687,4.997030
3,9.536012,8.248378,9.399982,7.736066,6.617875,15.579353,9.578000,10.259205,7.196885,7.279292,...,10.526798,9.793753,7.319052,7.217478,6.572079,6.381471,6.420144,11.102223,8.830534,8.171962
4,7.021586,4.437129,6.284071,5.221391,6.445811,19.011246,8.120659,7.939402,4.694885,8.520975,...,8.525386,7.805760,5.953613,3.885510,6.503966,3.441905,3.905469,8.587672,10.684676,3.815347
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5609,3.349150,3.788989,5.092700,5.177948,3.892073,24.178072,4.737583,4.077681,4.656351,7.106046,...,5.466353,5.258194,2.611379,3.025585,6.349593,5.613506,5.305398,3.590330,9.020087,4.401647
5610,4.202292,3.427294,7.625617,5.065885,4.630147,22.434052,3.731369,4.447538,4.110188,5.251585,...,5.955244,3.833687,4.117973,4.846192,3.739868,7.483955,4.603294,4.249149,6.602738,7.186928
5611,1.984878,6.420955,5.044348,5.000234,5.055620,25.903969,5.983733,2.946357,4.854911,7.948942,...,4.198414,6.101090,3.562657,4.706268,7.209786,6.716176,5.732876,1.716143,9.863109,5.198303
5612,2.846561,6.566216,6.959716,5.802551,1.888319,22.736668,5.271874,4.767501,5.508437,3.924647,...,5.234452,4.222678,3.960839,5.706073,5.525880,8.175658,6.566130,4.369510,5.838689,7.110766


### k=5

In [146]:
error_zscore_manhattan_k5,prediction = KNN(manhattan_zscore_df,binary_df,nominal_df,df_train_zscore_price,df_test_zscore_price.price,5)
print("MSE: ",error_zscore_manhattan_k5)

MSE:  0.3131428428943477


### k=10

In [147]:
error_zscore_manhattan_k10,prediction = KNN(manhattan_zscore_df,binary_df,nominal_df,df_train_zscore_price,df_test_zscore_price.price,10)
print("MSE: ",error_zscore_manhattan_k10)

MSE:  0.30613199624080967


### k=20

In [148]:
error_zscore_manhattan_k20,prediction = KNN(manhattan_zscore_df,binary_df,nominal_df,df_train_zscore_price,df_test_zscore_price.price,20)
print("MSE: ",error_zscore_manhattan_k20)

MSE:  0.2993762412007134


#### Euclidean Distance

In [149]:
euclidean_distance_zscore = Euclidean_Distance(df_train_numeric_zscore,df_test_numeric_zscore)
euclidean_zscore_df = pd.DataFrame(euclidean_distance_zscore)
euclidean_zscore_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15989,15990,15991,15992,15993,15994,15995,15996,15997,15998
0,3.191931,2.968998,3.245247,3.973539,2.329743,7.288764,2.381237,3.192978,3.262175,1.619262,...,3.339195,2.530217,2.124751,3.403697,1.091138,3.321532,3.445827,3.941702,2.925747,4.356112
1,2.300131,2.576601,2.751861,2.763895,1.404524,8.182698,3.357827,2.850231,2.232400,2.521608,...,3.152670,3.056572,2.346606,2.185930,2.685678,2.396311,2.586387,3.145613,3.399397,2.477549
2,2.300131,2.576601,2.751861,2.763895,1.404524,8.182698,3.357827,2.850231,2.232400,2.521608,...,3.152670,3.056572,2.346606,2.185930,2.685678,2.396311,2.586387,3.145613,3.399397,2.477549
3,3.873978,3.300164,3.810134,3.436370,3.094887,6.388866,4.039209,4.045144,2.956823,3.324071,...,4.229604,3.962426,3.434263,3.074912,3.210882,2.657883,3.003616,4.534843,3.922490,3.344048
4,3.516970,2.323370,3.418646,2.314898,3.217716,7.602678,4.167972,3.887800,2.166790,4.071495,...,4.402518,4.235976,3.050335,1.691281,3.372676,1.505492,1.887656,3.897392,4.949247,1.695334
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5609,1.635044,1.710345,2.359415,2.547758,2.311529,9.297953,3.007837,2.262340,2.250806,3.578953,...,3.153792,3.154756,1.487879,1.704458,2.715740,2.580675,2.469115,1.726213,4.476927,2.269524
5610,2.151827,1.817935,3.437256,2.687605,2.624367,8.401696,1.864911,2.244840,2.335142,2.810561,...,2.746651,2.031013,1.824454,2.326607,1.867954,3.238088,2.549581,2.316462,3.468826,3.332534
5611,0.877841,2.765764,2.817739,2.483153,2.446305,10.098703,2.975504,1.661917,2.257625,3.555000,...,2.435150,2.877381,2.184919,2.192987,3.540799,3.303308,2.752262,0.900411,4.142384,2.543907
5612,1.545535,2.774062,3.050315,3.190814,0.893806,9.044434,2.764618,2.234171,2.693777,1.976376,...,2.425257,2.169958,2.197968,2.750697,2.481986,3.461792,3.230081,2.496013,2.732509,3.300825


### k=5

In [150]:
error_zscore_euclidean_k5,prediction = KNN(euclidean_zscore_df,binary_df,nominal_df,df_train_zscore_price,df_test_zscore_price.price,5)
print("MSE: ",error_zscore_euclidean_k5)

MSE:  0.46957622605143307


### k=10

In [151]:
error_zscore_euclidean_k10,prediction = KNN(euclidean_zscore_df,binary_df,nominal_df,df_train_zscore_price,df_test_zscore_price.price,10)
print("MSE: ",error_zscore_euclidean_k10)

MSE:  0.4583901256483562


### k=20

In [152]:
error_zscore_euclidean_k20,prediction = KNN(euclidean_zscore_df,binary_df,nominal_df,df_train_zscore_price,df_test_zscore_price.price,20)
print("MSE: ",error_zscore_euclidean_k20)

MSE:  0.4552281318508417


## Decimal Scalling Normalization

In [153]:
df_train_numeric_decimal = df_train_decimal.filter(['sqft_living','sqft_lot','sqft_above','sqft_basement','lat','long','sqft_living15','sqft_lot15','view','condition','grade','yr_built','yr_renovated'], axis=1)
df_test_numeric_decimal = df_test_decimal.filter(['sqft_living','sqft_lot','sqft_above','sqft_basement','lat','long','sqft_living15','sqft_lot15','view','condition','grade','yr_built','yr_renovated'], axis=1)
df_train_decimal_price = df_train_decimal.filter(['price'], axis=1)
df_test_decimal_price = df_test_decimal.filter(['price'], axis=1)

#### Manhattan Distance

In [154]:
manhattan_distance_decimal = Manhattan_Distance(df_train_numeric_decimal,df_test_numeric_decimal)
manhattan_decimal_df = pd.DataFrame(manhattan_distance_decimal)
manhattan_decimal_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15989,15990,15991,15992,15993,15994,15995,15996,15997,15998
0,0.292725,0.192137,0.201610,0.387140,0.189384,0.624708,0.140377,0.276393,0.328558,0.106454,...,0.279868,0.175068,0.153085,0.317798,0.073312,0.257711,0.321065,0.332534,0.167015,0.409423
1,0.117035,0.137849,0.210725,0.101050,0.111001,0.800399,0.159199,0.094268,0.036077,0.192124,...,0.096874,0.139703,0.156020,0.041708,0.278423,0.126870,0.075375,0.156844,0.160229,0.116962
2,0.117035,0.137849,0.210725,0.101050,0.111001,0.800399,0.159199,0.094268,0.036077,0.192124,...,0.096874,0.139703,0.156020,0.041708,0.278423,0.126870,0.075375,0.156844,0.160229,0.116962
3,0.298114,0.265528,0.257805,0.210529,0.248080,0.619319,0.208803,0.275347,0.185156,0.209128,...,0.276972,0.278708,0.177099,0.183187,0.271977,0.125879,0.148454,0.337923,0.258898,0.229242
4,0.209853,0.218068,0.250131,0.122018,0.201070,0.707830,0.199268,0.190914,0.097242,0.229993,...,0.194389,0.231572,0.195838,0.104802,0.306308,0.124232,0.059944,0.249537,0.243411,0.141819
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5609,0.065531,0.143642,0.214915,0.167116,0.051564,0.862088,0.097966,0.075698,0.158863,0.132641,...,0.071173,0.063062,0.093396,0.123207,0.224692,0.255016,0.194191,0.104597,0.109795,0.190528
5610,0.178617,0.077995,0.264869,0.273032,0.074517,0.740442,0.107982,0.161651,0.213816,0.103395,...,0.165126,0.061015,0.148851,0.203690,0.103045,0.298969,0.237044,0.218425,0.054148,0.294681
5611,0.057048,0.250159,0.209233,0.143633,0.158207,0.968731,0.204484,0.078815,0.169381,0.239159,...,0.078690,0.169579,0.193438,0.175725,0.331335,0.261533,0.212708,0.021114,0.216437,0.149046
5612,0.061014,0.147725,0.234399,0.161199,0.055648,0.866172,0.102050,0.095182,0.176947,0.136725,...,0.090656,0.067146,0.112879,0.141336,0.228776,0.273099,0.212275,0.100081,0.113878,0.184612


### k=5

In [155]:
error_decimal_manhattan_k5,prediction = KNN(manhattan_decimal_df,binary_df,nominal_df,df_train_decimal_price,df_test_decimal_price.price,5)
print("MSE: ",error_decimal_manhattan_k5)

MSE:  0.0002775753600209813


### k=10

In [156]:
error_decimal_manhattan_k10,prediction = KNN(manhattan_decimal_df,binary_df,nominal_df,df_train_decimal_price,df_test_decimal_price.price,10)
print("MSE: ",error_decimal_manhattan_k10)

MSE:  0.00029016586170160785


### k=20

In [157]:
error_decimal_manhattan_k20,prediction = KNN(manhattan_decimal_df,binary_df,nominal_df,df_train_decimal_price,df_test_decimal_price.price,20)
print("MSE: ",error_decimal_manhattan_k20)

MSE:  0.00031274258221680945


#### Euclidean Distance

In [158]:
euclidean_distance_decimal = Euclidean_Distance(df_train_numeric_decimal,df_test_numeric_decimal)
euclidean_decimal_df = pd.DataFrame(euclidean_distance_decimal)
euclidean_decimal_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15989,15990,15991,15992,15993,15994,15995,15996,15997,15998
0,0.198799,0.133374,0.159513,0.224111,0.132762,0.300480,0.094947,0.182927,0.188941,0.071985,...,0.184422,0.132930,0.101891,0.181638,0.048935,0.157387,0.181661,0.228996,0.116960,0.236634
1,0.072508,0.098329,0.118903,0.056676,0.073981,0.423777,0.088956,0.060801,0.022802,0.107736,...,0.061315,0.089196,0.087953,0.019342,0.165193,0.088005,0.040126,0.096531,0.099191,0.066098
2,0.072508,0.098329,0.118903,0.056676,0.073981,0.423777,0.088956,0.060801,0.022802,0.107736,...,0.061315,0.089196,0.087953,0.019342,0.165193,0.088005,0.040126,0.096531,0.099191,0.066098
3,0.160645,0.132153,0.133037,0.124123,0.132835,0.347712,0.118850,0.142362,0.092500,0.122474,...,0.144078,0.143775,0.114524,0.098120,0.161340,0.061907,0.073637,0.185802,0.137525,0.132952
4,0.140026,0.130092,0.145255,0.076727,0.131487,0.395204,0.131742,0.128104,0.056869,0.142168,...,0.129182,0.142410,0.128162,0.063343,0.187373,0.071449,0.034482,0.159542,0.146560,0.086039
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5609,0.040336,0.078225,0.141637,0.104462,0.031537,0.441739,0.073400,0.049511,0.091201,0.093499,...,0.046866,0.040309,0.085122,0.076659,0.144184,0.143800,0.109377,0.065972,0.061205,0.112806
5610,0.123749,0.044971,0.178608,0.162458,0.063409,0.379811,0.072401,0.126972,0.145844,0.070130,...,0.125651,0.044185,0.098297,0.127412,0.073883,0.170060,0.151934,0.144260,0.034034,0.175307
5611,0.036158,0.140026,0.172021,0.097729,0.102548,0.507618,0.140492,0.063667,0.105919,0.162619,...,0.059968,0.110863,0.144333,0.099097,0.216384,0.178448,0.131661,0.012324,0.130614,0.096468
5612,0.043118,0.076069,0.155516,0.106601,0.040320,0.446948,0.083789,0.059994,0.098897,0.102436,...,0.056741,0.041461,0.097581,0.082317,0.147950,0.155441,0.117664,0.062149,0.063345,0.115027


### k=5

In [159]:
error_decimal_euclidean_k5,prediction = KNN(euclidean_decimal_df,binary_df,nominal_df,df_train_decimal_price,df_test_decimal_price.price,5)
print("MSE: ",error_decimal_euclidean_k5)

MSE:  0.000291548420527058


### k=10

In [160]:
error_decimal_euclidean_k10,prediction = KNN(euclidean_decimal_df,binary_df,nominal_df,df_train_decimal_price,df_test_decimal_price.price,10)
print("MSE: ",error_decimal_euclidean_k10)

MSE:  0.00031498096402542884


### k=5

In [161]:
error_decimal_euclidean_k20,prediction = KNN(euclidean_decimal_df,binary_df,nominal_df,df_train_decimal_price,df_test_decimal_price.price,20)
print("MSE: ",error_decimal_euclidean_k20)

MSE:  0.0003412376316839488


# Results

### Min-Max Normalization

In [162]:
print("Manhattan Distance and k=5: ", error_min_max_manhattan_k5)
print("Manhattan Distance and k=10: ", error_min_max_manhattan_k10)
print("Manhattan Distance and k=20: ", error_min_max_manhattan_k20)
print('\n')
print("Euclidean Distance and k=5: ", error_min_max_euclidean_k5)
print("Euclidean Distance and k=10: ", error_min_max_euclidean_k10)
print("Euclidean Distance and k=20: ", error_min_max_euclidean_k20)

Manhattan Distance and k=5:  0.0003395361475802469
Manhattan Distance and k=10:  0.0003396513235991349
Manhattan Distance and k=20:  0.0003509224538973662


Euclidean Distance and k=5:  0.0004309244038327973
Euclidean Distance and k=10:  0.00043924469793113756
Euclidean Distance and k=20:  0.0004597767328814279


### Z-Score

In [163]:
print("Manhattan Distance and k=5: ", error_zscore_manhattan_k5)
print("Manhattan Distance and k=10: ", error_zscore_manhattan_k10)
print("Manhattan Distance and k=20: ", error_zscore_manhattan_k10)
print('\n')
print("Euclidean Distance and k=5: ", error_zscore_euclidean_k5)
print("Euclidean Distance and k=10: ", error_zscore_euclidean_k10)
print("Euclidean Distance and k=20: ", error_zscore_euclidean_k20)

Manhattan Distance and k=5:  0.3131428428943477
Manhattan Distance and k=10:  0.30613199624080967
Manhattan Distance and k=20:  0.30613199624080967


Euclidean Distance and k=5:  0.46957622605143307
Euclidean Distance and k=10:  0.4583901256483562
Euclidean Distance and k=20:  0.4552281318508417


### Normalization By Decimal Scalling

In [164]:
print("Manhattan Distance and k=5: ", error_decimal_manhattan_k5)
print("Manhattan Distance and k=10: ", error_decimal_manhattan_k10)
print("Manhattan Distance and k=20: ", error_decimal_manhattan_k20)
print('\n')
print("Euclidean Distance and k=5: ", error_decimal_euclidean_k5)
print("Euclidean Distance and k=10: ", error_decimal_euclidean_k10)
print("Euclidean Distance and k=20: ", error_decimal_euclidean_k20)

Manhattan Distance and k=5:  0.0002775753600209813
Manhattan Distance and k=10:  0.00029016586170160785
Manhattan Distance and k=20:  0.00031274258221680945


Euclidean Distance and k=5:  0.000291548420527058
Euclidean Distance and k=10:  0.00031498096402542884
Euclidean Distance and k=20:  0.0003412376316839488
