In [186]:
import pandas as pd
import numpy as np

In [187]:
df_train = pd.read_csv('trainHome_data.csv')
df_test = pd.read_csv('testingHome_data.csv')

In [188]:
df_train.drop(['id', 'date'], axis = 1, inplace = True)
df_test.drop(['id', 'date'], axis = 1, inplace = True)

In [189]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15999 entries, 0 to 15998
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   price          15999 non-null  int64  
 1   bedrooms       15999 non-null  int64  
 2   bathrooms      15999 non-null  float64
 3   sqft_living    15999 non-null  int64  
 4   sqft_lot       15999 non-null  int64  
 5   floors         15999 non-null  float64
 6   waterfront     15999 non-null  int64  
 7   view           15999 non-null  int64  
 8   condition      15999 non-null  int64  
 9   grade          15999 non-null  int64  
 10  sqft_above     15999 non-null  int64  
 11  sqft_basement  15999 non-null  int64  
 12  yr_built       15999 non-null  int64  
 13  yr_renovated   15999 non-null  int64  
 14  zipcode        15999 non-null  int64  
 15  lat            15999 non-null  float64
 16  long           15999 non-null  float64
 17  sqft_living15  15999 non-null  int64  
 18  sqft_l

In [190]:
df_train = df_train.astype({"bathrooms": int,"floors": int})
df_test = df_test.astype({"bathrooms": int,"floors": int})

In [191]:
df_train.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,221900,3,1,1180,5650,1,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,538000,3,2,2570,7242,2,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,180000,2,1,770,10000,1,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,604000,4,3,1960,5000,1,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,510000,3,2,1680,8080,1,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


Nominal:    
- bedroom
- bathrooms
- floors
- zip code

Binary:
- waterfront(asymmetric)
    
Ordinal
- view
- condition    
- grade
- yr_built
- yr_renovated
    
Numeric:
- sqft_living (discrete)
- sqft_lot (discrete)
- sqft_above (discrete)
- sqft_basement (discrete)
- lat (continous)
- long (continous)
- sqft_living15 (discrete)
- sqft_lot15 (discrete)

## Pre-processing

In [192]:
df_train_normalized = df_train.copy()
df_train_numeric = df_train.filter(['price','sqft_living','sqft_lot','sqft_above','sqft_basement','lat','long','sqft_living15','sqft_lot15'], axis=1)
df_train_normalized.drop(['price','sqft_living','sqft_lot','sqft_above','sqft_basement','lat','long','sqft_living15','sqft_lot15'], axis=1,inplace=True)
df_train_normalized.head()

Unnamed: 0,bedrooms,bathrooms,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode
0,3,1,1,0,0,3,7,1955,0,98178
1,3,2,2,0,0,3,7,1951,1991,98125
2,2,1,1,0,0,3,6,1933,0,98028
3,4,3,1,0,0,5,7,1965,0,98136
4,3,2,1,0,0,3,8,1987,0,98074


In [193]:
df_test_normalized = df_test.copy()
df_test_numeric = df_test.filter(['price','sqft_living','sqft_lot','sqft_above','sqft_basement','lat','long','sqft_living15','sqft_lot15'], axis=1)
df_test_normalized.drop(['price','sqft_living','sqft_lot','sqft_above','sqft_basement','lat','long','sqft_living15','sqft_lot15'], axis=1,inplace=True)
df_test_normalized.head()

Unnamed: 0,bedrooms,bathrooms,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode
0,4,2,2,0,0,3,7,2001,0,98059
1,3,1,1,0,0,3,7,1979,0,98074
2,3,1,1,0,0,3,7,1979,0,98074
3,4,2,1,0,0,3,8,1978,0,98075
4,4,3,1,0,0,5,7,1969,0,98034


In [194]:
df_train_numeric.head()

Unnamed: 0,price,sqft_living,sqft_lot,sqft_above,sqft_basement,lat,long,sqft_living15,sqft_lot15
0,221900,1180,5650,1180,0,47.5112,-122.257,1340,5650
1,538000,2570,7242,2170,400,47.721,-122.319,1690,7639
2,180000,770,10000,770,0,47.7379,-122.233,2720,8062
3,604000,1960,5000,1050,910,47.5208,-122.393,1360,5000
4,510000,1680,8080,1680,0,47.6168,-122.045,1800,7503


In [195]:
df_test_numeric.head(5)

Unnamed: 0,price,sqft_living,sqft_lot,sqft_above,sqft_basement,lat,long,sqft_living15,sqft_lot15
0,465950,2340,6896,2340,0,47.4896,-122.14,2950,6775
1,325000,1780,11096,1210,570,47.617,-122.051,1780,10640
2,500000,1780,11096,1210,570,47.617,-122.051,1780,10640
3,450000,2450,20348,1410,1040,47.5887,-122.064,2450,50094
4,475000,2410,8284,1210,1200,47.7202,-122.22,2050,7940


### Ordinal Normalization

In [196]:
def rank_by_interval(x, data):
    z= (x-1)/((data.shape[0]+1)-1)
    return z

#### Train Data

In [197]:
df_train_normalized

Unnamed: 0,bedrooms,bathrooms,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode
0,3,1,1,0,0,3,7,1955,0,98178
1,3,2,2,0,0,3,7,1951,1991,98125
2,2,1,1,0,0,3,6,1933,0,98028
3,4,3,1,0,0,5,7,1965,0,98136
4,3,2,1,0,0,3,8,1987,0,98074
...,...,...,...,...,...,...,...,...,...,...
15994,5,1,1,0,2,3,7,1955,0,98155
15995,3,1,1,0,1,5,7,1910,0,98136
15996,2,1,1,0,0,4,6,1942,0,98146
15997,3,2,1,0,2,5,8,1986,0,98022


In [198]:
#view to rank [1...M]
df_train_normalized.view = df_train_normalized["view"]+1

In [199]:
#year built to rank [1...M]
train_yr_built_sorted_unique = sorted(set(df_train.yr_built))
train_yr_built_ordinal_map = {val: i for i, val in enumerate(train_yr_built_sorted_unique, 1)}
train_yr_built_ordinals = [train_yr_built_ordinal_map[val] for val in df_train.yr_built]
train_yr_built_ordinals = pd.Series(train_yr_built_ordinals)

In [200]:
#year renovated to rank [1...M]
train_yr_renovated_sorted_unique = sorted(set(df_train.yr_renovated))
train_yr_renovated_ordinal_map = {val: i for i, val in enumerate(train_yr_renovated_sorted_unique, 1)}
train_yr_renovated_ordinals = [train_yr_renovated_ordinal_map[val] for val in df_train.yr_renovated]
train_yr_renovated_ordinals = pd.Series(train_yr_renovated_ordinals)

In [201]:
#map ordinals to [0,1]
df_train_normalized.view = rank_by_interval(df_train_normalized["view"], df_train["view"])
df_train_normalized.condition = rank_by_interval(df_train_normalized["condition"], df_train["condition"])
df_train_normalized.grade = rank_by_interval(df_train_normalized["grade"], df_train["grade"])
df_train_normalized.yr_built = rank_by_interval(train_yr_built_ordinals, df_train["yr_built"])
df_train_normalized.yr_renovated = rank_by_interval(train_yr_renovated_ordinals, df_train["yr_renovated"])

In [202]:
df_train_normalized

Unnamed: 0,bedrooms,bathrooms,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode
0,3,1,1,0,0.000000,0.000125,0.000375,0.003438,0.000000,98178
1,3,2,2,0,0.000000,0.000125,0.000375,0.003188,0.002813,98125
2,2,1,1,0,0.000000,0.000125,0.000313,0.002063,0.000000,98028
3,4,3,1,0,0.000000,0.000250,0.000375,0.004063,0.000000,98136
4,3,2,1,0,0.000000,0.000125,0.000438,0.005438,0.000000,98074
...,...,...,...,...,...,...,...,...,...,...
15994,5,1,1,0,0.000125,0.000125,0.000375,0.003438,0.000000,98155
15995,3,1,1,0,0.000063,0.000250,0.000375,0.000625,0.000000,98136
15996,2,1,1,0,0.000000,0.000188,0.000313,0.002625,0.000000,98146
15997,3,2,1,0,0.000125,0.000250,0.000438,0.005375,0.000000,98022


#### Test Data

In [203]:
df_test_normalized

Unnamed: 0,bedrooms,bathrooms,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode
0,4,2,2,0,0,3,7,2001,0,98059
1,3,1,1,0,0,3,7,1979,0,98074
2,3,1,1,0,0,3,7,1979,0,98074
3,4,2,1,0,0,3,8,1978,0,98075
4,4,3,1,0,0,5,7,1969,0,98034
...,...,...,...,...,...,...,...,...,...,...
5609,3,2,3,0,0,3,8,2009,0,98103
5610,4,2,2,0,0,3,8,2014,0,98146
5611,2,0,2,0,0,3,7,2009,0,98144
5612,3,2,2,0,0,3,8,2004,0,98027


In [204]:
#view to rank [1...M]
df_test_normalized.view = df_test_normalized["view"]+1

In [205]:
#year built to rank [1...M]
yr_built_sorted_unique = sorted(set(df_test.yr_built))
yr_built_ordinal_map = {val: i for i, val in enumerate(yr_built_sorted_unique, 1)}
yr_built_ordinals = [yr_built_ordinal_map[val] for val in df_test.yr_built]
yr_built_ordinals = pd.Series(yr_built_ordinals)

In [206]:
#year renovated to rank [1...M]
yr_renovated_sorted_unique = sorted(set(df_test.yr_renovated))
yr_renovated_ordinal_map = {val: i for i, val in enumerate(yr_renovated_sorted_unique, 1)}
yr_renovated_ordinals = [yr_renovated_ordinal_map[val] for val in df_test.yr_renovated]
yr_renovated_ordinals = pd.Series(yr_renovated_ordinals)

In [207]:
#map ordinals to [0,1]
df_test_normalized.view = rank_by_interval(df_test_normalized["view"], df_train["view"])
df_test_normalized.condition = rank_by_interval(df_test_normalized["condition"], df_train["condition"])
df_test_normalized.grade = rank_by_interval(df_test_normalized["grade"], df_train["grade"])
df_test_normalized.yr_built = rank_by_interval(yr_built_ordinals, df_train["yr_built"])
df_test_normalized.yr_renovated = rank_by_interval(yr_renovated_ordinals, df_train["yr_renovated"])

In [208]:
df_test_normalized

Unnamed: 0,bedrooms,bathrooms,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode
0,4,2,2,0,0.0,0.000125,0.000375,0.006313,0.0,98059
1,3,1,1,0,0.0,0.000125,0.000375,0.004938,0.0,98074
2,3,1,1,0,0.0,0.000125,0.000375,0.004938,0.0,98074
3,4,2,1,0,0.0,0.000125,0.000438,0.004875,0.0,98075
4,4,3,1,0,0.0,0.000250,0.000375,0.004313,0.0,98034
...,...,...,...,...,...,...,...,...,...,...
5609,3,2,3,0,0.0,0.000125,0.000438,0.006813,0.0,98103
5610,4,2,2,0,0.0,0.000125,0.000438,0.007125,0.0,98146
5611,2,0,2,0,0.0,0.000125,0.000375,0.006813,0.0,98144
5612,3,2,2,0,0.0,0.000125,0.000438,0.006500,0.0,98027


### Numeric Normalization

In [209]:
def Min_Max_Normalize(x, data, new_min, new_max):
    minimum = np.min(data)
    maximum = np.max(data)
    normalized = ((x - minimum) / (maximum - minimum)) * (new_max - new_min) + new_min
    return normalized

In [210]:
def Z_Score_Normalize(x, data):
    std = np.std(data)
    mean = np.mean(data)
    z_normalized = ((x-mean)/std)
    return z_normalized

In [211]:
def Decimal_Scalling(x, data):
    maximum = np.max(data)
    length = len(str(abs(maximum)))
    d_normalized = x/10**length
    return d_normalized

### Train Data

#### Min Max Normalization

In [212]:
df_train_numeric_min_max = df_train_numeric.copy()

In [213]:
for i in range(df_train_numeric.shape[1]):
    df_train_numeric_min_max.iloc[:,i] = Min_Max_Normalize(df_train_numeric.iloc[:,i],df_train_numeric.iloc[:,i],0,1)

In [214]:
df_train_min_max = df_train_numeric_min_max.join(df_train_normalized)

#### Z-score Normalization

In [215]:
df_train_numeric_zscore= df_train_numeric.copy()

In [216]:
for i in range(df_train_numeric.shape[1]):
    df_train_numeric_zscore.iloc[:,i] = Z_Score_Normalize(df_train_numeric.iloc[:,i],df_train_numeric.iloc[:,i])

In [217]:
df_train_zscore = df_train_numeric_zscore.join(df_train_normalized)

#### Decimal Scalling

In [218]:
df_train_numeric_decimal = df_train_numeric.copy()

In [219]:
for i in range(df_train_numeric.shape[1]):
    df_train_numeric_decimal.iloc[:,i] = Decimal_Scalling(df_train_numeric.iloc[:,i],df_train_numeric.iloc[:,i])

In [220]:
df_train_decimal = df_train_numeric_decimal.join(df_train_normalized)

### Test Data

#### Min Max Normalization

In [221]:
df_test_numeric_min_max = df_test_numeric.copy()

In [222]:
for i in range(df_test_numeric.shape[1]):
    df_test_numeric_min_max.iloc[:,i] = Min_Max_Normalize(df_test_numeric.iloc[:,i],df_train_numeric.iloc[:,i],0,1)

In [223]:
df_test_min_max = df_test_numeric_min_max.join(df_test_normalized)

#### Z-score Normalization

In [224]:
df_test_numeric_zscore= df_test_numeric.copy()

In [225]:
for i in range(df_test_numeric.shape[1]):
    df_test_numeric_zscore.iloc[:,i] = Z_Score_Normalize(df_test_numeric.iloc[:,i],df_train_numeric.iloc[:,i])

In [226]:
df_test_zscore = df_test_numeric_zscore.join(df_test_normalized)

#### Decimal Scalling

In [227]:
df_test_numeric_decimal = df_test_numeric.copy()

In [228]:
for i in range(df_test_numeric.shape[1]):
    df_test_numeric_decimal.iloc[:,i] = Decimal_Scalling(df_test_numeric.iloc[:,i],df_train_numeric.iloc[:,i])

In [229]:
df_test_decimal = df_test_numeric_decimal.join(df_test_normalized)

In [230]:
df_train_min_max.head()

Unnamed: 0,price,sqft_living,sqft_lot,sqft_above,sqft_basement,lat,long,sqft_living15,sqft_lot15,bedrooms,bathrooms,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode
0,0.019266,0.061503,0.003108,0.089602,0.0,0.571498,0.217608,0.16477,0.005732,3,1,1,0,0.0,0.000125,0.000375,0.003438,0.0,98178
1,0.060721,0.167046,0.004072,0.199115,0.082988,0.908959,0.166113,0.226055,0.008017,3,2,2,0,0.0,0.000125,0.000375,0.003188,0.002813,98125
2,0.01377,0.030372,0.005743,0.044248,0.0,0.936143,0.237542,0.406409,0.008503,2,1,1,0,0.0,0.000125,0.000313,0.002063,0.0,98028
3,0.069377,0.120729,0.002714,0.075221,0.188797,0.586939,0.104651,0.168272,0.004985,4,3,1,0,0.0,0.00025,0.000375,0.004063,0.0,98136
4,0.057049,0.099468,0.004579,0.144912,0.0,0.741354,0.393688,0.245316,0.007861,3,2,1,0,0.0,0.000125,0.000438,0.005438,0.0,98074


In [231]:
df_train_zscore.head()

Unnamed: 0,price,sqft_living,sqft_lot,sqft_above,sqft_basement,lat,long,sqft_living15,sqft_lot15,bedrooms,bathrooms,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode
0,-0.848292,-0.96371,-0.241246,-0.706075,-0.674535,-0.348228,-0.299702,-0.94319,-0.278186,3,1,1,0,0.0,0.000125,0.000375,0.003438,0.0,98178
1,0.013119,0.571009,-0.203638,0.522771,0.213019,1.152804,-0.746161,-0.422419,-0.206756,3,2,2,0,0.0,0.000125,0.000375,0.003188,0.002813,98125
2,-0.962474,-1.416396,-0.138486,-1.214991,-0.674535,1.273716,-0.126879,1.110134,-0.191565,2,1,1,0,0.0,0.000125,0.000313,0.002063,0.0,98028
3,0.192977,-0.102501,-0.256601,-0.867438,1.34465,-0.279544,-1.279031,-0.913432,-0.30153,4,3,1,0,0.0,0.00025,0.000375,0.004063,0.0,98136
4,-0.063184,-0.411652,-0.183842,-0.085445,-0.674535,0.407296,1.226899,-0.258748,-0.211641,3,2,1,0,0.0,0.000125,0.000438,0.005438,0.0,98074


In [232]:
df_train_decimal.head()

Unnamed: 0,price,sqft_living,sqft_lot,sqft_above,sqft_basement,lat,long,sqft_living15,sqft_lot15,bedrooms,bathrooms,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode
0,0.02219,0.0118,0.000565,0.118,0.0,5e-06,-1.2e-05,0.134,0.00565,3,1,1,0,0.0,0.000125,0.000375,0.003438,0.0,98178
1,0.0538,0.0257,0.000724,0.217,0.04,5e-06,-1.2e-05,0.169,0.007639,3,2,2,0,0.0,0.000125,0.000375,0.003188,0.002813,98125
2,0.018,0.0077,0.001,0.077,0.0,5e-06,-1.2e-05,0.272,0.008062,2,1,1,0,0.0,0.000125,0.000313,0.002063,0.0,98028
3,0.0604,0.0196,0.0005,0.105,0.091,5e-06,-1.2e-05,0.136,0.005,4,3,1,0,0.0,0.00025,0.000375,0.004063,0.0,98136
4,0.051,0.0168,0.000808,0.168,0.0,5e-06,-1.2e-05,0.18,0.007503,3,2,1,0,0.0,0.000125,0.000438,0.005438,0.0,98074


In [233]:
df_test_min_max.head()

Unnamed: 0,price,sqft_living,sqft_lot,sqft_above,sqft_basement,lat,long,sqft_living15,sqft_lot15,bedrooms,bathrooms,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode
0,0.051272,0.149582,0.003862,0.21792,0.0,0.536754,0.314784,0.446682,0.007024,4,2,2,0,0.0,0.000125,0.000375,0.006313,0.0,98059
1,0.032787,0.107062,0.006406,0.09292,0.118257,0.741676,0.388704,0.241814,0.011464,3,1,1,0,0.0,0.000125,0.000375,0.004938,0.0,98074
2,0.055738,0.107062,0.006406,0.09292,0.118257,0.741676,0.388704,0.241814,0.011464,3,1,1,0,0.0,0.000125,0.000375,0.004938,0.0,98074
3,0.04918,0.157935,0.012011,0.115044,0.215768,0.696156,0.377907,0.359132,0.056785,4,2,1,0,0.0,0.000125,0.000438,0.004875,0.0,98075
4,0.052459,0.154897,0.004703,0.09292,0.248963,0.907673,0.248339,0.289091,0.008363,4,3,1,0,0.0,0.00025,0.000375,0.004313,0.0,98034


In [234]:
df_test_zscore.head()

Unnamed: 0,price,sqft_living,sqft_lot,sqft_above,sqft_basement,lat,long,sqft_living15,sqft_lot15,bedrooms,bathrooms,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode
0,-0.183226,0.317063,-0.211811,0.733785,-0.674535,-0.502767,0.542809,1.452355,-0.237785,4,2,2,0,0.0,0.000125,0.000375,0.006313,0.0,98059
1,-0.567332,-0.301241,-0.112595,-0.668837,0.590229,0.408727,1.183694,-0.288507,-0.098983,3,1,1,0,0.0,0.000125,0.000375,0.004938,0.0,98074
2,-0.090436,-0.301241,-0.112595,-0.668837,0.590229,0.408727,1.183694,-0.288507,-0.098983,3,1,1,0,0.0,0.000125,0.000375,0.004938,0.0,98074
3,-0.226692,0.438515,0.105966,-0.420585,1.633105,0.206252,1.090081,0.708397,1.317909,4,2,1,0,0.0,0.000125,0.000438,0.004875,0.0,98075
4,-0.158564,0.394351,-0.179023,-0.668837,1.988127,1.14708,-0.033267,0.113231,-0.195947,4,3,1,0,0.0,0.00025,0.000375,0.004313,0.0,98034


In [235]:
df_test_decimal.head()

Unnamed: 0,price,sqft_living,sqft_lot,sqft_above,sqft_basement,lat,long,sqft_living15,sqft_lot15,bedrooms,bathrooms,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode
0,0.046595,0.0234,0.00069,0.234,0.0,5e-06,-1.2e-05,0.295,0.006775,4,2,2,0,0.0,0.000125,0.000375,0.006313,0.0,98059
1,0.0325,0.0178,0.00111,0.121,0.057,5e-06,-1.2e-05,0.178,0.01064,3,1,1,0,0.0,0.000125,0.000375,0.004938,0.0,98074
2,0.05,0.0178,0.00111,0.121,0.057,5e-06,-1.2e-05,0.178,0.01064,3,1,1,0,0.0,0.000125,0.000375,0.004938,0.0,98074
3,0.045,0.0245,0.002035,0.141,0.104,5e-06,-1.2e-05,0.245,0.050094,4,2,1,0,0.0,0.000125,0.000438,0.004875,0.0,98075
4,0.0475,0.0241,0.000828,0.121,0.12,5e-06,-1.2e-05,0.205,0.00794,4,3,1,0,0.0,0.00025,0.000375,0.004313,0.0,98034


## Testing

In [236]:
def Nominal_Diff(nominal_train,nominal_test):    
    p = nominal_train.shape[1]
    nominal_table=np.zeros(nominal_test.shape[0])
    for i in range(nominal_test.shape[0]):
        m=0
        for j in range(nominal_test.shape[1]):
            if nominal_train[i,j] == nominal_test[i,j]:
                m+=1
        dist = (p - m)/p
        nominal_table[i] = dist
    return nominal_table

In [237]:
def Nominal_Diff2(nominal_train,nominal_test):    
    p = nominal_train.shape[1]
    nominal_table=np.zeros([nominal_test.shape[0],nominal_train.shape[0]])
    for i in range(nominal_test.shape[0]):        
        for k in range(nominal_train.shape[0]):
            m=0
            for j in range(nominal_test.shape[1]):
                if nominal_train[k,j] == nominal_test[i,j]:
                    m+=1
            dist = (p - m)/p
            nominal_table[i,k] = dist
    return nominal_table

In [238]:
def Binary_Diff(binary_train,binary_test):
    q=diss=0;
    binary_table=np.zeros([binary_test.shape[0],binary_train.shape[0]])
    for i in range(binary_test.shape[0]):
        q=diss=0;
        for j in range(binary_train.shape[0]): 
            if binary_train[j] ==1 and binary_test[i]==1:
                q+=1    
            elif binary_train[j] ==0 and binary_test[i]==0:
                continue
            else:
                diss+=1                
        binary_table[i,j]=diss/(diss+q)
    return binary_table

In [239]:
def Manhattan_Distance(numeric_train,numeric_test):        
    manhattan_dist=np.zeros([numeric_test.shape[0],numeric_train.shape[0]])
    for j in range(numeric_test.shape[0]):
        distance=0
        for i in range(numeric_test.shape[1]):
            dist = (numeric_train.iloc[:,i])-(numeric_test.iloc[j,i])
            abs_dist = np.abs(dist[~np.isnan(dist)])        
            distance += abs_dist
        manhattan_dist[j,:]=distance
    return manhattan_dist

In [240]:
def Euclidean_Distance(numeric_train,numeric_test):        
    euclidean_dist=np.zeros(numeric_test.shape[0])
    for i in range(numeric_train.shape[1]):
        dist = (numeric_train.iloc[:,i])-(numeric_test.iloc[:,i])
        squared = np.square(dist[~np.isnan(dist)])
        euclidean_dist += squared
    euclidean_dist = np.sqrt(euclidean_dist)
    return euclidean_dist

In [241]:
# Nominal data fields
nominal_data_train = df_train_min_max.copy()
nominal_data_train = df_train_min_max.filter(['bedrooms','bathrooms','floors','zipcode'], axis=1)
nominal_data_test = df_test_min_max.copy()
nominal_data_test = df_test_min_max.filter(['bedrooms','bathrooms','floors','zipcode'], axis=1)

In [242]:
# Binary data fields
binary_data_train = df_train_min_max.copy()
binary_data_train = df_train_min_max.filter(['waterfront'], axis=1)
binary_data_test = df_test_min_max.copy()
binary_data_test = df_test_min_max.filter(['waterfront'], axis=1)

In [124]:
nominal_data_comp = Nominal_Diff2(nominal_data_train.to_numpy().astype(int),nominal_data_test.to_numpy().astype(int))
nominal_df = pd.DataFrame(nominal_data_comp)
nominal_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15989,15990,15991,15992,15993,15994,15995,15996,15997,15998
0,1.00,0.50,1.00,0.75,0.75,0.75,0.50,1.00,1.00,0.50,...,1.00,0.50,0.75,0.50,0.75,1.00,1.00,1.00,0.75,1.00
1,0.25,0.75,0.50,0.75,0.25,0.75,0.75,0.25,0.25,0.75,...,0.25,0.75,0.50,0.75,0.50,0.50,0.25,0.50,0.50,0.50
2,0.25,0.75,0.50,0.75,0.25,0.75,0.75,0.25,0.25,0.75,...,0.25,0.75,0.50,0.75,0.50,0.50,0.25,0.50,0.50,0.50
3,0.75,0.75,0.75,0.50,0.50,0.50,0.75,0.75,0.75,0.75,...,0.75,0.75,1.00,0.25,0.50,0.75,0.75,0.75,0.50,0.75
4,0.75,1.00,0.75,0.25,0.75,0.50,1.00,0.75,0.75,1.00,...,0.75,1.00,1.00,0.50,0.50,0.75,0.75,0.75,0.75,0.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5609,0.75,0.50,1.00,1.00,0.50,1.00,0.50,0.75,0.75,0.50,...,0.75,0.50,0.75,0.50,1.00,1.00,0.75,1.00,0.50,1.00
5610,1.00,0.50,1.00,0.75,0.75,0.75,0.50,1.00,0.75,0.50,...,1.00,0.50,0.75,0.50,0.75,1.00,1.00,0.75,0.75,1.00
5611,1.00,0.75,0.75,1.00,1.00,1.00,0.75,1.00,1.00,0.75,...,1.00,0.75,0.75,1.00,1.00,1.00,1.00,0.75,1.00,0.75
5612,0.75,0.25,1.00,1.00,0.50,1.00,0.25,0.75,0.75,0.25,...,0.75,0.25,0.50,0.75,1.00,1.00,0.75,1.00,0.50,1.00


In [129]:
binary_data_comp = Binary_Diff(binary_data_train.to_numpy(),binary_data_test.to_numpy())
binary_df = pd.DataFrame(binary_data_comp)
binary_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15989,15990,15991,15992,15993,15994,15995,15996,15997,15998
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5610,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5611,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5612,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## Min Max Normalization

In [243]:
df_train_numeric_min_max = df_train_min_max.filter(['sqft_living','sqft_lot','sqft_above','sqft_basement','lat','long','sqft_living15','sqft_lot15','view','condition','grade','yr_built','yr_renovated'], axis=1)
df_test_numeric_min_max = df_test_min_max.filter(['sqft_living','sqft_lot','sqft_above','sqft_basement','lat','long','sqft_living15','sqft_lot15','view','condition','grade','yr_built','yr_renovated'], axis=1)
df_train_min_max_price = df_train_min_max.filter(['price'], axis=1)
df_test_min_max_price = df_test_min_max.filter(['price'], axis=1)

#### Manhattan Distance

In [178]:
manhattan_distance_min_max = Manhattan_Distance(df_train_numeric_min_max,df_test_numeric_min_max)
manhattan_min_max_df = pd.DataFrame(manhattan_distance_min_max)
#manhattan_min_max_df = manhattan_min_max_df.join(nominal_df)
#manhattan_min_max_df = manhattan_min_max_df.join(binary_df)
#manhattan_min_max_df = manhattan_min_max_df.join(df_test_min_max.Price)
manhattan_min_max_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15989,15990,15991,15992,15993,15994,15995,15996,15997,15998
0,0.635151,0.867900,0.817459,0.904640,0.610484,1.586748,0.686421,0.748056,0.746121,0.468726,...,0.899631,0.691917,0.577591,0.963158,0.321473,0.961641,0.834049,0.809525,0.846133,1.261893
1,0.595984,0.617428,0.760406,0.625410,0.192642,1.556428,0.990757,0.768221,0.463132,0.732899,...,0.917437,0.891509,0.489700,0.385516,0.598218,0.632434,0.521093,0.773253,0.948451,0.646342
2,0.595984,0.617428,0.760406,0.625410,0.192642,1.556428,0.990757,0.768221,0.463132,0.732899,...,0.917437,0.891509,0.489700,0.385516,0.598218,0.632434,0.521093,0.773253,0.948451,0.646342
3,0.878417,0.844907,0.899245,0.739429,0.535815,1.273995,1.068675,1.050653,0.679175,0.762477,...,1.193929,1.113115,0.592077,0.666651,0.522094,0.667015,0.570333,1.055686,1.113559,0.872662
4,0.842129,0.435907,0.582363,0.702891,0.713763,1.516742,1.143598,1.020163,0.643284,1.149766,...,1.171737,1.125522,0.642291,0.429622,0.736153,0.356073,0.533795,1.019272,1.429996,0.414444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5609,0.485971,0.335341,0.523864,0.639755,0.470430,1.969244,0.811415,0.618136,0.590122,1.022311,...,0.862015,0.873571,0.354226,0.314782,0.694854,0.577076,0.627375,0.514140,1.252411,0.439452
5610,0.390243,0.524893,0.924433,0.485313,0.560138,1.939821,0.537112,0.475017,0.368659,0.681840,...,0.732904,0.580005,0.480318,0.573732,0.470804,0.932087,0.467782,0.391485,0.867169,0.901042
5611,0.264755,0.681932,0.646953,0.528006,0.518523,2.143764,0.833834,0.445264,0.522957,0.998218,...,0.670027,0.849478,0.375041,0.499047,0.674747,0.779826,0.584010,0.240277,1.228443,0.605955
5612,0.294345,0.792771,0.864554,0.585339,0.247177,1.872418,0.751975,0.564879,0.562780,0.534300,...,0.702446,0.633068,0.498148,0.685376,0.574497,1.005143,0.677714,0.470354,0.764400,0.883634


In [179]:
numeric = pd.DataFrame(manhattan_min_max_df.iloc[0,:],lsuffix="c")
numeric.rename(columns={0:"Numeric"},inplace=True)

binary = pd.DataFrame(binary_df.iloc[0,:])
binary.rename(columns={0:"Binary"},inplace=True)

nominal = pd.DataFrame(nominal_df.iloc[0,:])
nominal.rename(columns={0:"Nominal"},inplace=True)

In [277]:
# MSE calculation function
def MSE(y_true,y_pred):
    m = y_true.shape[0]
    return (1/(2*m)) * (np.sum(((y_true-y_pred)**2)))

In [282]:
def KNN(numeric_data, binary_data, nominal_data, price_train, price_test, k):
    predictions=[]
    for i in range(numeric_data.shape[0]):
        numeric = pd.DataFrame(numeric_data.iloc[i,:])
        binary = pd.DataFrame(binary_data.iloc[i,:])
        nominal = pd.DataFrame(nominal_data.iloc[i,:])
        
        #result = numeric.join([binary,nominal,price_train])
        result = pd.concat([numeric,binary,nominal,price_train],ignore_index = True,axis=1)
        index=(np.sum(result,axis=1)).argsort()[:k]
        pred = np.mean(result.iloc[index.tolist(),:].iloc[:,-1])
        #print(pred, price_test[i])        
        predictions.append(pred)
    mean_squared_error = MSE(price_test,predictions)
    
    return mean_squared_error, predictions

In [283]:
error,prediction = KNN(manhattan_min_max_df,binary_df,nominal_df,df_train_min_max_price,df_test_min_max_price.price,3)

In [284]:
error

0.00034918872171042786

In [285]:
numeric = pd.DataFrame(manhattan_min_max_df.iloc[0,:])
#numeric.rename(columns={0:"Numeric"},inplace=True)

binary = pd.DataFrame(binary_df.iloc[0,:])
#binary.rename(columns={0:"Binary"},inplace=True)

nominal = pd.DataFrame(nominal_df.iloc[0,:])
#nominal.rename(columns={0:"Nominal"},inplace=True)

result = numeric.join([binary,nominal,df_train_min_max_price],ignore_index = True)
result = pd.concat([numeric,binary,nominal,df_train_min_max_price],ignore_index = True,axis=1)
index=(np.sum(result,axis=1)).argsort()[:3]
np.mean(result.iloc[index.tolist(),:].price)

#### Euclidean Distance

In [78]:
euclidean_distance_min_max = Euclidean_Distance(df_train_numeric_min_max,df_test_numeric_min_max)
euclidean_min_max_df = pd.DataFrame(euclidean_distance_min_max)
euclidean_min_max_df.rename(columns={"sqft_living":"Numeric"},inplace=True)
euclidean_min_max_df = euclidean_min_max_df.join(nominal_df)
euclidean_min_max_df = euclidean_min_max_df.join(binary_df)
euclidean_min_max_df

Unnamed: 0,Numeric,Nominal,Binary
0,0.338170,1.00,1.0
1,0.306475,0.75,1.0
2,0.331680,0.50,1.0
3,0.359851,0.50,1.0
4,0.344186,0.75,1.0
...,...,...,...
5609,0.183435,0.75,1.0
5610,0.416753,1.00,1.0
5611,0.284804,1.00,1.0
5612,0.164134,0.25,1.0


## Z-Score Normalization

In [80]:
df_train_numeric_zscore = df_train_zscore.filter(['sqft_living','sqft_lot','sqft_above','sqft_basement','lat','long','sqft_living15','sqft_lot15','view','condition','grade','yr_built','yr_renovated'], axis=1)
df_test_numeric_zscore = df_test_zscore.filter(['sqft_living','sqft_lot','sqft_above','sqft_basement','lat','long','sqft_living15','sqft_lot15','view','condition','grade','yr_built','yr_renovated'], axis=1)

#### Manhattan Distance

In [81]:
manhattan_distance_zscore = Manhattan_Distance(df_train_numeric_zscore,df_test_numeric_zscore)
manhattan_zscore_df = pd.DataFrame(manhattan_distance_zscore)
manhattan_zscore_df.rename(columns={"sqft_living":"Numeric"},inplace=True)
manhattan_zscore_df = manhattan_zscore_df.join(nominal_df)
manhattan_zscore_df = manhattan_zscore_df.join(binary_df)
manhattan_zscore_df

Unnamed: 0,Numeric,Nominal,Binary
0,6.185938,1.00,1.0
1,5.452292,0.75,1.0
2,6.621687,0.50,1.0
3,7.736066,0.50,1.0
4,6.445811,0.75,1.0
...,...,...,...
5609,3.746246,0.75,1.0
5610,6.721329,1.00,1.0
5611,2.460004,1.00,1.0
5612,2.255764,0.25,1.0


#### Euclidean Distance

In [82]:
euclidean_distance_zscore = Euclidean_Distance(df_train_numeric_zscore,df_test_numeric_zscore)
euclidean_zscore_df = pd.DataFrame(euclidean_distance_zscore)
euclidean_zscore_df.rename(columns={"sqft_living":"Numeric"},inplace=True)
euclidean_zscore_df = euclidean_zscore_df.join(nominal_df)
euclidean_zscore_df = euclidean_zscore_df.join(binary_df)
euclidean_zscore_df

Unnamed: 0,Numeric,Nominal,Binary
0,3.191931,1.00,1.0
1,2.576601,0.75,1.0
2,2.751861,0.50,1.0
3,3.436370,0.50,1.0
4,3.217716,0.75,1.0
...,...,...,...
5609,1.844404,0.75,1.0
5610,3.067318,1.00,1.0
5611,1.479427,1.00,1.0
5612,1.428808,0.25,1.0


## Decimal Scalling Normalization

In [83]:
df_train_numeric_decimal = df_train_decimal.filter(['sqft_living','sqft_lot','sqft_above','sqft_basement','lat','long','sqft_living15','sqft_lot15','view','condition','grade','yr_built','yr_renovated'], axis=1)
df_test_numeric_decimal = df_test_decimal.filter(['sqft_living','sqft_lot','sqft_above','sqft_basement','lat','long','sqft_living15','sqft_lot15','view','condition','grade','yr_built','yr_renovated'], axis=1)

#### Manhattan Distance

In [84]:
manhattan_distance_decimal = Manhattan_Distance(df_train_numeric_decimal,df_test_numeric_decimal)
manhattan_decimal_df = pd.DataFrame(manhattan_distance_decimal)
manhattan_decimal_df.rename(columns={"sqft_living":"Numeric"},inplace=True)
manhattan_decimal_df = manhattan_decimal_df.join(nominal_df)
manhattan_decimal_df = manhattan_decimal_df.join(binary_df)
manhattan_decimal_df

Unnamed: 0,Numeric,Nominal,Binary
0,0.292725,1.00,1.0
1,0.137849,0.75,1.0
2,0.210725,0.50,1.0
3,0.210529,0.50,1.0
4,0.201070,0.75,1.0
...,...,...,...
5609,0.129173,0.75,1.0
5610,0.198096,1.00,1.0
5611,0.068341,1.00,1.0
5612,0.116443,0.25,1.0


#### Euclidean Distance

In [85]:
euclidean_distance_decimal = Euclidean_Distance(df_train_numeric_decimal,df_test_numeric_decimal)
euclidean_decimal_df = pd.DataFrame(euclidean_distance_decimal)
euclidean_decimal_df.rename(columns={"sqft_living":"Numeric"},inplace=True)
euclidean_decimal_df = euclidean_decimal_df.join(nominal_df)
euclidean_decimal_df = euclidean_decimal_df.join(binary_df)
euclidean_decimal_df

Unnamed: 0,Numeric,Nominal,Binary
0,0.198799,1.00,1.0
1,0.098329,0.75,1.0
2,0.118903,0.50,1.0
3,0.124123,0.50,1.0
4,0.131487,0.75,1.0
...,...,...,...
5609,0.078782,0.75,1.0
5610,0.143621,1.00,1.0
5611,0.057419,1.00,1.0
5612,0.093803,0.25,1.0


In [1323]:
#np.abs(df_train_min_max.sqft_living-df_test_min_max.sqft_living)+np.abs(df_train_min_max.sqft_lot-df_test_min_max.sqft_lot)+np.abs(df_train_min_max.sqft_above-df_test_min_max.sqft_above)+np.abs(df_train_min_max.sqft_basement-df_test_min_max.sqft_basement)+np.abs(df_train_min_max.lat-df_test_min_max.lat)+np.abs(df_train_min_max.long-df_test_min_max.long)+np.abs(df_train_min_max.sqft_living15-df_test_min_max.sqft_living15)+np.abs(df_train_min_max.sqft_lot15-df_test_min_max.sqft_lot15)
#np.sqrt(np.square(df_train_min_max.sqft_living-df_test_min_max.sqft_living)+np.square(df_train_min_max.sqft_lot-df_test_min_max.sqft_lot)+np.square(df_train_min_max.sqft_above-df_test_min_max.sqft_above)+np.square(df_train_min_max.sqft_basement-df_test_min_max.sqft_basement)+np.square(df_train_min_max.lat-df_test_min_max.lat)+np.square(df_train_min_max.long-df_test_min_max.long)+np.square(df_train_min_max.sqft_living15-df_test_min_max.sqft_living15)+np.square(df_train_min_max.sqft_lot15-df_test_min_max.sqft_lot15))

In [None]:
numeric = pd.DataFrame(manhattan_min_max_df.iloc[0,:],lsuffix="c")
numeric.rename(columns={0:"Numeric"},inplace=True)

binary = pd.DataFrame(binary_df.iloc[0,:])
binary.rename(columns={0:"Binary"},inplace=True)

nominal = pd.DataFrame(nominal_df.iloc[0,:])
nominal.rename(columns={0:"Nominal"},inplace=True)
##########################


numeric = pd.DataFrame(manhattan_min_max_df.iloc[0,:])
#numeric.rename(columns={0:"Numeric"},inplace=True)

binary = pd.DataFrame(binary_df.iloc[0,:])
#binary.rename(columns={0:"Binary"},inplace=True)

nominal = pd.DataFrame(nominal_df.iloc[0,:])
#nominal.rename(columns={0:"Nominal"},inplace=True)

result = numeric.join([binary,nominal,df_train_min_max_price],ignore_index = True)
result = pd.concat([numeric,binary,nominal,df_train_min_max_price],ignore_index = True,axis=1)
index=(np.sum(result,axis=1)).argsort()[:3]
np.mean(result.iloc[index.tolist(),:].price)