In [571]:
import pandas as pd
import numpy as np

In [572]:
df_train = pd.read_csv('trainHome_data.csv')
df_test = pd.read_csv('testingHome_data.csv')

In [573]:
df_train.drop(['id', 'date'], axis = 1, inplace = True)
df_test.drop(['id', 'date'], axis = 1, inplace = True)

In [574]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15999 entries, 0 to 15998
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   price          15999 non-null  int64  
 1   bedrooms       15999 non-null  int64  
 2   bathrooms      15999 non-null  float64
 3   sqft_living    15999 non-null  int64  
 4   sqft_lot       15999 non-null  int64  
 5   floors         15999 non-null  float64
 6   waterfront     15999 non-null  int64  
 7   view           15999 non-null  int64  
 8   condition      15999 non-null  int64  
 9   grade          15999 non-null  int64  
 10  sqft_above     15999 non-null  int64  
 11  sqft_basement  15999 non-null  int64  
 12  yr_built       15999 non-null  int64  
 13  yr_renovated   15999 non-null  int64  
 14  zipcode        15999 non-null  int64  
 15  lat            15999 non-null  float64
 16  long           15999 non-null  float64
 17  sqft_living15  15999 non-null  int64  
 18  sqft_l

In [575]:
df_train = df_train.astype({"bathrooms": int})
df_test = df_test.astype({"bathrooms": int})

In [576]:
df_train.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,221900,3,1,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,538000,3,2,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,180000,2,1,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,604000,4,3,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,510000,3,2,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


Nominal:    
- bedroom
- bathrooms
- floors
- zip code

Binary:
- waterfront(asymmetric)
    
Ordinal
- view
- condition    
- grade
- yr_built
- yr_renovated
    
Numeric:
- sqft_living (discrete)
- sqft_lot (discrete)
- sqft_above (discrete)
- sqft_basement (discrete)
- lat (continous)
- long (continous)
- sqft_living15 (discrete)
- sqft_lot15 (discrete)

## Pre-processing

In [577]:
df_train_normalized = df_train.copy()
df_train_numeric = df_train.filter(['price','sqft_living','sqft_lot','sqft_above','sqft_basement','lat','long','sqft_living15','sqft_lot15'], axis=1)
df_train_normalized.drop(['price','sqft_living','sqft_lot','sqft_above','sqft_basement','lat','long','sqft_living15','sqft_lot15'], axis=1,inplace=True)
df_train_normalized.head()

Unnamed: 0,bedrooms,bathrooms,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode
0,3,1,1.0,0,0,3,7,1955,0,98178
1,3,2,2.0,0,0,3,7,1951,1991,98125
2,2,1,1.0,0,0,3,6,1933,0,98028
3,4,3,1.0,0,0,5,7,1965,0,98136
4,3,2,1.0,0,0,3,8,1987,0,98074


In [578]:
df_test_normalized = df_test.copy()
df_test_numeric = df_test.filter(['price','sqft_living','sqft_lot','sqft_above','sqft_basement','lat','long','sqft_living15','sqft_lot15'], axis=1)
df_test_normalized.drop(['price','sqft_living','sqft_lot','sqft_above','sqft_basement','lat','long','sqft_living15','sqft_lot15'], axis=1,inplace=True)
df_test_normalized.head()

Unnamed: 0,bedrooms,bathrooms,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode
0,4,2,2.0,0,0,3,7,2001,0,98059
1,3,1,1.0,0,0,3,7,1979,0,98074
2,3,1,1.0,0,0,3,7,1979,0,98074
3,4,2,1.0,0,0,3,8,1978,0,98075
4,4,3,1.0,0,0,5,7,1969,0,98034


In [579]:
df_train_numeric.head()

Unnamed: 0,price,sqft_living,sqft_lot,sqft_above,sqft_basement,lat,long,sqft_living15,sqft_lot15
0,221900,1180,5650,1180,0,47.5112,-122.257,1340,5650
1,538000,2570,7242,2170,400,47.721,-122.319,1690,7639
2,180000,770,10000,770,0,47.7379,-122.233,2720,8062
3,604000,1960,5000,1050,910,47.5208,-122.393,1360,5000
4,510000,1680,8080,1680,0,47.6168,-122.045,1800,7503


In [580]:
df_test_numeric.head(5)

Unnamed: 0,price,sqft_living,sqft_lot,sqft_above,sqft_basement,lat,long,sqft_living15,sqft_lot15
0,465950,2340,6896,2340,0,47.4896,-122.14,2950,6775
1,325000,1780,11096,1210,570,47.617,-122.051,1780,10640
2,500000,1780,11096,1210,570,47.617,-122.051,1780,10640
3,450000,2450,20348,1410,1040,47.5887,-122.064,2450,50094
4,475000,2410,8284,1210,1200,47.7202,-122.22,2050,7940


In [581]:
def Min_Max_Normalize(x, data, new_min, new_max):
    minimum = np.min(data)
    maximum = np.max(data)
    normalized = ((x - minimum) / (maximum - minimum)) * (new_max - new_min) + new_min
    return normalized

In [582]:
def Z_Score_Normalize(x, data):
    std = np.std(data)
    mean = np.mean(data)
    z_normalized = ((x-mean)/std)
    return z_normalized

In [583]:
def Decimal_Scalling(x, data):
    maximum = np.max(data)
    length = len(str(abs(maximum)))
    d_normalized = x/10**length
    return d_normalized

In [584]:
def rank_by_interval(x, data):
    z= (x-1)/((data.shape[0]+1)-1)
    return z

In [585]:
#pd.cut(df.yr_built, bins=25,precision=0)

### Ordinal Normalization

#### Train Data

In [546]:
df_train_normalized

Unnamed: 0,bedrooms,bathrooms,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode
0,3,1,1.0,0,0,3,7,1955,0,98178
1,3,2,2.0,0,0,3,7,1951,1991,98125
2,2,1,1.0,0,0,3,6,1933,0,98028
3,4,3,1.0,0,0,5,7,1965,0,98136
4,3,2,1.0,0,0,3,8,1987,0,98074
...,...,...,...,...,...,...,...,...,...,...
15994,5,1,1.0,0,2,3,7,1955,0,98155
15995,3,1,1.0,0,1,5,7,1910,0,98136
15996,2,1,1.0,0,0,4,6,1942,0,98146
15997,3,2,1.5,0,2,5,8,1986,0,98022


In [586]:
#view to rank [1...M]
df_train_normalized.view = df_train_normalized["view"]+1

In [587]:
#year built to rank [1...M]
train_yr_built_sorted_unique = sorted(set(df_train.yr_built))
train_yr_built_ordinal_map = {val: i for i, val in enumerate(train_yr_built_sorted_unique, 1)}
train_yr_built_ordinals = [train_yr_built_ordinal_map[val] for val in df_train.yr_built]
train_yr_built_ordinals = pd.Series(train_yr_built_ordinals)

In [588]:
#year renovated to rank [1...M]
train_yr_renovated_sorted_unique = sorted(set(df_train.yr_renovated))
train_yr_renovated_ordinal_map = {val: i for i, val in enumerate(train_yr_renovated_sorted_unique, 1)}
train_yr_renovated_ordinals = [train_yr_renovated_ordinal_map[val] for val in df_train.yr_renovated]
train_yr_renovated_ordinals = pd.Series(train_yr_renovated_ordinals)

In [589]:
#map ordinals to [0,1]
df_train_normalized.view = rank_by_interval(df_train_normalized["view"], df_train["view"])
df_train_normalized.condition = rank_by_interval(df_train_normalized["condition"], df_train["condition"])
df_train_normalized.grade = rank_by_interval(df_train_normalized["grade"], df_train["grade"])
df_train_normalized.yr_built = rank_by_interval(train_yr_built_ordinals, df_train["yr_built"])
df_train_normalized.yr_renovated = rank_by_interval(train_yr_renovated_ordinals, df_train["yr_renovated"])

In [590]:
df_train_normalized

Unnamed: 0,bedrooms,bathrooms,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode
0,3,1,1.0,0,0.000000,0.000125,0.000375,0.003438,0.000000,98178
1,3,2,2.0,0,0.000000,0.000125,0.000375,0.003188,0.002813,98125
2,2,1,1.0,0,0.000000,0.000125,0.000313,0.002063,0.000000,98028
3,4,3,1.0,0,0.000000,0.000250,0.000375,0.004063,0.000000,98136
4,3,2,1.0,0,0.000000,0.000125,0.000438,0.005438,0.000000,98074
...,...,...,...,...,...,...,...,...,...,...
15994,5,1,1.0,0,0.000125,0.000125,0.000375,0.003438,0.000000,98155
15995,3,1,1.0,0,0.000063,0.000250,0.000375,0.000625,0.000000,98136
15996,2,1,1.0,0,0.000000,0.000188,0.000313,0.002625,0.000000,98146
15997,3,2,1.5,0,0.000125,0.000250,0.000438,0.005375,0.000000,98022


#### Test Data

In [591]:
df_test_normalized

Unnamed: 0,bedrooms,bathrooms,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode
0,4,2,2.0,0,0,3,7,2001,0,98059
1,3,1,1.0,0,0,3,7,1979,0,98074
2,3,1,1.0,0,0,3,7,1979,0,98074
3,4,2,1.0,0,0,3,8,1978,0,98075
4,4,3,1.0,0,0,5,7,1969,0,98034
...,...,...,...,...,...,...,...,...,...,...
5609,3,2,3.0,0,0,3,8,2009,0,98103
5610,4,2,2.0,0,0,3,8,2014,0,98146
5611,2,0,2.0,0,0,3,7,2009,0,98144
5612,3,2,2.0,0,0,3,8,2004,0,98027


In [592]:
#view to rank [1...M]
df_test_normalized.view = df_test_normalized["view"]+1

In [593]:
#year built to rank [1...M]
yr_built_sorted_unique = sorted(set(df_test.yr_built))
yr_built_ordinal_map = {val: i for i, val in enumerate(yr_built_sorted_unique, 1)}
yr_built_ordinals = [yr_built_ordinal_map[val] for val in df_test.yr_built]
yr_built_ordinals = pd.Series(yr_built_ordinals)

In [594]:
#year renovated to rank [1...M]
yr_renovated_sorted_unique = sorted(set(df_test.yr_renovated))
yr_renovated_ordinal_map = {val: i for i, val in enumerate(yr_renovated_sorted_unique, 1)}
yr_renovated_ordinals = [yr_renovated_ordinal_map[val] for val in df_test.yr_renovated]
yr_renovated_ordinals = pd.Series(yr_renovated_ordinals)

In [595]:
#map ordinals to [0,1]
df_test_normalized.view = rank_by_interval(df_test_normalized["view"], df_train["view"])
df_test_normalized.condition = rank_by_interval(df_test_normalized["condition"], df_train["condition"])
df_test_normalized.grade = rank_by_interval(df_test_normalized["grade"], df_train["grade"])
df_test_normalized.yr_built = rank_by_interval(yr_built_ordinals, df_train["yr_built"])
df_test_normalized.yr_renovated = rank_by_interval(yr_renovated_ordinals, df_train["yr_renovated"])

In [596]:
df_test_normalized

Unnamed: 0,bedrooms,bathrooms,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode
0,4,2,2.0,0,0.0,0.000125,0.000375,0.006313,0.0,98059
1,3,1,1.0,0,0.0,0.000125,0.000375,0.004938,0.0,98074
2,3,1,1.0,0,0.0,0.000125,0.000375,0.004938,0.0,98074
3,4,2,1.0,0,0.0,0.000125,0.000438,0.004875,0.0,98075
4,4,3,1.0,0,0.0,0.000250,0.000375,0.004313,0.0,98034
...,...,...,...,...,...,...,...,...,...,...
5609,3,2,3.0,0,0.0,0.000125,0.000438,0.006813,0.0,98103
5610,4,2,2.0,0,0.0,0.000125,0.000438,0.007125,0.0,98146
5611,2,0,2.0,0,0.0,0.000125,0.000375,0.006813,0.0,98144
5612,3,2,2.0,0,0.0,0.000125,0.000438,0.006500,0.0,98027


### Numeric Normalization

### Train Data

#### Min Max Normalization

In [568]:
df_train_numeric_min_max = df_train_numeric.copy()

In [569]:
for i in range(df_train_numeric.shape[1]):
    df_train_numeric_min_max.iloc[:,i] = Min_Max_Normalize(df_train_numeric.iloc[:,i],df_train_numeric.iloc[:,i],0,1)

In [570]:
df_train_min_max = df_train_numeric_min_max.join(df_train_normalized)
df_train_min_max

Unnamed: 0,price,sqft_living,sqft_lot,sqft_above,sqft_basement,lat,long,sqft_living15,sqft_lot15,bedrooms,bathrooms,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode
0,0.019266,0.061503,0.003108,0.089602,0.000000,0.571498,0.217608,0.164770,0.005732,3,1,1.0,0,0.000000,0.000125,0.000375,0.003438,0.000000,98178
1,0.060721,0.167046,0.004072,0.199115,0.082988,0.908959,0.166113,0.226055,0.008017,3,2,2.0,0,0.000000,0.000125,0.000375,0.003188,0.002813,98125
2,0.013770,0.030372,0.005743,0.044248,0.000000,0.936143,0.237542,0.406409,0.008503,2,1,1.0,0,0.000000,0.000125,0.000313,0.002063,0.000000,98028
3,0.069377,0.120729,0.002714,0.075221,0.188797,0.586939,0.104651,0.168272,0.004985,4,3,1.0,0,0.000000,0.000250,0.000375,0.004063,0.000000,98136
4,0.057049,0.099468,0.004579,0.144912,0.000000,0.741354,0.393688,0.245316,0.007861,3,2,1.0,0,0.000000,0.000125,0.000438,0.005438,0.000000,98074
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15994,0.051148,0.110858,0.005137,0.073009,0.165975,0.953675,0.188538,0.387148,0.012079,5,1,1.0,0,0.000125,0.000125,0.000375,0.003438,0.000000,98155
15995,0.064131,0.122248,0.003471,0.079646,0.184647,0.640341,0.111296,0.276834,0.006421,3,1,1.0,0,0.000063,0.000250,0.000375,0.000625,0.000000,98136
15996,0.040590,0.057707,0.002857,0.084071,0.000000,0.570050,0.110465,0.106987,0.005353,2,1,1.0,0,0.000000,0.000188,0.000313,0.002625,0.000000,98146
15997,0.044590,0.124525,0.019729,0.181416,0.000000,0.092327,0.425249,0.252320,0.025226,3,2,1.5,0,0.000125,0.000250,0.000438,0.005375,0.000000,98022


In [565]:
df_train_numeric_min_max

Unnamed: 0,price,sqft_living,sqft_lot,sqft_above,sqft_basement,lat,long,sqft_living15,sqft_lot15
0,0.019266,0.061503,0.003108,0.089602,0.000000,0.571498,0.217608,0.164770,0.005732
1,0.060721,0.167046,0.004072,0.199115,0.082988,0.908959,0.166113,0.226055,0.008017
2,0.013770,0.030372,0.005743,0.044248,0.000000,0.936143,0.237542,0.406409,0.008503
3,0.069377,0.120729,0.002714,0.075221,0.188797,0.586939,0.104651,0.168272,0.004985
4,0.057049,0.099468,0.004579,0.144912,0.000000,0.741354,0.393688,0.245316,0.007861
...,...,...,...,...,...,...,...,...,...
15994,0.051148,0.110858,0.005137,0.073009,0.165975,0.953675,0.188538,0.387148,0.012079
15995,0.064131,0.122248,0.003471,0.079646,0.184647,0.640341,0.111296,0.276834,0.006421
15996,0.040590,0.057707,0.002857,0.084071,0.000000,0.570050,0.110465,0.106987,0.005353
15997,0.044590,0.124525,0.019729,0.181416,0.000000,0.092327,0.425249,0.252320,0.025226


#### Z-score Normalization

In [598]:
df_train_numeric_zscore= df_train_numeric.copy()

In [601]:
for i in range(df_train_numeric.shape[1]):
    df_train_numeric_zscore.iloc[:,i] = Z_Score_Normalize(df_train_numeric.iloc[:,i],df_train_numeric.iloc[:,i])

In [602]:
df_train_zscore = df_train_numeric_zscore.join(df_train_normalized)
df_train_zscore

Unnamed: 0,price,sqft_living,sqft_lot,sqft_above,sqft_basement,lat,long,sqft_living15,sqft_lot15,bedrooms,bathrooms,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode
0,-0.848292,-0.963710,-0.241246,-0.706075,-0.674535,-0.348228,-0.299702,-0.943190,-0.278186,3,1,1.0,0,0.000000,0.000125,0.000375,0.003438,0.000000,98178
1,0.013119,0.571009,-0.203638,0.522771,0.213019,1.152804,-0.746161,-0.422419,-0.206756,3,2,2.0,0,0.000000,0.000125,0.000375,0.003188,0.002813,98125
2,-0.962474,-1.416396,-0.138486,-1.214991,-0.674535,1.273716,-0.126879,1.110134,-0.191565,2,1,1.0,0,0.000000,0.000125,0.000313,0.002063,0.000000,98028
3,0.192977,-0.102501,-0.256601,-0.867438,1.344650,-0.279544,-1.279031,-0.913432,-0.301530,4,3,1.0,0,0.000000,0.000250,0.000375,0.004063,0.000000,98136
4,-0.063184,-0.411652,-0.183842,-0.085445,-0.674535,0.407296,1.226899,-0.258748,-0.211641,3,2,1.0,0,0.000000,0.000125,0.000438,0.005438,0.000000,98074
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15994,-0.185815,-0.246035,-0.162109,-0.892263,1.100573,1.351701,-0.551735,0.946463,-0.079770,5,1,1.0,0,0.000125,0.000125,0.000375,0.003438,0.000000,98155
15995,0.083972,-0.080418,-0.227072,-0.817788,1.300272,-0.042012,-1.221423,0.009076,-0.256639,3,1,1.0,0,0.000063,0.000250,0.000375,0.000625,0.000000,98136
15996,-0.405187,-1.018915,-0.251026,-0.768138,-0.674535,-0.354667,-1.228624,-1.434202,-0.290038,2,1,1.0,0,0.000000,0.000188,0.000313,0.002625,0.000000,98146
15997,-0.322071,-0.047295,0.406970,0.324170,-0.674535,-2.479579,1.500535,-0.199232,0.331249,3,2,1.5,0,0.000125,0.000250,0.000438,0.005375,0.000000,98022


#### Decimal Scalling

In [603]:
df_train_numeric_decimal = df_train_numeric.copy()

In [605]:
for i in range(df_train_numeric.shape[1]):
    df_train_numeric_decimal.iloc[:,i] = Decimal_Scalling(df_train_numeric.iloc[:,i],df_train_numeric.iloc[:,i])

In [606]:
df_train_decimal = df_train_numeric_decimal.join(df_train_normalized)
df_train_decimal

Unnamed: 0,price,sqft_living,sqft_lot,sqft_above,sqft_basement,lat,long,sqft_living15,sqft_lot15,bedrooms,bathrooms,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode
0,0.02219,0.0118,0.000565,0.118,0.000,0.000005,-0.000012,0.134,0.005650,3,1,1.0,0,0.000000,0.000125,0.000375,0.003438,0.000000,98178
1,0.05380,0.0257,0.000724,0.217,0.040,0.000005,-0.000012,0.169,0.007639,3,2,2.0,0,0.000000,0.000125,0.000375,0.003188,0.002813,98125
2,0.01800,0.0077,0.001000,0.077,0.000,0.000005,-0.000012,0.272,0.008062,2,1,1.0,0,0.000000,0.000125,0.000313,0.002063,0.000000,98028
3,0.06040,0.0196,0.000500,0.105,0.091,0.000005,-0.000012,0.136,0.005000,4,3,1.0,0,0.000000,0.000250,0.000375,0.004063,0.000000,98136
4,0.05100,0.0168,0.000808,0.168,0.000,0.000005,-0.000012,0.180,0.007503,3,2,1.0,0,0.000000,0.000125,0.000438,0.005438,0.000000,98074
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15994,0.04650,0.0183,0.000900,0.103,0.080,0.000005,-0.000012,0.261,0.011175,5,1,1.0,0,0.000125,0.000125,0.000375,0.003438,0.000000,98155
15995,0.05640,0.0198,0.000625,0.109,0.089,0.000005,-0.000012,0.198,0.006250,3,1,1.0,0,0.000063,0.000250,0.000375,0.000625,0.000000,98136
15996,0.03845,0.0113,0.000524,0.113,0.000,0.000005,-0.000012,0.101,0.005320,2,1,1.0,0,0.000000,0.000188,0.000313,0.002625,0.000000,98146
15997,0.04150,0.0201,0.003309,0.201,0.000,0.000005,-0.000012,0.184,0.022620,3,2,1.5,0,0.000125,0.000250,0.000438,0.005375,0.000000,98022


### Test Data

#### Min Max Normalization

In [608]:
df_test_numeric_min_max = df_test_numeric.copy()

In [609]:
for i in range(df_test_numeric.shape[1]):
    df_test_numeric_min_max.iloc[:,i] = Min_Max_Normalize(df_test_numeric.iloc[:,i],df_train_numeric.iloc[:,i],0,1)

In [610]:
df_test_min_max = df_test_numeric_min_max.join(df_test_normalized)
df_test_min_max

Unnamed: 0,price,sqft_living,sqft_lot,sqft_above,sqft_basement,lat,long,sqft_living15,sqft_lot15,bedrooms,bathrooms,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode
0,0.051272,0.149582,0.003862,0.217920,0.000000,0.536754,0.314784,0.446682,0.007024,4,2,2.0,0,0.0,0.000125,0.000375,0.006313,0.0,98059
1,0.032787,0.107062,0.006406,0.092920,0.118257,0.741676,0.388704,0.241814,0.011464,3,1,1.0,0,0.0,0.000125,0.000375,0.004938,0.0,98074
2,0.055738,0.107062,0.006406,0.092920,0.118257,0.741676,0.388704,0.241814,0.011464,3,1,1.0,0,0.0,0.000125,0.000375,0.004938,0.0,98074
3,0.049180,0.157935,0.012011,0.115044,0.215768,0.696156,0.377907,0.359132,0.056785,4,2,1.0,0,0.0,0.000125,0.000438,0.004875,0.0,98075
4,0.052459,0.154897,0.004703,0.092920,0.248963,0.907673,0.248339,0.289091,0.008363,4,3,1.0,0,0.0,0.000250,0.000375,0.004313,0.0,98034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5609,0.037377,0.088079,0.000370,0.128319,0.000000,0.874055,0.143688,0.198039,0.000975,3,2,3.0,0,0.0,0.000125,0.000438,0.006813,0.0,98103
5610,0.042623,0.147304,0.003206,0.214602,0.000000,0.570693,0.130399,0.250569,0.007513,4,2,2.0,0,0.0,0.000125,0.000438,0.007125,0.0,98146
5611,0.042898,0.049355,0.000503,0.071903,0.000000,0.705324,0.182724,0.108738,0.001547,2,0,2.0,0,0.0,0.000125,0.000375,0.006813,0.0,98144
5612,0.042623,0.093394,0.001132,0.136062,0.000000,0.608975,0.373754,0.177027,0.000720,3,2,2.0,0,0.0,0.000125,0.000438,0.006500,0.0,98027


#### Z-score Normalization

In [611]:
df_test_numeric_zscore= df_test_numeric.copy()

In [612]:
for i in range(df_test_numeric.shape[1]):
    df_test_numeric_zscore.iloc[:,i] = Z_Score_Normalize(df_test_numeric.iloc[:,i],df_train_numeric.iloc[:,i])

In [613]:
df_test_zscore = df_test_numeric_zscore.join(df_test_normalized)
df_test_zscore

Unnamed: 0,price,sqft_living,sqft_lot,sqft_above,sqft_basement,lat,long,sqft_living15,sqft_lot15,bedrooms,bathrooms,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode
0,-0.183226,0.317063,-0.211811,0.733785,-0.674535,-0.502767,0.542809,1.452355,-0.237785,4,2,2.0,0,0.0,0.000125,0.000375,0.006313,0.0,98059
1,-0.567332,-0.301241,-0.112595,-0.668837,0.590229,0.408727,1.183694,-0.288507,-0.098983,3,1,1.0,0,0.0,0.000125,0.000375,0.004938,0.0,98074
2,-0.090436,-0.301241,-0.112595,-0.668837,0.590229,0.408727,1.183694,-0.288507,-0.098983,3,1,1.0,0,0.0,0.000125,0.000375,0.004938,0.0,98074
3,-0.226692,0.438515,0.105966,-0.420585,1.633105,0.206252,1.090081,0.708397,1.317909,4,2,1.0,0,0.0,0.000125,0.000438,0.004875,0.0,98075
4,-0.158564,0.394351,-0.179023,-0.668837,1.988127,1.147080,-0.033267,0.113231,-0.195947,4,3,1.0,0,0.0,0.000250,0.000375,0.004313,0.0,98034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5609,-0.471953,-0.577270,-0.347998,-0.271634,-0.674535,0.997549,-0.940586,-0.660486,-0.426900,3,2,3.0,0,0.0,0.000125,0.000438,0.006813,0.0,98103
5610,-0.362948,0.283939,-0.237395,0.696547,-0.674535,-0.351806,-1.055802,-0.214111,-0.222522,4,2,2.0,0,0.0,0.000125,0.000438,0.007125,0.0,98146
5611,-0.357222,-1.140368,-0.342825,-0.904676,-0.674535,0.247033,-0.602142,-1.419323,-0.409016,2,0,2.0,0,0.0,0.000125,0.000375,0.006813,0.0,98144
5612,-0.362948,-0.499982,-0.318304,-0.184746,-0.674535,-0.181526,1.054076,-0.839036,-0.434873,3,2,2.0,0,0.0,0.000125,0.000438,0.006500,0.0,98027


#### Decimal Scalling

In [614]:
df_test_numeric_decimal = df_test_numeric.copy()

In [619]:
for i in range(df_test_numeric.shape[1]):
    df_test_numeric_decimal.iloc[:,i] = Decimal_Scalling(df_test_numeric.iloc[:,i],df_train_numeric.iloc[:,i])

In [620]:
df_test_decimal = df_test_numeric_decimal.join(df_test_normalized)
df_test_decimal

Unnamed: 0,price,sqft_living,sqft_lot,sqft_above,sqft_basement,lat,long,sqft_living15,sqft_lot15,bedrooms,bathrooms,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode
0,0.046595,0.0234,0.000690,0.234,0.000,0.000005,-0.000012,0.295,0.006775,4,2,2.0,0,0.0,0.000125,0.000375,0.006313,0.0,98059
1,0.032500,0.0178,0.001110,0.121,0.057,0.000005,-0.000012,0.178,0.010640,3,1,1.0,0,0.0,0.000125,0.000375,0.004938,0.0,98074
2,0.050000,0.0178,0.001110,0.121,0.057,0.000005,-0.000012,0.178,0.010640,3,1,1.0,0,0.0,0.000125,0.000375,0.004938,0.0,98074
3,0.045000,0.0245,0.002035,0.141,0.104,0.000005,-0.000012,0.245,0.050094,4,2,1.0,0,0.0,0.000125,0.000438,0.004875,0.0,98075
4,0.047500,0.0241,0.000828,0.121,0.120,0.000005,-0.000012,0.205,0.007940,4,3,1.0,0,0.0,0.000250,0.000375,0.004313,0.0,98034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5609,0.036000,0.0153,0.000113,0.153,0.000,0.000005,-0.000012,0.153,0.001509,3,2,3.0,0,0.0,0.000125,0.000438,0.006813,0.0,98103
5610,0.040000,0.0231,0.000581,0.231,0.000,0.000005,-0.000012,0.183,0.007200,4,2,2.0,0,0.0,0.000125,0.000438,0.007125,0.0,98146
5611,0.040210,0.0102,0.000135,0.102,0.000,0.000005,-0.000012,0.102,0.002007,2,0,2.0,0,0.0,0.000125,0.000375,0.006813,0.0,98144
5612,0.040000,0.0160,0.000239,0.160,0.000,0.000005,-0.000012,0.141,0.001287,3,2,2.0,0,0.0,0.000125,0.000438,0.006500,0.0,98027


In [447]:
df_test_min_max.price = Min_Max_Normalize(df_test.price, df_train.price, 0, 1)
df_test_min_max.sqft_living = Min_Max_Normalize(df_test.sqft_living, df_train.sqft_living, 0, 1)
df_test_min_max.sqft_lot = Min_Max_Normalize(df_test.sqft_lot, df_train.sqft_lot, 0, 1)
df_test_min_max.sqft_above = Min_Max_Normalize(df_test.sqft_above, df_train.sqft_above, 0, 1)
df_test_min_max.sqft_basement = Min_Max_Normalize(df_test.sqft_basement, df_train.sqft_basement, 0, 1)
df_test_min_max.lat = Min_Max_Normalize(df_test.lat, df_train.lat, 0, 1)
df_test_min_max.long = Min_Max_Normalize(df_test.long, df_train.long, 0, 1)
df_test_min_max.sqft_living15 = Min_Max_Normalize(df_test.sqft_living15, df_train.sqft_living15, 0, 1)
df_test_min_max.sqft_lot15 = Min_Max_Normalize(df_test.sqft_lot15, df_train.sqft_lot15, 0, 1)

In [448]:
df_test_min_max

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,0.051272,4,2,0.149582,0.003862,2.0,0,0.0,0.000125,0.000375,0.217920,0.000000,0.006313,0.0,98059,0.536754,0.314784,0.446682,0.007024
1,0.032787,3,1,0.107062,0.006406,1.0,0,0.0,0.000125,0.000375,0.092920,0.118257,0.004938,0.0,98074,0.741676,0.388704,0.241814,0.011464
2,0.055738,3,1,0.107062,0.006406,1.0,0,0.0,0.000125,0.000375,0.092920,0.118257,0.004938,0.0,98074,0.741676,0.388704,0.241814,0.011464
3,0.049180,4,2,0.157935,0.012011,1.0,0,0.0,0.000125,0.000438,0.115044,0.215768,0.004875,0.0,98075,0.696156,0.377907,0.359132,0.056785
4,0.052459,4,3,0.154897,0.004703,1.0,0,0.0,0.000250,0.000375,0.092920,0.248963,0.004313,0.0,98034,0.907673,0.248339,0.289091,0.008363
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5609,0.037377,3,2,0.088079,0.000370,3.0,0,0.0,0.000125,0.000438,0.128319,0.000000,0.006813,0.0,98103,0.874055,0.143688,0.198039,0.000975
5610,0.042623,4,2,0.147304,0.003206,2.0,0,0.0,0.000125,0.000438,0.214602,0.000000,0.007125,0.0,98146,0.570693,0.130399,0.250569,0.007513
5611,0.042898,2,0,0.049355,0.000503,2.0,0,0.0,0.000125,0.000375,0.071903,0.000000,0.006813,0.0,98144,0.705324,0.182724,0.108738,0.001547
5612,0.042623,3,2,0.093394,0.001132,2.0,0,0.0,0.000125,0.000438,0.136062,0.000000,0.006500,0.0,98027,0.608975,0.373754,0.177027,0.000720


#### Z-score normalize

In [449]:
df_test_zscore = df_test_normalized.copy()

In [450]:
df_test_zscore.price = Z_Score_Normalize(df_test.price, df_train.price)
df_test_zscore.sqft_living = Z_Score_Normalize(df_test.sqft_living, df_train.sqft_living)
df_test_zscore.sqft_lot = Z_Score_Normalize(df_test.sqft_lot, df_train.sqft_lot)
df_test_zscore.sqft_above = Z_Score_Normalize(df_test.sqft_above, df_train.sqft_above)
df_test_zscore.sqft_basement = Z_Score_Normalize(df_test.sqft_basement, df_train.sqft_basement)
df_test_zscore.lat = Z_Score_Normalize(df_test.lat, df_train.lat)
df_test_zscore.long = Z_Score_Normalize(df_test.long, df_train.long)
df_test_zscore.sqft_living15 = Z_Score_Normalize(df_test.sqft_living15, df_train.sqft_living15)
df_test_zscore.sqft_lot15 = Z_Score_Normalize(df_test.sqft_lot15, df_train.sqft_lot15)

In [451]:
df_test_zscore

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,-0.183226,4,2,0.317063,-0.211811,2.0,0,0.0,0.000125,0.000375,0.733785,-0.674535,0.006313,0.0,98059,-0.502767,0.542809,1.452355,-0.237785
1,-0.567332,3,1,-0.301241,-0.112595,1.0,0,0.0,0.000125,0.000375,-0.668837,0.590229,0.004938,0.0,98074,0.408727,1.183694,-0.288507,-0.098983
2,-0.090436,3,1,-0.301241,-0.112595,1.0,0,0.0,0.000125,0.000375,-0.668837,0.590229,0.004938,0.0,98074,0.408727,1.183694,-0.288507,-0.098983
3,-0.226692,4,2,0.438515,0.105966,1.0,0,0.0,0.000125,0.000438,-0.420585,1.633105,0.004875,0.0,98075,0.206252,1.090081,0.708397,1.317909
4,-0.158564,4,3,0.394351,-0.179023,1.0,0,0.0,0.000250,0.000375,-0.668837,1.988127,0.004313,0.0,98034,1.147080,-0.033267,0.113231,-0.195947
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5609,-0.471953,3,2,-0.577270,-0.347998,3.0,0,0.0,0.000125,0.000438,-0.271634,-0.674535,0.006813,0.0,98103,0.997549,-0.940586,-0.660486,-0.426900
5610,-0.362948,4,2,0.283939,-0.237395,2.0,0,0.0,0.000125,0.000438,0.696547,-0.674535,0.007125,0.0,98146,-0.351806,-1.055802,-0.214111,-0.222522
5611,-0.357222,2,0,-1.140368,-0.342825,2.0,0,0.0,0.000125,0.000375,-0.904676,-0.674535,0.006813,0.0,98144,0.247033,-0.602142,-1.419323,-0.409016
5612,-0.362948,3,2,-0.499982,-0.318304,2.0,0,0.0,0.000125,0.000438,-0.184746,-0.674535,0.006500,0.0,98027,-0.181526,1.054076,-0.839036,-0.434873


#### Normalize by decimal scaling

In [452]:
df_test_decimal_scaling = df_test_normalized.copy()

In [453]:
df_test_decimal_scaling.price = Decimal_Scalling(df_test.price, df_train.price)
df_test_decimal_scaling.sqft_living = Decimal_Scalling(df_test.sqft_living, df_train.sqft_living)
df_test_decimal_scaling.sqft_lot = Decimal_Scalling(df_test.sqft_lot, df_train.sqft_lot)
df_test_decimal_scaling.sqft_above = Decimal_Scalling(df_test.sqft_above, df_train.sqft_above)
df_test_decimal_scaling.sqft_basement = Decimal_Scalling(df_test.sqft_basement, df_train.sqft_basement)
df_test_decimal_scaling.lat = Decimal_Scalling(df_test.lat, df_train.lat)
df_test_decimal_scaling.long = Decimal_Scalling(df_test.long, df_train.long)
df_test_decimal_scaling.sqft_living15 = Decimal_Scalling(df_test.sqft_living15, df_train.sqft_living15)
df_test_decimal_scaling.sqft_lot15 = Decimal_Scalling(df_test.sqft_lot15, df_train.sqft_lot15)

In [454]:
df_test_decimal_scaling

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,0.046595,4,2,0.0234,0.000690,2.0,0,0.0,0.000125,0.000375,0.234,0.000,0.006313,0.0,98059,0.000005,-0.000012,0.295,0.006775
1,0.032500,3,1,0.0178,0.001110,1.0,0,0.0,0.000125,0.000375,0.121,0.057,0.004938,0.0,98074,0.000005,-0.000012,0.178,0.010640
2,0.050000,3,1,0.0178,0.001110,1.0,0,0.0,0.000125,0.000375,0.121,0.057,0.004938,0.0,98074,0.000005,-0.000012,0.178,0.010640
3,0.045000,4,2,0.0245,0.002035,1.0,0,0.0,0.000125,0.000438,0.141,0.104,0.004875,0.0,98075,0.000005,-0.000012,0.245,0.050094
4,0.047500,4,3,0.0241,0.000828,1.0,0,0.0,0.000250,0.000375,0.121,0.120,0.004313,0.0,98034,0.000005,-0.000012,0.205,0.007940
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5609,0.036000,3,2,0.0153,0.000113,3.0,0,0.0,0.000125,0.000438,0.153,0.000,0.006813,0.0,98103,0.000005,-0.000012,0.153,0.001509
5610,0.040000,4,2,0.0231,0.000581,2.0,0,0.0,0.000125,0.000438,0.231,0.000,0.007125,0.0,98146,0.000005,-0.000012,0.183,0.007200
5611,0.040210,2,0,0.0102,0.000135,2.0,0,0.0,0.000125,0.000375,0.102,0.000,0.006813,0.0,98144,0.000005,-0.000012,0.102,0.002007
5612,0.040000,3,2,0.0160,0.000239,2.0,0,0.0,0.000125,0.000438,0.160,0.000,0.006500,0.0,98027,0.000005,-0.000012,0.141,0.001287


## Testing Min Max Normalization

In [455]:
def Proximity_Nominal():
    

SyntaxError: unexpected EOF while parsing (<ipython-input-455-2daddd64b056>, line 1)

#### k=5
#### Manhattan Distance

In [None]:
df_test_min_max