In [128]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib as plt
%matplotlib inline
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
khouse_train = pd.read_csv('knn data/kc_house_data_small_train.csv',dtype =dtype_dict)
khouse_test = pd.read_csv('knn data/kc_house_data_small_test.csv',dtype = dtype_dict)
khouse_valid = pd.read_csv('knn data/kc_house_data_validation.csv',dtype=dtype_dict)

khouse_train, train_y = clean_features(khouse_train)
khouse_test, test_y = clean_features(khouse_test)
khouse_valid, valid_y = clean_features(khouse_valid)

khouse_train, norms = normalize_features(khouse_train)

khouse_test = khouse_test / norms
khouse_valid = khouse_valid / norms

In [156]:
def RSS(y,y_est):
    return ((y-y_est)**2).sum()


def set_data(features, output):
    features['constant'] = 1
    features.set_axis([i for i in range(features.shape[1])],axis='columns',inplace=True)
    return (features, output)


# def predict_output(feature_matrix, weights):
#     return (feature_matrix*weights).sum(axis=1)


def normalize_features(features):
    norms = np.linalg.norm(features,axis=0)
    return (features/norms,norms)


def clean_features(df):
    df['floors'] = df['floors'].astype('float64')
    df['zipcode'] = df['zipcode'].astype('float64')
    df = df[[df.columns[i] for i in range(df.shape[1]) if df.dtypes[i] !='O']]
    y = df['price']
    df =df.drop(columns=['price'])
    return (df,y)


def distance(node1,node2):
    return np.sqrt(np.sum((node1-node2)**2))


def compute_distances(features_instances: pd.core.frame.DataFrame, features_query: pd.core.series.Series) -> list:
    return np.sqrt(np.sum((features_instances-features_query)**2,axis=1))


def k_nearest_neighbors(k, features_train, features_query):
    dis = compute_distances(features_train,features_query)
    return np.argsort(dis)[0:k]


def predict_output_of_query(k, features_train, output_train, features_query):
    nearest_idx = k_nearest_neighbors(k, features_train, features_query)
    return output_train[nearest_idx].sum()/k

def predict_output(k, features_train, output_train, features_query):
    predictions = np.array([])
    for i in range(features_query.shape[0]):
        predictions = np.append(predictions,predict_output_of_query(k, features_train, output_train, features_query.loc[i]))
    return predictions

In [122]:
simple_features = ['sqft_living','bedrooms']
# (simple_feature_matrix, train_y) = set_data(khouse_train[simple_features].copy(), khouse_train['price'])
# (simple_feature_matrix, train_y) = set_data(khouse_test[simple_features].copy(), khouse_train['price'])
# (simple_feature_matrix, train_y) = set_data(khouse_valid[simple_features].copy(), khouse_train['price'])

dis = np.sqrt(np.sum((khouse_test.loc[0]-khouse_train.loc[9])**2))
print('Distance: ',dis)

Distance:  0.05972359371398078


In [123]:
for i in range(10):
    print(i, distance(khouse_test.loc[0],khouse_train.loc[i]))

0 0.06027470916295592
1 0.08546881147643746
2 0.06149946435279315
3 0.05340273979294363
4 0.05844484060170442
5 0.059879215098128345
6 0.0546314049677546
7 0.05543108323614607
8 0.052383627840220305
9 0.05972359371398078


In [129]:
m = compute_distances(khouse_train,khouse_test.loc[2])
print(m.idxmin())
print(m.min())
print('Predict: ',train_y[m.idxmin()])

382
0.0028605187556923974
Predict:  249000.0


In [142]:
m = k_nearest_neighbors(4,khouse_train,khouse_test.loc[2])
print(m)
predict_output_of_query(4,khouse_train,train_y,khouse_test.loc[2])

0     382
1    1149
2    4087
3    3142
dtype: int64


413987.5

In [157]:
predict_output(10, khouse_train,train_y,khouse_test.loc[0:9])

array([881300. , 431860. , 460595. , 430200. , 766750. , 667420. ,
       350032. , 512800.7, 484000. , 457235. ])

In [159]:
# Use validation set to choose k
for i in range(1,16):
    p = predict_output(i,khouse_train,train_y,khouse_valid)
    print(RSS(valid_y,p),i)

105453830251561.0 1
83445073504025.5 2
72692096019202.56 3
71946721652091.69 4
69846517419718.6 5
68899544353180.836 6
68341973450051.09 7
67361678735491.5 8
68372727958976.09 9
69335048668556.74 10
69523855215598.83 11
69048535243496.17 12
70011254508263.69 13
70908698869034.34 14
71106928385945.16 15


In [161]:
# Test error of KNN 
p = predict_output(8,khouse_train,train_y,khouse_test)
print('{:e}'.format(RSS(test_y,p)))

1.331188e+14
