In [1]:
import pandas as pd

In [2]:
import numpy as np


In [3]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [4]:
data = pd.read_csv('C:\\Documents and Settings\\G. Kannan\\My Documents\\Downloads\\kc_house_data_small.csv',dtype=dtype_dict)

In [5]:
train = pd.read_csv('C:\\Documents and Settings\\G. Kannan\\My Documents\\Downloads\\kc_house_data_small_train.csv',dtype=dtype_dict)

In [6]:
test = pd.read_csv('C:\\Documents and Settings\\G. Kannan\\My Documents\\Downloads\\kc_house_data_small_test.csv',dtype=dtype_dict)

In [7]:
valid = pd.read_csv('C:\\Documents and Settings\\G. Kannan\\My Documents\\Downloads\\kc_house_data_validation.csv',dtype=dtype_dict)

In [8]:
def get_numpy_data(data_frame, features, output):
    feature_frame = pd.DataFrame()
    data_frame['constant'] = 1
    featurelist = ['constant'] + features
    feature_frame = data_frame[featurelist]
    feature_matrix=feature_frame.as_matrix()
    out_frame=pd.DataFrame()
    out_frame[output]=data_frame[output]
    output_array=out_frame.as_matrix()
    return (feature_matrix, output_array)

In [9]:
def normalize_features(features):
    norms = np.linalg.norm(features, axis=0)
    normalized_features=features/norms
    return (normalized_features, norms)

In [10]:
feature_list = ['bedrooms',  
                'bathrooms',  
                'sqft_living',  
                'sqft_lot',  
                'floors',
                'waterfront',  
                'view',  
                'condition',  
                'grade',  
                'sqft_above',  
                'sqft_basement',
                'yr_built',  
                'yr_renovated',  
                'lat',  
                'long',  
                'sqft_living15',  
                'sqft_lot15']
features_train, output_train = get_numpy_data(train, feature_list, 'price')
features_test, output_test = get_numpy_data(test, feature_list, 'price')
features_valid, output_valid = get_numpy_data(valid, feature_list, 'price')

In [11]:
features_train, norms = normalize_features(features_train) # normalize training set features (columns)
features_test = features_test / norms # normalize test set by training set norms
features_valid = features_valid / norms # normalize validation set by training set norms

In [12]:
print features_test[0]

[ 0.01345102  0.01551285  0.01807473  0.01759212  0.00160518  0.017059    0.
  0.05102365  0.0116321   0.01564352  0.01362084  0.02481682  0.01350306
  0.          0.01345387 -0.01346922  0.01375926  0.0016225 ]


In [13]:
print features_train[9]

[ 0.01345102  0.01163464  0.00602491  0.0083488   0.00050756  0.01279425
  0.          0.          0.01938684  0.01390535  0.0096309   0.
  0.01302544  0.          0.01346821 -0.01346251  0.01195898  0.00156612]


In [14]:
np.sqrt(np.sum((features_train[9]-features_test[0])**2))

0.059723593713980783

In [15]:
first10=[]
for i in range(10):
    first10.append(np.sqrt(sum((features_train[i]-features_test[0])**2)))

In [16]:
first10.index(min(first10))

8

In [17]:
results = features_train[0:3] - features_test[0]
print results[0] - (features_train[0]-features_test[0])
# should print all 0's if results[0] == (features_train[0]-features_test[0])
print results[1] - (features_train[1]-features_test[0])
# should print all 0's if results[1] == (features_train[1]-features_test[0])
print results[2] - (features_train[2]-features_test[0])
# should print all 0's if results[2] == (features_train[2]-features_test[0])

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]


In [18]:
diff=features_train[:]-features_test[0]

In [19]:
diff[-1].sum()

-0.093433998746546426

In [20]:
distance=np.sqrt(np.sum(diff**2,axis=1))

In [21]:
distance[100]

0.023708232416678195

In [22]:
def compute_distances(features_instances, features_query):
    diff=features_instances[:]-features_query
    distances=np.sqrt(np.sum(diff**2,axis=1))
    return distances

In [23]:
d=compute_distances(features_train,features_test[2])

In [24]:
np.where(d==min(d))[0]

array([382])

In [25]:
min(d)

0.0028604955575117085

In [26]:
d[382]

0.0028604955575117085

In [27]:
train['price'][382]

249000.0

In [28]:
np.argsort(d)

array([ 382, 1149, 4087, ..., 1107, 5226, 2486])

In [29]:
def k_nearest_neighbors(k, feature_train, features_query):
    diff=feature_train[:]-features_query
    distances=np.sqrt(np.sum(diff**2,axis=1))
    s=np.argsort(distances)
    neighbors=s[0:k]
    return neighbors

In [112]:
[k_nearest_neighbors(4, features_train, features_test[2])]

[array([ 382, 1149, 4087, 3142])]

In [31]:
def predict_output_of_query(k, features_train, output_train, features_query):
    diff=features_train[:]-features_query
    distances=np.sqrt(np.sum(diff**2,axis=1))
    s=np.argsort(distances)
    neighbors=s[0:k]
    outputs=output_train[neighbors]
    predictions=sum(outputs)/k
    return predictions

In [32]:
predict_output_of_query(4, features_train, output_train, features_test[2])

array([ 413987.5])

In [62]:
def predict_output(k, features_train, output_train, features_query):
    predictions=[]
    for i in range(features_query.shape[0]):
        diff=features_train[:]-features_query[i]
        distances=np.sqrt(np.sum(diff**2,axis=1))
        s=np.argsort(distances)
        neighbors=s[0:k].tolist()
        outputs=output_train[neighbors]
        outputs=np.reshape(outputs,-1).tolist()
        predictions=predictions + [(sum(outputs)/k)]
    return predictions

In [63]:
result = predict_output(10, features_train, output_train, features_test[0:10])
result

[881300.0,
 431860.0,
 460595.0,
 430200.0,
 766750.0,
 667420.0,
 350032.0,
 512800.7,
 484000.0,
 457235.0]

In [64]:
result.index(min(result))

6

In [65]:
result

[881300.0,
 431860.0,
 460595.0,
 430200.0,
 766750.0,
 667420.0,
 350032.0,
 512800.7,
 484000.0,
 457235.0]

In [66]:
result[6]

350032.0

In [67]:
min(result)

350032.0

In [107]:
rss=[]
output_valid=np.reshape(output_valid,-1).tolist()
valid_out=np.array(output_valid)
for k in range(1,16):
    predicted=predict_output(k, features_train, output_train, features_valid)
    predict=np.array(predicted)
    rss.append(np.sum((valid_out-predict)**2))

In [109]:
min(rss)

67361678735491.5

In [126]:
rss.index(min(rss))+1

8

In [122]:
predicted=predict_output(8, features_train, output_train, features_test)

In [123]:
predict=np.array(predicted)

In [124]:
test_out=np.array(np.reshape(output_test,-1).tolist())

In [125]:
np.sum((test_out-predict)**2)

133118823551516.81

In [117]:
rss

[105453830251561.0,
 83445073504025.5,
 72692096019202.562,
 71946721652091.687,
 69846517419718.602,
 68899544353180.836,
 68341973450051.094,
 67361678735491.5,
 68372727958976.094,
 69335048668556.742,
 69523855215598.828,
 69049969587246.172,
 70011254508263.687,
 70908698869034.344,
 71106928385945.156]