# VALIDATION

In [1]:
import numpy as np
import pandas as pd

home_listings = pd.read_csv("listings.csv")
home_listings['price'] = home_listings['price'].str.replace(",","")
home_listings['price'] = home_listings['price'].str.replace("$","")
home_listings['price'] = home_listings['price'].astype('float')

home_listings.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,7087327,https://www.airbnb.com/rooms/7087327,20151002231825,2015-10-03,Historic DC Condo-Walk to Capitol!,Professional pictures coming soon! Welcome to ...,,Professional pictures coming soon! Welcome to ...,none,,...,,f,,"DISTRICT OF COLUMBIA, WASHINGTON",f,flexible,f,f,18,
1,975833,https://www.airbnb.com/rooms/975833,20151002231825,2015-10-03,Spacious Capitol Hill Townhouse,,Beautifully renovated Capitol Hill townhouse. ...,Beautifully renovated Capitol Hill townhouse. ...,none,,...,9.0,f,,"DISTRICT OF COLUMBIA, WASHINGTON",f,strict,f,f,1,2.11
2,8249488,https://www.airbnb.com/rooms/8249488,20151002231825,2015-10-03,Spacious/private room for single,This is an ideal room for a single traveler th...,,This is an ideal room for a single traveler th...,none,,...,,f,,,f,flexible,f,f,1,1.0
3,8409022,https://www.airbnb.com/rooms/8409022,20151002231825,2015-10-03,A wonderful bedroom with library,Prime location right on the Potomac River in W...,,Prime location right on the Potomac River in W...,none,,...,,f,,"DISTRICT OF COLUMBIA, WASHINGTON",f,flexible,f,f,1,
4,8411173,https://www.airbnb.com/rooms/8411173,20151002231825,2015-10-03,Downtown Silver Spring,"Hi travellers! I live in this peaceful spot, b...",This is a 750 sq ft 1 bedroom 1 bathroom. Whi...,"Hi travellers! I live in this peaceful spot, b...",none,Silver Spring is booming. You can walk to a n...,...,,f,,,f,flexible,f,f,1,


In [2]:
shuffled_index = np.random.permutation(home_listings.index)
home_listings = home_listings.reindex(shuffled_index)

split_one = home_listings[0:1862]
split_two = home_listings[1862:]


### Holdout Validation

In [3]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

train_one = split_one
test_one = split_two

train_two = split_two
test_two = split_one


In [9]:
test_one_copy = test_one.copy()
model = KNeighborsRegressor()
model.fit(train_one[['accommodates']],train_one['price'])
test_one_copy['predicted_price'] = model.predict(test_one[['accommodates']])
rmse_one = mean_squared_error(test_one['price'],test_one_copy['predicted_price'])**1/2
rmse_one

9729.718463191834

In [10]:
test_two_copy = test_two.copy()
model = KNeighborsRegressor()
model.fit(train_two[['accommodates']],train_two['price'])
test_two_copy['predicted_price'] = model.predict(test_two[['accommodates']])
rmse_two = mean_squared_error(test_two['price'],test_two_copy['predicted_price'])**1/2
rmse_two

9294.807486573576

In [14]:
average_rmse = np.mean([rmse_one,rmse_two])
print(rmse_one)
print(rmse_two)
print(average_rmse)

9729.718463191834
9294.807486573576
9512.262974882706


### K-Fold Cross Validation

In [None]:
home_listings.set_value(home_listings.index[0:744],"fold",1)
home_listings.set_value(home_listings.index[744:1488],"fold",2)
home_listings.set_value(home_listings.index[1488:2232],"fold",3)
home_listings.set_value(home_listings.index[2232:2976],"fold",4)
home_listings.set_value(home_listings.index[2976:3723],"fold",5)

In [17]:
model = KNeighborsRegressor()
train_one = home_listings[home_listings["fold"]!= 1]
test_one = home_listings[home_listings["fold"]==1]
model.fit(train_one[['accommodates']],train_one['price'])

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')

In [19]:
labels = model.predict(test_one[['accommodates']])
test_one['predicted_price'] = labels
mse_one = mean_squared_error(test_one['price'],test_one['predicted_price'])
mse_one = mse_one ** (1/2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


### Generalizing

In [26]:
folds = [1,2,3,4,5]

def train_and_validate(df, fold):
    model = KNeighborsRegressor()
    train_one = home_listings[home_listings["fold"] != fold]
    test_one = home_listings[home_listings["fold"]==fold]
    model.fit(train_one[['accommodates']],train_one['price'])
    labels = model.predict(test_one[['accommodates']])
    test_one['predicted_price'] = labels
    mse_one = mean_squared_error(test_one['price'],test_one['predicted_price'])
    rmse = mse_one ** (1/2)
    return rmse

rmses = []
for r in folds:
    rmse1 = train_and_validate(home_listings, r)
    rmses.append(rmse1)
    
average_rmse = np.mean(rmses)

print(rmses)
print("------------------------------------------------")
print(average_rmse)

[113.2497988319797, 135.07326267741732, 145.11073617306258, 127.56061725810524, 134.29093294791505]
------------------------------------------------
131.05706957769598


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


### K-Fold using scikit-learn

In [30]:
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score
kf = KFold(len(home_listings), 5, shuffle=True, random_state=1)
model = KNeighborsRegressor()
mses = cross_val_score(model, home_listings[["accommodates"]], home_listings["price"], scoring="mean_squared_error", cv=kf)
rmses = [np.sqrt(np.absolute(mse)) for mse in mses]
avg_rmse = np.mean(rmses)

print(rmses)
print(avg_rmse)


[132.4026541229138, 128.23489248643176, 145.7892609812179, 110.2320311838838, 150.6778062301233]
133.46732900091413


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


### Varying K-Values

In [34]:
folds = [3,6,7,8,9,11]
for fold in folds:
    kf = KFold(len(home_listings), fold, shuffle=True, random_state=1)
    model = KNeighborsRegressor()
    mses = cross_val_score(model, home_listings[["accommodates"]], home_listings["price"], scoring="mean_squared_error", cv=kf)
    rmses = [np.sqrt(np.absolute(mse)) for mse in mses]
    avg_rmse = np.mean(rmses)
    std_rmse = np.std(rmses)
    print(str(fold), "folds: ", "avg RMSE: ", str(avg_rmse), "std RMSE: ", str(std_rmse))


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


3 folds:  avg RMSE:  137.2610197687211 std RMSE:  10.010160282764632
6 folds:  avg RMSE:  134.24245041786506 std RMSE:  15.989312885070891
7 folds:  avg RMSE:  128.29543045315103 std RMSE:  21.348210629664127


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


8 folds:  avg RMSE:  127.76069996880271 std RMSE:  18.81679341484647
9 folds:  avg RMSE:  132.41586829984766 std RMSE:  25.184651521707394
11 folds:  avg RMSE:  131.7249351159235 std RMSE:  24.3432095657993


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


In [None]:
# Bias Variance Tradeoff