# Improving the Model and Scikit-learn

In [1]:
# reading and cleaning the dataset
import pandas as pd
import numpy as np
np.random.seed(1)
home_listings = pd.read_csv("listings.csv")
home_listings = home_listings.loc[np.random.permutation(len(home_listings))]
home_listings['price'] = home_listings['price'].str.replace(',', '')
home_listings['price'] = home_listings['price'].str.replace('$', '')
home_listings['price'] = home_listings['price'].astype('float')
home_listings.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
574,741202,https://www.airbnb.com/rooms/741202,20151002231825,2015-10-03,"Gorgeous unit, historic building",Come stay with former diplomats in our 110yr o...,Why stay in a stuffy impersonal hotel room whe...,Come stay with former diplomats in our 110yr o...,none,,...,10.0,f,,"DISTRICT OF COLUMBIA, WASHINGTON",f,strict,t,t,1,4.15
1593,3920830,https://www.airbnb.com/rooms/3920830,20151002231825,2015-10-03,Bedroom in Historic DC Rowhouse!,Sunny bedroom in upper Petworth with your own ...,Welcome to our corner rowhouse in DC! About t...,Sunny bedroom in upper Petworth with your own ...,none,Our home is located in upper Petworth only ste...,...,9.0,f,,"DISTRICT OF COLUMBIA, WASHINGTON",f,strict,f,f,2,3.68
3091,6403405,https://www.airbnb.com/rooms/6403405,20151002231825,2015-10-03,"Spacious, airy and bright bedroom","Comfortable, spacious, fully functional room, ...","Comfortable, spacious, fully functional room, ...","Comfortable, spacious, fully functional room, ...",none,,...,10.0,f,,"DISTRICT OF COLUMBIA, WASHINGTON",f,flexible,f,f,1,0.23
420,17378,https://www.airbnb.com/rooms/17378,20151002231825,2015-10-03,[1720-6/8] Deluxe 1BR-Foggy Bottom,"Nicely furnished, economical 1 Bedroom apartme...",Take the guesswork out of finding accommodatio...,"Nicely furnished, economical 1 Bedroom apartme...",none,,...,9.0,f,,"DISTRICT OF COLUMBIA, WASHINGTON",f,super_strict_30,f,t,5,0.31
808,106741,https://www.airbnb.com/rooms/106741,20151002231825,2015-10-03,4.5 Bedroom Capitol hill phase II,,The house has 5 rooms in total: One large room...,The house has 5 rooms in total: One large room...,none,,...,8.0,f,,"DISTRICT OF COLUMBIA, WASHINGTON",f,strict,f,f,3,0.64


### Removing the columns
that cannot be used as features as we cannot apply distance formulas on them

In [2]:
home_listings.drop(home_listings.columns[[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,54,55,56,58,59,60,61,62,63,66,67,68,69,70,71,72,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91]], axis=1, inplace=True) 

In [5]:
home_listings.drop(home_listings.columns[[0,1]],axis=1,inplace=True)
len(home_listings.columns)

8

### Handling Missing Values

In [7]:
home_listings = home_listings.dropna(axis=0)
home_listings.isnull().sum()


accommodates         0
bathrooms            0
bedrooms             0
beds                 0
price                0
minimum_nights       0
maximum_nights       0
number_of_reviews    0
dtype: int64

In [8]:
home_listings.head()

Unnamed: 0,accommodates,bathrooms,bedrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews
574,2,1.0,1.0,1.0,125.0,1,4,149
1593,2,1.5,1.0,1.0,85.0,1,30,49
3091,1,0.5,1.0,1.0,50.0,1,1125,1
420,2,1.0,1.0,1.0,209.0,4,730,2
808,12,2.0,5.0,5.0,215.0,2,1825,34


### Normalize columns

In [10]:
normalized_df = (home_listings - home_listings.mean())/(home_listings.std())

In [11]:
normalized_df['price'] = home_listings['price']
normalized_df.head()

Unnamed: 0,accommodates,bathrooms,bedrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews
574,-0.596544,-0.439151,-0.249467,-0.546858,125.0,-0.341375,-0.016604,4.57965
1593,-0.596544,0.412923,-0.249467,-0.546858,85.0,-0.341375,-0.016603,1.159275
3091,-1.095499,-1.291226,-0.249467,-0.546858,50.0,-0.341375,-0.016573,-0.482505
420,-0.596544,-0.439151,-0.249467,-0.546858,209.0,0.487635,-0.016584,-0.448301
808,4.393004,1.264998,4.507903,2.829956,215.0,-0.065038,-0.016553,0.646219


### Euclidean distance multivariate case

In [16]:
from scipy.spatial import distance
first_listing = normalized_df.iloc[0][['accommodates', 'bathrooms']]
second_listing = normalized_df.iloc[1][['accommodates', 'bathrooms']]
distance_first_second = distance.euclidean(first_listing,second_listing)
distance_first_second

0.8520745409669226

### Scikit-learn

In [18]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor()
knn = KNeighborsRegressor(algorithm='brute')

In [20]:
train_df = normalized_df.sample(frac=0.75,random_state=1)
test_df = normalized_df.drop(train_df.index)

train_features = train_df[['accommodates', 'bathrooms']]
train_target = train_df['price']
knn.fit(train_features, train_target)

predictions = knn.predict(test_df[['accommodates', 'bathrooms']])
print(predictions)


[ 83.2  82.6  82.6 205.8  83.2 205.8  82.6  82.6  82.6 395.8  82.6 210.6
  90.  107.2 307.6 132.4  82.6 395.8  90.   82.6  83.2  82.6 107.2 205.8
  90.   82.6 415.  217.8 131.4  83.2 395.8  83.2  82.6 395.8 223.  131.4
 417.   82.6  82.6 203.8  83.   82.6  82.6 395.8 141.6 104.   82.6 362.2
 141.6 168.6  82.6 395.8 141.6 395.8  82.6 395.8  82.6  90.   82.6 121.4
 104.   82.6  82.6 121.4 141.6 395.8  82.6  82.6  82.6  82.6 141.6  82.6
 141.6 141.6 168.6 168.6  82.6 395.8  82.6 395.8  82.6 104.  395.8  82.6
  82.6 104.  141.6  82.6  83.2  82.6 141.6 141.6  82.6  82.6 395.8 257.4
  83.2 104.  395.8  82.6  82.6 141.6  82.6 141.6  83.2  82.6  90.  395.8
 141.6 141.6 141.6 141.6  90.  395.8  83.2  83.2  82.6 395.8  82.6 141.6
 131.4  82.6 205.8  82.6 257.4  82.6  82.6  82.6 132.4  82.6  82.6 395.8
  82.6 141.6 322.8  82.6 141.6 395.8 395.8 141.6  82.6 257.4 395.8 433.4
 131.4  82.6 395.8 395.8 107.2  80.6  82.6  83.2  82.6 395.8  82.6 322.8
 141.6  80.6 132.4 395.8 395.8 141.6 415.  203.8  8

### MSE

In [21]:
from sklearn.metrics import mean_squared_error

two_features_mse = mean_squared_error(test_df["price"], predictions)
two_features_rmse = (two_features_mse) ** (1/2)

print(two_features_mse)
print(two_features_rmse)

21648.360174291938
147.13381723550822


### Increasing features

In [22]:
train_features = train_df[['accommodates', 'bathrooms','bedrooms','number_of_reviews']]
train_target = train_df['price']
knn.fit(train_features, train_target)

predictions = knn.predict(test_df[['accommodates', 'bathrooms','bedrooms','number_of_reviews']])
print(predictions)

[  94.8  115.    80.8  212.4   58.6  266.    81.6  253.2  117.   119.6
  253.2  415.8   82.6  113.2  238.4  199.   110.8  130.6  124.2  116.
  108.8  124.8  113.2  285.8   76.8  121.6  313.4  244.8  173.4   93.4
  119.6  108.8  111.4  120.6  255.   182.4  418.   115.    83.6  209.2
  113.   111.    99.2  137.8  248.6   94.   253.2  424.   105.4   99.
   89.8  122.8  232.6  140.4  253.2  130.6  124.8  103.8  253.2  190.6
   94.   123.   121.6  207.2  119.8  130.6   99.4  117.    80.8  101.2
  119.8  124.8  136.4  170.4  138.6  107.4  124.8  140.4  105.8  138.
   91.6   94.   139.4  117.   117.    94.   134.2   96.6   89.6  115.
  138.6  158.   118.4  253.2  171.   261.    94.8   88.8  159.8   99.4
   96.   125.4  118.2  108.   108.8   93.4   82.6  130.6  116.8  141.8
  232.6  125.6   77.8  152.   108.8   89.6  118.4  128.6   91.6  104.6
   90.6  253.2  219.4  124.8  245.8   98.2  104.2  253.2   99.6   98.8
  101.8  101.8   99.4  134.6  238.8  144.4  104.6  139.8  140.4  232.6
   99.4  3

In [23]:
mse = mean_squared_error(test_df["price"], predictions)
rmse = (mse) ** (1/2)

print(mse)
print(rmse)

14191.914684095862
119.12982281568232


### All features

In [30]:
train_features = train_df[train_df.columns.tolist()]
train_target = train_df['price']
knn.fit(train_features, train_target)

predictions = knn.predict(test_df[train_df.columns.tolist()])
print(predictions)

[  55.    74.8   42.   299.    75.   114.4  120.   120.    90.   105.
  135.   199.4   71.8   69.   159.8  299.2   75.    89.    99.    59.4
   60.   100.   168.2  100.    72.   250.   299.8  175.   250.    79.8
  105.    43.2   90.   189.   125.   149.8  482.8   99.    75.   220.
  110.   100.    65.   139.   145.   150.   125.   249.8   95.   138.6
  145.   139.4  160.   129.   120.   109.   111.    70.   100.   139.6
   75.4  125.   100.   169.4  119.   288.8   85.    45.    75.   105.
   85.    85.   140.   124.8   79.4  295.6   85.   125.   150.   165.
  105.    50.   208.2   89.    80.    43.2  125.    95.    99.8   70.
  105.   159.    99.   110.   140.   164.8   53.2   74.8   60.   100.
  111.   159.4   85.    95.   275.    55.    65.   246.8  175.   129.
  399.2  115.   140.    90.6  125.    85.   189.4   95.   100.    90.
  718.6  119.   299.    80.   299.8   50.2   71.2   60.    94.6  125.
  100.    89.   130.    98.8  175.   150.   175.    75.2  180.   129.
  100.   516.   

In [31]:
mse = mean_squared_error(test_df["price"], predictions)
rmse = (mse) ** (1/2)

print(mse)
print(rmse)

178.95128540305012
13.37726748641329
