## Recap

In [19]:
import pandas as pd
import numpy as np
np.random.seed(1)

dc_listings = pd.read_csv('dc_airbnb.csv')
dc_listings = dc_listings.loc[np.random.permutation(len(dc_listings))]
stripped_commas = dc_listings['price'].str.replace(',', '')
stripped_dollars = stripped_commas.str.replace('$', '')
dc_listings['price'] = stripped_dollars.astype('float')

In [20]:
dc_listings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3723 entries, 574 to 1061
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   host_response_rate    3289 non-null   object 
 1   host_acceptance_rate  3109 non-null   object 
 2   host_listings_count   3723 non-null   int64  
 3   accommodates          3723 non-null   int64  
 4   room_type             3723 non-null   object 
 5   bedrooms              3702 non-null   float64
 6   bathrooms             3696 non-null   float64
 7   beds                  3712 non-null   float64
 8   price                 3723 non-null   float64
 9   cleaning_fee          2335 non-null   object 
 10  security_deposit      1426 non-null   object 
 11  minimum_nights        3723 non-null   int64  
 12  maximum_nights        3723 non-null   int64  
 13  number_of_reviews     3723 non-null   int64  
 14  latitude              3723 non-null   float64
 15  longitude          

## Removing features

non-numerical values:

* room_type: e.g. Private room
* city: e.g. Washington
* state: e.g. DC

numerical but non-ordinal values:

* latitude: e.g. 38.913458
* longitude: e.g. -77.031
* zipcode: e.g. 20009

columns that don\'t directly describe the living space or the listing itself:

* host_response_rate
* host_acceptance_rate
* host_listings_count

In [21]:
dc_listings = dc_listings.drop(['room_type', 'city', 'state', 
                                'host_response_rate', 'host_acceptance_rate', 'host_listings_count', 
                                'latitude', 'longitude', 'zipcode'], axis=1)

In [22]:
dc_listings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3723 entries, 574 to 1061
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   accommodates       3723 non-null   int64  
 1   bedrooms           3702 non-null   float64
 2   bathrooms          3696 non-null   float64
 3   beds               3712 non-null   float64
 4   price              3723 non-null   float64
 5   cleaning_fee       2335 non-null   object 
 6   security_deposit   1426 non-null   object 
 7   minimum_nights     3723 non-null   int64  
 8   maximum_nights     3723 non-null   int64  
 9   number_of_reviews  3723 non-null   int64  
dtypes: float64(4), int64(4), object(2)
memory usage: 319.9+ KB


## Handling missing values

There are also 2 columns that have a large number of missing values:

* cleaning_fee - 37.3% of the rows
* security_deposit - 61.7% of the rows

remove these 2 columns entirely from consideration.

In [23]:
dc_listings.drop(['cleaning_fee', 'security_deposit'], axis=1, inplace=True)

In [24]:
dc_listings.dropna(subset=['bedrooms', 'bathrooms', 'beds'], axis=0, how='any', inplace=True)

In [25]:
dc_listings.isnull().sum()

accommodates         0
bedrooms             0
bathrooms            0
beds                 0
price                0
minimum_nights       0
maximum_nights       0
number_of_reviews    0
dtype: int64

## Normalize columns

In [27]:
normalized_listings = (normalized_listings - normalized_listings.mean())/normalized_listings.std()
normalized_listings['price'] = dc_listings['price']
normalized_listings.head(3)

Unnamed: 0,accommodates,bedrooms,bathrooms,beds,minimum_nights,maximum_nights,number_of_reviews,price
574,-0.596544,-0.249467,-0.439151,-0.546858,-0.341375,-0.016604,4.57965,125.0
1593,-0.596544,-0.249467,0.412923,-0.546858,-0.341375,-0.016603,1.159275,85.0
3091,-1.095499,-0.249467,-1.291226,-0.546858,-0.341375,-0.016573,-0.482505,50.0


## Euclidean distance for multivariate case

In [30]:
from scipy.spatial import distance

normalized_listings.reset_index(inplace=True)
first = normalized_listings.loc[0:0, ['accommodates', 'bathrooms']]
fifth = normalized_listings.loc[4:4, ['accommodates', 'bathrooms']]
first_fifth_distance = distance.euclidean(first, fifth)

In [31]:
print(first_fifth_distance)

5.272543124668522


## Introduction to scikit-learn

The scikit-learn workflow consists of 4 main steps:

* instantiate the specific machine learning model you want to use
* fit the model to the training data
* use the model to make predictions
* evaluate the accuracy of the predictions

## Fitting a model and making predictions

In [32]:
from sklearn.neighbors import KNeighborsRegressor

train_df = normalized_listings.iloc[0:2792]
test_df = normalized_listings.iloc[2792:]

knn = KNeighborsRegressor(n_neighbors=5, algorithm='brute')

train_features = train_df[['accommodates', 'bathrooms']]
# List-like object, containing just the target column, `price`.
train_target = train_df['price']
# Pass everything into the fit method.
knn.fit(train_features, train_target)

predictions = knn.predict(test_df[['accommodates', 'bathrooms']])

## Calculating MSE using Scikit-Learn

In [34]:
from sklearn.metrics import mean_squared_error

train_columns = ['accommodates', 'bathrooms']
knn = KNeighborsRegressor(n_neighbors=5, algorithm='brute', metric='euclidean')
knn.fit(train_df[train_columns], train_df['price'])
predictions = knn.predict(test_df[train_columns])

two_features_mse = mean_squared_error(predictions, test_df['price'])

two_features_rmse = two_features_mse**0.5

print(two_features_mse)
print(two_features_rmse)

15439.800409556317
124.2569934030126


## Using more features

In [36]:
features = ['accommodates', 'bedrooms', 'bathrooms', 'number_of_reviews']

from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=5, algorithm='brute')

knn.fit(train_df[features], train_df['price'])

four_predictions = knn.predict(test_df[features])

four_mse = mean_squared_error(four_predictions, test_df['price'])
four_rmse = four_mse**0.5

print(four_mse)
print(four_rmse)

13707.009647326508
117.07693900733187


## Using all features

In [38]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=5, algorithm='brute')

train_x = train_df.drop('price', axis=1)
train_y = train_df['price']
knn.fit(train_x, train_y)

test_x = test_df.drop('price', axis=1)
test_y = test_df['price']
all_features_predictions = knn.predict(test_x)

all_features_mse  = mean_squared_error(all_features_predictions, test_y)
all_features_rmse  = all_features_mse**0.5

print(all_features_mse)
print(all_features_rmse)

23509.204050056884
153.32711452987328
