## Overview: Property prediction model from scraped airbnb data

* combine data sets from 3 cities with feature column 'city'
* Make baseline model by predicting the city's average and calculating RMSE
* Improve baseline by adding features to linear regression model and calculate improvement
* calculate RMSE of model over different sizes of training examples to see improvement

In [1]:
# standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def load_df(df_file):
    basepath = 'data/'
    filepath = basepath + df_file
    load_df = pd.DataFrame.from_csv(filepath)
    return load_df

In [3]:
# load the processed datasets (from case-study-explore)
ny_df = load_df('processed_ny_df.csv')
la_df = load_df('processed_la_df.csv')
sf_df = load_df('processed_sf_df.csv')

In [4]:
ny_df.head()

Unnamed: 0,Bedrooms,Capacity,Id,Price,Property_Type,Review_Count,Room_Type,Star_Rating
0,1,2,6882689,198,Apartment,17,2,5
1,2,7,12022627,2200,Apartment,0,2,0
2,1,2,8940740,149,Apartment,6,2,5
3,1,4,12035970,55,Apartment,0,2,0
4,1,2,11642165,65,Apartment,3,1,5


In [18]:
gb = ny_df.groupby('Star_Rating').Price.mean()
gb

Star_Rating
0.0    125.722892
4.0     66.500000
4.5     93.458824
5.0     99.825758
Name: Price, dtype: float64

In [20]:
gb2 = ny_df.groupby('Capacity').Price.mean()
gb2

Capacity
1      58.303571
2      94.967033
3     138.827586
4     130.848485
5     102.000000
6     135.000000
7    2200.000000
8     245.000000
Name: Price, dtype: float64

In [22]:
gb3 = ny_df.groupby('Room_Type').Price.mean()
gb3

Room_Type
0     49.333333
1     70.538462
2    143.437500
Name: Price, dtype: float64

In [23]:
ny_df['City'] = 'NY'

In [24]:
ny_df.head()

Unnamed: 0,Bedrooms,Capacity,Id,Price,Property_Type,Review_Count,Room_Type,Star_Rating,City
0,1,2,6882689,198,Apartment,17,2,5,NY
1,2,7,12022627,2200,Apartment,0,2,0,NY
2,1,2,8940740,149,Apartment,6,2,5,NY
3,1,4,12035970,55,Apartment,0,2,0,NY
4,1,2,11642165,65,Apartment,3,1,5,NY


In [25]:
la_df['City'] = 'LA'
sf_df['City'] = 'SF'

In [45]:
ny_df['Avg_Price'] = ny_df.Price.mean()

In [47]:
la_df['Avg_Price'] = la_df.Price.mean()
sf_df['Avg_Price'] = sf_df.Price.mean()

In [30]:
def combine_dfs(df_list):
    data_frame = None
    for df in df_list:
        if data_frame is None:
            data_frame = df
        else:
            data_frame = data_frame.append(df)
    return data_frame

In [48]:
df_list = [ny_df, la_df, sf_df]
combined_df = combine_dfs(df_list)

In [49]:
combined_df.head()

Unnamed: 0,Bedrooms,Capacity,Id,Price,Property_Type,Review_Count,Room_Type,Star_Rating,City,Avg_Price
0,1,2,6882689,198,Apartment,17,2,5,NY,104.428105
1,2,7,12022627,2200,Apartment,0,2,0,NY,104.428105
2,1,2,8940740,149,Apartment,6,2,5,NY,104.428105
3,1,4,12035970,55,Apartment,0,2,0,NY,104.428105
4,1,2,11642165,65,Apartment,3,1,5,NY,104.428105


In [50]:
# remove that outlier
combined_df = combined_df[combined_df.Price<2000]

In [51]:
combined_df.head()

Unnamed: 0,Bedrooms,Capacity,Id,Price,Property_Type,Review_Count,Room_Type,Star_Rating,City,Avg_Price
0,1,2,6882689,198,Apartment,17,2,5.0,NY,104.428105
2,1,2,8940740,149,Apartment,6,2,5.0,NY,104.428105
3,1,4,12035970,55,Apartment,0,2,0.0,NY,104.428105
4,1,2,11642165,65,Apartment,3,1,5.0,NY,104.428105
5,1,1,648047,61,Apartment,66,1,4.5,NY,104.428105


# Baseline model where prediction = Average Price for a given city

In [58]:
from sklearn import metrics

In [56]:
base_predictions = np.log(combined_df.Avg_Price)

In [59]:
y = np.log(combined_df.Price)

In [60]:
error = y - base_predictions

In [61]:
base_RMSE = np.sqrt(metrics.mean_squared_error(y, base_predictions))
base_RMSE

0.47735659478362091

### Baseline RMSE is 0.4777, we will build linear regression model that beats this value

In [63]:
from sklearn.linear_model import Ridge
from sklearn.cross_validation import train_test_split