### Imports

In [111]:
# Note to self -> Use numpy somewhere
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sb
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, median_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

In [112]:
# Importing data into pandas dataframes
listings = pd.read_csv('./listings_sea.csv')
calendar = pd.read_csv('./calendar_sea.csv')
reviews = pd.read_csv('./reviews_sea.csv')

**Data Preparation**

In [113]:
# Select the columns that are needed - Listings
listings_data = listings.loc[:,('id',
                                'review_scores_rating',
                                'amenities',
                                'price',
                                'property_type',
                                'neighbourhood_group_cleansed')]

In [114]:
# Impute for null values
listings_data.at[listings_data['property_type'].isnull(), 'property_type'] = 'Other'
listings_data.at[listings_data['review_scores_rating'].isnull(), 'review_scores_rating'] = 0

In [115]:
# Create categorical columns
listings_data['TV'] = listings_data.amenities.str.contains("TV", case=False)
listings_data['Internet'] = listings_data.amenities.str.contains('Internet|Wireless', case=False)
listings_data['Parking'] = listings_data.amenities.str.contains('Parking', case=False)
listings_data['Washer'] = listings_data.amenities.str.contains('Washer', case=False)
listings_data['Dryer'] = listings_data.amenities.str.contains('Dryer', case=False)
listings_data['AirCon'] = listings_data.amenities.str.contains('Air Condition|Air Conditioning', case=False)
listings_data['PetFriendly'] = listings_data.amenities.str.contains('Pets Allowed|Pets live on this property|Dog|Cat', case=False)
listings_data['Kitchen'] = listings_data.amenities.str.contains('Kitchen', case=False)
listings_data['FamilyFriendly'] = listings_data.amenities.str.contains('Family/Kid Friendly', case=False)

In [116]:
# Fix Data Types
listings_data['price'] = listings_data['price'].str.replace("[$, ]", "").astype("float")

In [117]:
# Check dtypes
listings_data.dtypes

id                                int64
review_scores_rating            float64
amenities                        object
price                           float64
property_type                    object
neighbourhood_group_cleansed     object
TV                                 bool
Internet                           bool
Parking                            bool
Washer                             bool
Dryer                              bool
AirCon                             bool
PetFriendly                        bool
Kitchen                            bool
FamilyFriendly                     bool
dtype: object

In [118]:
listings_data_categorical = pd.get_dummies(listings_data, drop_first=True);

In [119]:
# Check dtypes
listings_data_categorical.dtypes

id                                                    int64
review_scores_rating                                float64
price                                               float64
TV                                                     bool
Internet                                               bool
                                                     ...   
neighbourhood_group_cleansed_Queen Anne               uint8
neighbourhood_group_cleansed_Rainier Valley           uint8
neighbourhood_group_cleansed_Seward Park              uint8
neighbourhood_group_cleansed_University District      uint8
neighbourhood_group_cleansed_West Seattle             uint8
Length: 3326, dtype: object

In [120]:
# Check dataframe for null values
listings_data_categorical.isnull().sum()

id                                                  0
review_scores_rating                                0
price                                               0
TV                                                  0
Internet                                            0
                                                   ..
neighbourhood_group_cleansed_Queen Anne             0
neighbourhood_group_cleansed_Rainier Valley         0
neighbourhood_group_cleansed_Seward Park            0
neighbourhood_group_cleansed_University District    0
neighbourhood_group_cleansed_West Seattle           0
Length: 3326, dtype: int64

In [121]:
# Check dataframe for null values
listings_data_categorical.head(3)

Unnamed: 0,id,review_scores_rating,price,TV,Internet,Parking,Washer,Dryer,AirCon,PetFriendly,...,neighbourhood_group_cleansed_Interbay,neighbourhood_group_cleansed_Lake City,neighbourhood_group_cleansed_Magnolia,neighbourhood_group_cleansed_Northgate,neighbourhood_group_cleansed_Other neighborhoods,neighbourhood_group_cleansed_Queen Anne,neighbourhood_group_cleansed_Rainier Valley,neighbourhood_group_cleansed_Seward Park,neighbourhood_group_cleansed_University District,neighbourhood_group_cleansed_West Seattle
0,241032,95.0,85.0,True,True,False,True,True,True,False,...,0,0,0,0,0,1,0,0,0,0
1,953595,96.0,150.0,True,True,True,True,True,False,False,...,0,0,0,0,0,1,0,0,0,0
2,3308979,97.0,975.0,True,True,True,True,True,True,True,...,0,0,0,0,0,1,0,0,0,0


In [123]:
# Look for anomalies
listings_data_categorical['price'].unique()

array([  85.,  150.,  975.,  100.,  450.,  120.,   80.,   60.,   90.,
         95.,   99.,  245.,  165.,  461.,  109.,   66.,  200.,  700.,
        110.,   75.,  600.,   45.,  300.,  175.,  325.,  222.,  159.,
        125.,  348.,  148.,  350.,  349.,  160.,  130.,  137.,   20.,
        145.,  400.,   70.,  170.,  465.,   50.,   59.,   47.,   55.,
         65.,   25.,   40.,  149.,  129.,  105.,  218.,  126.,  115.,
        225.,   89.,  134.,  375.,   97.,  197.,  135.,  180.,   69.,
        195.,  224.,  338.,   79.,   49.,   57.,   96.,  295.,   53.,
         35.,  133.,   61.,   52.,  275.,  199.,  435.,  250.,  116.,
         39.,   98.,   73.,  190.,   68.,  196.,  209.,  749.,  285.,
        235.,  103.,  143.,  142.,  335.,  499.,  156.,   94.,  219.,
         91.,   74.,  140.,  104.,  122.,  158.,   42.,  185.,  680.,
        119.,  575.,  139.,  259.,  166.,  215.,  249.,  210.,  439.,
        155.,   54.,  254.,  310.,  490.,  255.,  144.,   46.,  445.,
        395.,   88.,

**Start Model**

In [75]:
# Consider only numerical variables
X = listings_data_categorical[[columns go in here]]
y = listings_data_categorical['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42)

**Four steps:**

In [76]:
#Instantiate
lm_model = LinearRegression(normalize=True) 

In [77]:
#Fit
lm_model.fit(X_train, y_train) 

ValueError: could not convert string to float: 'House'

In [None]:
#Predict

In [None]:
#Score