In [135]:
import os
import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error as mse
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline



class PredictPrice:
    def __init__(self):
        pass

    def predict_price(self, form):
        form.pop('name')
        form.pop('csrf_token')
        location = form.pop('location')

        listings = self.get_listings(location)

        X, y, my_listing = self.to_Xy(listings, form)

        model = self.make_model(X, y)

        price = self.predict(model, my_listing)
        return price

    def get_listings(self, location):
        airbnb = InsideAirBnB()
        listing_file = ['data/listings.csv.gz']
        latest_date = airbnb.lookup('dates', for_=location)[0]
        data = airbnb.get_data(location_name=location, date=latest_date, data_files=listing_file)
        listings = data['listings-dat']

        # Can speed up local debugging
        # listings.to_csv('listings.gz')  # save to a file (~20 MB)
        # listings = pd.read_csv('listings.gz')  # load from file instead of downloading every time
        return listings

    def to_Xy(self, listings, form, min_availability=0, active_since='2020-02-01'):
        # Only keep listings which are available and active (30% in New York City)
        available = listings['availability_365'] > min_availability
        available = listings[available].index
        active = pd.to_datetime(listings['last_review']) > pd.to_datetime(active_since)
        active = listings[active].index
        good_rows = available & active

        # Crop X from listings
        columns = list(form.keys())
        X = listings[columns]
        X = X.iloc[good_rows]

        # Prep the target
        y = listings['price'].apply(lambda x: float(x.replace('$', '').replace(',', '')))
        y = y.apply(lambda x: np.log(x) if x > 0 else 0)
        y = y.loc[good_rows]

        # Format the form dictionary to a DataFrame
        my_listing = pd.DataFrame(form, index=[0])
        return X, y, my_listing

    def make_model(self, X, y):
        model = GradientBoostingRegressor(n_estimators=500, criterion='mse')
        return model.fit(X, y)

    def predict(self, model, listing):
        pred = model.predict(listing)
        return np.exp(pred[0])  # the model actually predicts log price, so np.exp() un-logs it

    # These methods were useful in the Colab notebook, but aren't currently integrated here
    def baseline(self, y_train, y_test):
        y_baseline = np.full(y_test.shape, y_train.median())
        return mse(y_test, y_baseline)

    def over_base(self, y_train, y_test, y_pred):
        mse_base = self.baseline(y_train, y_test)
        mse_gb = mse(y_test, y_pred)
        improvement = 1 - (mse_gb / mse_base)
        return improvement

    def save_model(self, model):
        cache_folder, model_filename = 'cache', 'model.json'
        model_path = os.path.join(cache_folder, model_filename)
        model.save_model(fname=model_path)

    def load_model(self):
        cache_folder, model_filename = 'cache', 'model.json'
        model_path = os.path.join(cache_folder, model_filename)
        model = GradientBoostingRegressor()
        model.load_model(fname=model_path)
        return model


In [136]:
import os
import json
import pandas as pd
from urllib.request import urlopen


class InsideAirBnB:
    """
    There are 7 data files available for every location and date
    These numbers are for New York City, 2021-02-04
                                                Size   ,   Shape            Key
    'data/listings.csv.gz',                     21.2 MB,   (18291, 74)      listings-dat
    'data/calendar.csv.gz',                     34.8 MB,   (13464021, 7)    calendar-dat
    'data/reviews.csv.gz',                      99.3 MB,   (847727, 6)      reviews-dat
    'visualisations/listings.csv',              5.12 MB,   (37012, 16)      listings-vis
    'visualisations/reviews.csv',               15.7 MB,   (847727, 2)      reviews-vis
    'visualisations/neighbourhoods.csv',       0.005 MB,   (230, 2)         neighborhoods-vis
    'visualisations/neighbourhoods.geojson',   0.604 MB,   (233, 3)         neighborhoods-geo
    """

    def __init__(self):
        self.locations = self.load_locations()

    def load_locations(self):
        cache_folder, index_filename = 'cache', 'inside-airbnb-locations.bz2'
        locations_path = os.path.join(cache_folder, index_filename)

        try:
            locations = pd.read_csv(locations_path)  # attempt to load from file
        except FileNotFoundError:
            print("Couldn't find the cached locations for insideairbnb.com, regenerating it...")

            # generate the cache folder if it's missing
            if not os.path.exists(cache_folder):
                os.mkdir(cache_folder)

            locations = self.regenerate_locations(save_path=locations_path)
        return locations

    def regenerate_locations(self, save_path=None):
        """
        Compiles a new index of locations and dates from insideairbnb.com
        :return: A pandas DataFrame of ['name', 'url', 'dates']
        TODO: This is slow because get_dates() searches all 148,121 lines of html * 109 locations.
        It would be more efficient to take note of line numbers between each location
        in the first pass. Then for urls and dates, pass in starting and ending line numbers
        to regextract() to only search the sections of the html for that location.
        Currently, this function takes ~17 seconds to finish.
        """
        import re
        import requests
        from collections import Counter

        # 148,121 of html as a list of strings
        html = requests.get('http://insideairbnb.com/get-the-data.html').text.splitlines()

        def regextract(pattern):
            # Searches 'html' and returns whatever is in the first () of pattern as a unique list of strings
            matches = [re.search(pattern, line) for line in html]  # non-matches are included as None
            matches = [match.group(1) for match in matches if match]  # the if filters out the None's
            return list(Counter(matches))  # Returns only unique matches. Preserves the original order

        def get_dates(location_url):
            """
            Finds all dates that insideairbnb.com has data for, given a location
            :param location_url: The place to get dates for in url form
            :return: A list of date strings in 'YYYY-MM-DD' form
            """
            location_url = f'http://data.insideairbnb.com/{location_url}/([\d\-]+)/'
            dates = regextract(pattern=location_url)
            return json.dumps(dates)

        # Construct the index dataframe
        locations = pd.DataFrame({
            'name': regextract(pattern='<h2>(.+)</h2>'),
            'url': regextract(pattern='insideairbnb.com/(.+)/[\d\-]+/data/listings')})
        locations['dates'] = locations['url'].apply(get_dates)

        # save index to a compressed form (to 14% the original size)
        # BZ2 compression performed better than ZIP, GZ and XZ in my trial
        locations.to_csv(save_path, index=False)
        return locations

    def get_location_names(self):
        return self.locations['name'].to_list()

    def get_location_urls(self):
        return self.locations['url'].to_list()

    def lookup(self, column, for_):
        # Given part of a place name, return the value in another column for that place
        # lookup(column='dates', for_='Francisco')
        hits = self.locations.name.str.contains(for_)
        if hits.sum() > 0:
            value = self.locations.at[hits.idxmax(), column]
            if column == 'dates':
                value = json.loads(value)
            return value
        else:
            print(f"Could not find {for_} in locations['name']")
            return None

    def get_data(self, location_name=None, date=None, data_files=None):
        # import geopandas as gpd

        if location_name:
            location_url = self.lookup(column='url', for_=location_name)
        else:
            location_name = 'New York City'
            location_url = 'united-states/ny/new-york-city'

        if date is None:  # if no date is provided, assume the most recent date (first in the list)
            date = self.lookup(column='dates', for_=location_name)[1]

        if data_files is None:
            data_files = [self.data_files[0]]

        domain = 'http://data.insideairbnb.com'
        place_date = '/'.join([domain, location_url, date])
        urls = ['/'.join([place_date, file]) for file in data_files]

        def load_df(url):
            filetype = url.split('.')[-1]

            if filetype in ['csv', 'gz']:
                df = pd.read_csv(url)  # compression format is inferred from the filename
            elif filetype == 'geojson':
                with urlopen(url) as response:
                    df = json.load(response)
            else:
                print(f"Can't load {url} as a DataFrame")
                df = None
            return df

        def df_name(url):
            filetype = url.split('.')[-1]
            table, mode = url.split('/')[-1:-3:-1]
            table = table.split('.')[0]
            mode = mode[:3]

            if filetype == 'geojson':
                return f'{table}-{filetype[:3]}'
            else:
                return f'{table}-{mode}'

        dataframes = {df_name(url): load_df(url) for url in urls}
        return dataframes


In [141]:
predict = PredictPrice()

In [142]:
form = {'location': 'Australia', 'name': "the outback", 'latitude':49, 'longitude':35, 'csrf_token':'blah'}


In [143]:
predict.predict_price(form = form)

  good_rows = available & active


199.4442322419655

In [139]:
alpha = predict.get_listings(location='Australia')

In [110]:
latest_date = airbnb.lookup('dates', for_=location)[0]

In [112]:
latest_date

'2021-03-09'

In [113]:
listing_file = ['data/listings.csv.gz']

In [114]:
location = 'Australia'

In [115]:
data = airbnb.get_data(location_name=location, date=latest_date, data_files=listing_file)

In [118]:
listing = data['listings-dat']

In [119]:
listing.head()

Unnamed: 0,id,listing_url,scrape_id,last_searched,last_scraped,name,description,neighborhood_overview,picture_url,host_id,...,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,region_id,region_name,region_parent_id,region_parent_name,region_parent_parent_id,region_parent_parent_name,reviews_per_month
0,419714,https://www.airbnb.com/rooms/419714,20210309214200,2021-03-10,2021-03-24,Unit #2- Downtown Suites,This is a fully furnished apartment located co...,The area and location: <br />The apartments ha...,https://a0.muscache.com/pictures/ec07875a-6e8c...,1485023,...,6,0,0,16150,Orange,1,New South Wales,,,0.82
1,38826228,https://www.airbnb.com/rooms/38826228,20210309214200,2021-03-10,2021-03-24,'Wendouree' is a centrally located heritage home,Treat yourself to a wonderful experience in a ...,'Wendouree' is located in the centre of Orange...,https://a0.muscache.com/pictures/42e270b6-e8bb...,258660009,...,1,0,0,16150,Orange,1,New South Wales,,,1.36
2,30823894,https://www.airbnb.com/rooms/30823894,20210309214200,2021-03-10,2021-03-19,Carawatha by Beach Stays,Basking in a sunny north aspect with vistas sp...,"Palm Beach, affectionately known as 'Palmy', i...",https://a0.muscache.com/pictures/4852c458-2b6a...,24721535,...,43,0,0,16370,Pittwater,1,New South Wales,,,0.07
3,43055038,https://www.airbnb.com/rooms/43055038,20210309214200,2021-03-09,2021-03-22,Mt View Charleston,The Mt View Charleston is situated on a workin...,We are location on a working farm outside of B...,https://a0.muscache.com/pictures/ca04114b-70a5...,95077201,...,2,0,0,16180,Palerang,1,New South Wales,,,
4,39366261,https://www.airbnb.com/rooms/39366261,20210309214200,2021-03-10,2021-03-22,"Cozy One Bedroom, WiFi, Foxtel, Parking,",Feel right at home in a One Bedroom Apartment ...,Nesuto apartment -hotel is well located on the...,https://a0.muscache.com/pictures/5f68e0c8-5a74...,279514563,...,2,1,0,16250,Parramatta,1,New South Wales,,,1.19
