In [None]:
# Obviously
import pandas as pd
import numpy as np

# Plotting set-up
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import rc
# Command to reset plot styles to default: mpl.rcParams.update(mpl.rcParamsDefault)
plt.style.use('seaborn-poster')
mpl.rcParams['font.family'] = 'serif'
% matplotlib inline

# Options
pd.set_option('display.height', 500)
pd.set_option('display.max_rows', 500)
data_folder = '../data/'

In [None]:
listings = pd.read_csv(data_folder + 'listings.csv')
venues = pd.read_csv(data_folder + 'venues.csv.gz')
real_estate = pd.read_csv(data_folder + 'real_estate.csv.gz')
# calendar = pd.read_csv(data_folder + 'calendar.csv.gz', parse_dates=True)
demographics = pd.read_csv(data_folder + 'demographics.csv')
econ_state = pd.read_csv(data_folder + 'econ_state.csv')

In [None]:
demographics

In [None]:
real_estate['zipcode'] = real_estate['zipcode'].apply(lambda x: str(x).zfill(5))
demographics['zipcode'] = demographics['zipcode'].apply(lambda x: str(x).zfill(5))

In [None]:
from pandas.api.types import is_numeric_dtype, is_datetime64_dtype
price_columns = ['price', 'weekly_price']
for col in price_columns:
    if not is_numeric_dtype(listings[col]):
        listings[col] = listings[col].str.replace('$', '')
        listings[col] = listings[col].str.replace(',', '')
        listings[col] = listings[col].astype(np.float64)

In [None]:
import holoviews as hv
import geoviews as gv
import geoviews.feature as gf
import cartopy
from cartopy import crs as ccrs
from bokeh.tile_providers import STAMEN_TONER
from bokeh.models import WMTSTileSource

hv.notebook_extension('bokeh')
tiles = {'OpenMap': WMTSTileSource(url='http://c.tile.openstreetmap.org/{Z}/{X}/{Y}.png'),
         'ESRI': WMTSTileSource(url='https://server.arcgisonline.com/ArcGIS/rest/services/World_Imagery/MapServer/tile/{Z}/{Y}/{X}.jpg'),
         'Wikipedia': WMTSTileSource(url='https://maps.wikimedia.org/osm-intl/{Z}/{X}/{Y}@2x.png'),
         'Stamen Toner': STAMEN_TONER}

In [None]:
type_counts = dict()
for t in venues.types:
    t2 = t[1:-1].split(', ')
    for t3 in t2:
        type_name = t3[1:-1]
        if type_name in type_counts:
            type_counts[type_name] += 1
        else:
            type_counts[type_name] = 1
type_counts

In [None]:
cap = 300
listings['price_cap'] = listings['price'].apply(lambda x: x if x < cap else cap)

In [None]:
%%opts Overlay [width=700 height=600] 
%%opts Points (size=0.1 cmap='viridis') [tools=['hover'] size_index=2 color_index=2 xaxis=None yaxis=None]
listings_ds = gv.Dataset(listings[listings['metropolitan'] == 'austin'], kdims=['price_cap', 'accommodates', 'zipcode'])
(gv.WMTS(tiles['Wikipedia']) *\
listings_ds.to(gv.Points, kdims=['longitude', 'latitude'],
              vdims=['price_cap', 'accommodates', 'zipcode'], crs=ccrs.PlateCarree()))

In [None]:
listings_cities = {'asheville': 'Asheville',
                   'austin': 'Austin',
                   'nashville': 'Nashville',
                   'new_orleans': 'New Orleans',
                   'new orleans': 'New Orleans',
                   'LA': 'Los Angeles',
                   'los angeles': 'Los Angeles'}
listings['metropolitan'] = listings['metropolitan'].apply(lambda x: listings_cities[x] if x in listings_cities else x)

In [None]:
listings['zipcode'] = listings['zipcode'].apply(lambda x: str(x).zfill(5))
airbnb_counts_per_zip = listings.groupby('zipcode').size()
airbnb_counts_per_zip = pd.DataFrame(airbnb_counts_per_zip).reset_index()
airbnb_counts_per_zip.columns = ['zipcode', 'airbnb_count']
airbnb_counts_per_zip.merge(demographics, on='zipcode')

In [None]:
%%opts Overlay [width=700 height=600] 
%%opts Points (size=0.1 cmap='viridis') [tools=['hover'] size_index=2 color_index=2 xaxis=None yaxis=None]
listings_ds = gv.Dataset(venues[listings['metropolitan'] == 'austin'], kdims=['price', 'accommodates', 'zipcode'])
(gv.WMTS(tiles['Wikipedia']) *\
listings_ds.to(gv.Points, kdims=['longitude', 'latitude'],
              vdims=['price', 'accommodates', 'zipcode'], crs=ccrs.PlateCarree()))

In [None]:
real_estate

In [None]:
listings['zipgroup'] = listings['zipcode'].apply(lambda x: str(x[:3]))

In [None]:
demographics.sort_values('population', ascending=False)

In [None]:
demographics['zipcode'] = demographics['zipcode'].apply(lambda x: str(x).zfill(5))

In [None]:
#demographics = demographics[~(demographics['$9,999_or_less'] == '-')]
print(len(demographics))

income_cols = {'$9,999_or_less': 5000,
               '$10,000-$14,999': 12500,
               '$15,000-$24,999': 20000,
               '$25,000-$34,999': 30000,
               '$35,000-$49,999': 42500,
               '$50,000-$74,999': 62500,
               '$75,000-$99,999': 82500,
               '$100,000-$150,000': 125000,
               '$150,000-$199,999': 175000,
               '$200,000_or_more': 225000}
for col, income in income_cols.items():
    demographics[col] = demographics[col].astype(float)

In [None]:
def gini_row(x):
    mu = sum(x[col]*income*0.01 for col, income in income_cols.items())
    from itertools import combinations
    gini_sum = 0
    for i,j in combinations(income_cols.keys(), 2):
        gini_sum += x[i]*x[j]*0.0001*np.abs(income_cols[i]-income_cols[j])
    gini = gini_sum*(0.5*(1/mu))
    return gini
demographics['gini'] = demographics.apply(gini_row, axis=1)

In [None]:
listing_zips = listings['zipcode'].unique()
listing_demographics = demographics[demographics['zipcode'].isin(listing_zips)]
listing_demographics[['zipcode', 'gini']].sort_values('gini', ascending=False)

In [None]:
real_estate_gini = real_estate[['zipcode', '2017-06']].merge(listing_demographics[['zipcode','gini']])
la_zips = listings[listings['metropolitan'] == 'Los Angeles']['zipcode'].unique()
real_estate_gini = real_estate_gini[real_estate_gini]
airbnb_zip_gini = airbnb_counts_per_zip.merge(listing_demographics[['zipcode','gini']])
plt.scatter(airbnb_zip_gini['airbnb_count'], airbnb_zip_gini['gini'])
plt.show()
plt.scatter(real_estate_gini['2017-06'], real_estate_gini['gini'])
plt.show()

In [None]:
venues['city'] = venues['city'].apply(lambda x: listings_cities[x] if x in listings_cities else x)

In [None]:
la_venues = venues[venues['city'] == 'Los Angeles']
la_listings = listings[listings['metropolitan'] == 'Los Angeles']

In [None]:
from math import radians, cos, sin, asin, sqrt
def haversine(lat1, lon1, lat2, lon2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    # Radius of earth in kilometers is 6371
    km = 6371* c
    return km

In [None]:
def split_venue_types(venue):
    venuetypes = venue['types']
    new_types = list()
    splits = venuetypes[1:-1].split(', ')
    for type_name in splits:
        new_types.append(type_name[1:-1])
    return new_types
venues['split_types'] = venues.apply(split_venue_types, axis=1)

In [None]:
def count_near_venues(listing, type_venues, r=1):
    lat = listing['latitude']
    lon = listing['longitude']
    count = 0
    type_venues = type_venues[type_venues['city'] == listing['metropolitan']]
    for i, v in type_venues.iterrows():
        distance = haversine(lat, lon, v['latitude'], v['longitude'])
        if distance < r:
            count += 1
    return count

In [None]:
from tqdm import tqdm, tqdm_notebook, tqdm_pandas
tqdm.pandas(tqdm_notebook)

In [None]:
venues['split_types'].head()

In [None]:
def venues_by_type(venue, venuetypes):
    venuetypes = set(venuetypes)
    venuetypes2 = set(venue['split_types'])
    return len(venuetypes.intersection(venuetypes2)) > 0

In [None]:
nightlife = ['bar', 'casino', 'night_club']
nightlife_venues = venues.loc[venues.apply(lambda v: venues_by_type(v, nightlife), axis=1)]

In [None]:
transit = ['bus_station', 'airport', 'car_rental', 'light_rail_station', 'subway_station', 'train_station', 'transit_station']
transit_venues = venues.loc[venues.apply(lambda v: venues_by_type(v, transit), axis=1)]
amusement = ['amusement_parks', 'art_gallery', 'aquarium', 'bowling_alley', \
             'casino', 'movie_theater', 'museum', 'zoo', 'stadium', 'shopping_mall']
amusement_venues = venues.loc[venues.apply(lambda v: venues_by_type(v, amusement), axis=1)]

In [None]:
la_listings = listings[listings['metropolitan'] == 'Los Angeles']
la_nightlife_venues = nightlife_venues[nightlife_venues['city'] == 'Los Angeles']
la_listings['nightlife_count'] = la_listings.progress_apply(lambda x: count_near_venues(x, la_nightlife_venues, r=1), axis=1)

In [None]:
listings['amusement_count'] = listings.progress_apply(lambda x: count_near_venues(x, amusement_venues, r=1), axis=1)

In [None]:
listings['nightlife_count'].describe()

In [341]:
zip_lat_lon = pd.read_csv(data_folder + 'zip_lat_lon.csv')
zip_lat_lon.columns = ['zipcode', 'latitude', 'longitude']

In [342]:
interesting_zips = listings['zipcode'].unique()
zip_lat_lon[zip_lat_lon['zipcode'].isin(interesting_zips)]

Unnamed: 0,zipcode,latitude,longitude


In [334]:
def zip_for_lat_lon(row):
    my_lat = row['latitude']
    my_lon = row['longitude']
    min_dist = 1000
    min_zip = None
    for i, ziprow in zip_lat_lon.iterrows():
        dist = haversine(ziprow['latitude'], ziprow['longitude'], my_lat, my_lon)
        if dist < min_dist:
            min_zip = ziprow['zipcode']
            min_dist = dist
    return str(int(min_zip)).zfill(5)

In [337]:
venues['zipcode'] = venues.progress_apply(zip_for_lat_lon, axis=1)














  0%|          | 0/121007 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












  0%|          | 2/121007 [00:01<28:49:42,  1.17it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












  0%|          | 3/121007 [00:03<37:29:56,  1.12s/it][A[A[A[A[A[A[A[A[A[A[A[A[A












  0%|          | 4/121007 [00:05<42:29:26,  1.26s/it][A[A[A[A[A[A[A[A[A[A[A[A[A












  0%|          | 5/121007 [00:06<44:58:33,  1.34s/it][A[A[A[A[A[A[A[A[A[A[A[A[A












  0%|          | 6/121007 [00:08<46:38:57,  1.39s/it][A[A[A[A[A[A[A[A[A[A[A[A[A












  0%|          | 7/121007 [00:09<47:47:36,  1.42s/it][A[A[A[A[A[A[A[A[A[A[A[A[A












  0%|          | 8/121007 [00:11<51:13:34,  1.52s/it][A[A[A[A[A[A[A[A[A[A[A[A[A












  0%|          | 9/121007 [00:13<55:44:34,  1.66s/it][A[A[A[A[A[A[A[A[A[A[A[A[A












  0%|          | 10/121007 [00:14<56:03:33,  1.67s/it

KeyboardInterrupt: 