In [71]:
import ast
import pandas as pd 
import timeit
import numpy as np 
from statistics import mean
import prophet
import pydeck as pdk

# Imports

In [None]:
#Import two data-sets
starbucks_df = pd.read_csv('./Clean_Data/Starbucks_Data.csv')
airbnb_df = pd.read_csv('./Clean_Data/AirBNB_Data.csv')
#Convert List columns stored as strings to lists
starbucks_df['airbnb_distances'] = starbucks_df['airbnb_distances'].apply(ast.literal_eval)
airbnb_df['starbucks_distances'] = airbnb_df['starbucks_distances'].apply(ast.literal_eval)
#Convert Date columns to Datetime
airbnb_df['first_review'] = pd.to_datetime(airbnb_df['first_review'])
airbnb_df['last_review'] = pd.to_datetime(airbnb_df['last_review'])
#Preview to verify
display(starbucks_df.head(1))
display(starbucks_df.info())

display(airbnb_df.head(1))
display(airbnb_df.info())

# Data Analytics

In [None]:
#Manually assigned distances of interest
bins = [.1, .25, .5, 1, 2, 5]
#Define a function that loops through each 'bin' (not technically correct)
#and counts starbucks within that range, then populate a column with that count
#for future plots
def airbnb_to_starbucks():
    for bin in bins:
        def nested_bin(row):
            dist_to_sb_list = row['starbucks_distances']
            sb_within_range = [x for x in dist_to_sb_list if x < bin]
            return len(sb_within_range)
        column_name = f"SB_within_{bin}_miles"
        airbnb_df[column_name] = airbnb_df.apply(nested_bin, axis=1)
# Run the function
airbnb_to_starbucks()
# Do it again for starbucks
def starbucks_to_airbnb():
    for bin in bins:
        def nested_bin(row):
            dist_to_bnb_list = row['airbnb_distances']
            bnb_within_range = [x for x in dist_to_bnb_list if x < bin]
            return len(bnb_within_range)
        column_name = f"Airbnb_within_{bin}_miles"
        starbucks_df[column_name] = starbucks_df.apply(nested_bin, axis=1)
starbucks_to_airbnb()

In [None]:
#calculate the average distance to the nearest 5 starbucks or bnbs
def mean_dist_nearest_5_sb(row):
    dist_list = row['starbucks_distances']
    dist_list.sort()
    return round(mean(dist_list[:5]),2)

def mean_dist_nearest_5_bnb(row):
    dist_list = row['airbnb_distances']
    dist_list.sort()
    return round(mean(dist_list[:5]),2)

airbnb_df['nearest_5_avg'] = airbnb_df.apply(mean_dist_nearest_5_sb, axis=1)
starbucks_df['nearest_5_avg'] = starbucks_df.apply(mean_dist_nearest_5_bnb, axis=1)

In [None]:
#Add a column for "price per occupant" 
airbnb_df['price_per_occupant'] = round(airbnb_df['price']/airbnb_df['accommodates'],2)

In [None]:
first_review_df = airbnb_df.copy()
first_review_df.dropna(how='any', inplace=True)
first_review_df.set_index('first_review',inplace=True)
first_review_df_agg = first_review_df.groupby([first_review_df.index.year, first_review_df.index.month])
first_review_df_agg = first_review_df_agg['id'].count().to_frame()
first_review_df_agg.columns = ['Total_New_Airbnbs']
first_review_df_agg['dateseries'] = pd.to_datetime(first_review_df_agg.index.get_level_values(0).astype(str)+'-'+first_review_df_agg.index.get_level_values(1).astype(str))
first_review_df_agg.set_index('dateseries',inplace=True)
first_review_df_agg.reset_index(inplace=True)
first_review_df_agg.columns = ['ds','y']
# first_review_df_agg.to_csv('./Clean_Data/DateSeries_Prophet.csv')


In [None]:
def total_bnbs(row):
    date = row['ds']
    total = first_review_df_agg.loc[first_review_df_agg['ds'] <= date]['y'].sum()
    return total

first_review_df_agg['total_airbnbs'] = first_review_df_agg.apply(total_bnbs, axis=1)

total_bnb_df = first_review_df_agg[['ds','total_airbnbs']].rename(columns={'total_airbnbs':'y'})
new_bnb_df = first_review_df_agg[['ds','y']]

In [None]:
#model and plot for new airbnbs
m = prophet.Prophet()
m.fit(new_bnb_df)
future_df = m.make_future_dataframe(periods=100, freq='M')
forecast = m.predict(future_df)
m.plot_components(forecast)

In [None]:
#model and plot for total airbnbs
m2 = prophet.Prophet()
m2.fit(total_bnb_df)
future_df2 = m2.make_future_dataframe(periods=100, freq='M')
forecast2 = m2.predict(future_df2)
m2.plot_components(forecast2)

# Visualizations

### Reviews with respect to the number of starbucks within 0.5 miles (distance may change based on group-by line)

In [None]:
#Aggregate reviews by the number of SB within X miles
airbnb_df_reviewed_only = airbnb_df.loc[airbnb_df['review_scores_rating'] != -1]
aggregate_bnb_df = airbnb_df_reviewed_only.groupby(by=['SB_within_0.5_miles'])
agg_bnb_df_mean = aggregate_bnb_df[['price', 'review_scores_rating','review_scores_location']].mean()
display(aggregate_bnb_df['review_scores_rating'].count())
agg_bnb_df_mean[['review_scores_rating','review_scores_location']].plot(figsize=(12,8), ylim=(4.6,5))

### Not very helpful scatter-plot of reviews vs average distance to nearest 5 SB

In [None]:
airbnb_df.loc[(airbnb_df['review_scores_rating'] != -1) & (airbnb_df['number_of_reviews'] > 10)]\
    .plot(x='nearest_5_avg', y='review_scores_rating', kind='scatter', ylim=(4.1, 5.1), figsize=(12,8), s=1)

### Chart Depicting Reviews vs mileage to the nearest 5 starbucks

In [None]:
#actual binning of the data by nearest_5_avg
nearest_5_bins = [0, 0.25, 0.5, 1, 2, 5]
labels = [1,2,3,4,5]
airbnb_df['nearest_5_distance_bins']=pd.cut(airbnb_df['nearest_5_avg'], bins=nearest_5_bins, labels=labels)
grouped_near5_airbnb = airbnb_df.loc[airbnb_df['review_scores_rating'] != -1].groupby('nearest_5_distance_bins')
plt = grouped_near5_airbnb[['review_scores_rating', 'review_scores_location']].mean().plot()
plt.set_xticks([0,1,2,3,4])
plt.set_xlim((-.1,4.1))
plt.set_xticklabels(['<.25','0.25 - 0.5','0.5 - 1.0','1.0 - 2.0','2.0 - 5.0'])
plt.set_xlabel("Distance Bins (miles)")
plt.set_ylabel("Average Review")

### Average Price with respect to distance to nearest 5 starbucks (same bins as above)

In [None]:
plt2 = grouped_near5_airbnb[['price_per_occupant']].mean().plot()
plt2.set_xticks([0,1,2,3,4])
plt2.set_xticklabels(['<.25','0.25 - 0.5','0.5 - 1.0','1.0 - 2.0','2.0 - 5.0'])
plt2.set_xlabel("Distance Bins (miles)")
plt2.set_ylabel("Average Price ($ Per Accomodated Guest)")

# Testing that supported decisions while writing

In [None]:
#speed testing list comprehension vs. numpy eval:
def numpy_speed():
    data_list = starbucks_df['airbnb_distances'][0]
    numpy_array = np.array(data_list)
    nearby_distances = numpy_array[numpy_array < 1]
    nearby_distances = np.sort(nearby_distances)
    nearby_count = len(nearby_distances)
    return(nearby_count,nearby_distances)

time_taken_numpy = timeit.timeit(numpy_speed, number=5000)

def list_speed():
    data_list = starbucks_df['airbnb_distances'][0]
    keep_list = [x for x in data_list if x < 1]
    keep_list.sort()
    nearby_count = len(keep_list)
    return(nearby_count,keep_list)

time_taken_list = timeit.timeit(list_speed, number=1000)

print(f"numpy: {time_taken_numpy}")
print(f"list comprehension: {time_taken_list}")

#list comprehension was consistently ~5-10% faster

In [135]:
#Prepping Data into lat and long bins for geographical plotting
lat_min = airbnb_df['latitude'].min()
lat_max = airbnb_df['latitude'].max()
long_min = airbnb_df['longitude'].min()
long_max = airbnb_df['longitude'].max()
bin_count = 20
lat_step = round((lat_max-lat_min)/bin_count,3)
long_step = round((long_max-long_min)/bin_count,3)
lat_bins = np.arange(lat_min,lat_max,lat_step)
long_bins = np.arange(long_min,long_max,long_step)
lat_bins[0] -= .02
lat_bins[-1] += .02
long_bins[0] -= .02
long_bins[-1] += .02
airbnb_df['lat_bins'] = pd.cut(airbnb_df['latitude'], bins=lat_bins, labels=range(len(lat_bins)-1))
airbnb_df['long_bins'] = pd.cut(airbnb_df['longitude'], bins=long_bins, labels=range(len(long_bins)-1))

In [136]:
airbnb_df

Unnamed: 0,id,description,neighborhood_overview,host_neighbourhood,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,...,SB_within_0.25_miles,SB_within_0.5_miles,SB_within_1_miles,SB_within_2_miles,SB_within_5_miles,nearest_5_avg,price_per_occupant,nearest_5_distance_bins,lat_bins,long_bins
0,360,Enjoy the famous Colorado weather and unplug i...,The cottage is located in the center of Lower ...,Highland,Highland,39.766415,-105.002098,Entire guesthouse,Entire home/apt,3,...,0,1,5,21,49,0.74,30.00,3,12,4
1,364,"Modern 1,000 square foot loft in the heart of ...","Ten brewpubs within walking distance, two grea...",Five Points,Five Points,39.766720,-104.979060,Entire loft,Entire home/apt,3,...,0,0,1,16,54,1.20,59.67,4,12,5
2,590,"Large guest room in my home, where I also live...",I love the diversity of my neighborhood and it...,North Park Hill,North Park Hill,39.755110,-104.911090,Private room in home,Private room,3,...,0,1,2,7,48,0.93,21.33,3,11,8
3,592,This room is in the basement. It does not hav...,-1,North Park Hill,North Park Hill,39.754810,-104.911060,Private room in home,Private room,2,...,0,0,2,6,49,0.92,28.50,3,11,8
4,686,Thanks for visiting my Queen Bed Room site for...,"I love my Uptown neighborhood, which is within...",North Capitol Hill,North Capitol Hill,39.746950,-104.978380,Private room in home,Private room,2,...,1,1,9,23,60,0.48,16.50,2,10,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5362,985655038490529659,"Blake's house is a captivating haven, blending...",-1,Highland,Highland,39.765313,-105.013120,Entire home,Entire home/apt,14,...,0,0,5,18,49,0.91,45.14,3,12,3
5363,985699485774546946,This is a fully restored property in one of th...,-1,Washington Park,Elyria Swansea,39.783094,-104.964334,Entire home,Entire home/apt,3,...,0,0,0,1,46,2.17,31.67,5,13,6
5364,986480371259464482,Enjoy an industrial experience at this central...,"Nicknamed “RiNo,” the trendy River North Art D...",Five Points,Five Points,39.756581,-104.988746,Entire rental unit,Entire home/apt,2,...,0,1,15,21,54,0.54,70.00,3,11,5
5365,986558663097275426,Beautiful/private one bedroom with private mod...,-1,South Denver,Rosedale,39.677558,-104.983539,Private room in home,Private room,2,...,0,0,2,7,57,1.03,35.00,4,5,5


In [None]:
#Learning how to use pydeck
# Build the map layer:    
layer = pdk.Layer(type='ColumnLayer',
                  data=airbnb_df,
                  get_position=['lng', 'lat'],
                  get_elevation='population',
                  auto_highlight=True,
                  elevation_scale=0.03,
                  pickable=True,
                  get_fill_color=['population', 255],
                  coverage=5)

# Set the view parameters:
view_state = pdk.ViewState(longitude=-95, 
                           latitude=36,
                           zoom=3.8,
                           min_zoom=3,
                           max_zoom=15,
                           pitch=45.0,
                           bearing=0)

# Render the map:
r = pdk.Deck(layers=[layer], initial_view_state=view_state)
r.to_html('usa_popl.html')