In [1]:
import plotly.graph_objects as go
import plotly.express as px

import pandas as pd

In [2]:
l = pd.read_csv(r"../data/airbnb/listings.csv.gz", compression='gzip', low_memory = False)

In [7]:
l.groupby("host_location").size().sort_values()

host_location
AU                                                          1
Mexico City, Federal District, Mexico                       1
Middletown, Maryland, United States                         1
Mifflintown, Pennsylvania, United States                    1
Milford, Connecticut, United States                         1
Minnesota, United States                                    1
Montclair, Virginia, United States                          1
Montpellier, Occitanie, France                              1
Montreal, Québec, Canada                                    1
Morrisville, Pennsylvania, United States                    1
Moscow, Moscow, Russian Federation                          1
Mountain View, California, United States                    1
Mumbai, Maharashtra, India                                  1
Muscatine, Iowa, United States                              1
My family and I reside on the top two floors.               1
Mysuru, Karnataka, India                                

In [4]:
for col in l.columns:
    print(col)
    

id
listing_url
scrape_id
last_scraped
name
summary
space
description
experiences_offered
neighborhood_overview
notes
transit
access
interaction
house_rules
thumbnail_url
medium_url
picture_url
xl_picture_url
host_id
host_url
host_name
host_since
host_location
host_about
host_response_time
host_response_rate
host_acceptance_rate
host_is_superhost
host_thumbnail_url
host_picture_url
host_neighbourhood
host_listings_count
host_total_listings_count
host_verifications
host_has_profile_pic
host_identity_verified
street
neighbourhood
neighbourhood_cleansed
neighbourhood_group_cleansed
city
state
zipcode
market
smart_location
country_code
country
latitude
longitude
is_location_exact
property_type
room_type
accommodates
bathrooms
bedrooms
beds
bed_type
amenities
square_feet
price
weekly_price
monthly_price
security_deposit
cleaning_fee
guests_included
extra_people
minimum_nights
maximum_nights
minimum_minimum_nights
maximum_minimum_nights
minimum_maximum_nights
maximum_maximum_nights
minimum_ni

In [3]:
#subset the relevant variables

listings = l[['id', 'neighbourhood','state', 'host_id',  'host_since', 'first_review' , 'host_neighbourhood', 'property_type', 'room_type', 'host_listings_count', 'monthly_price']].copy()

In [4]:
listings.shape

(9126, 11)

In [5]:
#remove any listings for VA or MD

listings = listings[(listings['state'] == 'DC')]

In [12]:
listings.head()

Unnamed: 0,id,neighbourhood,state,host_id,host_since,first_review,host_neighbourhood,property_type,room_type,host_listings_count,monthly_price
0,3344,Downtown/Penn Quarter,DC,4957,2008-12-10,2009-05-09,Downtown/Penn Quarter,Condominium,Private room,2.0,"$1,495.00"
1,3362,Shaw,DC,2798,2008-09-07,2009-01-21,Shaw,Townhouse,Entire home/apt,5.0,
2,3662,Buena Vista,DC,4645,2008-11-26,2011-04-13,Anacostia,House,Private room,3.0,"$1,175.00"
3,3670,U Street Corridor,DC,4630,2008-11-25,2015-03-18,,Townhouse,Private room,1.0,
4,3686,Anacostia,DC,4645,2008-11-26,2010-11-01,Anacostia,House,Private room,3.0,$975.00


In [13]:
#check for nulls

listings.isnull().sum()

id                        0
neighbourhood             0
state                     0
host_id                   0
host_since                4
first_review           1807
host_neighbourhood      926
property_type             0
room_type                 0
host_listings_count       4
monthly_price          8171
dtype: int64

In [6]:
#change dtype of host_since and first_review to datetime and create year columns

listings['first_review'] = pd.to_datetime(listings['first_review'], errors='coerce')
listings['firstreview_year'] = listings['first_review'].dt.strftime('%Y')


listings['host_since'] = pd.to_datetime(listings['host_since'], errors='coerce')
listings['hostsince_year'] = listings['host_since'].dt.strftime('%Y')

In [15]:
listings.to_csv("../data/airbnb/listings_subset.csv", index = False)

In [94]:
ll = listings[['neighbourhood', 'firstreview_year', 'hostsince_year']]

In [97]:
ll = ll.groupby(['neighbourhood', 'firstreview_year']).firstreview_year.agg('count').to_frame('listcount').reset_index()

In [98]:
ll['listings_cumulative'] = ll.groupby(['neighbourhood'])['listcount'].apply(lambda x: x.cumsum())

In [99]:
#drop duplicate hosts from hosts df

hostdata = listings.drop_duplicates("host_id")
hostdata.shape

(4890, 11)

In [100]:
hh = hostdata.groupby(['neighbourhood', 'firstreview_year']).firstreview_year.agg('count').to_frame('hostcount').reset_index()

In [101]:
hh['hosts_cumulative'] = hh.groupby(['neighbourhood'])['hostcount'].apply(lambda x: x.cumsum())

In [102]:
ll.head()

Unnamed: 0,neighbourhood,firstreview_year,listcount,listings_cumulative
0,16th Street Heights,2009,1,1
1,16th Street Heights,2013,3,4
2,16th Street Heights,2014,8,12
3,16th Street Heights,2015,13,25
4,16th Street Heights,2016,24,49


In [103]:
ll.shape

(604, 4)

In [104]:
hh.head()

Unnamed: 0,neighbourhood,firstreview_year,hostcount,hosts_cumulative
0,16th Street Heights,2009,1,1
1,16th Street Heights,2013,3,4
2,16th Street Heights,2014,8,12
3,16th Street Heights,2015,12,24
4,16th Street Heights,2016,24,48


In [46]:
data_list = []

for l in labels:
    df = ll[ll['neighbourhood']==l]
    data_list.append(df)
    

In [67]:
data_host = []

for l in labels:
    df = hh[hh['neighbourhood']==l]
    data_host.append(df)

In [83]:
data_host[0].neighbourhood.unique()[0]

'16th Street Heights'

In [24]:
import plotly
import plotly.express as px

In [49]:
labels = hh.neighbourhood.unique().tolist()

In [50]:
len(labels)

110

In [104]:

fig = go.Figure()


for i, d in enumerate(data_list):
    
    fig.add_trace(go.Scatter( x=d["firstreview_modified_year"], 
                                y=d["listings_cumulative"], 
                                mode = "lines+markers",
                                marker_color="blue",
                                text=d['neighbourhood'],
                                showlegend= True, 
                                legendgroup = labels[i],
                            name = "listings")
                  
                                )

for i, d in enumerate(data_host):
    
    fig.add_trace(go.Scatter( x=d["hostsince_year"], 
                                y=d["hosts_cumulative"], 
                                mode = "lines+markers",
                                #line=dict(color=hh['neighbourhood']),
                                marker_color="black",
                                text=d['neighbourhood'], 
                            legendgroup = labels[i],
                            name = "hosts")
                                
                                )
    

    
### Create buttons for drop down menu
buttons = []
for i, label in enumerate(labels):
    visibility = [i==j for j in range(len(labels))]
    button = dict(
                 label =  label,
                 method = 'update',
                 args = [{'visible': visibility},
                     {'title': label}])
    buttons.append(button)

updatemenus = list([
    dict(x = 1,
        y = 1.15,
        #xref = 'paper',
        #yref = 'paper',
        yanchor = 'top',
        active = 1,
        showactive = False,
        buttons = buttons
    )
])

#fig['layout']['title'] = 'Number of Listings vs Hosts by Neighbourhood'
#fig['layout']['showlegend'] = True
#fig['layout']['updatemenus'] = updatemenus

fig['layout']['title'] = '<b>Number of Listings vs Hosts by Neighbourhood</b>'
fig['layout']['xaxis']['title'] = ''
fig['layout']['xaxis']['type'] = 'date'
fig['layout']['xaxis']['gridcolor'] = '#d3d3d3'
fig['layout']['yaxis']['title'] = '<b>Count</b>'
fig['layout']['yaxis']['type'] = 'linear'
fig['layout']['yaxis']['range'] = [0,1200]
fig['layout']['yaxis']['gridcolor'] = '#d3d3d3'
fig['layout']["plot_bgcolor"]= 'rgb(240, 240, 240)'
fig['layout']["width"]= 800
fig['layout']["height"] = 400
fig['layout']['showlegend'] = False

fig['layout']['updatemenus'] = updatemenus


plotly.offline.plot(fig, "listings.html")
fig.show()
    

In [None]:
- legend labels
- title

In [None]:
fig['layout']['title'] = '<b>Number of Listings vs Hosts by Neighbourhood</b>'
fig['layout']['xaxis']['title'] = '<b>Year</b>'
fig['layout']['xaxis']['gridcolor'] = '#d3d3d3'
fig['layout']['yaxis']['title'] = '<b>Count</b>'
fig['layout']['yaxis']['range'] = [0,1200]
fig['layout']['yaxis']['gridcolor'] = '#d3d3d3'
fig['layout']["plot_bgcolor"]= "white"
fig['layout']["width"]= 800
fig['layout']["height"] = 400



layout = list([
    dict(
        paper_bgcolor = 'rgb(240, 240, 240)',
        plot_bgcolor = 'rgb(240, 240, 240)',
        title = '<b>Number of Listings vs Hosts by Neighbourhood</b>'
        xaxis = dict(
            title = '',
            type = 'date'),
        yaxis = dict(
    title = 'Count',
    type = 'linear'))])
        

In [115]:
#create new df for modified listings aggregrate and new column for cumsum

listdata = listings.groupby(['firstreview_modified_year']).firstreview_modified_year.agg('count').to_frame('list_count').reset_index()
listdata['list_total'] = listdata.list_count.cumsum()

In [116]:
listdata

Unnamed: 0,firstreview_modified_year,list_count,list_total
0,2008,4,4
1,2009,30,34
2,2010,36,70
3,2011,69,139
4,2012,154,293
5,2013,353,646
6,2014,595,1241
7,2015,1178,2419
8,2016,1657,4076
9,2017,1931,6007


In [118]:
hostdata = hostdata.dropna()
hostdata = hostdata.groupby(['hostsince_year']).hostsince_year.agg('count').to_frame('host_count').reset_index()
hostdata['host_total'] = hostdata.host_count.cumsum()

hostdata

Unnamed: 0,hostsince_year,host_count,host_total
0,2008,24,24
1,2009,36,60
2,2010,73,133
3,2011,248,381
4,2012,469,850
5,2013,796,1646
6,2014,1095,2741
7,2015,1143,3884
8,2016,1105,4989
9,2017,526,5515


In [119]:
df = pd.concat([listdata, hostdata], axis=1)

In [121]:
df = df.drop(['hostsince_year'], axis=1)


Unnamed: 0,firstreview_modified_year,list_count,list_total,host_count,host_total
0,2008,4,4,24,24
1,2009,30,34,36,60
2,2010,36,70,73,133
3,2011,69,139,248,381
4,2012,154,293,469,850
5,2013,353,646,796,1646
6,2014,595,1241,1095,2741
7,2015,1178,2419,1143,3884
8,2016,1657,4076,1105,4989
9,2017,1931,6007,526,5515


In [122]:
df = df.rename({'firstreview_modified_year':'year'}, axis=1)

In [123]:
df

Unnamed: 0,year,list_count,list_total,host_count,host_total
0,2008,4,4,24,24
1,2009,30,34,36,60
2,2010,36,70,73,133
3,2011,69,139,248,381
4,2012,154,293,469,850
5,2013,353,646,796,1646
6,2014,595,1241,1095,2741
7,2015,1178,2419,1143,3884
8,2016,1657,4076,1105,4989
9,2017,1931,6007,526,5515


In [124]:
df.to_csv("list_host_yearly_countdata.csv", index=False)

In [137]:
[df['year'][k] for k in range(12)]

['2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019']

In [148]:
# Create figure
fig = go.Figure(
    data=[go.Scatter(x=df['year'], y=df['list_count'],
                     mode="lines",
                     line=dict(width=2, color="blue")),    
          
          go.Scatter(x=df['year'], y=df['list_total'],
                     mode="lines",
                     line=dict(width=2, color="black")),
          
          go.Scatter(x=df['year'], y=df['list_count'],
                     mode="lines", name = "change in listings",
                     line=dict(width=2, color="blue")),
          
          go.Scatter(x=df['year'], y=df['list_total'],
                     mode="lines", name= "listings total",
                     line=dict(width=2, color="black"))
          ],
    layout=go.Layout(
        xaxis=dict(range=[2008,2020], autorange=False, zeroline=False),
        yaxis=dict(range=[0,10000], autorange=False, zeroline=False),
        title_text="Kinematic Generation of a Planar Curve", hovermode="closest",
        updatemenus=[dict(type="buttons",
                          buttons=[dict(label="Play",
                                        method="animate",
                                        args=[None])])]),
    


    frames=[go.Frame(
        data=[go.Scatter(
            x=[df['year'][k]],
            y=[df['list_count'][k]],
            mode="markers",
            name = "listings",
            marker=dict(color="red", size=10)),
              
            go.Scatter(
            x=[df['year'][k]],
            y=[df['list_total'][k]],
            mode="markers",
                name = "hosts",
            marker=dict(color="red", size=10))])

        for k in range(12)])

fig.show()

In [None]:
- add annotations
- add host data
- can use host info w/ dups for listings
- for host: keep first while dropping dups, arranged asc

In [162]:
listings.head()


Unnamed: 0,id,neighbourhood,state,host_id,host_since,property_type,first_review_modified,firstreview_modified_year,hostsince_year
0,3344,Downtown/Penn Quarter,DC,4957,2008-12-10,Condominium,2009-05-09,2009,2008
1,3362,Shaw,DC,2798,2008-09-07,Townhouse,2009-01-21,2009,2008
2,3662,Buena Vista,DC,4645,2008-11-26,House,2011-04-13,2011,2008
3,3670,U Street Corridor,DC,4630,2008-11-25,Townhouse,2015-03-18,2015,2008
4,3686,Anacostia,DC,4645,2008-11-26,House,2010-11-01,2010,2008


In [18]:
prop = listings[['room_type', 'firstreview_year'] ].copy()

In [30]:
k.isnull().sum()

room_type           0
firstreview_year    0
listcount           0
cumu_count          0
dtype: int64

In [24]:
prop = prop.groupby(['room_type', 'firstreview_year']).firstreview_year.agg('count').to_frame('listcount').reset_index()

In [25]:
prop['cumu_count'] = prop.groupby(['room_type'])['listcount'].apply(lambda x: x.cumsum())

In [27]:
prop.room_type.unique()

array(['Entire home/apt', 'Private room', 'Shared room'], dtype=object)

In [28]:
types = ['Entire home/apt', 'Private room', 'Shared room']

k = []

for t in types:
    k.append(prop[prop['room_type'] == t])
    
k = pd.concat(k)

In [29]:
k = k.reset_index()

k = k.drop(['index'], axis=1)

k.head()

Unnamed: 0,room_type,firstreview_year,listcount,cumu_count
0,Entire home/apt,2009,3,3
1,Entire home/apt,2010,10,13
2,Entire home/apt,2011,15,28
3,Entire home/apt,2012,46,74
4,Entire home/apt,2013,91,165


In [41]:
k = k.drop([23, 34], axis=0).reset_index()

k = k.drop(['index'] ,axis=1)

In [34]:
k.groupby('room_type').size()

room_type
Entire home/apt    11
Private room       12
Shared room        11
dtype: int64

In [35]:
import plotly

In [63]:
#area chart for listing type

fig = px.area(k, x="firstreview_year", y="cumu_count", color="room_type",
	      line_group="room_type", labels = {"room_type": "Listing Type", "firstreview_year": "Year" ,"cumu_count": "Count"})



fig.update_layout(
    
    xaxis = dict(gridcolor= '#d3d3d3'),
    yaxis = dict(title = 'Count', gridcolor= '#d3d3d3'),
    #width = 900,
    #height = 500,
    showlegend = True,
    title = dict(text = '<b>Airbnb Listing Type Trend</b>', xref = 'container', yref = 'container'), 
    plot_bgcolor = "#FDFEFE")
    
    
    


plotly.offline.plot(fig, "types.html")
fig.show()

