# Airbnb Visualization Project

In this notebook, I'll be exploring the Airbnb dataset to gain insights into how popular NYC Airbnbs are.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

In [2]:
df = pd.read_csv('./data/Airbnb_Open_Data.csv')
df.head()

  df = pd.read_csv('./data/Airbnb_Open_Data.csv')


Unnamed: 0,id,NAME,host id,host_identity_verified,host name,neighbourhood group,neighbourhood,lat,long,country,...,service fee,minimum nights,number of reviews,last review,reviews per month,review rate number,calculated host listings count,availability 365,house_rules,license
0,1001254,Clean & quiet apt home by the park,80014485718,unconfirmed,Madaline,Brooklyn,Kensington,40.64749,-73.97237,United States,...,$193,10.0,9.0,10/19/2021,0.21,4.0,6.0,286.0,Clean up and treat the home the way you'd like...,
1,1002102,Skylit Midtown Castle,52335172823,verified,Jenna,Manhattan,Midtown,40.75362,-73.98377,United States,...,$28,30.0,45.0,5/21/2022,0.38,4.0,2.0,228.0,Pet friendly but please confirm with me if the...,
2,1002403,THE VILLAGE OF HARLEM....NEW YORK !,78829239556,,Elise,Manhattan,Harlem,40.80902,-73.9419,United States,...,$124,3.0,0.0,,,5.0,1.0,352.0,"I encourage you to use my kitchen, cooking and...",
3,1002755,,85098326012,unconfirmed,Garry,Brooklyn,Clinton Hill,40.68514,-73.95976,United States,...,$74,30.0,270.0,7/5/2019,4.64,4.0,1.0,322.0,,
4,1003689,Entire Apt: Spacious Studio/Loft by central park,92037596077,verified,Lyndon,Manhattan,East Harlem,40.79851,-73.94399,United States,...,$41,10.0,9.0,11/19/2018,0.1,3.0,1.0,289.0,"Please no smoking in the house, porch or on th...",


In [3]:
df.columns

Index(['id', 'NAME', 'host id', 'host_identity_verified', 'host name',
       'neighbourhood group', 'neighbourhood', 'lat', 'long', 'country',
       'country code', 'instant_bookable', 'cancellation_policy', 'room type',
       'Construction year', 'price', 'service fee', 'minimum nights',
       'number of reviews', 'last review', 'reviews per month',
       'review rate number', 'calculated host listings count',
       'availability 365', 'house_rules', 'license'],
      dtype='object')

In [4]:
print(df.count())

id                                102599
NAME                              102349
host id                           102599
host_identity_verified            102310
host name                         102193
neighbourhood group               102570
neighbourhood                     102583
lat                               102591
long                              102591
country                           102067
country code                      102468
instant_bookable                  102494
cancellation_policy               102523
room type                         102599
Construction year                 102385
price                             102352
service fee                       102326
minimum nights                    102190
number of reviews                 102416
last review                        86706
reviews per month                  86720
review rate number                102273
calculated host listings count    102280
availability 365                  102151
house_rules     

In [5]:
print(df.isna().sum())

id                                     0
NAME                                 250
host id                                0
host_identity_verified               289
host name                            406
neighbourhood group                   29
neighbourhood                         16
lat                                    8
long                                   8
country                              532
country code                         131
instant_bookable                     105
cancellation_policy                   76
room type                              0
Construction year                    214
price                                247
service fee                          273
minimum nights                       409
number of reviews                    183
last review                        15893
reviews per month                  15879
review rate number                   326
calculated host listings count       319
availability 365                     448
house_rules     

In [6]:
df = df.drop(columns=['license'])
df.columns

Index(['id', 'NAME', 'host id', 'host_identity_verified', 'host name',
       'neighbourhood group', 'neighbourhood', 'lat', 'long', 'country',
       'country code', 'instant_bookable', 'cancellation_policy', 'room type',
       'Construction year', 'price', 'service fee', 'minimum nights',
       'number of reviews', 'last review', 'reviews per month',
       'review rate number', 'calculated host listings count',
       'availability 365', 'house_rules'],
      dtype='object')

In [7]:
df['neighbourhood group'].value_counts()

neighbourhood group
Manhattan        43792
Brooklyn         41842
Queens           13267
Bronx             2712
Staten Island      955
brookln              1
manhatan             1
Name: count, dtype: int64

In [8]:
df['neighbourhood group'] = df['neighbourhood group'].str.replace("brookln", "Brooklyn")
df['neighbourhood group'] = df['neighbourhood group'].str.replace("manhatan", "Manhattan")
df['neighbourhood group'].value_counts()

neighbourhood group
Manhattan        43793
Brooklyn         41843
Queens           13267
Bronx             2712
Staten Island      955
Name: count, dtype: int64

In [9]:
df = df.dropna(axis='index', subset=['neighbourhood group', 'neighbourhood'])

In [10]:
print(df.isna().sum())

id                                    0
NAME                                248
host id                               0
host_identity_verified              281
host name                           405
neighbourhood group                   0
neighbourhood                         0
lat                                   8
long                                  8
country                             529
country code                        128
instant_bookable                    102
cancellation_policy                  73
room type                             0
Construction year                   209
price                               245
service fee                         273
minimum nights                      407
number of reviews                   183
last review                       15889
reviews per month                 15877
review rate number                  324
calculated host listings count      319
availability 365                    436
house_rules                       52118


In [11]:
df[df['neighbourhood group'] == "Manhattan"]['neighbourhood'].value_counts()

neighbourhood
Harlem                 5463
Hell's Kitchen         3965
Upper West Side        3859
Upper East Side        3679
East Village           3489
Midtown                3390
East Harlem            2339
Chelsea                2284
Lower East Side        1948
Washington Heights     1778
Financial District     1490
West Village           1484
Murray Hill             960
Kips Bay                920
Chinatown               789
Greenwich Village       744
Gramercy                693
SoHo                    685
Morningside Heights     635
Theater District        565
Inwood                  548
Nolita                  530
Tribeca                 345
Little Italy            278
Roosevelt Island        159
Flatiron District       158
NoHo                    140
Two Bridges             132
Battery Park City       118
Civic Center            107
Stuyvesant Town          83
Marble Hill              28
Name: count, dtype: int64

In [12]:
df['country'].value_counts()

country
United States    102025
Name: count, dtype: int64

In [13]:
df['country code'].value_counts()

country code
US    102426
Name: count, dtype: int64

In [14]:
print(df['lat'].max())
print(df['lat'].min())

40.91697
40.49979


In [15]:
print(df['long'].max())
print(df['long'].min())

-73.70522
-74.24984


In [16]:
df = df.drop(columns=['country', 'country code'])
df.count()

id                                102554
NAME                              102306
host id                           102554
host_identity_verified            102273
host name                         102149
neighbourhood group               102554
neighbourhood                     102554
lat                               102546
long                              102546
instant_bookable                  102452
cancellation_policy               102481
room type                         102554
Construction year                 102345
price                             102309
service fee                       102281
minimum nights                    102147
number of reviews                 102371
last review                        86665
reviews per month                  86677
review rate number                102230
calculated host listings count    102235
availability 365                  102118
house_rules                        50436
dtype: int64

In [17]:
df['review rate number'].value_counts(normalize=True)

review rate number
5.0    0.228495
4.0    0.228084
3.0    0.227497
2.0    0.225873
1.0    0.090052
Name: proportion, dtype: float64

In [18]:
df = df.dropna(axis='index', subset=['review rate number'])
df.count()

id                                102230
NAME                              101990
host id                           102230
host_identity_verified            101962
host name                         101832
neighbourhood group               102230
neighbourhood                     102230
lat                               102222
long                              102222
instant_bookable                  102142
cancellation_policy               102157
room type                         102230
Construction year                 102029
price                             101985
service fee                       101957
minimum nights                    101843
number of reviews                 102048
last review                        86378
reviews per month                  86391
review rate number                102230
calculated host listings count    101979
availability 365                  101808
house_rules                        50278
dtype: int64

In [19]:
df.dtypes

id                                  int64
NAME                               object
host id                             int64
host_identity_verified             object
host name                          object
neighbourhood group                object
neighbourhood                      object
lat                               float64
long                              float64
instant_bookable                   object
cancellation_policy                object
room type                          object
Construction year                 float64
price                              object
service fee                        object
minimum nights                    float64
number of reviews                 float64
last review                        object
reviews per month                 float64
review rate number                float64
calculated host listings count    float64
availability 365                  float64
house_rules                        object
dtype: object

In [20]:
df['price'].values[1:]

array(['$142 ', '$620 ', '$368 ', ..., '$988 ', '$546 ', '$1,032 '],
      shape=(102229,), dtype=object)

In [21]:
df = df.dropna(axis='index', subset=['price', 'service fee'])
df.count()

id                                101746
NAME                              101509
host id                           101746
host_identity_verified            101483
host name                         101353
neighbourhood group               101746
neighbourhood                     101746
lat                               101738
long                              101738
instant_bookable                  101666
cancellation_policy               101677
room type                         101746
Construction year                 101555
price                             101746
service fee                       101746
minimum nights                    101359
number of reviews                 101564
last review                        85944
reviews per month                  85958
review rate number                101746
calculated host listings count    101495
availability 365                  101324
house_rules                        50072
dtype: int64

In [22]:
df['price'] = df['price'].str.replace('$', '')
df['price'] = df['price'].str.replace(',', '')
df['price'] = df['price'].str.replace(' ', '')
df['price'] = df['price'].astype('int')

In [26]:
df['room type'].value_counts()

room type
Entire home/apt    53277
Private room       46147
Shared room         2208
Hotel room           114
Name: count, dtype: int64

In [27]:
df['last review'].value_counts()

last review
6/23/2019     2427
6/30/2019     2216
7/1/2019      2199
6/24/2019     1484
7/7/2019      1143
              ... 
12/22/2014       1
7/10/2014        1
8/15/2014        1
8/17/2014        1
5/21/2022        1
Name: count, Length: 2451, dtype: int64

In [28]:
df['service fee'] = df['service fee'].str.replace('$', '')
df['service fee'] = df['service fee'].str.replace(',', '')
df['service fee'] = df['service fee'].str.replace(' ', '')
df['service fee'] = df['service fee'].astype('int')

In [30]:
df = df.drop(columns=['last review', 'reviews per month'])

In [33]:
df[df['number of reviews'].isna()]

Unnamed: 0,id,NAME,host id,host_identity_verified,host name,neighbourhood group,neighbourhood,lat,long,instant_bookable,...,room type,Construction year,price,service fee,minimum nights,number of reviews,review rate number,calculated host listings count,availability 365,house_rules
141,1079210,"Modern Greenpoint, Brooklyn Apt",63891709973,verified,Martin,Brooklyn,Greenpoint,40.73409,-73.95348,False,...,Entire home/apt,2016.0,488,98,,,4.0,1.0,325.0,
566,1313938,East Village House -- Unique!,77437300795,unconfirmed,Alisa,Manhattan,East Village,40.72956,-73.97903,False,...,Entire home/apt,2007.0,844,169,1.0,,3.0,1.0,85.0,
1066,1590088,"No Inq,Read it, 1 BR, Rt of Subway,",16471166561,verified,Sharma,Queens,Jackson Heights,40.74906,-73.89377,True,...,Private room,,399,80,7.0,,5.0,3.0,126.0,House Rules The house rules are an important p...
1591,1880045,Space! Light! Charm! 1BR close to subways & park,5336294653,verified,Ellen,Manhattan,Upper West Side,40.79241,-73.97111,False,...,Entire home/apt,2021.0,564,113,3.0,,5.0,1.0,194.0,
2141,2183810,Manhattan's Best Deal!,52753082069,verified,Adrianne,Manhattan,East Harlem,40.80626,-73.94009,False,...,Entire home/apt,2015.0,976,195,3.0,,1.0,2.0,287.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91027,51275548,"Large, Sunny Studio in the heart of Chelsea.",2477107287,unconfirmed,Eunice,Manhattan,Chelsea,40.74089,-74.00002,False,...,Entire home/apt,2021.0,695,139,2.0,,1.0,1.0,0.0,
91831,51719597,Upper East Side Cozy Apartment,93404114754,unconfirmed,Vladimir,Manhattan,Upper East Side,40.76830,-73.95919,False,...,Entire home/apt,2003.0,151,30,4.0,,1.0,1.0,4.0,1. No parties. 2. Respect the neighbors. Nois...
99736,56085529,Bright and Charming Private Room in Williamburg!,32648903471,unconfirmed,Naveen,Brooklyn,Williamsburg,40.71365,-73.96232,True,...,Private room,2022.0,546,109,3.0,,1.0,1.0,188.0,"- cleaning, quiet, friendly, no drug, no smoking,"
100590,56557193,Room at Home in Lower East Side,95585296622,unconfirmed,Andrea,Manhattan,Lower East Side,40.71833,-73.98556,True,...,Private room,2015.0,671,134,4.0,,4.0,2.0,244.0,I spent a lot of time and effort renovating my...


In [34]:
df['number of reviews'] = df['number of reviews'].fillna(0)

In [35]:
print(df.isna().sum())

id                                    0
NAME                                237
host id                               0
host_identity_verified              263
host name                           393
neighbourhood group                   0
neighbourhood                         0
lat                                   8
long                                  8
instant_bookable                     80
cancellation_policy                  69
room type                             0
Construction year                   191
price                                 0
service fee                           0
minimum nights                      387
number of reviews                     0
review rate number                    0
calculated host listings count      251
availability 365                    422
house_rules                       51674
dtype: int64


In [24]:
manhattan_df = df[df['neighbourhood group'] == "Manhattan"]
brooklyn_df = df[df['neighbourhood group'] == "Brooklyn"]
queens_df = df[df['neighbourhood group'] == "Queens"]
bronx_df = df[df['neighbourhood group'] == "Bronx"]
statenI_df = df[df['neighbourhood group'] == "Staten Island"]

In [25]:
manhattan_group = manhattan_df.groupby('neighbourhood')[['review rate number', 'price']].mean()

In [36]:
df.to_csv('./data/clean_airbnb_data.csv')