Airbnb scraper by APIFY
https://apify.com/dtrungtin/airbnb-scraper

## Dataset structure

Here we are dealing with database with few columns (we specified excel-friendly format). I suppose you can get more information and complex dataset by changing that option

This dataset was extracted on **08-June-2023** and the scrape period was **6 months**

`dataset_airbnb-scraper_2023-06-08_14-57-41-189.json`

|Column|Type|Description|
|------|----|-----------|
|url|str|URL of the listing|
|name|str|Name of the listing|
|stars|float|Star grading of the listing|
|numberOfGuests|int|Max number of occupants|
|address|str|City, State, Country|
|roomType|str|Type of the listing (e.g., Full apartment, house)|
|location|dict|`{lat, lng}`|
|reviews|list||
|pricing|dict|Dictionary containing currency, rate, etc|
|photos|list|URL's with photos of the listing|
|primaryHost|dict|Details of the host|
|additionalHosts|list|More Details about the host|
|isHostedBySuperhost|bool|Is the host SuperHost?|
|isAvailable|bool|Is the listing Available?|
|calendar|list|List with pairs of values `{Available, Date}`|
|occupancyPercentage|float|Percentage of occupancy in the specified period|

* Table made with the information extracted from the `preliminar.ipynb` notebook 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_json('dataset_airbnb-scraper_2023-06-08_14-57-41-189.json')
data.columns

Index(['url', 'name', 'stars', 'numberOfGuests', 'address', 'roomType',
       'location', 'reviews', 'pricing', 'photos', 'primaryHost',
       'additionalHosts', 'isHostedBySuperhost', 'isAvailable', 'calendar',
       'occupancyPercentage'],
      dtype='object')

In [7]:
df = data[['stars', 'numberOfGuests', 'roomType', 'location', 'pricing',
            'isHostedBySuperhost', 'occupancyPercentage']].copy()
df.head()

Unnamed: 0,stars,numberOfGuests,roomType,location,pricing,isHostedBySuperhost,occupancyPercentage
0,4.88,4,Alojamiento entero: apto. residencial,"{'lat': 6.204, 'lng': -75.564}","{'rate': {'amount': 159, 'amountFormatted': '$...",False,0.97
1,,3,Alojamiento entero: apto. residencial,"{'lat': 6.24723, 'lng': -75.59545}","{'rate': {'amount': 237, 'amountFormatted': '$...",False,3.4
2,4.69,5,Alojamiento entero: apartamento con servicios,"{'lat': 6.201, 'lng': -75.574}","{'rate': {'amount': 188, 'amountFormatted': '$...",True,16.99
3,4.19,6,Alojamiento entero: piso,"{'lat': 6.21169, 'lng': -75.57166}","{'rate': {'amount': 180, 'amountFormatted': '$...",False,3.4
4,,2,Habitación privada en: bed and breakfast,"{'lat': 6.20033, 'lng': -75.56914}","{'rate': {'amount': 176, 'amountFormatted': '$...",True,13.11


In [8]:
data.location[0]

{'lat': 6.204, 'lng': -75.564}

In [9]:
def typeofproperty(p):
    if 'hotel' in p:
        return 'hotel'
    elif 'habit' in p or 'piso' in p or 'loft' in p or 'privada' in p or 'apartamento' in p or 'apto. residencial' in p:
        return 'apartamento'
    elif 'casa' in p or 'granja' in p or 'villa' in p or 'adosado' in p:
        return 'casa'
    elif 'entero' in p and 'vivienda' in p:
        return 'casa'



df['pricepernight'] = df['pricing'].apply(lambda x: x['rate']['amount'])
df['propertyType'] = df.roomType.apply(typeofproperty)

df['superhost']  = df.isHostedBySuperhost.apply(lambda x: 1 if x == True else 0)
df['latitude'] = df.location.apply(lambda x: x['lat'])
df['longitude'] = df.location.apply(lambda x: x['lng'])
df.drop(columns=['pricing', 'roomType', 'location', 'isHostedBySuperhost'], inplace=True)
df

Unnamed: 0,stars,numberOfGuests,occupancyPercentage,pricepernight,propertyType,superhost,latitude,longitude
0,4.88,4,0.97,159,apartamento,0,6.20400,-75.56400
1,,3,3.40,237,apartamento,0,6.24723,-75.59545
2,4.69,5,16.99,188,apartamento,1,6.20100,-75.57400
3,4.19,6,3.40,180,apartamento,0,6.21169,-75.57166
4,,2,13.11,176,apartamento,1,6.20033,-75.56914
...,...,...,...,...,...,...,...,...
1234,4.96,5,21.36,54,apartamento,1,6.15719,-75.60836
1235,4.78,4,6.80,54,apartamento,1,6.19552,-75.57851
1236,4.50,6,22.33,57,apartamento,0,6.20688,-75.56564
1237,4.82,2,0.97,50,apartamento,1,6.20928,-75.55877


In [54]:
df.describe()
    

Unnamed: 0,stars,numberOfGuests,occupancyPercentage,pricepernight,superhost,latitude,longitude
count,1115.0,1239.0,1239.0,1239.0,1239.0,1239.0,1239.0
mean,4.78017,4.168684,25.568475,126.456013,0.493947,6.207146,-75.570225
std,0.197935,2.516143,25.119167,93.726613,0.500165,0.025904,0.015205
min,3.5,1.0,0.0,50.0,0.0,5.87329,-75.69834
25%,4.7,2.0,4.85,64.0,0.0,6.2,-75.573685
50%,4.82,4.0,17.96,92.0,0.0,6.20818,-75.568
75%,4.92,5.0,39.32,144.5,1.0,6.21206,-75.564175
max,5.0,16.0,100.0,500.0,1.0,6.2689,-75.48079


In [6]:
df.corr()

  df.corr()


Unnamed: 0,stars,numberOfGuests,occupancyPercentage,pricepernight,superhost,latitude,longitude
stars,1.0,-0.032801,0.06964,0.050697,0.414078,0.003538,0.044176
numberOfGuests,-0.032801,1.0,0.045062,0.593515,-0.156761,-0.276945,0.001434
occupancyPercentage,0.06964,0.045062,1.0,-0.130454,0.047826,-0.017641,-0.011657
pricepernight,0.050697,0.593515,-0.130454,1.0,-0.117911,-0.18906,0.127666
superhost,0.414078,-0.156761,0.047826,-0.117911,1.0,0.157378,-0.034236
latitude,0.003538,-0.276945,-0.017641,-0.18906,0.157378,1.0,0.025626
longitude,0.044176,0.001434,-0.011657,0.127666,-0.034236,0.025626,1.0


In [53]:
import folium
import branca

# Create a map centered at an initial point
map = folium.Map(location=[6.225, -75.5812], zoom_start=14)

# Create a colormap for the color coding
colormap = branca.colormap.linear.YlOrRd_09.scale(df.pricepernight.min(), df.pricepernight.max())


for i, row in df.iterrows():
    #folium.Marker([row['latitude'], row['longitude']]).add_to(map)
    folium.CircleMarker(
        location=[row['latitude'], row['longitude']], radius=5,
        color=colormap(row['pricepernight']),
        fill=True,
        fill_color=colormap(row['pricepernight']),
        fill_opacity=0.7
    ).add_to(map)

map
