# **Airbnb Web Scraping**

**Imports and initializations**

In [1]:
import requests as rq
import bs4
import pandas as pd

In [2]:
url = "https://www.airbnb.com/s/Bali--Indonesia/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&query=Bali%2C%20Indonesia&place_id=ChIJoQ8Q6NNB0S0RkOYkS7EPkSQ&checkin=2020-12-29&checkout=2021-01-03&source=structured_search_input_header&search_type=autocomplete_click"

Targeted get functions

In [3]:
def get_page(url):
    response = rq.get(url).text
    return bs4.BeautifulSoup(response, 'html.parser')

In [4]:
def get_listings(soup):
    listingsType = "div"
    listingsClass = "itemListElement"
    return soup.find_all(listingsType, {"itemprop": listingsClass})

In [5]:
def get_listing_title(listing):
    titleType = "div"
    titleClass = "_bzh5lkq"
    if(listing.find(titleType, {"class": titleClass}) == None):
        return None
    else:
        return listing.find(titleType, {"class": titleClass}).text

In [6]:
def get_listing_subtitle(listing):
    subtitleType = "div"
    subtitleClass = "_167qordg"
    if(listing.find(subtitleType, {"class": subtitleClass}) == None):
        return None
    else:
        return listing.find(subtitleType, {"class": subtitleClass}).text

In [7]:
def get_listing_info(listing):
    infoType = "div"
    infoClass = "_kqh46o"
    try:
        all_info = listing.find_all(infoType, {"class": infoClass})
        for info in all_info:
            if("guest" in info.text):
                return info.text
        return None
    except:
        return None

In [8]:
def get_listing_ammenities(listing):
    ammType = "div"
    ammClass = "_kqh46o"
    try:
        all_info = listing.find_all(ammType, {"class": ammClass})
        if(len(all_info) > 1):
            return all_info[1].text
        else:
            return None
    except:
        return None

In [9]:
def get_listing_rating(listing):
    reviewType = "span"
    reviewClass = "_krjbj"
    try:
        all_review_info = listing.find_all(reviewType, {"class": reviewClass})
        for info in all_review_info:
            if("Rating" in info.text):
                return float(info.text.split(" ")[1])
        return None
    except:
        return None

In [10]:
def get_listing_reviews(listing):
    reviewType = "span"
    reviewClass = "_krjbj"
    try:
        all_review_info = listing.find_all(reviewType, {"class": reviewClass})
        for info in all_review_info:
            if(" review" in info.text):
                return int(info.text.split(" ")[0])
        return None
    except:
        return None

In [11]:
def get_listing_price_per_night(listing):
    priceType = "div"
    priceClass = "_1fwiw8gv"  
    try:
        all_price_info = listing.find_all(priceType, {"class": priceClass})
        for info in all_price_info:
            infoStr = info.text
            if("Previous" in infoStr):
                return infoStr.split("$")[1].split("D")[0].replace(",","")
            elif("Price" in infoStr):
                return infoStr.split("$")[1].split(" ")[0].replace(",","")
        return None
    except:
        return None

In [12]:
def get_listing_total_price(listing):
    totalType = "button"
    totalClass = "_ebe4pze"
    if(listing.find(totalType, {"class": totalClass}) == None):
        return None
    else:
        return listing.find(totalType, {"class": totalClass}).text.split(" ")[0].replace("$","").replace(",","")

In [13]:
base_url = "https://airbnb.com"

def find_next_page(page):
    try:
        return base_url + page.find("a", {"aria-label": "Next"})['href']
    except:
        return None

Retrieving info with get functions

In [14]:
title = []
subtitle = []
info = []
ammenities = []
rating = []
reviews = []
price_per_night = []
total_price = []

page = get_page(url)
morePages = True

while(morePages):
    listings = get_listings(page)
    
    for listing in listings:
        title.append(get_listing_title(listing))
        subtitle.append(get_listing_subtitle(listing))
        info.append(get_listing_info(listing))
        ammenities.append(get_listing_ammenities(listing))
        rating.append(get_listing_rating(listing))
        reviews.append(get_listing_reviews(listing))
        price_per_night.append(get_listing_price_per_night(listing))
        total_price.append(get_listing_total_price(listing))
    
    next_url = find_next_page(page)

    if(next_url is None):
        morePages = False
    else:
        page = get_page(next_url)

Getting more granular data

In [15]:
guests = []
bedrooms = []
baths = []

for i in info:
    if(i is None):
        guests.append(None)
        bedrooms.append(None)
        baths.append(None)
    else:
        element = i.split("·")
        noGuest, noBedroom, noBath = True, True, True
        for word in element:
            word = word.strip()
            if("guest" in word):
                guests.append(int(word.split(" ")[0]))
                noGuest = False
            elif("bedroom" in word):
                bedrooms.append(int(word.split(" ")[0])) 
                noBedroom = False
            elif("bath" in word):
                try:
                    baths.append(float(word.split(" ")[0]))
                    noBath = False
                except:
                    baths.append(0.5)
                    noBath = False
        if(noGuest):
            guests.append(None)
        if(noBedroom):
            bedrooms.append(None)
        if(noBath):
            baths.append(None)

In [16]:
wifi = []
kitchen = []
air_conditioning = []
pool = []

for amm in ammenities:
    if(amm is None):
        wifi.append(None)
        kitchen.append(None)
        air_conditioning.append(None)
        pool.append(None)
    else:
        noWifi, noKitch, noAc, noPool = True, True ,True ,True
        element = amm.split("·")
        for word in element:
            word = word.strip()
            if(word == 'Wifi'):
                wifi.append(1)
                noWifi = False
            elif(word == 'Kitchen'):
                kitchen.append(1)
                noKitch = False
            elif(word == 'Air conditioning'):
                air_conditioning.append(1)
                noAc = False
            elif(word == 'Pool'):
                pool.append(1)
                noPool = False
        if(noWifi):
            wifi.append(0)
        if(noKitch):
            kitchen.append(0)
        if(noAc):
            air_conditioning.append(0)
        if(noPool):
            pool.append(0)

Saving as pandas dataframe

In [17]:
df = pd.DataFrame(data = {"title": title, 
                              "subtitle": subtitle, 
                              "rating": rating,
                              "reviews": reviews,
                              "price_per_night": price_per_night,
                              "total_price": total_price,
                              "guests": guests,
                              "bedrooms": bedrooms,
                              "baths": baths,
                              "wifi": wifi,
                              "kitchen": kitchen,
                              "air_conditioning": air_conditioning,
                              "pool": pool})
df.head()

Unnamed: 0,title,subtitle,rating,reviews,price_per_night,total_price,guests,bedrooms,baths,wifi,kitchen,air_conditioning,pool
0,Beautiful villa on the edge of BLUE LAGOON,Entire villa in Nusa Ceningan,4.82,310.0,92,350,2,1.0,1.0,1.0,0.0,1.0,1.0
1,Bali Bustle Co-Living and Co-working space V,Hotel room in Kuta,4.57,7.0,25,153,2,,1.0,1.0,1.0,1.0,1.0
2,Cozy Bobo Hostel and Working Space,Hotel room in Bali,4.8,45.0,9,34,1,1.0,0.5,1.0,1.0,1.0,0.0
3,Seminyak Beach Private Villa 3 Rooms W/ Pool. ...,Entire villa in Kuta,4.95,42.0,97,555,8,3.0,3.0,1.0,1.0,1.0,1.0
4,Lovely 3 bedroom home directly on Bingin Beach,Entire villa in Bingin Beach,4.92,88.0,550,3138,6,3.0,1.5,1.0,1.0,1.0,0.0


Exporting to csv file

In [18]:
df.to_csv("airbnb.csv")