# Part 1: Get All Listings

All packages planned to be used should be imported:

In [1]:
# Part 1: Get All Listings
import requests
from bs4 import BeautifulSoup

# Part 2: Get All Features
import time 
from fake_useragent import UserAgent
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

### Functions

Function for obtaining HTML content of a page:

In [2]:
def scrape_page(page_url):
    proxy = {"http" : "http://138.197.148.215:80"}
    headers = {'User-agent':'Mozilla/5.0 (X11; Linux i686; rv:100.0) Gecko/20100101 Firefox/100.0.'}
    r = requests.get(page_url, proxies = proxy, headers=headers)
    soup = BeautifulSoup(r.text, features="html.parser")
    return soup

Function for obtaining the URLs of each search page (total 15)

In [3]:
# url = the SECOND page of the search on AirBnB
def get_page_urls(url):
    page_urls = []
    num = 0
    for i in range(15):
        split_url = url.split("items_offset=")
        new_url = [split_url[0], split_url[1].split("20&")[1]]
        link = new_url[0]+"items_offset="+str(num)+"&"+new_url[1]
        page_urls.append(link)
        num +=20
    return page_urls

Function that takes your specific parameters and scrapes the page for it:

In [4]:
def extract_element_data(soup, params):
    """Extracts data from a specified HTML element"""
    
    # 1. Find the right tag
    if 'itemprop' in params:
        elements_found = soup.find_all(params['tag'], {"itemprop": params['itemprop']})
    elif 'class' in params:
        elements_found = soup.find_all(params['tag'], {"class": params['class']})
    else:
        elements_found = soup.find_all(params['tag'])
        
    # 2. Extract text from these tags
    if 'get' in params:
        element_texts = [el.get(params['get']) for el in elements_found]
    else:
        element_texts = [el.get_text() for el in elements_found]
        
    # 3. Select a particular text or concatenate all of them
    tag_order = params.get('order', 0)
    if tag_order == -1:
        output = '**__**'.join(element_texts)
    else:
        output = element_texts[tag_order]
    
    return output

Define parameters. Obtain the name and url of each listing on a page.

In [5]:
RULES_SEARCH_PAGE = {
    'name' : {'tag': 'meta', 'itemprop': 'name', 'get': 'content'},
    'url' : {'tag': 'meta', 'itemprop': 'url', 'get':'content'}
}

Function for adding 'https://' at the beginning of every link, because Python apparently doesn't recognize links without it.

In [6]:
def add_https(listing_list):
    counter = 0
    for listing in listing_list:
        listing_list[counter]['url'] = "https://" + listing_list[counter]['url']
        counter += 1
    return listing_list

### Main Program

The AirBnB search page contains 15 pages. Each page contains 20 listings, including the last page.

Enter the url of the **SECOND** search page here:

In [7]:
url = "https://www.airbnb.com/s/Kyoto--Japan/homes?adults=2&place_id=ChIJ8cM8zdaoAWARPR27azYdlsA&refinement_paths%5B%5D=%2Fhomes&tab_id=home_tab&query=Kyoto%2C%20Japan&flexible_trip_lengths%5B%5D=one_week&date_picker_type=flexible_dates&flexible_trip_dates%5B%5D=august&source=structured_search_input_header&search_type=filter_change&federated_search_session_id=6dcf8468-1875-467f-a28f-a14b26a721d9&pagination_search=true&items_offset=20&section_offset=2"

First step is to obtain the URLs of all 15 search pages, then the HTML of each page.

In [8]:
page_urls = get_page_urls(url)

# Show the first 3. Check if these URLs work.
page_urls[:5]

['https://www.airbnb.com/s/Kyoto--Japan/homes?adults=2&place_id=ChIJ8cM8zdaoAWARPR27azYdlsA&refinement_paths%5B%5D=%2Fhomes&tab_id=home_tab&query=Kyoto%2C%20Japan&flexible_trip_lengths%5B%5D=one_week&date_picker_type=flexible_dates&flexible_trip_dates%5B%5D=august&source=structured_search_input_header&search_type=filter_change&federated_search_session_id=6dcf8468-1875-467f-a28f-a14b26a721d9&pagination_search=true&items_offset=0&section_offset=2',
 'https://www.airbnb.com/s/Kyoto--Japan/homes?adults=2&place_id=ChIJ8cM8zdaoAWARPR27azYdlsA&refinement_paths%5B%5D=%2Fhomes&tab_id=home_tab&query=Kyoto%2C%20Japan&flexible_trip_lengths%5B%5D=one_week&date_picker_type=flexible_dates&flexible_trip_dates%5B%5D=august&source=structured_search_input_header&search_type=filter_change&federated_search_session_id=6dcf8468-1875-467f-a28f-a14b26a721d9&pagination_search=true&items_offset=20&section_offset=2',
 'https://www.airbnb.com/s/Kyoto--Japan/homes?adults=2&place_id=ChIJ8cM8zdaoAWARPR27azYdlsA&refin

Now for each search page, obtain 20 listing names and URLs (total 300 listings)

In [9]:
listing_urls = []

for page in page_urls:
    search_html = scrape_page(page)
    listing_html = search_html.find_all("div", attrs={"itemprop":"itemListElement"})
    counter = 0

    for listing in listing_html:
        features_dict = {}
        for feature in RULES_SEARCH_PAGE:
            features_dict[feature] = extract_element_data(listing, RULES_SEARCH_PAGE[feature])
        listing_urls.append(features_dict)
        counter += 1

In [10]:
# If you have 300 listings, the listings have been saved correctly.
len(listing_urls)

300

Apparently Python cannot recognize these URLs without 'https://'. Add them with the add_https() function.

In [11]:
#Check the first three listings (should be almost the same as shown on the first search page)
listing_urls[:3]

[{'name': 'Wonderful river view, splendid Kyoto house',
  'url': 'www.airbnb.com/rooms/30221877?adults=2&children=0&infants=0&check_in=2022-08-01&check_out=2022-08-08&previous_page_section_name=1000'},
 {'name': 'Apt. for  family&friends w/Cycle Port&WiF',
  'url': 'www.airbnb.com/rooms/41567231?adults=2&children=0&infants=0&check_in=2022-08-09&check_out=2022-08-16&previous_page_section_name=1000'},
 {'name': 'Super location best host w/Cycle Port & wifi',
  'url': 'www.airbnb.com/rooms/6249629?adults=2&children=0&infants=0&check_in=2022-08-01&check_out=2022-08-08&previous_page_section_name=1000'}]

In [12]:
listings = add_https(listing_urls)
listings[:3]

[{'name': 'Wonderful river view, splendid Kyoto house',
  'url': 'https://www.airbnb.com/rooms/30221877?adults=2&children=0&infants=0&check_in=2022-08-01&check_out=2022-08-08&previous_page_section_name=1000'},
 {'name': 'Apt. for  family&friends w/Cycle Port&WiF',
  'url': 'https://www.airbnb.com/rooms/41567231?adults=2&children=0&infants=0&check_in=2022-08-09&check_out=2022-08-16&previous_page_section_name=1000'},
 {'name': 'Super location best host w/Cycle Port & wifi',
  'url': 'https://www.airbnb.com/rooms/6249629?adults=2&children=0&infants=0&check_in=2022-08-01&check_out=2022-08-08&previous_page_section_name=1000'}]

AirBnB search page occasionally show duplicates in multiple pages. Let's remove duplicates:

In [13]:
listing_id = []

counter = 0
for i in listings:
    new_url = listings[counter]['url'].split("?")[0]
    list_id = (counter, new_url)
    listing_id.append(list_id)
    counter += 1

In [14]:
clean_id = []
dupe_id = []
seen_id = []

counter = 0
for i in listing_id:
    if listing_id[counter][1] in seen_id:
        dupe_id.append(i)
    else:
        clean_id.append(i)
    seen_id.append(listing_id[counter][1])
    counter += 1

In [15]:
clean_listings = []
with_numbering = []

counter = 0
for i in clean_id:
    number_id = clean_id[counter][0]
    clean_listings.append(listings[number_id])
    with_numbering.append((counter, listings[number_id]))
    counter += 1

In [16]:
len(clean_listings)

221

There were 79 duplicates. The list in with_numbering helps me keep track of progress when scraping.

In [17]:
with_numbering

[(0,
  {'name': 'Wonderful river view, splendid Kyoto house',
   'url': 'https://www.airbnb.com/rooms/30221877?adults=2&children=0&infants=0&check_in=2022-08-01&check_out=2022-08-08&previous_page_section_name=1000'}),
 (1,
  {'name': 'Apt. for  family&friends w/Cycle Port&WiF',
   'url': 'https://www.airbnb.com/rooms/41567231?adults=2&children=0&infants=0&check_in=2022-08-09&check_out=2022-08-16&previous_page_section_name=1000'}),
 (2,
  {'name': 'Super location best host w/Cycle Port & wifi',
   'url': 'https://www.airbnb.com/rooms/6249629?adults=2&children=0&infants=0&check_in=2022-08-01&check_out=2022-08-08&previous_page_section_name=1000'}),
 (3,
  {'name': 'Traditional house with a garden ＆ open-air bath',
   'url': 'https://www.airbnb.com/rooms/42572392?adults=2&children=0&infants=0&check_in=2022-08-19&check_out=2022-08-26&previous_page_section_name=1000'}),
 (4,
  {'name': 'KumoMachiya KOTAKO',
   'url': 'https://www.airbnb.com/rooms/30075745?adults=2&children=0&infants=0&check_

# Part 2: Get All Features

Imports for this part of the functions:

In [18]:
import time 
from fake_useragent import UserAgent
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

In [19]:
driver_path = "C:/Users/parkj/Downloads/chromedriver_win32/chromedriver.exe"

### Functions

This function is for extracting a listing's HTML. Not sure why scrape_page() doesn't work for the listing pages but this does:

In [20]:
def extract_listing_html(url):
    
    opts = Options()
    userAgent = "Mozilla/5.0 (X11; NetBSD) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36"
    #print(userAgent)
    opts.add_argument(f'user-agent={userAgent}')
    driver = webdriver.Chrome(service = Service(driver_path), options = opts)
    time.sleep(5)
    driver.get(url)  
    time.sleep(5)
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")
    
    return soup

This function grabs the features of the listing and returns them as a string.

In [21]:
def extract_feature(soup, tag, class_):
    
    # If there is no object to extract, return 'None'
    if soup.find(tag, class_= class_) is None:
        strings = 'None'
        
    # HTML is written differently for rooms. If we are looking for number of rooms, do this:
    elif tag == 'li' and class_ == 'len26si dir dir-ltr':
        guests_rooms = soup.find_all('li', attrs={"class":"len26si dir dir-ltr"})
        n_rooms = guests_rooms[0:-1]
        if len(n_rooms) > 3 :
            str_rooms = str(n_rooms[0])+str(n_rooms[1])+str(n_rooms[2])+str(n_rooms[3])
            room_soup = BeautifulSoup(str_rooms, 'html5lib')
            strings = room_soup.text.strip()
        elif len(n_rooms) == 3:
            str_rooms = str(n_rooms[0])+str(n_rooms[1])+str(n_rooms[2])
            room_soup = BeautifulSoup(str_rooms, 'html5lib')
            strings = room_soup.text.strip()
        elif len(n_rooms) == 2:
            str_rooms = str(n_rooms[0])+str(n_rooms[1])
            room_soup = BeautifulSoup(str_rooms, 'html5lib')
            strings = room_soup.text.strip()
        elif len(n_rooms) == 1:
            str_rooms = str(n_rooms[0])
            room_soup = BeautifulSoup(str_rooms, 'html5lib')
            strings = room_soup.text.strip()
        else: 
            strings = "None"
        
    # If there is a text object, return that text
    else: strings = soup.find(tag, class_= class_).getText()
    
    return strings

The details for pricing can only be accessed after clicking on the "Reserve" button. This function does clicks on the button and returns the new webpage HTML.

In [22]:
def get_price_soup(url):
    opts = Options()
    userAgent = "Mozilla/5.0 (X11; NetBSD) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36"
    #print(userAgent)
    opts.add_argument(f'user-agent={userAgent}')
    driver = webdriver.Chrome(service = Service(driver_path), options = opts)
    driver.get(url)   #and here is the change, just https
    time.sleep(5)
    reserve = WebDriverWait(driver, 5).until(
        EC.visibility_of_element_located((By.CLASS_NAME, "_qqb2vcb"))).click()
    new_url = driver.current_url
    soup = extract_listing_html(new_url)
    return soup

These are the parameters for each feature:

In [28]:
# Set parameters for each feature:

FEATURES_PARAMS = {
    "name" : {"tag" : "h1", "class": "_fecoyn4"},
    "header" : {"tag" : "div", "class" : "_cv5qq4"},
    "superhost" : {"tag" : "span", "class" : "_1mhorg9"},
    "guests_rooms" : {"tag" : "li", "class" : "len26si dir dir-ltr"},
    "rating" : {"tag" : "span", "class" : "_17p6nbba"},
    "reviews" : {"tag" : "button", "class" : "_11eqlma4"},
    "location" : {"tag" : "span", "class" : "_9xiloll"},
    "rarity" : {"tag" : "h2", "class" : "_mtn6h9x"},
    "rarity_desc" : {"tag" : "span", "class" : "_1hgrncy"},
    "house_rules" : {"tag" : "div", "class" : "c1lue5su dir dir-ltr"},
    "health_safety" : {"tag" : "div", "class" : "c3bnsmm dir dir-ltr"},
    "cancellation" : {"tag" : "div", "class" : "t1dlbb60 dir dir-ltr"},
    "host_response" : {"tag" : "ul", "class" : "fhhmddr dir dir-ltr"}
}

# These parameters require Selenium to click on a button and access a new page

CLICK_PARAMS = {
    "per night" : {"tag" : "div", "class" : "_12hv04d"},
    "total price" : {"tag" : "div", "class" : "_j1143kl"},
}

Putting all the previous functions together in this function, it will return a list of the features for each listing.

Comment out `print(counter)` if you do not want to print a number for every time a listing's features were successfully extracted.

In [29]:
def get_features(listing_list):
    features_list = []
    counter = 0

    for listing in listing_list:
        url = listing_list[counter]['url']
        soup = extract_listing_html(url)
        features_dict = {}
        for feature in FEATURES_PARAMS:
            features_dict[feature] = extract_feature(soup, FEATURES_PARAMS[feature]['tag'], FEATURES_PARAMS[feature]['class'])
        features_list.append(features_dict)
        counter += 1
        #print(counter)
    
    return features_list

Similar to the last function, but specifically used for getting the prices. It might be a little slow to run.

In [25]:
def get_price_feats(listing_list, features_list):
    
    counter = 0
    
    for listing in listing_list:
        listing_url = listing_list[counter]['url']
        price_soup = get_price_soup(listing_url)
        features_dict = {}
        for feature in CLICK_PARAMS:
            features_list[counter][feature] = extract_feature(price_soup, CLICK_PARAMS[feature]['tag'], CLICK_PARAMS[feature]['class'])
        counter += 1
    
    return features_list

### Main Program

Get the features without the prices first. This may take a long time, so the list will be ran in multiple parts just in case there is an error:

In [26]:
# Testing the waters
features_list1 = get_features(clean_listings[0:10])

In [30]:
features_no_price = get_features(clean_listings)

Just in case I mess up after this:

In [32]:
features_no_price_extra = features_no_price

### Adding Prices

In [33]:
# Testing the waters
features_test = get_price_feats(clean_listings[:5], features_no_price_extra[:5])

In [36]:
full_features = get_price_feats(clean_listings, features_no_price_extra)

Only one listing was missing info on prices. It was the only listing that required us to select a room type, like booking a hotel:

In [39]:
for i in full_features:
    if i['total price'] == 'None':
        print(i)

{'name': 'The OneFive Kyoto Shijo', 'header': 'This is a hotel', 'superhost': 'Superhost', 'guests_rooms': 'None', 'rating': '4.79 ·', 'reviews': '138 reviews', 'location': 'Kyoto, Japan', 'rarity': 'None', 'rarity_desc': 'None', 'house_rules': 'Check-in: 3:00 PM - 12:00 AMCheckout: 10:00 AMPhoto ID and credit card required at check-inShow all', 'health_safety': 'None', 'cancellation': 'None', 'host_response': 'Policy number: Hotels and Inns Business Act | 京都市長\u3000門川\u3000大作\u3000 | 京都市指令保医セ第\u3000175\u3000号', 'per night': 'None', 'total price': 'None'}


In [41]:
full_features[154] #Obtained from looking at with_numbering

{'name': 'The OneFive Kyoto Shijo',
 'header': 'This is a hotel',
 'superhost': 'Superhost',
 'guests_rooms': 'None',
 'rating': '4.79 ·',
 'reviews': '138 reviews',
 'location': 'Kyoto, Japan',
 'rarity': 'None',
 'rarity_desc': 'None',
 'house_rules': 'Check-in: 3:00 PM - 12:00 AMCheckout: 10:00 AMPhoto ID and credit card required at check-inShow all',
 'health_safety': 'None',
 'cancellation': 'None',
 'host_response': 'Policy number: Hotels and Inns Business Act | 京都市長\u3000門川\u3000大作\u3000 | 京都市指令保医セ第\u3000175\u3000号',
 'per night': 'None',
 'total price': 'None'}

In [42]:
url = "https://www.airbnb.com/book/stays/44779722?numberOfAdults=2&checkin=2022-08-14&numberOfGuests=1&checkout=2022-08-20&guestCurrency=USD&productId=44779722&isWorkTrip=false&numberOfChildren=0&numberOfInfants=0&numberOfPets=0"
driver = webdriver.Chrome(service = Service(driver_path))
driver.get(url)
price_soup = extract_listing_html(url)
for feature in CLICK_PARAMS:
        full_features[154][feature] = extract_feature(price_soup, CLICK_PARAMS[feature]['tag'], CLICK_PARAMS[feature]['class'])

In [48]:
full_features[154]

{'name': 'The OneFive Kyoto Shijo',
 'header': 'This is a hotel',
 'superhost': 'Superhost',
 'guests_rooms': 'None',
 'rating': '4.79 ·',
 'reviews': '138 reviews',
 'location': 'Kyoto, Japan',
 'rarity': 'None',
 'rarity_desc': 'None',
 'house_rules': 'Check-in: 3:00 PM - 12:00 AMCheckout: 10:00 AMPhoto ID and credit card required at check-inShow all',
 'health_safety': 'None',
 'cancellation': 'None',
 'host_response': 'Policy number: Hotels and Inns Business Act | 京都市長\u3000門川\u3000大作\u3000 | 京都市指令保医セ第\u3000175\u3000号',
 'per night': '$71.58 x 6 nights',
 'total price': '$429.50'}

In [44]:
len(full_features)

221

# Part 3: Cleaning the Data

Just in case I mess up after this:

In [77]:
all_features = full_features

### Add listing URLs

In [78]:
counter = 0
for i in all_features:
    all_features[counter]['url'] = clean_listings[counter]['url']
    counter += 1

In [86]:
all_features[100]

{'name': 'DISCOUNT！1min to sta.｜NishikiMarket｜WIFI｜Kyoto',
 'header': 'Entire rental unit',
 'superhost': 'None',
 'guests_rooms': '2 guests · 1 bedroom · 2 beds · 1 bath',
 'rating': '4.79 ·',
 'reviews': '90 reviews',
 'location': '京都市中京区梅忠町, 京都府, Japan',
 'rarity': 'None',
 'rarity_desc': 'None',
 'house_rules': 'House rulesCheck-in: After 3:00 PMCheckout: 10:00 AMNo smokingNo petsNo parties or eventsShow more',
 'health_safety': "Health & safety\U000f9001Airbnb's COVID-19 safety practices applyCarbon monoxide alarmSmoke alarm\U000f0402Security Deposit - if you damage the home, you may be charged up to $236Show more",
 'cancellation': 'Free cancellation before Jul 31.',
 'host_response': 'Policy number: Hotels and Inns Business Act | 京都市 | 京都市指令保医セ第46号Response rate: 100%Response time: within an hour',
 'per night': '$88.18 x 7 nights',
 'total price': '$308.61',
 'url': 'https://www.airbnb.com/rooms/24528310?adults=2&children=0&infants=0&check_in=2022-08-05&check_out=2022-08-12&prev

### Clean 'header' and add a column for hosts

The first listing has the header cut off by a letter because I used it earlier to test out the code.

In [88]:
all_features[0]['header'] = "Entire Home"

And not sure why this affects `full_features`. 

So be careful doing this part, only run ONCE or else it will continue to split the header and cut the end off. You will end up having to fix each listing's header or re-extract everything.

# Part 4: Exporting the Data

In [91]:
import csv

to_csv = all_features

keys = to_csv[0].keys()

with open('kyotolistings.csv', 'w', newline='', encoding="utf-8") as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(to_csv)