# Airbnb Data Scraping 

In [1]:
url = 'https://www.airbnb.co.uk/s/Italy/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_lengths%5B%5D=one_week&price_filter_input_type=0&price_filter_num_nights=5&query=Italy&place_id=ChIJA9KNRIL-1BIRb15jJFz1LOI&date_picker_type=calendar&source=structured_search_input_header&search_type=autocomplete_click'

In [2]:
import requests 
from bs4 import BeautifulSoup 

In [3]:
def open_url(url): 
    return BeautifulSoup(requests.get(url).text, "html.parser")

page = open_url(url)


In [4]:
def scrape_package_title(page):
    packages = page.find_all(attrs={'class': 't1jojoys'})
    packages = [package.get_text() for package in packages]
    return packages



In [5]:
scrape_package_title(page)


['Farm stay in Tremosine sul Garda',
 'Home in Bolzano',
 'Flat in Rome',
 'Holiday home in Varenna',
 'Farm stay in Castelfranco Veneto',
 'Villa in Campo',
 'Flat in Vicenza',
 'Home in Pari',
 'Castle in Tuscania (VT)',
 'Flat in Rome',
 'Villa in Pollina',
 'Flat in Firenze',
 'Home in Volterra',
 'Home in Anghiari',
 'Yurt in Catignano',
 'Private room in Cadeo',
 'Farm stay in Maruggio',
 'Tent in Castiglione della Pescaia']

In [6]:
def scrape_package_description(page):
    packages = page.find_all(attrs={'class': 't6mzqp7 dir dir-ltr'})
    packages = [package.get_text() for package in packages]
    return packages

In [7]:
scrape_package_description(page)

['Nature House-Bondo Valley Nature Reserve',
 'Mirror House Sud',
 'Stylish Roman Loft with Piano | Steps to Colosseum',
 'Small natural house on the lake',
 "Accommodation at Ca' Amedeo Farm",
 'Peaceful country villa 10  far from Pisa town',
 'Oleandri apartment with one bedroom and one bathroom on the ground floor with wheelchair access (apartment 2)',
 'Podere Vignali',
 'San Giusto Abbey { medieval Tower }',
 'Royal Piazza di Spagna',
 'Romantic and one-of-a-kind Beachouse',
 'Rooftop with Breathtaking Views. Short Walk to The Duomo.',
 'Hill Pool',
 'Casa Rosmarino Eco-Wellness Country Home',
 'Glamping Abruzzo - The Yurt',
 'The Castle rooms',
 'Casa Pindini in agri resort with infinity pool',
 'Pigna Felice Boutique Superior']

In [31]:
def scrape_package_rating(page):
    packages = page.find_all(attrs={'class': 'g1qv1ctd cb4nyux dir dir-ltr'})
    packages = [package.get_text().split('price breakdown')[-1] for package in packages]
    return packages

In [32]:
page = open_url(url)
print(scrape_package_rating(page))
len(scrape_package_rating(page))

['4.88 (17)', '4.93 (27)', '4.96 (111)', '4.94 (259)', '4.91 (357)', '4.91 (137)', '4.89 (89)', '4.92 (13)', '4.78 (9)', '', '4.81 (16)', '4.96 (23)', '4.98 (448)', '4.64 (97)', '4.98 (55)', '4.9 (107)', '4.89 (57)', '4.94 (123)']


18

In [10]:
def scrape_package_price(page):
    packages = page.find_all(attrs={'class': '_1jo4hgw'})
    packages = [package.get_text().replace('\xa0night', '').split('\xa0')[-1] for package in packages]

    return packages

In [11]:
scrape_package_price(page)

['£159',
 '£339',
 '£103',
 '£112',
 '£1,171',
 '£144',
 '£150',
 '£206',
 '£51',
 '£171',
 '£316',
 '£97',
 '£339',
 '£60',
 '£178',
 '£175',
 '£154',
 '£119']

In [30]:
def scrape_package_duration(page):
    packages = page.find_all(attrs={'class': 'f15liw5s s1cjsi4j dir dir-ltr'})
    packages = [package.get_text() for package in packages]
    type_of_host, available_dates = [], []
    if '' in packages: 
        packages.remove('')
        for i in range(0, len(packages),2):
            if 'host' in packages[i].lower(): 
                type_of_host.append(packages[i])
            else:
                type_of_host.append("information missing")
            available_dates.append(packages[i+1])
    else: 
        for i in range(0, len(packages),2):
            if 'host' in packages[i].lower(): 
                type_of_host.append(packages[i])
            else:
                type_of_host.append("information missing")
            available_dates.append(packages[i+1])
            
            
    return type_of_host, available_dates

In [25]:
scrape_package_duration(page)

['Professional Host', '5–10 Feb', 'Individual Host', '1–8 Apr', 'Professional Host', '1–6 Feb', 'Professional Host', '25 Feb – 2 Mar', 'Professional Host', '1–6 Feb', 'Professional Host', '28 Oct – 2 Nov', 'Professional Host', '30 Mar – 4 Apr', 'Individual Host', '1–6 May', 'Individual Host', '1–6 Mar', 'Professional Host', '3–8 Feb', 'Professional Host', '6–11 Jul', 'Individual Host', '5–10 Feb', 'Individual Host', '1–6 Feb', 'Individual Host', '12–17 Feb', 'Individual Host', '5–10 Feb', 'Individual Host', '1–6 Feb', 'Individual Host', '4–9 Feb', 'Professional Host', '1–6 Feb']


(['Professional Host',
  'Individual Host',
  'Professional Host',
  'Professional Host',
  'Professional Host',
  'Professional Host',
  'Professional Host',
  'Individual Host',
  'Individual Host',
  'Professional Host',
  'Professional Host',
  'Individual Host',
  'Individual Host',
  'Individual Host',
  'Individual Host',
  'Individual Host',
  'Individual Host',
  'Professional Host'],
 ['5–10 Feb',
  '1–8 Apr',
  '1–6 Feb',
  '25 Feb – 2 Mar',
  '1–6 Feb',
  '28 Oct – 2 Nov',
  '30 Mar – 4 Apr',
  '1–6 May',
  '1–6 Mar',
  '3–8 Feb',
  '6–11 Jul',
  '5–10 Feb',
  '1–6 Feb',
  '12–17 Feb',
  '5–10 Feb',
  '1–6 Feb',
  '4–9 Feb',
  '1–6 Feb'])

In [37]:
urls = page.find_all(attrs={'aria-label':'Next'})[-1]
urls

<a aria-label="Next" class="_1bfat5l" href="/s/Italy/homes?tab_id=home_tab&amp;refinement_paths%5B%5D=%2Fhomes&amp;flexible_trip_lengths%5B%5D=one_week&amp;price_filter_input_type=0&amp;price_filter_num_nights=5&amp;query=Italy&amp;place_id=ChIJA9KNRIL-1BIRb15jJFz1LOI&amp;date_picker_type=calendar&amp;source=structured_search_input_header&amp;search_type=autocomplete_click&amp;federated_search_session_id=a5a1a1e7-ab44-4c1f-a521-26fb34c903ca&amp;pagination_search=true&amp;cursor=eyJzZWN0aW9uX29mZnNldCI6MiwiaXRlbXNfb2Zmc2V0IjoxOCwidmVyc2lvbiI6MX0%3D"><svg aria-hidden="true" focusable="false" role="presentation" style="display:block;fill:none;height:16px;width:16px;stroke:currentColor;stroke-width:3;overflow:visible" viewbox="0 0 32 32" xmlns="http://www.w3.org/2000/svg"><g fill="none"><path d="m12 4 11.2928932 11.2928932c.3905243.3905243.3905243 1.0236893 0 1.4142136l-11.2928932 11.2928932"></path></g></svg></a>

In [40]:
next_url = 'https://www.airbnb.co.uk' + page.find_all(attrs={'aria-label':'Next'})[-1].attrs['href'] 
next_url

'https://www.airbnb.co.uk/s/Italy/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_lengths%5B%5D=one_week&price_filter_input_type=0&price_filter_num_nights=5&query=Italy&place_id=ChIJA9KNRIL-1BIRb15jJFz1LOI&date_picker_type=calendar&source=structured_search_input_header&search_type=autocomplete_click&federated_search_session_id=a5a1a1e7-ab44-4c1f-a521-26fb34c903ca&pagination_search=true&cursor=eyJzZWN0aW9uX29mZnNldCI6MiwiaXRlbXNfb2Zmc2V0IjoxOCwidmVyc2lvbiI6MX0%3D'

In [53]:
import pandas as pd 

pages = 2

def scrape_airbnb(url, pages):
    scrape_data_dict = pd.DataFrame()
    page = open_url(url)
    scrape_data_dict["Title"] = scrape_package_title(page)
    scrape_data_dict["Description"] = scrape_package_description(page)
    scrape_data_dict["Rating"] = scrape_package_rating(page)
    scrape_data_dict["Price"] = scrape_package_price(page)
    scrape_data_dict["Type of Host"], scrape_data_dict["Available Dates"] = scrape_package_duration(page)
    
    for i in range(0,pages-1):
        next_url = 'https://www.airbnb.co.uk' + page.find_all(attrs={'aria-label':'Next'})[-1].attrs['href'] 
        temporarily_dataframe = pd.DataFrame()
        page = open_url(next_url)
        temporarily_dataframe["Title"] = scrape_package_title(page)
        temporarily_dataframe["Description"] = scrape_package_description(page)
        temporarily_dataframe["Rating"] = scrape_package_rating(page)
        temporarily_dataframe["Price"] = scrape_package_price(page) 
        temporarily_dataframe["Type of Host"], temporarily_dataframe["Available Dates"] = scrape_package_duration(page)
        # display(temporarily_dataframe)
        # scrape_data_dict.append(temporarily_dataframe, ignore_index = True)
        scrape_data_dict = pd.concat([scrape_data_dict, temporarily_dataframe], axis = 0)
        
    scrape_data_dict.reset_index(drop = True, inplace = True)
    return scrape_data_dict

data_frame = scrape_airbnb(url, pages)
data_frame
    


Unnamed: 0,Title,Description,Rating,Price,Type of Host,Available Dates
0,Farm stay in Tremosine sul Garda,Nature House-Bondo Valley Nature Reserve,4.98 (64),£149,Professional Host,1–6 Feb
1,Castle in Tuscania (VT),San Giusto Abbey { medieval Tower },4.99 (318),£203,Professional Host,3–8 Feb
2,Farm stay in Sillico,Romantic stay where Tuscany meets the sky!,4.96 (111),£103,Professional Host,1–6 Feb
3,Farm stay in Greve in Chianti,Typical Tuscan country house,4.99 (236),£144,Professional Host,28 Oct – 2 Nov
4,Villa in Cefalù,VILLA NORMANNO_infinity pool_,5.0 (23),£422,Individual Host,9–14 Mar
5,Cottage in San Giovanni Valdarno,First Barn - Private Pool,4.88 (134),£126,Individual Host,1–6 Feb
6,Trullo in Locorotondo,Trulli Suite 2P with Private Jacuzzi,4.86 (182),£124,Professional Host,1–6 Apr
7,Home in Miazzina,"Baita Vrei (2)""Titta"" - Hut Lago Maggiore",4.94 (51),£59,Individual Host,1–6 Mar
8,Flat in Rome,Designer Apartment near the Vatican,4.85 (315),£107,Individual Host,5–10 Feb
9,Villa in Capoville,"Casa Melograno 11+2, Emma Villas",,£274,Professional Host,1–6 Apr


In [15]:
data_frame = scrape_airbnb(url)
data_frame


['Individual Host', '13–18 Feb', 'Professional Host', '31 Mar – 5 Apr', 'Individual Host', '1–8 Apr', 'Professional Host', '23–28 Feb', 'Individual Host', '9–14 Mar', 'Professional Host', '3–8 Apr', 'Individual Host', '1–6 Feb', 'Professional Host', '1–6 Apr', 'Individual Host', '1–6 Mar', 'Individual Host', '7–12 May', 'Individual Host', '5–10 Feb', 'Individual Host', '12–17 Feb', 'Professional Host', '9–14 Apr', 'Individual Host', '15–20 Apr', 'Individual Host', '2–9 May', 'Individual Host', '8–13 Feb', 'Professional Host', '12–17 Feb', 'Professional Host', '19–24 Mar']


Unnamed: 0,Title,Description,Rating,Price,Type of Host,Available Dates
0,Home in Florence,Splendida Casa sull'Albero a Pochi Minuti da F...,Home in FlorenceSplendida Casa sull'Albero a P...,£263,Individual Host,13–18 Feb
1,Villa in Paestum-Giungano,"Entire Villa, Cilento Paestum for 28 p!","Villa in Paestum-GiunganoEntire Villa, Cilento...",£66,Professional Host,31 Mar – 5 Apr
2,Trullo in Cisternino,Trullo of Light. Luxurious Trullo retreat,Trullo in CisterninoTrullo of Light. Luxurious...,£339,Individual Host,1–8 Apr
3,Cottage in Castellabate,CILENTO PANORAMIC COTTAGE: NATURE & SEA,Cottage in CastellabateCILENTO PANORAMIC COTTA...,£75,Professional Host,23–28 Feb
4,Villa in Cefalù,VILLA NORMANNO_infinity pool_,Villa in CefalùVILLA NORMANNO_infinity pool_In...,£422,Individual Host,9–14 Mar
5,Home in Locorotondo,Trullo Tulou relax in Valle d'Itria,Home in LocorotondoTrullo Tulou relax in Valle...,£163,Professional Host,3–8 Apr
6,Cottage in San Giovanni Valdarno,First Barn - Private Pool,Cottage in San Giovanni ValdarnoFirst Barn - P...,£126,Individual Host,1–6 Feb
7,Trullo in Locorotondo,Trulli Suite 2P with Private Jacuzzi,Trullo in LocorotondoTrulli Suite 2P with Priv...,£124,Professional Host,1–6 Apr
8,Home in Miazzina,"Baita Vrei (2)""Titta"" - Hut Lago Maggiore","Home in MiazzinaBaita Vrei (2)""Titta"" - Hut La...",£51,Individual Host,1–6 Mar
9,Private room in Palermo,Triple room- Villa Megna Green Paradise B&B,Private room in PalermoTriple room- Villa Megn...,£70,Individual Host,7–12 May


In [16]:
data_frame = scrape_airbnb(url)
data_frame


['Individual Host', '1–6 Feb', 'Professional Host', '5–10 Feb', 'Professional Host', '29 May – 3 Jun', 'Individual Host', '1–8 Apr', 'Professional Host', '1–6 Feb', 'Professional Host', '23–28 Feb', 'Individual Host', '5–10 Feb', 'Individual Host', '1–6 May', 'Professional Host', '5–10 Feb', 'Individual Host', '1–6 Mar', 'Individual Host', '22–27 Jun', 'Professional Host', '1–6 Feb', 'Individual Host', '5–10 Feb', 'Professional Host', '13–18 Feb', 'Individual Host', '1–6 Feb', 'Individual Host', '5–10 Feb', 'Individual Host', '30 Oct – 4 Nov', 'Individual Host', '18–23 Jun']


Unnamed: 0,Title,Description,Rating,Price,Type of Host,Available Dates
0,Farm stay in Pierantonio,La Stalla - Casa San Gabriel,Farm stay in PierantonioLa Stalla - Casa San G...,£115,Individual Host,1–6 Feb
1,Private room in Vigo di Fassa,Room Stella Alpina in Agriturismo Ecogreen,Private room in Vigo di FassaRoom Stella Alpin...,£159,Professional Host,5–10 Feb
2,Flat in Vicenza,Oleandri apartment with one bedroom and one ba...,Flat in VicenzaOleandri apartment with one bed...,£72,Professional Host,29 May – 3 Jun
3,Trullo in Cisternino,Trullo of Light. Luxurious Trullo retreat,Trullo in CisterninoTrullo of Light. Luxurious...,£339,Individual Host,1–8 Apr
4,Trullo in Locorotondo,Verdeacqua Suite - Trullo with Indoor Pool,Trullo in LocorotondoVerdeacqua Suite - Trullo...,£347,Professional Host,1–6 Feb
5,Cottage in Castellabate,CILENTO PANORAMIC COTTAGE: NATURE & SEA,Cottage in CastellabateCILENTO PANORAMIC COTTA...,£75,Professional Host,23–28 Feb
6,Farm stay in Campofiorito,"""Corleone Drink""... Nature, Pool, Good Food","Farm stay in Campofiorito""Corleone Drink""... N...",£74,Individual Host,5–10 Feb
7,Guest suite in Torri del Benaco,Lake Garda luxury studio with pool,Guest suite in Torri del BenacoLake Garda luxu...,£206,Individual Host,1–6 May
8,Villa in Castiglion Fiorentino,Tuscan charm of villa - countryside,Villa in Castiglion FiorentinoTuscan charm of ...,£343,Professional Host,5–10 Feb
9,Home in Miazzina,"Baita Vrei (2)""Titta"" - Hut Lago Maggiore","Home in MiazzinaBaita Vrei (2)""Titta"" - Hut La...",£51,Individual Host,1–6 Mar


In [17]:
data_frame = scrape_airbnb(url)
data_frame

['Professional Host', '1–6 Mar', 'Professional Host', '6–13 Apr', 'Professional Host', '5–10 Feb', 'Professional Host', '6–13 Apr', 'Professional Host', '1–6 Feb', 'Professional Host', '25 Feb – 2 Mar', 'Individual Host', '19–24 Feb', 'Individual Host', '1–6 Mar', 'Individual Host', '1–6 Feb', 'Individual Host', '12–17 Feb', 'Individual Host', '5–10 Feb', 'Individual Host', '1–6 Feb', 'Individual Host', '13–18 Aug', 'Professional Host', '20–25 Mar', 'Individual Host', '24–29 Apr', '', '1 queen bed', '28 Feb – 5 Mar', 'Professional Host', '8–13 Apr', 'Individual Host', '27 Feb – 4 Mar']


IndexError: list index out of range

In [27]:
dataframes = []
for i in range(0, 10):
    print(i)
    data_frame = scrape_airbnb(url)
    dataframes.append(data_frame)

data_frame

0
['Individual Host', '6–12 Feb', 'Professional Host', '1–6 Mar', 'Professional Host', '6–13 Apr', 'Professional Host', '29 May – 3 Jun', 'Professional Host', '30 Mar – 4 Apr', 'Individual Host', '1–6 May', 'Professional Host', '1–7 Apr', 'Individual Host', '12–18 Mar', 'Individual Host', '5–10 Feb', 'Individual Host', '18–23 Jun', 'Professional Host', '9–14 May', 'Individual Host', '1–6 Apr', 'Individual Host', '1–6 Feb', 'Professional Host', '1–6 Apr', 'Individual Host', '1–6 Mar', 'Individual Host', '13–18 Aug', 'Professional Host', '1–6 Feb', 'Individual Host', '20–25 Mar']
1
['Professional Host', '1–6 Feb', 'Individual Host', '1–6 Mar', 'Professional Host', '5–10 Feb', 'Individual Host', '13–18 Feb', 'Individual Host', '1–8 Apr', 'Professional Host', '25 Feb – 2 Mar', 'Professional Host', '1–6 Feb', 'Individual Host', '1–6 Feb', 'Individual Host', '1–6 May', 'Individual Host', '7–12 May', 'Individual Host', '18–24 Mar', 'Individual Host', '5–10 Feb', 'Individual Host', '5–10 Feb',

Unnamed: 0,Title,Description,Rating,Price,Type of Host,Available Dates
0,Earthen home in Terrasini,Container Suite: suspended between land and sea,Earthen home in TerrasiniContainer Suite: susp...,£155,Individual Host,6–11 Feb
1,Home in Bolzano,Mirror House Sud,Home in BolzanoMirror House SudIndividual Host...,£226,Individual Host,6–12 Feb
2,Flat in Rome,Stylish Roman Loft with Piano | Steps to Colos...,Flat in RomeStylish Roman Loft with Piano | St...,£200,Individual Host,28 Mar – 2 Apr
3,Farm stay in Acquapendente,Rome | Farmhouse with pool | Agriturismo Cerqueto,Farm stay in AcquapendenteRome | Farmhouse wit...,£142,Professional Host,6–13 Apr
4,Private room in Vigo di Fassa,Room Stella Alpina in Agriturismo Ecogreen,Private room in Vigo di FassaRoom Stella Alpin...,£159,Professional Host,5–10 Feb
5,Farm stay in Montepulciano,Rural Tuscany | Winefarm with pool | Casa Viol...,Farm stay in MontepulcianoRural Tuscany | Wine...,£98,Professional Host,1–6 Apr
6,Farm stay in Voltaggio,FARMHOUSE WITH SWIMMING POOL: CASCINA RONCO FANTI,Farm stay in VoltaggioFARMHOUSE WITH SWIMMING ...,£204,Professional Host,15–20 Mar
7,Trullo in Cisternino,Trullo of Light. Luxurious Trullo retreat,Trullo in CisterninoTrullo of Light. Luxurious...,£339,Individual Host,1–8 Apr
8,Flat in Kaukana,"Amarillis, beachfront accommodation in Caucana","Flat in KaukanaAmarillis, beachfront accommoda...",£101,Professional Host,6–13 Apr
9,Farm stay in Sillico,Romantic stay where Tuscany meets the sky!,Farm stay in SillicoRomantic stay where Tuscan...,£103,Professional Host,1–6 Feb


In [23]:
faulty_dataframe = dataframes[0]

for item in faulty_dataframe.Rating: 
    print(item) 

Farm stay in Tremosine sul GardaNature House-Bondo Valley Nature ReserveProfessional Host1–6 Feb£149 night£149 per night · £743 total£743 totalShow price breakdown4.98 (64)
Holiday home in VarennaSmall natural house on the lakeIndividual Host9–14 Feb£244 night£244 per night · £1,218 total£1,218 totalShow price breakdown4.94 (373)
Trullo in CisterninoTrullo of Light. Luxurious Trullo retreatIndividual Host1–8 Apr£339 night£339 per night · £2,373 total£2,373 totalShow price breakdown5.0 (12)
Castle in Tuscania (VT)San Giusto Abbey { medieval Tower }Professional Host3–8 Feb£203 night£203 per night · £1,012 total£1,012 totalShow price breakdown4.99 (318)
Farm stay in SillicoRomantic stay where Tuscany meets the sky!Professional Host1–6 Feb£103 night£103 per night · £513 total£513 totalShow price breakdown4.96 (111)
Farm stay in Greve in ChiantiTypical Tuscan country houseProfessional Host28 Oct – 2 Nov£144 night£144 per night · £719 total£719 totalShow price breakdown4.99 (236)
Villa in Ce