# This example demonstrates how to scrap hotels prices from hotels.com

Adapted from [Tutorial: Web Scraping Hotel Prices using Selenium and Python](https://www.scrapehero.com/tutorial-web-scraping-hotel-prices-using-selenium-and-python/)

The special feature of this code is that it actually opens a webbrowser and it seems that human is navigating the webpage to complet the interaction with the webpage, such as input the city, checkin and checkout date, etc.

This is just demonstation and **requires to install selenium and webdriver(fireboxdriver)**, here is a link on how to install the driver: https://pypi.python.org/pypi/selenium

### You don't have to complete this notebook, but just for your information.

In [1]:
from re import findall,sub
from lxml import html
from time import sleep
from selenium import webdriver
import pandas as pd

path_to_fireboxdriver = 'C:\\Program Files (x86)\\firefoxdriver\\geckodriver.exe' # change path as needed

searchKey = "Madrid,spain" # Change this to your city 
checkInDate = '10/19/2017' #Format %d/%m/%Y
checkOutDate = '10/22/2017' #Format %d/%m/%Y

def parse(url):
    savefile=pd.DataFrame(columns=["hotelName",
                    "price",
                    "rating",
                    "address",
                    "locality",
                    "region",
                    "postalCode",
                    "countryName"])
    response = webdriver.Firefox(executable_path = path_to_fireboxdriver)
    response.get(url)
    searchKeyElement = response.find_elements_by_xpath('//input[contains(@id,"destination")]')
    checkInElement = response.find_elements_by_xpath('//input[contains(@class,"check-in")]')
    checkOutElement = response.find_elements_by_xpath('//input[contains(@class,"check-out")]')
    submitButton = response.find_elements_by_xpath('//button[@type="submit"]')
    if searchKeyElement and checkInElement and checkOutElement:
        searchKeyElement[0].send_keys(searchKey)
        checkInElement[0].clear()
        checkInElement[0].send_keys(checkInDate)
        checkOutElement[0].clear()
        checkOutElement[0].send_keys(checkOutDate)
        randomClick = response.find_elements_by_xpath('//h1')
        if randomClick:
            randomClick[0].click()
        submitButton[0].click()
        sleep(15)
        dropDownButton = response.find_elements_by_xpath('//fieldset[contains(@id,"dropdown")]')
        if dropDownButton:
            dropDownButton[0].click()
            priceLowtoHigh = response.find_elements_by_xpath('//li[contains(text(),"low to high")]')
            if priceLowtoHigh:
                priceLowtoHigh[0].click()
                sleep(10)
                
                
    lenOfPage = response.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
    match=False   
    while(match==False):
        lastCount = lenOfPage
        sleep(3)
        lenOfPage = response.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
        if lastCount==lenOfPage:
            match=True
        
    parser = html.fromstring(response.page_source,response.current_url)
    hotels = parser.xpath('//div[@class="hotel-wrap"]')
    for hotel in hotels: #Replace 5 with 1 to just get the cheapest hotel
        hotelName = hotel.xpath('.//h3/a')
        hotelName = hotelName[0].text_content() if hotelName else None
        price = hotel.xpath('.//div[@class="price"]/a//ins')
        price = price[0].text_content().replace(",","").strip() if price else None
        if price==None:
            price = hotel.xpath('.//div[@class="price"]/a')
            price = price[0].text_content().replace(",","").strip() if price else None
        price = findall('([\d\.]+)',price) if price else None
        price = price[0] if price else None
        rating = hotel.xpath('.//span[contains(@class,"guest-rating-value")]')
        rating = rating[0].text_content() if rating else None
        address = hotel.xpath('.//span[contains(@class,"street-address")]')
        address = "".join([x.text_content() for x in address]) if address else None
        locality = hotel.xpath('.//span[contains(@class,"locality")]')
        locality = locality[0].text_content().replace(",","").strip() if locality else None
        region = hotel.xpath('.//span[contains(@class,"region")]')
        region = region[0].text_content().replace(",","").strip() if region else None
        postalCode = hotel.xpath('.//span[contains(@class,"postal-code")]')
        postalCode = postalCode[0].text_content().replace(",","").strip() if postalCode else None
        countryName = hotel.xpath('.//span[contains(@class,"country-name")]')
        countryName = countryName[0].text_content().replace(",","").strip() if countryName else None

        item = {
                    "hotelName":hotelName,
                    "price":price,
                    "rating":rating,
                    "address":address,
                    "locality":locality,
                    "region":region,
                    "postalCode":postalCode,
                    "countryName":countryName,
        }
        savefile=savefile.append(item,ignore_index=True)
    savefile.to_csv('..\\data\\hotel prices_'+searchKey+'.csv',index=False)



In [2]:
parse('http://www.hotels.com')

In [3]:
f=pd.read_csv('..\\data\\hotel prices_'+searchKey+'.csv',encoding ='latin1')

In [4]:
f.shape

(153, 8)

In [5]:
f.head(10)

Unnamed: 0,hotelName,price,rating,address,locality,region,postalCode,countryName
0,Gran Melia Fenix,248,4.6 / 5,Hermosilla 2,Madrid,Madrid,28001,Spain
1,Hostal Felipe V,83,4.2 / 5,"Calle Gran Via, 15",Madrid,Madrid,28013,Spain
2,Puerta del Sol Rooms,55,3.7 / 5,Plaza Puerta del Sol 14-4,Madrid,Madrid,28013,Spain
3,Hostal Salamanca,54,3.7 / 5,C/ José Ortega y Gasset 89,Madrid,Madrid,28006,Spain
4,Hostal Hispano,72,4.2 / 5,"C/ Hortaleza, Nº38 - 2º",Madrid,Madrid,28004,Spain
5,WooTravelling Plaza de Oriente HOMTELS,126,4.4 / 5,Calle Cuesta de San Vicente 10-12,Madrid,Madrid,28008,Spain
6,Flat5Madrid,78,4.0 / 5,"Calle de San Bernardo, 55",Madrid,Madrid,28015,Spain
7,Hostal Sonsoles,74,4.1 / 5,"Calle de Fuencarral, 18",Madrid,Madrid,28004,Spain
8,Hostal Gallardo,61,3.6 / 5,"Fuencarral,95-4",Madrid,Madrid,28004,Spain
9,VP El Madroño,131,4.6 / 5,C/ General Díaz Porlier 101,Madrid,Madrid,28006,Spain


In [6]:
sf=f.sort_values(['price'])

In [8]:
sf.head(10)

Unnamed: 0,hotelName,price,rating,address,locality,region,postalCode,countryName
99,Hostal Jacinto,37,4.0 / 5,"Paseo Estación, 2, 1a planta",Alcala de Henares,,28807,Spain
90,Pensión Venecia,38,4.5 / 5,Calle Egido De La Fuente 21,Pinto,,28320,Spain
64,Hostal El Arco,41,3.6 / 5,"Plaza de España, 3",Mejorada del Campo,Madrid,28840,Spain
35,Posadas de España Pinto,41,3.9 / 5,"Calle Sierra Nevada, 3",Pinto,Madrid,28320,Spain
42,Hostal Juan XXIII,43,3.7 / 5,"Silvio Abad, 12",San Sebastian de los Reyes,Madrid,28703,Spain
79,Hostal Carabanchel,43,3.8 / 5,Calle Petirrojo 34,Madrid,,28047,Spain
26,Hotel Cisneros,48,3.8 / 5,"Paseo de Pastrana, 32",Alcala De Henares,Madrid,28803,Spain
149,Hostal Pacios,51,3.1 / 5,"Calle de Atocha, 28",Madrid,Madrid,28012,Spain
144,Hostal Carlos III,52,3.9 / 5,"Velasco, 7",Getafe,Madrid,28901,Spain
110,Hotel Madrid Las Rozas,53,3.9 / 5,Carretera La Coruña (N-VI),Las Rozas de Madrid,Madrid,28230,Spain


In [9]:
f.tail()

Unnamed: 0,hotelName,price,rating,address,locality,region,postalCode,countryName
148,Apartamento Estación de Atocha,95,,Calle Atocha 84,Madrid,España,28012,Spain
149,Hostal Pacios,51,3.1 / 5,"Calle de Atocha, 28",Madrid,Madrid,28012,Spain
150,Roisa Centro,104,4.2 / 5,Calle San Bernardo 87,Madrid,Madrid,28015,Spain
151,Hotel Meninas,177,4.0 / 5,"Calle de Campomanes, 7",Madrid,Madrid,28013,Spain
152,Dobo Rooms - Ronda de Segovia Apartments,79,,"Ronda de Segovia, 33",Madrid,,28005,Spain
