9. Write a python program to scrape a data for all available Hostels from https://www.hostelworld.com/ in “London” location. You have to scrape hostel name, distance from city centre, ratings, total reviews, overall reviews, privates from price, dorms from price, facilities and property description. 

In [19]:
# Importing libraries 
import selenium
import pandas as pd
from selenium import webdriver
import warnings
warnings.filterwarnings('ignore')
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import ElementClickInterceptedException, NoSuchElementException, ElementNotInteractableException

import time

In [160]:
def get_attribute_el(webelement, attribute):
    return (webelement.get_attribute(attribute))

def text_el(webelement):
    return(webelement.text)

def load_items_by_xpath(driver, n=None, **kwargs):
    ''' Load items on the page with the attributes sent in kwargs.
        n: number of products required
        kwargs: dictionary with key, value = label, XPATH to retrieve 
        Return: a DataFrame with the info of the products
    '''
    data_retrieved = {}
    lenght_ref = -1
    
    for key, value in kwargs.items():
        tags = driver.find_elements(By.XPATH, value[0] )  # scraping
        
        if len(value)>1:
            items = [ value[1](tag) for tag in tags] # If lambda is specified it applies
        else:
            items = [ tag.text for tag in tags]  # Default: text function
            
        if(lenght_ref <0):
            lenght_ref = len(items)
        
        if(lenght_ref == len(items)):
            data_retrieved[key] = items
            print(f"{key} retrieved and included:", len(items))
        else:
            data_retrieved[key] = [None for _ in range(lenght_ref)]
            print(f"{key} retrieved NOT COMPATIBLE:", len(items))

    df = pd.DataFrame(data_retrieved)
        
    if ( bool(n) and (n < len(df))):
        return (df.iloc[0:n,:])
    
    return df

In [258]:
def load_data_by_group(driver, xpath_grp, length_ref, not_found_value='-', **kwargs):
        '''Retrieves the data by group, in case any attribute is not 
           defined it returns None for that specific item.  
           xpath_grp = XPATH to retrieve the group
           not_found_value = 
           kwargs = attributes to find in each group, with corresponding xpath config
               e.g.'ratings': [By.CLASS_NAME, 'number', lambda wel: text_el(wel)]
           Return: a data frame with the data recovered'''
        data_retrieved = {}
        # Scraping the group of elements
        tags = driver.find_elements(By.XPATH, xpath_grp)
        
        # Scraping all the atributes in each group
        for tag_name, xpath_config in kwargs.items():
            items = []

            # Example of xpath_config: 'ratings': [By.CLASS_NAME, 'number', lambda wel: text_el(wel)]
            by = xpath_config[0]
            xpath_param = xpath_config[1] 
            fn = xpath_config[2]
            
            for i, tag in enumerate(tags):
                try:
                    tag_item = tag.find_elements(by, xpath_param )
                    if len(tag_item) > 1:
                        # adding attributes as a list
                        items.append([fn(tag) for tag in tag_item])
                    elif len(tag_item) > 0:
                        # adding attribute individually
                        items.append(fn(tag_item[0]))
                    else:
                        # If there is not attribute '-' : not_found_param
                        items.append(not_found_value)

                except NoSuchElementException:
                    print("Element not found")
                    items.append(not_found_value)
            
            if ( not(length_ref) or length_ref == len(items)):
                data_retrieved[tag_name] = items
                print(f"GRP: {tag_name} retrieved and included: {len(items)}" )
                
        df = pd.DataFrame(data_retrieved)
        return df

In [275]:
def load_items_by_page(driver, n=None, **kwargs):
    ''' Load the items on the page base on a configuration dictionary <<kwargs>> .
        n: number of products required
        Return: a DataFrame with the info of the items
    '''
    # scraping each general attribute
    if 'ATTR' in kwargs:
        attributes = kwargs['ATTR']
    else:
        attributes = kwargs
    
    df = load_items_by_xpath(driver, None, **attributes) 
    length_ref = df.shape[0]
    
    if 'GROUP' in kwargs:
        xpath_grp = kwargs['GROUP']['XPATH_GRP'][0]
        attributes_grp =  kwargs['GROUP']['ATTR_GRP']
        df2 = load_data_by_group(driver, xpath_grp, length_ref,'-', **attributes_grp)
        df_result = pd.concat([df,df2], axis=1)

    if (bool(n) and length_ref > n):
        return df_result[:n] 

    return df_result


def scrape_item_detail(driver, **kwargs):
    '''Retrieve all the attributes sent in kwargs.
       Return: a Data frame with the data.'''
    data = {}
    
    for key, xpath_config in kwargs.items():
        items = []
        
        by = xpath_config[0]
        xpath_param = xpath_config[1] 
        fn = xpath_config[2]
        is_list = xpath_config[3]
        
        print(xpath_config)
        try:
            tag_item = tag.find_elements(by, xpath_param )
            if (not(is_list) and len(tag_item) > 0): 
                # adding attribute individually -Description 
                items.append(fn(tag_item[0]))
            elif (is_list and len(tag_item) > 0): 
                # adding attributes as a list
                items.append([fn(tag) for tag in tag_item])
            else:
                # If there is not attribute '-' : not_found_param
                items.append(not_found_value)
        except NoSuchElementException:
            print("Element not found")
            items.append(not_found_value)

        data[key] = items

    return pd.DataFrame(data)

In [272]:
not(True)

False

In [271]:
driver.quit()

In [276]:
url = 'https://www.hostelworld.com/'
location = 'London'

# Set up the WebDriver 
driver = webdriver.Chrome()
driver.get(url)

wait_time = 3
time.sleep(wait_time)

try:
    loc_input = driver.find_element(By.XPATH, '//input[@type="text"]')
    loc_input.send_keys(location)
    
    time.sleep(wait_time)
    loc_item = driver.find_element(By.XPATH, f"//*[@aria-label='{location}']")
    loc_item.click()

    time.sleep(wait_time)
    btn = driver.find_element(By.XPATH, '//*[@class="btn-content large-button icon-only"]')
    btn.click()
    
except Exception as e:
    print(f"Element not available. {e.msg}")
    driver.get("https://www.hostelworld.com/pwa/wds/s?q=London,%20England&country=London&city=London&type=city&id=3&from=2023-08-11&to=2023-08-14&guests=2&page=1")

In [277]:
#  hostel name, distance from city centre, ratings, total reviews, overall reviews, 
#  privates from price, dorms from price, facilities and property description. 
# 'facilities': ['']
# 'description': ['']
attributes = {'ATTR': {'Hostel Name':['//div[@class="property-name"]'],
                       'URL': ['//a[@rel="noreferrer noopener"]', lambda wel: get_attribute_el(wel, 'href')]},
              'GROUP': {
                  'XPATH_GRP': ['//*[@class="property-info-container"]'],
                  'ATTR_GRP':{'Distance from city centre': [By.CLASS_NAME, 'property-name', lambda wel: text_el(wel) ],
                              'ratings': [By.CLASS_NAME, 'number', lambda wel: text_el(wel)],
                              'total reviews': [By.CLASS_NAME, 'review', lambda wel: text_el(wel)],
                              'overal reviews': [By.CLASS_NAME, 'keyword', lambda wel: text_el(wel)],
                              'price': [By.CLASS_NAME,'property-accommodation-price', lambda wel: text_el(wel)]}
              }}

In [278]:
df_hotels = load_items_by_page(driver, None, **attributes)

Hostel Name retrieved and included: 32
URL retrieved and included: 32
GRP: Distance from city centre retrieved and included: 32
GRP: ratings retrieved and included: 32
GRP: total reviews retrieved and included: 32
GRP: overal reviews retrieved and included: 32
GRP: price retrieved and included: 32


In [265]:
df_hotels['URL'].iloc[0:2]

0    https://www.hostelworld.com/pwa/hosteldetails....
1    https://www.hostelworld.com/pwa/hosteldetails....
Name: URL, dtype: object

In [279]:
attributes_det = {
    'Description': [By.XPATH, '//div[@class="title"]/following::div[@class="content collapse-content"]', lambda wel: text_el(wel), False],
    'Facilities': [By.XPATH, '//*[@class="facilities"]', lambda wel: text_el(wel), True]}

# Initializing dataframe
df_det = pd.DataFrame(columns=[*attributes.keys()])

for url in df_hotels['URL'].iloc[0:2]:
    print(url)
    driver.get(url)

    wait_time = 3
    time.sleep(wait_time)

    # Scrape the data in each url
    df2 = scrape_item_detail(driver, **attributes_det)
#     df2 = scrape_product(driver, **attributes_det)
#     df_det = pd.concat([df, df2], ignore_index=True)

    display(df2)

https://www.hostelworld.com/pwa/hosteldetails.php/St%20Christopher's%20Village/London/502?from=2023-08-11&to=2023-08-14&guests=2
['xpath', '//div[@class="title"]/following::div[@class="content collapse-content"]', <function <lambda> at 0x000002D0488DA550>, False]


MaxRetryError: HTTPConnectionPool(host='localhost', port=61232): Max retries exceeded with url: /session/6e1a4ecd1923482b7fcfe5398588090a/element/64C395168547CD131843B2E2EC5BD66F_element_97/elements (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000002D0488DCA90>: Failed to establish a new connection: [WinError 10061] No se puede establecer una conexión ya que el equipo de destino denegó expresamente dicha conexión'))

In [None]:
df2