9. Write a python program to scrape a data for all available Hostels from https://www.hostelworld.com/ in “London” location. You have to scrape hostel name, distance from city centre, ratings, total reviews, overall reviews, privates from price, dorms from price, facilities and property description. 

In [175]:
# Importing libraries 
import selenium
import pandas as pd
from selenium import webdriver
import warnings
warnings.filterwarnings('ignore')
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import ElementClickInterceptedException, NoSuchElementException, ElementNotInteractableException

import time
import regex as re

In [177]:
def get_attribute_el(webelement, attribute):
    return (webelement.get_attribute(attribute))

def text_el(webelement):
    return(webelement.text)

def load_items_by_xpath(driver, n=None, **kwargs): # NO CHANGES 
    ''' Load items on the page with the attributes sent in kwargs.
        n: number of products required
        kwargs: dictionary with key, value = label, XPATH to retrieve 
        Return: a DataFrame with the info of the products
    '''
    data_retrieved = {}
    lenght_ref = -1
    
    for key, value in kwargs.items():
        tags = driver.find_elements(By.XPATH, value[0] )  # scraping
        
        if len(value)>1:
            items = [ value[1](tag) for tag in tags] # If lambda is specified it applies
        else:
            items = [ tag.text for tag in tags]  # Default: text function
            
        if(lenght_ref <0):
            lenght_ref = len(items)
        
        if(lenght_ref == len(items)):
            data_retrieved[key] = items
            print(f"{key} retrieved and included:", len(items))
        else:
            data_retrieved[key] = [None for _ in range(lenght_ref)]
            print(f"{key} retrieved NOT COMPATIBLE:", len(items))

    df = pd.DataFrame(data_retrieved)
        
    if ( bool(n) and (n < len(df))):
        return (df.iloc[0:n,:])
    
    return df

def go_next_page(driver, xpath):
    ''' Go to the next page'''
        
    next_btn = driver.find_element(By.XPATH, xpath)
    print(f" Going to {next_btn.text}")
    
    # scroll until button be visible
    actions = ActionChains(driver)
    actions.move_to_element(next_btn).perform()

    # Wait until the button be visible (loading compleated)
    wait = WebDriverWait(driver, 15)
    wait.until(EC.element_to_be_clickable((By.XPATH, xpath)))

    # Click the button once it is clickable
    next_btn.click()

In [3]:
def load_data_by_group(driver, xpath_grp, length_ref, not_found_value='-', **kwargs):
        '''Retrieves the data by group, in case any attribute is not 
           defined it returns None for that specific item.  
           xpath_grp = XPATH to retrieve the group
           not_found_value = 
           kwargs = attributes to find in each group, with corresponding xpath config
               e.g.'ratings': [By.CLASS_NAME, 'number', lambda wel: text_el(wel)]
           Return: a data frame with the data recovered'''
        data_retrieved = {}
        # Scraping the group of elements
        tags = driver.find_elements(By.XPATH, xpath_grp)
        
        # Scraping all the atributes in each group
        for tag_name, xpath_config in kwargs.items():
            items = []

            # Example of xpath_config: 'ratings': [By.CLASS_NAME, 'number', lambda wel: text_el(wel)]
            by = xpath_config[0]
            xpath_param = xpath_config[1] 
            fn = xpath_config[2]
            
            for i, tag in enumerate(tags):
                try:
                    tag_item = tag.find_elements(by, xpath_param )
                    if len(tag_item) > 1:
                        # adding attributes as a list
                        items.append([fn(tag) for tag in tag_item])
                    elif len(tag_item) > 0:
                        # adding attribute individually
                        items.append(fn(tag_item[0]))
                    else:
                        # If there is not attribute '-' : not_found_param
                        items.append(not_found_value)

                except NoSuchElementException:
                    print("Element not found")
                    items.append(not_found_value)
            
            if ( not(length_ref) or length_ref == len(items)):
                data_retrieved[tag_name] = items
                print(f"GRP: {tag_name} retrieved and included: {len(items)}" )
                
        df = pd.DataFrame(data_retrieved)
        return df

In [78]:
def load_items_by_page(driver, n=None, **kwargs):
    ''' Load the items on the page base on a configuration dictionary <<kwargs>> .
        n: number of products required
        Return: a DataFrame with the info of the items
    '''
    # scraping each general attribute
    if 'ATTR' in kwargs:
        attributes = kwargs['ATTR']
    else:
        attributes = kwargs
    
    df = load_items_by_xpath(driver, None, **attributes) 
    length_ref = df.shape[0]
    
    if 'GROUP' in kwargs:
        xpath_grp = kwargs['GROUP']['XPATH_GRP'][0]
        attributes_grp =  kwargs['GROUP']['ATTR_GRP']
        df2 = load_data_by_group(driver, xpath_grp, length_ref,'-', **attributes_grp)
        df_result = pd.concat([df,df2], axis=1)

    if (bool(n) and length_ref > n):
        return df_result[:n] 

    return df_result


def scrape_item_detail(driver, not_found_value = '-', **kwargs):
    '''Retrieve all the attributes sent in kwargs.
       Return: a Data frame with the data.'''
    data = {}
    
    for key, xpath_config in kwargs.items():
        items = []
        
        by = xpath_config[0]
        xpath_param = xpath_config[1] 
        fn = xpath_config[2]
        is_list = xpath_config[3]
        
        try:
            tag_item = driver.find_elements(by, xpath_param )
            
            if (not(is_list) and len(tag_item) > 0): 
                # adding attribute individually -Description 
                items.append(fn(tag_item[0]))
            elif (is_list and len(tag_item) > 0): 
                # adding attributes as a list
                items.append([fn(tag) for tag in tag_item])
            else:
                # If there is not attribute '-' : not_found_param
                items.append(not_found_value)
        except NoSuchElementException:
            print("Element not found")
            items.append(not_found_value)

        data[key] = items

    return pd.DataFrame(data)

def scrape_data_by_urls(urls, attributes_det):
    ''' Scrape the data in attributes_det from each url in urls.
    urls: a Serie with URLs
    attributes_det: attributes configuration to scrape the data'''
    # Initializing dataframe
    df_det = pd.DataFrame(columns=[*attributes_det.keys()])

    for i, url in enumerate(urls):
        driver.get(url)

        print(f"Retrieving Details of record: {i+1}/{len(urls)}")

        wait_time = 3
        time.sleep(wait_time)

        # Scrape the data in each url
        df2 = scrape_item_detail(driver, **attributes_det)
        df_det = pd.concat([df_det, df2], ignore_index=True)
    
    return df_det

def extract_min_numeric_value(string):
    '''Extract the minimum numeric value from a string. Money with $'''
    pattern = r'\$(\d+)'
    numeric_values = re.findall(pattern, string)
    if numeric_values:
        min_value = min(map(int, numeric_values))
        return f"{min_value}" 
    else:
        return '-'

In [217]:
driver.quit()

In [218]:
url = 'https://www.hostelworld.com/'
location = 'London'

# Set up the WebDriver 
driver = webdriver.Chrome()
driver.get(url)

wait_time = 3
time.sleep(wait_time)

try:
    loc_input = driver.find_element(By.XPATH, '//input[@type="text"]')
    loc_input.send_keys(location)
    print(f"Sending location: {location}")
    
    time.sleep(wait_time)
    loc_item = driver.find_element(By.XPATH, f"//*[@aria-label='{location}']")
    loc_item.click()
    print(f"Selecting location: {location}")

    time.sleep(wait_time)
    btn = driver.find_element(By.XPATH, '//*[@class="btn-content large-button icon-only"]')
    btn.click()
    print("Searching...")
    
except Exception as e:
    print(f"Element not available. {e.msg}")
    driver.get("https://www.hostelworld.com/pwa/wds/s?q=London,%20England&country=London&city=London&type=city&id=3&from=2023-08-11&to=2023-08-14&guests=2&page=1")


# Setting the vars for scraping
attributes = {'ATTR': {'Hostel Name':['//div[@class="property-name"]'],
                   'URL': ['//a[@rel="noreferrer noopener"]', lambda wel: get_attribute_el(wel, 'href')]},
              'GROUP': {
                  'XPATH_GRP': ['//*[@class="property-info-container"]'],
                  'ATTR_GRP':{'Distance from city centre': [By.CLASS_NAME, 'distance-description', lambda wel: text_el(wel) ],
                              'ratings': [By.CLASS_NAME, 'number', lambda wel: text_el(wel)],
                              'total reviews': [By.CLASS_NAME, 'review', lambda wel: text_el(wel)],
                              'overal reviews': [By.CLASS_NAME, 'keyword', lambda wel: text_el(wel)],
                              'price': [By.CLASS_NAME,'property-accommodation-price', lambda wel: text_el(wel)]}
              }}

attributes_det = {
    'Description': [By.XPATH, '//div[@class="title"]/following::div[@class="content collapse-content"]', lambda wel: text_el(wel), False],
    'Facilities': [By.XPATH, '//*[@class="facilities"]', lambda wel: text_el(wel), True]}

wait_time = 8
xpath_nextbtn = '//*[@class="pill-content page-nav nav-right icon-only"]'
df_hotels = pd.DataFrame()

# Searching page links
try:
    btn_page = driver.find_elements(By.XPATH, '//*[@class="pill-content page-number"]')
    if len(btn_page) > 0:
        final_page = int(btn_page[len(btn_page) - 1].text)
        print(f"Pages to retrieve: {final_page}")
except NoSuchElementException:
    final_page = 1
        
for i in range(final_page):
    # Scraping Data
    time.sleep(wait_time)
    df_pag = load_items_by_page(driver, None, **attributes)    
    
    # Checking there is data retrieved
    if len(df_pag) <= 0:
        print( f'Page {i + 1}, NO retrieved data.' )
        break
        
    # Concat the dataframes
    df_hotels = pd.concat([df_hotels,df_pag])
    print( f'Page {i + 1}, {len(df_pag)} records added,  Records retrieved successfully: {len(df_hotels)}' )

    # Next page
    if (i + 1 == final_page ):
        break
    try:
        # Scroll until the element to avoid MoveTargetOutOfBoundsException 
        for _ in range(7):
            driver.execute_script("window.scrollBy(0, 1000)")
            time.sleep(2)    
    
        go_next_page(driver, xpath_nextbtn)
    except NoSuchElementException as e:
        print(f"No more data to retrieve: {e.msg}")
        break
    except ElementClickInterceptedException:
        print(f"No more data to retrieve: ElementClickInterceptedException occurred")
        break
    except ElementNotInteractableException:
        print(f"No more data to retrieve: ElementNotInteractableException occurred")
        break

# Reseting index
df_hotels.reset_index(drop=True, inplace=True)

# Scraping detail info: Description and facilities in each URL
wait_time = 5
time.sleep(wait_time)
df_det = scrape_data_by_urls(df_hotels['URL'], attributes_det)

# Checking the dimensions of dataframes
if len(df_det) == len(df_hotels):
    df = pd.concat([df_hotels, df_det], axis=1)
else:
    df = df_hotels   

    
# Cleaning the driver
driver.quit()

Sending location: London
Selecting location: London
Searching...


In [221]:
# Cleaning the dataframe

# Split the 'price' column into separate columns
df_result = df.join(df['price'].apply(pd.Series), how='outer')
df_result.columns = df.columns.tolist() + ['Privates From Price', 'Dorms From Price']

# Setting distance, total reviews
df_result['Distance from city centre'] = df_result['Distance from city centre'].str.extract(r'(\d+\.\d+km)', expand=False)
df_result['total reviews'] = df_result['total reviews'].str.extract(r'\((\d+)\)', expand=False)

# Filling the NaN with '-'
df_result = df_result.fillna('-')
df_result['Privates From Price'] = (df_result['Privates From Price']).apply(extract_min_numeric_value)
df_result['Dorms From Price'] = (df_result['Dorms From Price']).apply(extract_min_numeric_value)

# Dropping columns not longer needed
df_result.drop(['price'], axis=1, inplace=True)

# Saving the dataframe
df_result.to_csv('datasets/q9_hotels.csv')

In [222]:
df_result

Unnamed: 0,Hostel Name,URL,Distance from city centre,ratings,total reviews,overal reviews,Description,Facilities,Privates From Price,Dorms From Price
0,St Christopher's Village,https://www.hostelworld.com/pwa/hosteldetails....,-,8.1,-,Fabulous,St Christopher's Inn at The Village in London ...,[Linen Included Free WiFi Free Internet Access...,32,-
1,Wombat's City Hostel London,https://www.hostelworld.com/pwa/hosteldetails....,3.6km,9.0,15144,Superb,A safe haven in the middle of the metropolis: ...,[Linen Included Free City Maps Free WiFi Free ...,182,46
2,Onefam Notting Hill by Hostel One,https://www.hostelworld.com/pwa/hosteldetails....,5.5km,9.7,2178,Superb,The perfect place for solo travelers to connec...,[Linen Included Free WiFi Free Internet Access...,173,89
3,St Christopher's Village,https://www.hostelworld.com/pwa/hosteldetails....,1.8km,8.1,12304,Fabulous,St Christopher's Inn at The Village in London ...,[Linen Included Free WiFi Free Internet Access...,-,32
4,Generator London,https://www.hostelworld.com/pwa/hosteldetails....,-,7.6,7637,Very Good,Generator London is a design hotel-hostel loca...,[Linen Included Free City Maps Free WiFi Free ...,138,40
5,NX London Hostel,https://www.hostelworld.com/pwa/hosteldetails....,6.1km,8.2,2024,Fabulous,Welcome to NX London Hostel!\n\nPLEASE READ IN...,[Free Breakfast Linen Included Towels Included...,-,29
6,Urbany Hostel London,https://www.hostelworld.com/pwa/hosteldetails....,5.4km,9.5,851,Superb,"Welcome to Urbany Hostel London, our first int...","[Linen Included Free WiFi, Security Lockers Mi...",178,48
7,Safestay London Elephant & Castle,https://www.hostelworld.com/pwa/hosteldetails....,1.7km,7.3,5041,Very Good,Safestay at Elephant & Castle is ideal if you ...,"[Linen Included Free WiFi, Security Lockers Ke...",-,28
8,Pickwick Hall,https://www.hostelworld.com/pwa/hosteldetails....,2.3km,8.7,2693,Fabulous,Pickwick Hall provides accommodation for touri...,[Free Breakfast Linen Included Free Internet A...,165,-
9,Safestay London Kensington Holland Park,https://www.hostelworld.com/pwa/hosteldetails....,5.8km,6.9,1606,Good,Safestay Holland Park\n\nSafestay Holland Park...,"[Linen Included Free WiFi, Security Lockers Ke...",-,25
