In [1]:
from requests import get
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np

import time
import re

In [2]:
url = 'https://www.hostelworld.com/hostels/Berlin'
response = get(url)

In [3]:
# criate soup
soup = BeautifulSoup(response.text, 'html.parser')

In [4]:
# creating individual containers, on each one there's information about ano hostel
holstel_containers = soup.findAll(class_='fabresult rounded clearfix hwta-property')

In [5]:
# how many hostels on the page
len(holstel_containers)

30

In [6]:
# Analisando os dados do primeiro hostel

first_hostel = holstel_containers[0]

# Hostel name
first_hostel.h2.a.text

'Grand Hostel Berlin Classic'

In [7]:
print(first_hostel.prettify())

<div class="fabresult rounded clearfix hwta-property" data-id="34160" data-name="Grand Hostel Berlin Classic" id="searchResults_34160" url="https://www.hostelworld.com/hosteldetails.php/Grand-Hostel-Berlin-Classic/Berlin/34160">
 <div class="fab-carousel-skeleton carousel-skeleton">
  <div class="fab-carousel-container small-12 medium-5 large-3 columns rounded" data-images="https://a.hwstatic.com/propertyimages/3/34160/f7tnuw4gkhoid2zcmt5a.jpg,https://a.hwstatic.com/propertyimages/3/34160/m79pmoi04kow3ug7gi7j.jpg,https://a.hwstatic.com/propertyimages/3/34160/5011.jpg,https://a.hwstatic.com/propertyimages/3/34160/73.jpg,https://a.hwstatic.com/propertyimages/3/34160/gj6tdtn1owavmvbxsale.jpg,https://a.hwstatic.com/propertyimages/3/34160/frzuauiabnf0hezdwkjs.jpg,https://a.hwstatic.com/propertyimages/3/34160/zlasfl84zjnt8fjz3hev.jpg,https://a.hwstatic.com/propertyimages/3/34160/72.jpg,https://a.hwstatic.com/propertyimages/3/34160/77.jpg,https://a.hwstatic.com/propertyimages/3/34160/76.jpg,h

In [8]:
# first, create the empty lists

hostel_names = []
hostel_links = []
hostel_distance = []
hostel_ratings = []
hostel_reviews = []
hostel_prices = []

# To iterate over the pages and create the conteiners
for page in np.arange(1,4):
    url = 'http://www.hostelworld.com/hostels/Berlin?page=' + str(page)
    response = get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    holstel_containers= soup.findAll(class_='fabresult rounded clearfix hwta-property')
    
#to iterate over the results on each page
    for item in range(len(holstel_containers)):
        hostel_names.append(holstel_containers[item].h2.a.text)
        hostel_links.append(holstel_containers[item].h2.a.get('href'))
        hostel_distance.append(holstel_containers[item].find(class_='addressline').text[12:18].replace('k', '').replace('m','').strip())
        hostel_ratings.append(holstel_containers[item].find(class_='hwta-rating-score').text.replace('\n', '').strip())
        hostel_reviews.append(holstel_containers[item].find(class_='hwta-rating-counter').text.replace('\n', '').strip())
        hostel_prices.append(holstel_containers[item].find(class_='price').text.replace('\n', '').strip()[3:])

# this is used to not push too hard on the website
    time.sleep(2)

In [9]:
# create the dictionary to feed the dataframe
hw_berlin = pd.DataFrame({
    'hostel_name': hostel_names,
    'distance_center_km': hostel_distance,
    'average_rating': hostel_ratings,
    'number_reviews': hostel_reviews,
    'average_price_usd': hostel_prices,
    'hw_links': hostel_links
})

In [10]:
hw_berlin.head()

Unnamed: 0,hostel_name,distance_center_km,average_rating,number_reviews,average_price_usd,hw_links
0,Grand Hostel Berlin Classic,3.2,9.2,6718,17.14,https://www.hostelworld.com/hosteldetails.php/...
1,Industriepalast Hostel Berlin,3.3,8.5,1609,14.46,https://www.hostelworld.com/hosteldetails.php/...
2,EastSeven Berlin Hostel,1.2,9.5,6941,19.29,https://www.hostelworld.com/hosteldetails.php/...
3,Pfefferbett Hostel,1 fr,9.4,4557,17.49,https://www.hostelworld.com/hosteldetails.php/...
4,PLUS Berlin,3.4,9.1,16512,12.46,https://www.hostelworld.com/hosteldetails.php/...


In [11]:
# removing non numerical character on the column distance_center_km

hw_berlin.distance_center_km = [re.sub('[^ 0-9.]', '', x) for x in hw_berlin.distance_center_km]

#Usando a função sub(), ou substituir, da biblioteca de operações em regex re e aplicando a expressão regular ‘[^ 0-9.]’, 
    #que exclui tudo o que não seja número ou um ponto.

In [12]:
hw_berlin.head()

Unnamed: 0,hostel_name,distance_center_km,average_rating,number_reviews,average_price_usd,hw_links
0,Grand Hostel Berlin Classic,3.2,9.2,6718,17.14,https://www.hostelworld.com/hosteldetails.php/...
1,Industriepalast Hostel Berlin,3.3,8.5,1609,14.46,https://www.hostelworld.com/hosteldetails.php/...
2,EastSeven Berlin Hostel,1.2,9.5,6941,19.29,https://www.hostelworld.com/hosteldetails.php/...
3,Pfefferbett Hostel,1.0,9.4,4557,17.49,https://www.hostelworld.com/hosteldetails.php/...
4,PLUS Berlin,3.4,9.1,16512,12.46,https://www.hostelworld.com/hosteldetails.php/...
