# Tripadvisor Miami Hotel Reviews ✈️🏨🌴

## Setup

In [1]:
!pip install -q -r requirements.txt

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm.notebook import tqdm
import json
import multiprocessing
import os
import re
import pandasql as ps

In [3]:
def flatten(l):
    return [item for sublist in l if sublist is not None for item in sublist]


def scrape_main(url):
    headers = {'User-Agent': "Mozilla/5.0"}
    url_base = "https://www.tripadvisor.com/"

    page = requests.get(url, headers=headers)
    soup = BeautifulSoup(page.content, "html.parser")
    hotels_data = soup.find("div", id="map_wc_dusty_bridge")
    json_str = hotels_data.find_all("div")[1]["data-hotels-data"]
    data = json.loads(json_str)
    hotel_urls = [url_base + x["detailUrl"] for x in data["hotels"]]

    return hotel_urls


def get_review_count(soup):
    review_count = soup.find("div",  class_="jVDab o W f u w GOdjs").find("span", "biGQs _P pZUbB KxBGd").text
    review_count = int(re.sub("[^0-9]", "", review_count))
    return review_count 


def get_address(soup):
    address = soup.find("div",  class_="gZwVG H3 f u ERCyA").find("span", "biGQs _P pZUbB KxBGd").text    
    return address


def get_rank(soup):
    rank_str = soup.find("div",  class_="biGQs _P pZUbB KxBGd").text
    return rank_str


def get_category(x):
    category = re.findall(r"[a-zA-Z\s]+(?= in Miami)", x)[0].strip()
    return category


def scrape_hotel_info(url):
    headers = {'User-Agent': "Mozilla/5.0"}
    
    try:
        page = requests.get(url, headers=headers)
        soup = BeautifulSoup(page.content, "html.parser")
        hotel_html = soup.find_all("div",  class_="fIrGe _T")
        top_reviews = [x.text for x in soup.find_all("span",  class_="QewHA H4 _a")]
        description = hotel_html[0].text
        name = soup.find("h1", id="HEADING").text
        review_count = get_review_count(soup)
        address = get_address(soup)
        score = soup.find("span", class_="uwJeR P").text
        rank_str = get_rank(soup)
        category = get_category(rank_str)
        rows = [{
            "name": name,
            "category": category,
            "address": address, 
            "description": description,
            "score": score,
            "rank_str": rank_str,
            "review": x,
            "review_count": review_count,
            "url": url
        } for x in top_reviews]
            
        return rows
        
    except:
        pass

## Scrape hotel listing pages

In [4]:
page_urls = (
    ["https://www.tripadvisor.com/Hotels-g34438-Miami_Florida-Hotels.html"] + \
    [f"https://www.tripadvisor.com/Hotels-g34438-oa{n}-Miami_Florida-Hotels.html" for n in range(30, 1020, 30)]
)

page_urls[:3]

['https://www.tripadvisor.com/Hotels-g34438-Miami_Florida-Hotels.html',
 'https://www.tripadvisor.com/Hotels-g34438-oa30-Miami_Florida-Hotels.html',
 'https://www.tripadvisor.com/Hotels-g34438-oa60-Miami_Florida-Hotels.html']

In [5]:
%%time

with multiprocessing.Pool(os.cpu_count()) as pool:
    hotel_urls = pool.map(scrape_main, page_urls)

hotel_urls = list(set(flatten(hotel_urls)))

CPU times: user 31.3 ms, sys: 179 ms, total: 210 ms
Wall time: 8.07 s


In [6]:
print(f"Found unique {len(hotel_urls)} URLs")
print(f"Example URL: {hotel_urls[0]}")

Found unique 358 URLs
Example URL: https://www.tripadvisor.com//Hotel_Review-g34438-d23657692-Reviews-Opera_Tower_Silvana-Miami_Florida.html


## Scrape hotel information

In [7]:
%%time

with multiprocessing.Pool(os.cpu_count()) as pool:
    hotel_info_list = pool.map(scrape_hotel_info, hotel_urls)

hotel_info_list = flatten(hotel_info_list)

CPU times: user 163 ms, sys: 160 ms, total: 323 ms
Wall time: 48.7 s


In [8]:
print(f"Found {len(hotel_info_list)} reviews")

Found 1713 reviews


## Create dataframe

In [9]:
df = pd.DataFrame.from_dict(hotel_info_list)
df = df.sample(frac=1, random_state=2077).reset_index(drop=True)

print(df.shape)

(1713, 9)


In [10]:
df.head()

Unnamed: 0,name,category,address,description,score,rank_str,review,review_count,url
0,Sentral Wynwood,hotels,"51 NW 26th St, Miami, FL 33127",Sentral is transforming the travel experience ...,4.5,#60 of 155 hotels in Miami,"I loved it here! Close to restaurants, shoppin...",87,https://www.tripadvisor.com//Hotel_Review-g344...
1,The Guild Downtown | X Miami,condos,"230 NE 4th St, Miami, FL 33132-2231",The Guild is the hotel that combines the comfo...,4.0,#3 of 28 condos in Miami,I also received that email yesterday. Was ver...,159,https://www.tripadvisor.com//Hotel_Review-g344...
2,Beautiful Spanish Residence in Coconut Grove,limited service properties,"1757 Wa Kee NA Dr, Miami, FL 33133-2437","No se que tiene este lugar ,pero me siento com...",4.5,#7 of 18 limited service properties in Miami,What a gorgeous find! Our family of 5 stayed f...,4,https://www.tripadvisor.com//Hotel_Review-g344...
3,"EVEN Hotel Miami - Airport, an IHG Hotel",hotels,"3499 N.W. 25th Street, Miami, FL 33142",Modern Pet Friendly Hotel near Miami Internati...,4.0,#34 of 155 hotels in Miami,Everyone was very nice and helpful. The room ...,461,https://www.tripadvisor.com//Hotel_Review-g344...
4,Mr. C Miami Coconut Grove,hotels,"2988 McFarlane Rd, Miami, FL 33133-6011",From the illustrious fourth generation hospita...,4.5,#13 of 155 hotels in Miami,Relatively small hotel with 5-star service and...,413,https://www.tripadvisor.com//Hotel_Review-g344...


In [11]:
df.to_csv("./miami-hotel-reviews.csv", index=False)