# Import libraries

In [340]:
# uncomment to install libraries
# ! pip install numpy pandas matplotlib seaborn requests selenium bs4
# ! pip install jupyter_contrib_nbextensions && jupyter contrib nbextension install 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import selenium
from bs4 import BeautifulSoup
import time

# Test web scraping escape room reviews

## World of Escapes map page URL

In [341]:
map_page_url = "https://worldofescapes.com/map"

# also home page
home_page_url = "https://worldofescapes.com"

## Map page response

In [342]:
map_page_response = requests.get(map_page_url)

## Map page soup

In [343]:
map_page_soup = BeautifulSoup(map_page_response.text, "html.parser")

## Map states and city href and URLs

In [344]:
# dictionary container for states and their city href and urls
state_city_href = {}
state_city_urls = {}

# loop through states and add their city href
for state_item_i in map_page_soup.find_all("div", {"class": "col-lg-3 col-md-4 col-sm-6 col-xs-6 state-item"}):
    # state str
    state_i = state_item_i.find("h3").get_text().strip().lower()
    
    # add states to href and url dictionaries
    state_city_href[state_i] = []
    state_city_urls[state_i] = []
    
    # add cities to state values
    for city_item_j in state_item_i.find_all("li", {"class": "city-item"}):
        state_city_href[state_i].append(city_item_j.a["href"])
        state_city_urls[state_i].append(home_page_url + city_item_j.a["href"])

# See URLs (which incorporate href)
state_city_urls

{'alabama': ['https://worldofescapes.com/auburn',
  'https://worldofescapes.com/birm',
  'https://worldofescapes.com/daleville',
  'https://worldofescapes.com/dothan',
  'https://worldofescapes.com/florence-al',
  'https://worldofescapes.com/gadsden',
  'https://worldofescapes.com/henagar',
  'https://worldofescapes.com/huntsville',
  'https://worldofescapes.com/mobile',
  'https://worldofescapes.com/montgomery',
  'https://worldofescapes.com/gulf-shores',
  'https://worldofescapes.com/oxford-al',
  'https://worldofescapes.com/tuscaloosa'],
 'alaska': ['https://worldofescapes.com/anchorage',
  'https://worldofescapes.com/fairbanks',
  'https://worldofescapes.com/juneau',
  'https://worldofescapes.com/skagway'],
 'arizona': ['https://worldofescapes.com/cottonwood',
  'https://worldofescapes.com/flagstaff',
  'https://worldofescapes.com/lake-havasu-city',
  'https://worldofescapes.com/phoenix',
  'https://worldofescapes.com/prescott',
  'https://worldofescapes.com/tucson',
  'https://wor

## Map states and city page responses and soups

In [None]:
# containers for state and their city page responses and soups
state_city_responses = {}
state_city_soups = {}

# loop through responses and soups
# just alabama and alaska for now
for state_key, city_url in {state: state_city_urls[state] for state in ("alabama", "alaska", "arizona", "arkansas")}.items():
    # add state key to dictionary containers
    state_city_responses[state_key] = []
    state_city_soups[state_key] = []
    
    for url in city_url:
        # get response
        response_j = requests.get(url)
        
        # make soup
        soup_j = BeautifulSoup(response_j.text, "html.parser")
        
        # append response and soup to state
        state_city_responses[state_key].append(response_j)
        state_city_soups[state_key].append(soup_j)
        
        # sleep for a random number of seconds
        sleep_time_j = np.random.randint(low = 1, high = 5, size = 1)
        time.sleep(sleep_time_j)
        
        # print loop summary
        print("Made soup for {} in {}. Sleeping for {} seconds...".format(url, state_key, sleep_time_j))

## Escape room href and URLs

In [None]:
# container for city urls within states
state_city_room_href = {}
state_city_room_urls = {}

# loop through states and cities within states
# just alabama and alaska for now (state_city_soups only contains soups from alabama and alaska from previous code chunk)
for state_key, city_soup in state_city_soups.items():
    # add state key to dictionary containers
    state_city_room_href[state_key] = {}
    state_city_room_urls[state_key] = {}

    for j, soup in enumerate(city_soup):
        # add dictionary container to each city href value
        # nested dictionaries, e.g., {"alabama": "/auburn": [NEW escape room href go here, ...]}
        state_city_room_href[state_key][state_city_href[state_key][j]] = []
        state_city_room_urls[state_key][state_city_href[state_key][j]] = []
        
        # loop through a tags with escape room href
        if soup.find("div", {"data-content": "other-quests"}) is not None:
            for a in soup.find("div", {"data-content": "other-quests"}).find_all("a", {"class": "item-hover quest_tile_hover_link"}):
                
                # add href
                state_city_room_href[state_key][state_city_href[state_key][j]].append(a["href"])
                
                # add url
                state_city_room_urls[state_key][state_city_href[state_key][j]].append(home_page_url + a["href"])
                
                # print loop summary
                print("Added {} for {}, {}.".format(home_page_url + a["href"], state_city_href[state_key][j], state_key))
                
        else:
            print("None")

## Escape room page responses and soups

In [None]:
# container for escape room page responses and urls
state_city_room_responses = {}
state_city_room_soups = {}

# loop through city href nested within states
# just alabama and alaska for now (state_city_soups only contains soups from alabama and alaska from previous code chunk)
for state_key, city_href in state_city_room_urls.items():
    # add state key to dictionary containers
    state_city_room_responses[state_key] = {}
    state_city_room_soups[state_key] = {}
    
    # loop through urls nested within city href
    for href, room_url in city_href.items():
        # page response and soup containers for city hrefs nested in states
        state_city_room_responses[state_key][href] = []
        state_city_room_soups[state_key][href] = []
        
        # loop through urls and indices, request page responses and make soups from them
        for r, url in enumerate(room_url):
            ## room page response
            room_response_r = requests.get(url)

            ## soup
            room_soup_r = BeautifulSoup(room_response_r.text, "html.parser")

            # add page responses and soup to each container
            # nested dictionaries, e.g., {"alabama": "/auburn": [NEW escape room page response or soup go here, ...]} 
            state_city_room_responses[state_key][href].append(room_response_r)
            state_city_room_soups[state_key][href].append(room_soup_r)

            # sleep for a random number of seconds
            sleep_time_r = np.random.randint(low = 1, high = 5, size = 1)
            time.sleep(sleep_time_r)
        
            # print loop summary
            print("Made soup for {0} in {1}, {2}. Sleeping for {3} seconds...".format(url, href, state_key, sleep_time_j))
        

### title

In [None]:
state_city_room_soups["alabama"]["/auburn"][0].find("title").get_text()

### description

In [None]:
state_city_room_soups["alabama"]["/auburn"][0].find("div", {"class": "description"}).get_text()

### address

In [None]:
state_city_room_soups["alabama"]["/auburn"][3].find("div", {"data-content": "address"}).get_text().strip()

### tags

In [None]:
for li in (state_city_room_soups["alabama"]["/auburn"][0]
    .find("div", {"class": "tags"})
    .find("ul", {"class": "tags-2"})
    .find_all("li")):
    print(li.get_text())

In [None]:
state_city_room_soups["alabama"]["/auburn"][0].find("ul", {"class": "params-ul"}).find_all("li")

### room attributes

In [None]:
for li in state_city_room_soups["alabama"]["/auburn"][0].find("ul", {"class": "params-ul"}).find_all("li"):
    print(li.find("span", {"class": "td"}).get_text())

### reviews

In [None]:
for review in state_city_room_soups["alabama"]["/auburn"][0].find("ul", {"class": "masonry-list"}).find_all("p", {"class": "content"}):
    print(review.get_text())

## Escape room/game titles, descriptions, and addresses

In [None]:
# containers for titles, descriptions, and addresses
state_city_room_titles = {}
state_city_room_descriptions = {}
state_city_room_addresses = {}

# loop through city href nested within state keys
# just alabama and alaska for now (state_city_soups only contains soups from alabama and alaska from previous code chunk)
for state_key, city_href in state_city_room_soups.items():
    # add state key to dictionary containers
    state_city_room_titles[state_key] = {}
    state_city_room_descriptions[state_key] = {}
    state_city_room_addresses[state_key] = {}
    
    # loop through room soups nested within city href
    for href, room_soup in city_href.items():
        # room containers for city hrefs nested in states
        state_city_room_titles[state_key][href] = {}
        state_city_room_descriptions[state_key][href] = {}
        state_city_room_addresses[state_key][href] = {}
    
        # loop through soups nested within city href
        for k, room in enumerate(room_soup):
            # room containers for city href nested in states
            # e.g. {"alabama": {"/auburn": {"/auburn/quests/auburn-escape-zones-imprisoned": [review_1, ...]}}}
            state_city_room_titles[state_key][href][state_city_room_href[state_key][href][k]] = []
            state_city_room_descriptions[state_key][href][state_city_room_href[state_key][href][k]] = []
            state_city_room_addresses[state_key][href][state_city_room_href[state_key][href][k]] = []
            
            # title, description, address
            title_k = room.find("title").get_text()
            description_k = room.find("div", {"class": "description"}).get_text()
            address_k = room.find("div", {"data-content": "address"}).get_text().strip()
            
            # add title, description, and address to their containers
            state_city_room_titles[state_key][href][state_city_room_href[state_key][href][k]].append(title_k)
            state_city_room_descriptions[state_key][href][state_city_room_href[state_key][href][k]].append(description_k)
            state_city_room_addresses[state_key][href][state_city_room_href[state_key][href][k]].append(address_k)
            
            # print loop summary
            print("{0}: {1} is located at {2}.\n{3}: {4}.".format(k, title_k, address_k, k, description_k))

### Store room/game titles, descriptions, and addresses in a data frame

In [None]:
state_city_room_titles

In [None]:
# titles
## data frame container
escape_room_titles = pd.DataFrame(columns = {"state", "city_href", "room_href", "room_title"})

# loop through city href nested in state keys
for state, city_href in state_city_room_titles.items():
    # loop through room href nested within city href keys
    for city, room_href in city_href.items():
        # loop through review lists nested in room href keys
        for room, titles in room_href.items():
            # loop through titles (there is only 1)
            for title in titles:
                
                # append to data frame (append does not occur in place, so need to assign appended data frame)
                escape_room_titles = escape_room_titles.append(pd.DataFrame({"state": state, "city_href": city, "room_href": room, "room_title": titles}, index = [0]), ignore_index = True)
                
# descriptions
## data frame container
escape_room_descriptions = pd.DataFrame(columns = {"state", "city_href", "room_href", "room_description"})

# loop through city href nested in state keys
for state, city_href in state_city_room_descriptions.items():
    # loop through room href nested within city href keys
    for city, room_href in city_href.items():
        # loop through description lists nested in room href keys
        for room, descriptions in room_href.items():
            # loop through descriptions (there is only 1)
            for description in descriptions:
                
                # append to data frame (append does not occur in place, so need to assign appended data frame)
                escape_room_descriptions = escape_room_descriptions.append(pd.DataFrame({"state": state, "city_href": city, "room_href": room, "room_description": description}, index = [0]), ignore_index = True)

# addresses
## data frame container
escape_room_addresses = pd.DataFrame(columns = {"state", "city_href", "room_href", "room_address"})

# loop through city href nested in state keys
for state, city_href in state_city_room_addresses.items():
    # loop through room href nested within city href keys
    for city, room_href in city_href.items():
        # loop through address lists nested in room href keys
        for room, addresses in room_href.items():
            # loop through addresses (there is only 1)
            for address in addresses:
                
                # append to data frame (append does not occur in place, so need to assign appended data frame)
                escape_room_addresses = escape_room_addresses.append(pd.DataFrame({"state": state, "city_href": city, "room_href": room, "room_address": address}, index = [0]), ignore_index = True)
                
# merge data frames
escape_room_info = (escape_room_titles
                    .merge(escape_room_descriptions, how = "left", on = ["state", "city_href", "room_href"])
                    .merge(escape_room_addresses, how = "left", on = ["state", "city_href", "room_href"]))

# See it
escape_room_info

## Escape room/game reviews

In [None]:
# container for reviews nested within game nested within city nested within state
state_city_room_reviews = {}

# loop through city href nested within states
# just alabama and alaska for now (state_city_soups only contains soups from alabama and alaska from previous code chunk)
for state_key, city_href in state_city_room_soups.items():
    # add state key to dictionary containers
    state_city_room_reviews[state_key] = {}
    
    # loop through urls nested within city href
    for href, room_soup in city_href.items():
        # room containers for city hrefs nested in states
        state_city_room_reviews[state_key][href] = {}
    
        # loop through urls nested within city href
        for k, room in enumerate(room_soup):
            # room containers for city href nested in states
            # e.g. {"alabama": {"/auburn": {"/auburn/quests/auburn-escape-zones-imprisoned": [review_1, ...]}}}
            state_city_room_reviews[state_key][href][state_city_room_href[state_key][href][k]] = []
            
            # review html
            if room.find("ul", {"class": "masonry-list"}) is not None:
                review_content = room.find("ul", {"class": "masonry-list"}).find_all("p", {"class": "content"})
            
                # loop through review content
                for r, review in enumerate(review_content, start = 1):
                    state_city_room_reviews[state_key][href][state_city_room_href[state_key][href][k]].append(review.get_text())

                    # print loop summary
                    print("Scraped {0} out of {1} reviews of {2} in {3}, {4}.".format(r, len(review_content), state_city_room_href[state_key][href][k], href, state_key, sleep_time_j))
            else:
                state_city_room_reviews[state_key][href][state_city_room_href[state_key][href][k]].append("None")
                print("No reviews for {0} in {1}, {2}.".format(state_city_room_href[state_key][href][k], href, state_key, sleep_time_j))

In [None]:
state_city_room_reviews

### Store reviews in a data frame

In [None]:
# data frame container
escape_room_reviews = pd.DataFrame(columns = ["state", "city_href", "room_href", "review_number", "review"])

# loop through city href nested in state keys
for state, city_href in state_city_room_reviews.items():
    # loop through room href nested within city href keys
    for city, room_href in city_href.items():
        # loop through review lists nested in room href keys
        for room, reviews in room_href.items():
            # loop through reviews
            for r, review in enumerate(reviews):
                # append to data frame (append does not occur in place, so need to assign appended data frame)
                escape_room_reviews = escape_room_reviews.append(pd.DataFrame({"state": state, "city_href": city, "room_href": room, "review_number": r, "review": review}, index = [0]), ignore_index = True)

# See it
escape_room_reviews

### Merge reviews with escape room/game info

In [None]:
# merge data frames
escape_room_reviews = escape_room_reviews.merge(escape_room_info, how = "left", on = ["state", "city_href", "room_href"])

# See it
escape_room_reviews

### Export escape room reviews data to .csv

In [None]:
escape_room_reviews.to_csv("data/escape_room_reviews.csv")