# Scraping beer review site 

In [None]:
# Import dependencies for data wrangling  
import pandas as pd 
import numpy as np
import requests 
import json

In [None]:
# Import dependencies for scraping
import pymongo
from time import sleep
from bs4 import BeautifulSoup

#from splinter import Browser 
#from webdriver_manager.chrome import ChromeDriverManager

In [None]:
# Open the Chrome Driver Browser
#executable_path = {'executable_path': ChromeDriverManager().install()}
#browser = Browser('chrome', **executable_path, headless=False)

In [None]:
# Read in CSV
beer_df = pd.read_csv("../../Data/beer.gz", encoding="ISO-8859-1")
beer_df.head()

# Transformations 
1. Dropping unecessary columns 
2. Reset the index 
3. Set the brewery_id as the new index 

In [None]:
# Drop the "unnamed: 0" column 
beer_df = beer_df.drop(['Unnamed: 0'], axis = 1)

# Drop duplicates in the "brewery_id" column
beer_df.drop_duplicates(subset = "brewery_id", keep = "first", inplace = True)

In [None]:
# Set the brewery_id as index 
# beer_df = beer_df.set_index("brewery_id")
beer_df = beer_df.reset_index(drop = True)

In [None]:
beer_df.head()

# Preparing data for scraping 

(As shown in the images below) The HTML format of the addresses in each profile page is messy, each text is not placed in a div making it hard to access the different elements of an address using beautiful soup. A function needs to be created to separate the Postal Code from the Country.

In [None]:
# Create an array of countries that have breweries in our dataset
countries = ["Belgium", "Czech Republic", "Germany", "Denmark", "France", "United Kingdom", "Netherlands", "Sweden",
             "United States", "Russia", "Japan", "Italy"]

# Create a function that formats the address - splitting the postal code from the country  
def formatAddress(address):
    for country in countries:
        if country in address:
            return address.split(country)[0] + " " + country

In [None]:
# Create a new empty column in our dataframe for the address 
beer_df["address"] = "" 

In [None]:
beer_df.head()

In [None]:
# Use the iterrows() function to iterate over beer_df
for index, row in beer_df.iterrows():
    
    if row["address"] == "": 
        
        # URL of page to be scraped
        url = "https://www.beeradvocate.com/beer/profile/" + str(row['brewery_id']) + "/"
        
        # retrieve page with the requests module
        response = requests.get(url)
        
        # Create a bs object and parse with html 
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Examine the results 
        results = soup.find_all('div', {"id": "info_box"})
        
        # Loop through returned results and clean up the html text
        for r in results:
            text_break = r.text.split("\n") 
            
            for line in text_break:   
                if "map" in line: 
                    
                    # Error handling 
                    try:
                        beer_df.loc[index, "address"] = formatAddress(line)
                        print(beer_df.loc[index, "address"])
                        
                    except:
                        print(f'no address for {row["brewery_id"]}')

# Scraping Attempt

In [None]:
brewery_ids = beer_df["brewery_id"].unique()
type(brewery_ids)
len(brewery_ids)

In [None]:
address_list = list()
count = 0

for brewery_id in brewery_ids_2:
    
        url = "https://www.beeradvocate.com/beer/profile/" + str(brewery_id) + "/"
        browser.visit(url)
        sleep(1)

        html = browser.html
        soup = BeautifulSoup(html, 'lxml')
        
        
        try:

            browser.links.find_by_partial_text('map').click()
            browser.windows[1].is_current = True
            sleep(1)

            html = browser.html 
            soup = BeautifulSoup(html, 'lxml')

            input_tag = soup.find(id = "searchboxinput")
            output = input_tag['value']

            address_list.append({
                "brewery_id": brewery_id,
                "address": output
            })

            count += 1
            print(count)
            
            browser.windows[1].close()

        except:
            pass 
        sleep(2)

In [None]:
address_df = pd.DataFrame(list(address_list), columns = ['brewery_id', 'address'])
address_df

In [None]:
# Export the census data to csv to load to database
address_df.to_csv('second_set_address.csv', index=False)