In [1]:
# Import the libraries
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [2]:
# Make a GET request to the site
response = requests.get("http://whiskyadvocate.com/ratings-reviews/?search=&submit=+&brand_id=0&rating=0&price=0&category=1%2C3%2C4%2C6%2C51&styles_id=0&issue_id=0")

# Get the content of the response
content = response.content

# Initialize the parser, and pass in the content we grabbed earlier
parser = BeautifulSoup(content, 'html.parser')

In [3]:
def fetch_data(parser, *args):
    """
    Iterates through the parser, finds all the specified args,
    returning the pertinent list
    Parameters
    ---------
    parser : a BeautifulSoup object
             Represents the input HTML/XML document used for its creation
    *args  : list
             Passes a variable number of arguments to the function
    Returns
    -------
    temp_list : list
             The list of fetched data
    """
    temp_list = list()
    
    for i in range(len(parser)-1):
        texts = parser[i].find_all(*args)[0].text
        temp_list.append(texts)
    return temp_list

In [4]:
# Get the names
parser_printable = parser.find_all('div', {'class': 'printable-section'})
name_list = fetch_data(parser_printable, 'h1')

In [5]:
# Get the categories
parser_entry_meta = parser.find_all('p', {'class': 'entry-meta'})
args_cat = ['span', {'itemprop': 'category'}]
category_list = fetch_data(parser_entry_meta, args_cat[0], args_cat[1])

In [6]:
# Get the prices - the same parser utilised
args_pr= ['span', {'itemprop': 'price'}]
price_list = fetch_data(parser_entry_meta, args_pr[0], args_pr[1])

In [7]:
# Get the currencies  - same parser utilised
args_cur = ['span', {'itemprop': 'priceCurrency'}]
currency_list = fetch_data(parser_entry_meta, args_cur[0], args_cur[1])

In [8]:
# Get the descriptions
parser_review_text = parser.find_all('div', {'class': 'review-text', 'itemprop': 'description'})
args_desc = ['p']
description_list = fetch_data(parser_review_text, args_desc)

In [9]:
# Get the ratings
parser_review_top = parser.find_all('div', {'class': 'review-top'})
args_rat = ['span', {'itemprop': 'ratingValue'}]
rating_list = fetch_data(parser_review_top, args_rat[0], args_rat[1])

In [10]:
# Check for equal length of list
data = [name_list, category_list, rating_list, price_list, currency_list, description_list]
for data in data:
    print(len(data))

2636
2636
2636
2636
2636
2636


In [11]:
# Combine all the fetched data into a DataFrame
dict_series = {'name': name_list, 'category': category_list, 'rating': rating_list, 'price': price_list, 'currency': currency_list, 'description': description_list}
df = pd.DataFrame(dict_series)

In [18]:
# Write to .csv
df.to_csv('whiskey_data.csv')