In [22]:
import numpy as np
import pandas as pd
from collections import Counter

In [None]:
# download latest spreadsheet...
archive_url = r'https://docs.google.com/spreadsheets/d/1X1HTxkI6SqsdpNSkSSivMzpxNT-oeTbjFFDdEkXD30o/export?exportFormat=csv'
df = pd.read_csv(archive_url)

In [2]:
# or use the local copy...
df = pd.read_csv('archive.csv')

In [3]:
# find the most prolific reviewers (as a proxy for experience and reliability...)
#
author_tallies = Counter(df["Reviewer's Reddit Username"])

In [4]:
author_tallies.most_common(10)

[('TOModera', 1236),
 ('Unclimbability', 556),
 ('muaddib99', 551),
 ('HawkI84', 527),
 ('Shane_IL', 513),
 ('cake_my_day', 408),
 ('Texacer', 407),
 ('I_SAID_NO_GOLDFISH', 399),
 ('Ethanized', 395),
 ('devoz', 375)]

In [5]:
# cull the list down to regions i'm interested in
#
ind = [ reg in ['Islay', 'Speyside', 'Highland', 'Island'] 
       for reg in df['Whisky Region or Style'] ]

df2 = df.loc[ind]

In [8]:
# limit the list to whiskies that are well-reviewed
#
tally = Counter(df2['Whisky Name'])

nwhiskies = 200
whiskies = [ name for name, count in tally.most_common(nwhiskies) ]

In [9]:
def get_top_links(whisky, nreviews=5):
    """return links to the top reviews for whisky
    (sorted by reviewer's experience, which I take as a proxy for quality)"""
    
    # find all reviews for this whisky
    temp = df2.loc[lambda df: df['Whisky Name'] == whisky]

    # for each review of *this* whisky, get the total number of reviews
    # for *all* whiskies by that reviewer
    reviewer_count = np.asarray([ author_tallies[author] 
                                 for author in temp["Reviewer's Reddit Username"] ] )

    # keep only the most nreviews reviews
    keep = reviewer_count.argsort()[-nreviews:]

    temp = temp.iloc[keep]

    # return a list of the urls
    return list(temp["Link To Reddit Review"])

In [10]:
import requests
from bs4 import BeautifulSoup

In [11]:
def download_review(url):
    """download the reddit review living at url
    this is fairly hacky, and i don't think im doing it correctly...
    but it seems to work for my purposes"""
    
    hdr = {'User-Agent': 'osx:r/Scotch:v0.1 (by /u/db1ac38e)'}
    try:
        r = requests.get(url, headers=hdr)
    except:
        print "invalid link"
        return "\n\n\n\n"

    if not r.ok:
        raise AssertionError('download request failed with reason ' + r.reason)

    soup = BeautifulSoup(r.text, "html5lib")

    # from here below is hacky, and i'm just guessing at how reddit structures
    # its html... may be a better way to do this!
    #
    # reddit comments seem to be in html divs with class "entry"
    # 
    comments = soup.find_all('div', attrs={"class":"entry"})

    result = []
    for comment in comments:
        # not all comments seem to be "real"... keep ones which have a div of class "usertext-body"
        temp = comment.find_all('div', attrs={"class":"usertext-body"})
        if len(temp) > 0:
            # this loop should extend only over 1 item, but i'm not 100% sure i understand how
            # reddit's html is structured...
            for t in temp:
                # now, keep only comments with more than one <p> element:
                ps = t.find_all('p')
                if len(ps) > 1:
                    # convert to text
                    strings = [ p.text for p in ps ]
                    strings = [ s for s in strings if type(s) != type(None) ]
                    result.append("\n\n".join(strings))

    return "\n\n\n\n".join(result)

In [137]:
print download_review('https://www.reddit.com/r/Scotch/comments/44i3ze/reviews_141142_highland_park_1218/')

Highland Park 12

Highland Park is an Island distillery (from the Isle of Orkney) that’s owned by Edrington, the nice people behind The Macallan, Glenrothes, a little known blend called Famous Grouse, and a few other lesser-known distilleries. This is one of the first bottles I got when I was really getting in to Scotch a few years back, and I quite enjoyed it at the time.

12 years old. 43% abv. Probably filtered and colored.

Nose - Honey, toffee, vanilla, and peppery-gingery spice are front and center, though lurking behind those notes seem to be some sherried ones - it seems like I can pick up some dark dried fruit, gingerbread, and Mexican chocolate notes in the background. Lightly smoky, with some light coastal/briny notes. As is reputed, has a bit of everything.

Taste - Honey, light toffee, vanilla, lemon, seawater, black pepper, ginger, light smoke, dark dried fruit, Mexican chocolate. The notes are listed in order of decreasing prominence - again, the honey and toffee are fro

In [12]:
import time
import os
from textwrap import wrap

In [13]:
def download_corpus(scotch):
    results = []
    for url in get_top_links(scotch, nreviews=10):
        results.append(download_review(url))
        time.sleep(15)
    return '\n\n\n\n'.join(results)

In [14]:
def create_corpus(scotch):
    fname = 'Reviews-2/' + scotch.replace(' ', '-') + '.txt'

    if os.path.isfile(fname):
        print scotch, 'already present.  skipping.'
        return False

    print 'downloading reviews for', scotch, '...'
    corpus = download_corpus(scotch)

    corpus2 = corpus.encode('ascii', 'replace')
    with open(fname, 'w') as f:
        f.write(corpus2)

    return True

In [21]:
for whisky in whiskies:
    # some have names like 'Ledaig 7 /r/Scotch Community Cask',
    # which crashes my stupid program...
    if whisky.find('/') == -1:
        create_corpus(whisky)

Highland Park Dark Origins already present.  skipping.
GlenDronach 18 Allardice already present.  skipping.
Laphroaig Cairdeas 2014 Amontillado already present.  skipping.
Glenfarclas 10 already present.  skipping.
Bruichladdich Black Art 2.2 already present.  skipping.
GlenDronach Cask Strength already present.  skipping.
Aberlour 18 already present.  skipping.
Laphroaig Cairdeas 2012 Origin  already present.  skipping.
Highland Park 25 already present.  skipping.
Highland Park 15 already present.  skipping.
Bowmore 18 already present.  skipping.
Finlaggan Old Reserve already present.  skipping.
Glenmorangie Signet already present.  skipping.
Balvenie 17 Peated Cask already present.  skipping.
Glenfiddich 15 Distillery Edition already present.  skipping.
Tomatin 18 already present.  skipping.
AnCnoc 16 already present.  skipping.
Laphroaig Cairdeas 2015 already present.  skipping.
Deanston 12 already present.  skipping.
Bruichladdich Port Charlotte PC7 Sin an Doigh Ileach already pres