In [181]:
import pyspark as ps
import warnings
import multiprocessing
import numpy as np

import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

from pyspark.sql.functions import UserDefinedFunction, udf
from pyspark.sql.types import StringType, ArrayType

from pyspark.ml.feature import CountVectorizer, IDF, Tokenizer

In [2]:
# multiprocessing.cpu_count() 
try:
    sc = ps.SparkContext('local[8]')
    print "Just created a SparkContext"
except ValueError:
    warnings.warn("SparkContext already exists in this scope")

Just created a SparkContext


In [3]:
sqlContext = ps.SQLContext(sc)

In [4]:
df = sqlContext.read.json('reviews_100.json')

In [134]:
df.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- $oid: string (nullable = true)
 |-- abv: string (nullable = true)
 |-- avg_rating: string (nullable = true)
 |-- ba_score: double (nullable = true)
 |-- beer_name: string (nullable = true)
 |-- beer_style: string (nullable = true)
 |-- brewery_name: string (nullable = true)
 |-- bros: string (nullable = true)
 |-- feel: string (nullable = true)
 |-- look: string (nullable = true)
 |-- num_ratings: string (nullable = true)
 |-- overall: string (nullable = true)
 |-- smell: string (nullable = true)
 |-- state: string (nullable = true)
 |-- taste: string (nullable = true)
 |-- text: string (nullable = true)
 |-- weighted_ba_score: string (nullable = true)



In [185]:
def preprocess_review_text(text):
    stopwords_ = set(stopwords.words('english'))
    stemmer_ = SnowballStemmer('english')
    
    if (text == None):
        return []
    
    if (len(text) < 1):
        return []
    
    if (type(text) == unicode):
        text = text.encode('utf-8')
        
    replace_punctuation = string.maketrans(string.punctuation, ' '*len(string.punctuation))
    unpunctuated_text = text.translate(replace_punctuation)
    tokens = word_tokenize(unpunctuated_text)[:-5:] # reviewer username info lost here via array slice
    lowercased_tokens = [token.lower() for token in tokens]
    filtered_tokens = [w for w in lowercased_tokens if not w in stopwords_] 
    
    stemmed = []
    for token in filtered_tokens:
        try:
            stemmed.append(stemmer_.stem(token))
        except:
            # continue when stemmer doesn't work
            continue

    return stemmed

df_reviews = df.select('brewery_name', 'beer_name', 'state', 'beer_style', 'text')
# df_reviews.show()

token_udf = udf(lambda x: preprocess_review_text(x), ArrayType(StringType()))

def up(text):
    return text.upper()
up_udf = udf(up, StringType())

# df_reviews.select(token_udf('text')).show()
df_tokens = df_reviews.withColumn("tokens", token_udf('text'))
df_tokens.take(1)
# df_tokens.printSchema()
# df_tokens.show()

[u'pour', u'turbid', u'amber', u'brown', u'thin', u'layer', u'beig', u'color', u'head', u'pretti', u'effervesc', u'never', u'drop', u'bright', u'cakey', u'figgi', u'much', u'els', u'follow', u'nose', u'dryness', u'herbal', u'hop', u'boozey', u'front', u'figgy', u'pear', u'breadi', u'sweet', u'warm', u'plus', u'boozey', u'littl', u'golden', u'delici', u'appl', u'finish', u'littl', u'cakey', u'boozey', u'orchard', u'fruit', 'mf', u'light', u'medium', u'bodi', u'slight', u'carbon', u'sort', u'thin', u'warm', u'okay', u'style', u'someth', u'would', u'recommend', u'littl']


[Row(brewery_name=u'405 Brewing Co.', beer_name=u'Manly Barley Wine Ale', state=u'Oklahoma', beer_style=u'English Barleywine', text=u'Pours a turbid amber/brown with a thin layer of beige colored head.  Pretty effervescent, never drops bright\n\nS:  Cakey, figgy, not much else\n\nT:  Follows the nose, some dryness, herbal hops & boozeyness up front.  Figgyness, some pear & bready sweetness as this warms, plus boozeyness & a little Golden Delicious Apple.    Finishes a little cakey, boozey & with some orchard fruit\n\nMF: Light/medium body, slightly over carbonated, sort of thin as it warms\n\nOkay for the style, not something I would recommend, a little too hot\xa0527 charactersrusspowell, Jan 19, 2016', tokens=[u'pour', u'turbid', u'amber', u'brown', u'thin', u'layer', u'beig', u'color', u'head', u'pretti', u'effervesc', u'never', u'drop', u'bright', u'cakey', u'figgi', u'much', u'els', u'follow', u'nose', u'dryness', u'herbal', u'hop', u'boozey', u'front', u'figgy', u'pear', u'breadi

In [None]:
'''
BREAKKKKKK
'''

In [6]:
reviews_df = df.select('brewery_name', 'beer_name', 'state', 'beer_style', 'text')

reviews_df.printSchema()

root
 |-- brewery_name: string (nullable = true)
 |-- beer_name: string (nullable = true)
 |-- state: string (nullable = true)
 |-- beer_style: string (nullable = true)
 |-- text: string (nullable = true)



In [23]:
from pyspark.sql.functions import count, collect_list, split

reviews_df.groupBy('brewery_name', 'beer_name', 'state')

<pyspark.sql.group.GroupedData at 0x10bbb0e50>

In [86]:
grouped_reviews = reviews_df.groupBy('brewery_name', 'beer_name', 'state').agg(collect_list('text').alias('reviews'))
grouped_reviews.collect()

[Row(brewery_name=u'10-56 Brewing Company', beer_name=u"Charlie's Cherry - Cheery Chamomile Blonde Bombshell", state=u'Indiana', reviews=[u'Taste: wheat bread with herbal notes of chamomile, subtle cherry, slight tart and yeast nose. Champagne carbonated mouth-feel. Gold, slight unfiltered, large super foamy head. Medium body. The yeast was busy with this batch.\xa0224 charactersBlackBeerPirate, Oct 15, 2016']),
 Row(brewery_name=u'Abbey Beverage Company (Monastery of Christ in the Desert)', beer_name=u"Monks' Dubbel Ale", state=u'New Mexico', reviews=[u'From a 12 ounce bottle, this is an amber/copper color with a slim white head.  The aroma is estery...raisins and apples...with some rich dark malt as well.  The taste has malt, spicy pepper flavors, some hops, and a bit of alcohol warmth.  Full in body and satisfying.\xa0268 charactersSuds, Jan 16, 2016', u'Pours a hazy dark orange with a foamy khaki head that settles to wisps of film on top of the beer. Thin streaks of lace form aroun

In [125]:

def preprocess_reviews(reviews):
    stopwords_ = set(stopwords.words('english'))
    stemmer_ = SnowballStemmer('english')
    
    if (reviews == None):
        return []
    
    if (len(reviews) < 1):
        return []
    
    stemmed = []
    for text in reviews:
        if (type(text) == unicode):
            text = text.encode('utf-8')
        
        replace_punctuation = string.maketrans(string.punctuation, ' '*len(string.punctuation))
        unpunctuated_text = text.translate(replace_punctuation)
        tokens = word_tokenize(unpunctuated_text)[:-5:] # reviewer username info lost here via array slice
        lowercased_tokens = [token.lower() for token in tokens]
        filtered_tokens = [w for w in lowercased_tokens if not w in stopwords_] 
        try:
            stemmed += [stemmer_.stem(w) for w in filtered_tokens]
        except:
            stemmed += [stemmer_.stem(w) for w in filtered_tokens[:-1:]] # review text is more than 999 characters
    return(stemmed)

In [123]:
test = '''Poured from a 12oz bottle ($6.99/four-pack) into a teku:\n\nAppearance: pours a bright copper-orange, with a thick, fluffy head. The body isn't quite translucent, but I wouldn't call it hazy either....somewhere in between. Leaves tons of sticky lacing.\n\nSmell: disappointed in this area, as I didn't get all that much from the hops. I got some dry, crackery scent from the malt, and maybe a little bit of bitterness, but the hops were a disappointment in a style that definitely needs hop aromatics.\n\nTaste: starts off with some nice, mellow fruit flavor. I got a little bit of melon, some pineapple rind.  Orange pith, not so much the flesh. Lots of bitterness, perhaps a little bit over on the bitter balance. The magnum definitely comes through on the finish much more than the El Dorado, though both give this beer a distinctly bitter flavor. Not necessarily a bad thing, but it's not the en vogue flavor right now. Nicely balanced between malt and hops.\n\nFeel: medium-bodied, with light to medium carbonation, and a semi-thick mouthfeel. Leaves a kind of dry, bitter finish.\n\nOverall: a solid first IPA from a new brewery. Combined with the fact that half of all proceeds are donated to different charities, this'll be a solid purchase, especially at the relatively cheap price point.\xa01,287 charactersTheElectricOne, Mar 25, 2016'''

reviews = [u'Solidly amber in color with just a touch of haze.  Small but lasting head which leaves decent latticed lace.\n\nVery nice blended smell of caramel malt (like an English IPA) with a pleasant floral hop (more like an East Coast US IPA).\n\nStarts mild with some toastiness on the tongue.  The hop is straight up bitterness and stronger than I expected based on the smell.  Highly carbonated and neutral in body.  Long aftertaste, but again, the flavor is straight forward.  \n\nOverall, I do like this IPA.  The interplay between the malt and hop is fun, but neither is very complex.\xa0575 characterssmcolw, Apr 22, 2016', u'Pours a slightly hazy and light amber in color with a minimal off white head that quickly faded to only a few bubbles at the edges. It has an almost murky/muddy appearance. Aroma is almost completely devoid of hop presence with a fair amount of caramelly malts. Taste is like the aroma, hop flavor is low with only hints of citrus and a few floral notes. Caramel malt character is offset by a relatively high bitterness that has a touch of astringency at the end. Medium body and moderate carbonation. Overall a very mediocre beer. I found the hop presence lacking, the caramel malts a bit heavy handed and the bitterness out of whack. I do really like the label (which is the reason that I took a chance on an unknown and picked this up).\xa0739 charactersmnj21655, Apr 19, 2016', u'A- Snifter pour from the narrow 12oz bottle with a deep orange-amber body and a foamy off-white cap that rises to just about one finger high. Head falls fairly fast to a medium ring and lacing is in sheets.\n\nS- Really not a whole lot to go by especially for the IPA style. A hint of orange citrus hops is trumped by medium caramelized malts and yeast bread notes. A bit toasty.\n\nT- Fairly malty focus carries over from the aroma with cereal grains, toast, pale caramel and raw base malt notes coming out. Hops are more present in the finish with a heavy spicy and earthy presence as well citrus, herbal tea and floral hints.\n\nMF- Texture falls somewhere between silky, frothy and a tad creamy in a medium body that leans just towards the light end. Carbonation is moderate and the finish contains an herbal bitterness.\n\nA moderate malt presence interferes with the brightness of the hops in a fairly forgettable brew in an ultra competitive style. IPAs can be balanced but this one does not excel at that.\xa01,005 characterscbutova, Mar 30, 2016', u"Poured from a 12oz bottle ($6.99/four-pack) into a teku:\n\nAppearance: pours a bright copper-orange, with a thick, fluffy head. The body isn't quite translucent, but I wouldn't call it hazy either....somewhere in between. Leaves tons of sticky lacing.\n\nSmell: disappointed in this area, as I didn't get all that much from the hops. I got some dry, crackery scent from the malt, and maybe a little bit of bitterness, but the hops were a disappointment in a style that definitely needs hop aromatics.\n\nTaste: starts off with some nice, mellow fruit flavor. I got a little bit of melon, some pineapple rind.  Orange pith, not so much the flesh. Lots of bitterness, perhaps a little bit over on the bitter balance. The magnum definitely comes through on the finish much more than the El Dorado, though both give this beer a distinctly bitter flavor. Not necessarily a bad thing, but it's not the en vogue flavor right now. Nicely balanced between malt and hops.\n\nFeel: medium-bodied, with light to medium carbonation, and a semi-thick mouthfeel. Leaves a kind of dry, bitter finish.\n\nOverall: a solid first IPA from a new brewery. Combined with the fact that half of all proceeds are donated to different charities, this'll be a solid purchase, especially at the relatively cheap price point.\xa01,287 charactersTheElectricOne, Mar 25, 2016"]
# reviews

In [120]:
# import string
# replace_punctuation = string.maketrans(string.punctuation, ' '*len(string.punctuation))
# unpunctuated_test = test.translate(replace_punctuation)
# tokens = word_tokenize(unpunctuated_test)
# lowercase_tokens = [token.lower() for token in tokens]
# lowercase_tokens[:-5:]

In [169]:
# preprocess_review_text(test)
tokens = preprocess_reviews(reviews)
print tokens

tokenizer_udf = udf(lambda x : preprocess_reviews(x), ArrayType(StringType()))

df_tokens = grouped_reviews.withColumn("tokens", tokenizer_udf(grouped_reviews.reviews))
df_tokens.printSchema()


[u'solid', u'amber', u'color', u'touch', u'haze', u'small', u'last', u'head', u'leav', u'decent', u'lattic', u'lace', u'nice', u'blend', u'smell', u'caramel', u'malt', u'like', u'english', u'ipa', u'pleasant', u'floral', u'hop', u'like', u'east', u'coast', 'us', u'ipa', u'start', u'mild', u'toasti', u'tongu', u'hop', u'straight', u'bitter', u'stronger', u'expect', u'base', u'smell', u'high', u'carbon', u'neutral', u'bodi', u'long', u'aftertast', u'flavor', u'straight', u'forward', u'overal', u'like', u'ipa', u'interplay', u'malt', u'hop', u'fun', u'neither', u'complex', u'pour', u'slight', u'hazi', u'light', u'amber', u'color', u'minim', u'white', u'head', u'quick', u'fade', u'bubbl', u'edg', u'almost', u'murki', u'muddi', u'appear', u'aroma', u'almost', u'complet', u'devoid', u'hop', u'presenc', u'fair', u'amount', u'caramelli', u'malt', u'tast', u'like', u'aroma', u'hop', u'flavor', u'low', u'hint', u'citrus', u'floral', u'note', u'caramel', u'malt', u'charact', u'offset', u'relat', 

In [87]:
cv = CountVectorizer(inputCol='tokens', outputCol='features_tf')
# cv.fit(tokens)

In [105]:
string.punctuation+'\\xa'

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\\xa'