In [1]:
import json
import datetime

# read the data from disk and split into lines
# we use .strip() to remove the final (empty) line
with open("./data/yelp4.json") as f:
    reviews = f.read().strip().split("\n")

reviews = [json.loads(review) for review in reviews] 

texts = [review['text'] for review in reviews]
stars = [review['stars'] for review in reviews]

In [2]:
from collections import Counter

def balance_classes(xs, ys):
    freqs = Counter(ys)

    # the least common class is the maximum number we want for all classes
    max_allowable = freqs.most_common()[-1][1]
    num_added = {clss: 0 for clss in freqs.keys()}
    new_ys = []
    new_xs = []
    for i, y in enumerate(ys):
        if num_added[y] < max_allowable:
            new_ys.append(y)
            new_xs.append(xs[i])
            num_added[y] += 1
    return new_xs, new_ys

In [3]:
print(Counter(stars))
balanced_x, balanced_y = balance_classes(texts, stars)
print(Counter(balanced_y))


Counter({4: 86, 5: 77, 3: 45, 2: 22, 1: 20})
Counter({4: 20, 2: 20, 3: 20, 5: 20, 1: 20})


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

t1 = datetime.datetime.now()

from sklearn.externals import joblib
classifier = joblib.load('classifier.pkl')
vectorizer = joblib.load('vectorizer.pkl')

vectors = vectorizer.transform(balanced_x)

print(datetime.datetime.now() - t1)

0:00:31.028198


In [7]:
preds = classifier.predict(vectors)
print(list(preds[:80]))
print(balanced_y[:80])

[5, 2, 1, 3, 3, 1, 5, 4, 5, 1, 3, 4, 1, 1, 3, 5, 2, 2, 5, 1, 3, 4, 1, 1, 3, 2, 4, 5, 5, 3, 5, 2, 5, 1, 1, 5, 4, 5, 1, 5, 4, 4, 4, 3, 5, 4, 4, 2, 3, 5, 5, 4, 5, 4, 5, 1, 5, 5, 4, 2, 3, 2, 5, 5, 1, 1, 1, 3, 3, 1, 3, 1, 3, 2, 2, 4, 3, 3, 1, 2]
[4, 4, 2, 3, 4, 3, 5, 4, 5, 3, 2, 4, 1, 2, 4, 5, 4, 2, 5, 1, 3, 5, 5, 1, 3, 1, 4, 5, 4, 2, 4, 3, 5, 4, 2, 5, 4, 4, 1, 5, 4, 5, 5, 3, 5, 4, 4, 2, 3, 4, 4, 4, 4, 5, 5, 1, 3, 5, 5, 3, 3, 3, 5, 5, 5, 2, 1, 3, 3, 2, 3, 3, 3, 3, 2, 3, 3, 2, 1, 2]


In [8]:
for index,text in enumerate(texts[0:10]):
    print(index,'*',preds[index],'*',balanced_y[index], ') ' ,text , '----\n')

0 * 5 * 4 )  I LOVE this place. I practically lived here after having surgery a few weeks ago.... the shaved ice was one of the only things I could eat. I especially love their large selection of sugar-free syrups. Bahama Buck's made my recovery SO much more pleasant!!!

The only thing I'm not fond of is the location... there always seems to be seedy people hanging out in that parking lot. It's not somewhere I'd feel particularly safe being after dark. ----

1 * 2 * 4 )  I need my healthy food and I need my fresh produce and this place has it. I'm not rich so shopping at a place like Whole Foods with such a huge selection and an even huger price tag isn't going to happen. I feel like they have a wide range of stuff, but then sometimes I feel like they don't. Its been multiple times that I buy something I like and then the next time I come back they don't have it in stock... :( Almond butter, honey, 9 grain bread, ect. Some of the stuff is a little pricey, but its really fresh and they 