In [1]:
import json
import datetime

# read the data from disk and split into lines
# we use .strip() to remove the final (empty) line
with open("./data/yelp.json") as f:
    reviews = f.read().strip().split("\n")

# each line of the file is a separate JSON object
reviews = [json.loads(review) for review in reviews] 

# we're interested in the text of each review 
# and the stars rating, so we load these into 
# separate lists
texts = [review['text'] for review in reviews]
stars = [review['stars'] for review in reviews]

In [2]:
from collections import Counter

def balance_classes(xs, ys):
    freqs = Counter(ys)

    # the least common class is the maximum number we want for all classes
    max_allowable = freqs.most_common()[-1][1]
    num_added = {clss: 0 for clss in freqs.keys()}
    new_ys = []
    new_xs = []
    for i, y in enumerate(ys):
        if num_added[y] < max_allowable:
            new_ys.append(y)
            new_xs.append(xs[i])
            num_added[y] += 1
    return new_xs, new_ys

In [3]:
print(Counter(stars))
balanced_x, balanced_y = balance_classes(texts, stars)
print(Counter(balanced_y))


Counter({4: 79878, 5: 76193, 3: 35363, 2: 20957, 1: 17516})
Counter({5: 17516, 4: 17516, 2: 17516, 3: 17516, 1: 17516})


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# This vectorizer breaks text into single words and bi-grams
# and then calculates the TF-IDF representation
vectorizer = TfidfVectorizer(ngram_range=(1,2))
t1 = datetime.datetime.now()

# the 'fit' builds up the vocabulary from all the reviews
# while the 'transform' step turns each indivdual text into
# a matrix of numbers.
vectors = vectorizer.fit_transform(balanced_x)
print(datetime.datetime.now() - t1)

0:00:51.731333


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(vectors, balanced_y, test_size=0.33)

In [6]:
from sklearn.svm import LinearSVC

# initialise the SVM classifier
classifier = LinearSVC()

# train the classifier
t1 = datetime.datetime.now()
classifier.fit(X_train, y_train)
print(datetime.datetime.now() - t1)

0:00:22.477658


In [7]:
preds = classifier.predict(X_test)
print(list(preds[:20]))
print(y_test[:20])

[4, 1, 1, 5, 4, 3, 2, 1, 4, 5, 2, 1, 1, 1, 5, 4, 4, 1, 4, 2]
[3, 3, 2, 5, 3, 3, 2, 1, 4, 5, 2, 1, 1, 1, 5, 4, 5, 1, 4, 5]


In [14]:
print(texts[0])
print('------')
print(texts[4])

My wife took me here on my birthday for breakfast and it was excellent.  The weather was perfect which made sitting outside overlooking their grounds an absolute pleasure.  Our waitress was excellent and our food arrived quickly on the semi-busy Saturday morning.  It looked like the place fills up pretty quickly so the earlier you get here the better.

Do yourself a favor and get their Bloody Mary.  It was phenomenal and simply the best I've ever had.  I'm pretty sure they only use ingredients from their garden and blend them fresh when you order it.  It was amazing.

While EVERYTHING on the menu looks excellent, I had the white truffle scrambled eggs vegetable skillet and it was tasty and delicious.  It came with 2 pieces of their griddled bread with was amazing and it absolutely made the meal complete.  It was the best "toast" I've ever had.

Anyway, I can't wait to go back!
------
General Manager Scott Petello is a good egg!!! Not to go into detail, but let me assure you if you have

In [15]:
# import pickle
# with open('yelp-classifier.pkl', 'wb') as picklefile:  
#     pickle.dump(classifier,picklefile)

# # Save tuple
# tuple_objects = (classifier, X_train, y_train)
# pickle.dump(tuple_objects, open("yelp-classifier-tuple.pkl", 'wb'))

#joblib save
from sklearn.externals import joblib
joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(classifier, 'classifier.pkl')

['classifier.pkl']