In [1]:
import json
import datetime

# read the data from disk and split into lines
# we use .strip() to remove the final (empty) line
with open("./data/yelp.json") as f:
    reviews = f.read().strip().split("\n")

# each line of the file is a separate JSON object
reviews = [json.loads(review) for review in reviews] 

# we're interested in the text of each review 
# and the stars rating, so we load these into 
# separate lists
texts = [review['text'] for review in reviews]
stars = [review['stars'] for review in reviews]

In [2]:
from collections import Counter

def balance_classes(xs, ys):
    freqs = Counter(ys)

    # the least common class is the maximum number we want for all classes
    max_allowable = freqs.most_common()[-1][1]
    num_added = {clss: 0 for clss in freqs.keys()}
    new_ys = []
    new_xs = []
    for i, y in enumerate(ys):
        if num_added[y] < max_allowable:
            new_ys.append(y)
            new_xs.append(xs[i])
            num_added[y] += 1
    return new_xs, new_ys

In [3]:
print(Counter(stars))
balanced_x, balanced_y = balance_classes(texts, stars)
print(Counter(balanced_y))


Counter({4: 79878, 5: 76193, 3: 35363, 2: 20957, 1: 17516})
Counter({5: 17516, 4: 17516, 2: 17516, 3: 17516, 1: 17516})


In [4]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer()
X = tf_transformer.fit_transform(balanced_x).toarray()


from sklearn.feature_extraction.text import TfidfVectorizer
tfidfVectorizer = TfidfVectorizer(max_features =2000)
X = tfidfVectorizer.fit_transform(balanced_x).toarray()



# from sklearn.feature_extraction.text import TfidfVectorizer

# # This vectorizer breaks text into single words and bi-grams
# # and then calculates the TF-IDF representation
# vectorizer = TfidfVectorizer(ngram_range=(1,2))
# t1 = datetime.datetime.now()

# # the 'fit' builds up the vocabulary from all the reviews
# # while the 'transform' step turns each indivdual text into
# # a matrix of numbers.
# vectors = vectorizer.fit_transform(balanced_x).toarray()
# print(datetime.datetime.now() - t1)

0:01:57.052837


In [1]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(vectors, balanced_y, test_size=0.20)

NameError: name 'vectors' is not defined

In [None]:
# from sklearn.svm import LinearSVC

# # initialise the SVM classifier
# classifier = LinearSVC()

from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()

# train the classifier
t1 = datetime.datetime.now()
classifier.fit(X_train, y_train)
print(datetime.datetime.now() - t1)

In [None]:
preds = classifier.predict(X_test)
print(list(preds[:20]))
print(y_test[:20])

# [4, 1, 1, 5, 4, 3, 2, 1, 4, 5, 2, 1, 1, 1, 5, 4, 4, 1, 4, 2]
# [3, 3, 2, 5, 3, 3, 2, 1, 4, 5, 2, 1, 1, 1, 5, 4, 5, 1, 4, 5]

In [None]:
for index,text in enumerate(texts[0:20]):
    print('PREDICTION:',preds[index],', ACTUAL:',balanced_y[index], ') ' ,text , '\n')

In [None]:
# import pickle
# with open('yelp-classifier.pkl', 'wb') as picklefile:  
#     pickle.dump(classifier,picklefile)

# # Save tuple
# tuple_objects = (classifier, X_train, y_train)
# pickle.dump(tuple_objects, open("yelp-classifier-tuple.pkl", 'wb'))

#joblib save
# from sklearn.externals import joblib
# joblib.dump(vectorizer, 'vectorizer.pkl')
# joblib.dump(classifier, 'classifier.pkl')