### Extracting data from the JSON files

In [7]:
import numpy as np
import json 
import pandas

business_ratings = { }
for line in open("data/yelp_academic_dataset_business.json", "r"):
    json_data = json.loads(line)
    business_ratings[json_data["business_id"]] = json_data["stars"]
selected_businesses = set(np.random.choice(business_ratings.keys(), size = 2000, replace = False))
    
review_list = []
for line in open("data/yelp_academic_dataset_review.json", "r"):
    json_data = json.loads(line)
    if json_data["business_id"] in selected_businesses:
        review_list.append([json_data["review_id"], json_data["stars"], json_data["text"], json_data["business_id"]])
review_pd = pandas.DataFrame(review_list, columns = ["Review ID", "Stars", "Review Text", "Business ID"])

### Tokenizing the review text

In [17]:
import sys
import string
import unicodedata
from nltk.tokenize import RegexpTokenizer

tbl = dict.fromkeys(i for i in xrange(sys.maxunicode) if unicodedata.category(unichr(i)).startswith('P'))

def tokenize(text):
    return text.translate(tbl).lower().split()

review_pd["Tokenized Text"] = review_pd["Review Text"].apply(tokenize)

In [18]:
review_pd.head()

Unnamed: 0,Review ID,Stars,Review Text,Business ID,Tokenized Text
0,ICfB48LBct5Q1AU_5bz5WA,4,As a chinese american college student from NYC...,Y2p07YEC8xOsYTHWf0UxiA,"[as, a, chinese, american, college, student, f..."
1,hqoYUZwXt43W3rL0nw7DgA,4,I recently pop in to this restaurant and got a...,Y2p07YEC8xOsYTHWf0UxiA,"[i, recently, pop, in, to, this, restaurant, a..."
2,-SOyFOGiDZRlC1ov0RT52w,3,Pretty solid Chinese takeout. They have a $10...,Y2p07YEC8xOsYTHWf0UxiA,"[pretty, solid, chinese, takeout, they, have, ..."
3,pwm6ZRHqr0ubXu4QBaYpQw,2,Food: There are countless options to choose fr...,Y2p07YEC8xOsYTHWf0UxiA,"[food, there, are, countless, options, to, cho..."
4,Mea6JnjvGDV51Wv6qyDiPQ,2,This place features everything you can expect ...,Y2p07YEC8xOsYTHWf0UxiA,"[this, place, features, everything, you, can, ..."


### Manipulating data into LabeledSentences

In [94]:
from gensim.models.doc2vec import LabeledSentence
import numpy as np

class LabeledSentences:
    
    def __init__(self, series):
        self.series = series
        
    def __iter__(self):
        indexes = list(np.arange(len(self.series)))
        np.random.shuffle(indexes)
        for (index, wordlist) in self.series.iloc[indexes].iteritems():
            yield LabeledSentence(words = wordlist, tags = ['VEC_%s' % index])
            
sentences = LabeledSentences(review_pd["Tokenized Text"].copy()[:12000])

### Learn vector representations for each word via Word2Vec

In [95]:
import multiprocessing
import gensim
from gensim.models import Doc2Vec

assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be slow otherwise"
cores = multiprocessing.cpu_count()

model = Doc2Vec(dm = 1, dm_concat = 0, min_count = 1, window = 10, size = 30, sample = 1e-4, negative = 5, 
                workers = cores, alpha = 0.025, min_alpha = 0.025)
model.build_vocab(sentences)



### Learn vector representations for each review text via Doc2Vec

In [96]:
for epoch in range(10):
    model.train(sentences)
    model.alpha -= 0.002
    model.min_alpha = model.alpha

### Concatenate Review Rating with Vector

In [97]:
design_matrix, labels = [], []
test_matrix, t_labels = [], []

for index in xrange(len(model.docvecs)):
    key = ("VEC_%s") % (index)
    if index < 10500:
        design_matrix.append(np.append(model.docvecs[key], review_pd["Stars"][index]))
        labels.append(business_ratings[review_pd["Business ID"][index]])
    else:
        test_matrix.append(np.append(model.docvecs[key], review_pd["Stars"][index]))
        t_labels.append(business_ratings[review_pd["Business ID"][index]])
    
design_matrix = np.array(design_matrix)
labels = (np.array(labels) > 3).astype(int)

test_matrix = np.array(test_matrix)
t_labels = (np.array(t_labels) > 3).astype(int)

### Input data into Binomial Model

In [98]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(design_matrix, labels)

preds = logreg.predict(test_matrix)
print np.mean(preds == t_labels)

0.962
