### Extracting data from the JSON files

In [1]:
import numpy
import json
import pandas

business_ratings = { }
for line in open("data/yelp_academic_dataset_business.json", "r"):
    json_data = json.loads(line)
    business_ratings[json_data["business_id"]] = json_data["stars"]
    
review_list = []
for line in open("data/yelp_academic_dataset_review.json", "r"):
    json_data = json.loads(line)
    review_list.append([json_data["review_id"], json_data["stars"], json_data["text"], json_data["business_id"]])
review_pd = pandas.DataFrame(review_list, columns = ["Review ID", "Stars", "Review Text", "Business ID"])

### Tokenizing the review text

In [2]:
from nltk.tokenize import RegexpTokenizer

def tokenize(text):
    tokenizer = RegexpTokenizer(r'\w+')
    return tokenizer.tokenize(text.lower())

review_pd["Tokenized Text"] = review_pd["Review Text"].apply(tokenize)

In [3]:
review_pd.head()

Unnamed: 0,Review ID,Stars,Review Text,Business ID,Tokenized Text
0,15SdjuK7DmYqUAj6rjGowg,5,dr. goldberg offers everything i look for in a...,vcNAWiLM4dR7D2nwwJ7nCA,"[dr, goldberg, offers, everything, i, look, fo..."
1,RF6UnRTtG7tWMcrO2GEoAg,2,"Unfortunately, the frustration of being Dr. Go...",vcNAWiLM4dR7D2nwwJ7nCA,"[unfortunately, the, frustration, of, being, d..."
2,-TsVN230RCkLYKBeLsuz7A,4,Dr. Goldberg has been my doctor for years and ...,vcNAWiLM4dR7D2nwwJ7nCA,"[dr, goldberg, has, been, my, doctor, for, yea..."
3,dNocEAyUucjT371NNND41Q,4,Been going to Dr. Goldberg for over 10 years. ...,vcNAWiLM4dR7D2nwwJ7nCA,"[been, going, to, dr, goldberg, for, over, 10,..."
4,ebcN2aqmNUuYNoyvQErgnA,4,Got a letter in the mail last week that said D...,vcNAWiLM4dR7D2nwwJ7nCA,"[got, a, letter, in, the, mail, last, week, th..."


In [4]:
review_pd.to_json("data/pd_tokenized.json", orient = "index")

In [None]:
print 5

### Manipulating data into LabeledSentences

In [76]:
from gensim.models.doc2vec import LabeledSentence
import numpy as np

class LabeledSentences:
    
    def __init__(self, series):
        self.series = series
        
    def __iter__(self):
        indexes = list(np.arange(len(self.series)))
        np.random.shuffle(indexes)
        for (index, wordlist) in self.series.iloc[indexes].iteritems():
            yield LabeledSentence(words = wordlist, tags = ['VEC_%s' % index])
            
sentences = LabeledSentences(review_pd["Tokenized Text"][:5000])

### Learn vector representations for each word via Word2Vec

In [77]:
import multiprocessing
import gensim
from gensim.models import Doc2Vec

assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be slow otherwise"
cores = multiprocessing.cpu_count()

model = Doc2Vec(dm = 1, dm_concat = 0, min_count = 1, window = 10, size = 40, sample = 1e-4, negative = 5, 
                workers = cores, alpha = 0.025, min_alpha = 0.025)
model.build_vocab(sentences)

### Learn vector representations for each review text via Doc2Vec

In [78]:
for epoch in range(10):
    model.train(sentences)
    model.alpha -= 0.002
    model.min_alpha = model.alpha

In [79]:
model.docvecs.most_similar("VEC_1187", topn = 10)

[('VEC_2236', 0.7533506155014038),
 ('VEC_1444', 0.7461391687393188),
 ('VEC_1381', 0.7458664178848267),
 ('VEC_3897', 0.7332332134246826),
 ('VEC_4772', 0.7292354106903076),
 ('VEC_153', 0.7277787327766418),
 ('VEC_2118', 0.7263537049293518),
 ('VEC_4748', 0.7253901362419128),
 ('VEC_3705', 0.7248246669769287),
 ('VEC_1556', 0.7241321206092834)]

In [83]:
print review_pd["Review Text"][0]

Dr. Goldberg has been my doctor for years and I like him.  I've found his office to be fairly efficient.  Today I actually got to see the doctor a few minutes early!  

He seems very engaged with his patients and his demeanor is friendly, yet authoritative.    

I'm glad to have Dr. Goldberg as my doctor.
