In [1]:
import json
import numpy as np
import scipy
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
%matplotlib inline
from sklearn import preprocessing

In [2]:
# Helper function to clean text data
def clean_text(s):
    html_esc = ['&lt;', '&gt;', '&amp;']
    sp_symbols = ['\t', '\n', '\r']
    for sym in html_esc + sp_symbols:
        s = s.replace(sym, ' ')
    symbols = '1234567890!"#$%&\'()*+,-./:;?@[]^_`{|}~'
    for sym in symbols:
        s = s.replace(sym, ' ')
    return s

In [3]:
# List of cities to consider
#cities = ['Pittsburgh', 'Charlotte', 'Urbana', 'Phoenix', 'Las Vegas', 'Madison', 'Cleveland']
cities = ['Phoenix']

# Business dataset
ids = []
business_info = []
with open('yelp_academic_dataset_business.json') as business_json:
    for line in business_json:
        business = json.loads(line)
        if business['city'] in cities:
            if business['categories'] is not None:
                if 'Restaurants' in business['categories']:
                    ids.append(business['business_id'])
                    business_info.append({'business_id': business['business_id'], 'rating': business['stars']})
        
# Review dataset
review_info = []
with open('yelp_academic_dataset_review.json') as review_json:
    for line in review_json:
        review = json.loads(line)
        if review['business_id'] in ids:
            review_info.append({'business_id': review['business_id'],
                                'review': clean_text(review['text']),
                                'rating': review['stars']})

# Create dataframes
business_info = pd.DataFrame(business_info)
review_info = pd.DataFrame(review_info)

In [4]:
business_info.head()

Unnamed: 0,business_id,rating
0,YPavuOh2XsnRbLfl0DH2lQ,4.0
1,F53MSa5SYzO9BG8c_JhskQ,2.5
2,hEcn9k6ONd5n2mq0lB2aew,3.5
3,Ld2hhA3q3cdkptwS1fsYEg,4.0
4,a48i_DFln5e1oaJKNM6BUg,2.5


In [5]:
review_info.head()

Unnamed: 0,business_id,rating,review
0,9JTkh1D7u9fRBohglQRL0w,1,I know that this airport strives to be the fri...
1,9JTkh1D7u9fRBohglQRL0w,1,We waited for min at a table while all the ...
2,9JTkh1D7u9fRBohglQRL0w,4,The other review apparently was expecting a fi...
3,9JTkh1D7u9fRBohglQRL0w,2,I guess the nice thing to say would be that th...
4,9JTkh1D7u9fRBohglQRL0w,4,Decent food decent beer fast service what m...


In [None]:
review_info.to_csv('reviews.csv')

In [6]:
print('Number of restaurants:', business_info.shape[0])
print('Number of reviews:', review_info.shape[0])

Number of restaurants: 3353
Number of reviews: 266766


In [7]:
# Convert reviews to a matrix of TF-IDF features
vectorizer = TfidfVectorizer(stop_words='english', min_df=0.1, max_df=0.8,
                             max_features=30, ngram_range=(1,2))
dtm = vectorizer.fit_transform(review_info.review.values).toarray()

In [9]:
import math
# Use ~80% of data for training and ~20% for testing
X = dtm
y = review_info.rating.values

train_set_size = math.ceil(dtm.shape[0]*0.8)
X_train = X[:train_set_size]
y_train = y[:train_set_size]
X_test = X[train_set_size:]
y_test = y[train_set_size:]

In [None]:
from sklearn import svm

svc_poly = svm.SVC(kernel='poly', degree=6, C=10.0)
svc_poly.fit(X_train, y_train)
y_pred_test = svc_poly.predict(X_test)

print("Accuracy:", svc_poly.score(X_test, y_test))


In [None]:
#review_info = pd.DataFrame(review_info.groupby('business_id')['review'].apply(lambda x: x.sum())).reset_index()

#info = pd.merge(business_info, review_info, how='inner', on='business_id')

#info.head()