# Rating Predictor for Yelp Reviews

In [1]:
# Import libraries
import requests
import json
import time
import pandas as pd
from bs4 import BeautifulSoup

import seaborn as sns
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import numpy as np

import string

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report

import pickle
from sklearn.svm import SVC

## Scraper
Scrape review and restaurant data from Yelp API.

In [2]:
# Load API key from file 
with open('api_key.txt', 'r') as f:
    api_key = f.read().replace('\n','')

In [3]:
def extract_info_for_business(row):
    return {'name': row['name'], 'id': row['id'], 'review_count': int(row['review_count']), 'url': row['url']}

def scrape_restaurant_list(location):
    # Comment for method    
    payload = {'categories': 'restaurants', 'location': location, 'limit': 20, 'offset': 0}
    basic_url = 'https://api.yelp.com/v3/businesses/search'
    headers = {'Authorization': 'Bearer ' + api_key}
    response = requests.get(basic_url, params=payload, headers=headers)
    res_json = response.json()
    total = res_json['total']
    print("total: ", total)
    businesses_list = list(map(extract_info_for_business, res_json['businesses']))
    while len(businesses_list) < total:
        print(len(businesses_list))
        time.sleep(0.3)
        payload['offset'] = len(businesses_list)
        response = requests.get(basic_url, params=payload, headers=headers)
        res_json = response.json()
        if 'businesses' not in res_json or len(list(res_json['businesses'])) == 0:
            break
        businesses_list.extend(list(map(extract_info_for_business ,res_json['businesses'])))
    
    return businesses_list
    

In [30]:
blist = scrape_restaurant_list('94043')
# print(blist)
print(len(blist))

total:  4000
20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460
480
500
520
540
560
580
600
620
640
660
680
700
720
740
760
780
800
820
840
860
880
900
920
940
960
980
1000
1000


In [31]:
print(blist[:10])

[{'name': "The Sea by Alexander's Steakhouse", 'id': 'P1eEPolk9EDGqVn1Jyncww', 'review_count': 874, 'url': 'https://www.yelp.com/biz/the-sea-by-alexanders-steakhouse-palo-alto?adjust_creative=6RD6nFOw75PxaCjeWnG24Q&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=6RD6nFOw75PxaCjeWnG24Q'}, {'name': 'So Gong Dong Tofu House', 'id': 'rTUV3mPTGcALQrKgdokACA', 'review_count': 1326, 'url': 'https://www.yelp.com/biz/so-gong-dong-tofu-house-palo-alto?adjust_creative=6RD6nFOw75PxaCjeWnG24Q&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=6RD6nFOw75PxaCjeWnG24Q'}, {'name': "Zareen's", 'id': 'ud9ocsQHI7h3zNO7FdOFYQ', 'review_count': 1263, 'url': 'https://www.yelp.com/biz/zareens-mountain-view-3?adjust_creative=6RD6nFOw75PxaCjeWnG24Q&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=6RD6nFOw75PxaCjeWnG24Q'}, {'name': 'Evvia Estiatorio', 'id': '1vMgajRAI3lYwuCeGX58oQ', 'review_count': 2098, 'url': 'https://www.yelp.com/biz/evvia-estiator

Get reviews

In [4]:
def parse_page(html):
    """
    Parse the reviews on a single page of a restaurant.
    
    Args:
        html (string): String of HTML corresponding to a Yelp restaurant

    Returns:
        tuple(list, string): a tuple of two elements
            first element: list of dictionaries corresponding to the extracted review information
            second element: URL for the next page of reviews (or None if it is the last page)
    """
    review_list = []
    soup = BeautifulSoup(html, 'html.parser')
    for review_block in soup.find_all('div', attrs={'class': 'review review--with-sidebar'}):
        review_id = user_id = rating = date = text = None
        if 'data-review-id' in review_block.attrs:
            review_id = review_block['data-review-id']
        if 'data-signup-object' in review_block.attrs and review_block['data-signup-object'].startswith('user_id:'):
            user_id = review_block['data-signup-object'][8:]
        rating_div = review_block.find('div', attrs={'class': 'i-stars'})
        if 'title' in rating_div.attrs:
            rating = float(rating_div['title'].split()[0])
        date_span = review_block.find('span', attrs={'class': 'rating-qualifier'})
        if date_span:
            date = date_span.getText().strip()
        review_content = review_block.find('div', attrs={'class': 'review-content'})
        if review_content:
            text = review_content.find('p').getText()
        if review_id and user_id and rating and date and text:
            review_list.append({
                'review_id': review_id,
                'user_id': user_id,
                'rating': rating,
                'date': date,
                'text': text
            })
    next_link = None
    next_ele = soup.find('a', attrs={'class': 'u-decoration-none next pagination-links_anchor'})
    if next_ele and 'href' in next_ele.attrs:
        next_link = next_ele['href']
    return review_list, next_link

In [5]:
def scrape_all_reviews(restaurants):
    for i, restaurant in enumerate(restaurants):
        print("restaurant: ", i)
        reviews = []
        url = restaurant['url']
        while url != None:
            response = requests.get(url)
            reviews_in_page, url = parse_page(response.content)
            reviews.extend(reviews_in_page)
        print(len(reviews))
        df = pd.DataFrame(reviews)
        df.to_csv('reviews.csv', mode='a', header=True)

In [56]:
scrape_all_reviews(blist[42:])

restaurant:  0
990
restaurant:  1
618
restaurant:  2
440
restaurant:  3
801
restaurant:  4
552
restaurant:  5
1213
restaurant:  6
661
restaurant:  7
482
restaurant:  8
697
restaurant:  9
1183
restaurant:  10
1382
restaurant:  11
1044
restaurant:  12
1423
restaurant:  13
647
restaurant:  14
671
restaurant:  15
1150
restaurant:  16
574
restaurant:  17
150
restaurant:  18
1227
restaurant:  19
442
restaurant:  20
511
restaurant:  21
629
restaurant:  22
703
restaurant:  23
992
restaurant:  24
424
restaurant:  25
193
restaurant:  26
851
restaurant:  27
349
restaurant:  28
1177
restaurant:  29
449
restaurant:  30
1246
restaurant:  31
863
restaurant:  32
1189
restaurant:  33
420
restaurant:  34
623
restaurant:  35
1490
restaurant:  36
612
restaurant:  37
402
restaurant:  38
479
restaurant:  39
913
restaurant:  40
552
restaurant:  41
576
restaurant:  42
931
restaurant:  43
600
restaurant:  44
342
restaurant:  45
721
restaurant:  46
434
restaurant:  47
911
restaurant:  48
286
restaurant:  49
185

0
restaurant:  415
0
restaurant:  416
0
restaurant:  417
0
restaurant:  418
0
restaurant:  419
0
restaurant:  420
0
restaurant:  421
0
restaurant:  422
0
restaurant:  423
0
restaurant:  424
0
restaurant:  425
0
restaurant:  426
0
restaurant:  427
0
restaurant:  428
0
restaurant:  429
0
restaurant:  430
0
restaurant:  431
0
restaurant:  432
0
restaurant:  433
0
restaurant:  434
0
restaurant:  435
0
restaurant:  436
0
restaurant:  437
0
restaurant:  438
0
restaurant:  439
0
restaurant:  440
0
restaurant:  441
0
restaurant:  442
0
restaurant:  443
0
restaurant:  444
0
restaurant:  445
0
restaurant:  446
0
restaurant:  447
0
restaurant:  448
0
restaurant:  449
0
restaurant:  450
0
restaurant:  451
0
restaurant:  452
0
restaurant:  453
0
restaurant:  454
0
restaurant:  455
0
restaurant:  456
0
restaurant:  457
0
restaurant:  458
0
restaurant:  459
0
restaurant:  460
0
restaurant:  461
0
restaurant:  462
0
restaurant:  463
0
restaurant:  464
0
restaurant:  465
0
restaurant:  466
0
restaurant

0
restaurant:  847
0
restaurant:  848
0
restaurant:  849
0
restaurant:  850
0
restaurant:  851
0
restaurant:  852
0
restaurant:  853
0
restaurant:  854
0
restaurant:  855
0
restaurant:  856
0
restaurant:  857
0
restaurant:  858
0
restaurant:  859
0
restaurant:  860
0
restaurant:  861
0
restaurant:  862
0
restaurant:  863
0
restaurant:  864
0
restaurant:  865
0
restaurant:  866
0
restaurant:  867
0
restaurant:  868
0
restaurant:  869
0
restaurant:  870
0
restaurant:  871
0
restaurant:  872
0
restaurant:  873
0
restaurant:  874
0
restaurant:  875
0
restaurant:  876
0
restaurant:  877
0
restaurant:  878
0
restaurant:  879
0
restaurant:  880
0
restaurant:  881
0
restaurant:  882
0
restaurant:  883
0
restaurant:  884
0
restaurant:  885
0
restaurant:  886
0
restaurant:  887
0
restaurant:  888
0
restaurant:  889
0
restaurant:  890
0
restaurant:  891
0
restaurant:  892
0
restaurant:  893
0
restaurant:  894
0
restaurant:  895
0
restaurant:  896
0
restaurant:  897
0
restaurant:  898
0
restaurant

3


## Get Pitt Data

In [6]:
blist = scrape_restaurant_list('Pittsburgh')
print(len(blist))

total:  1800
20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460
480
500
520
540
560
580
600
620
640
660
680
700
720
740
760
780
800
820
840
860
880
900
920
940
960
980
1000
1000


In [7]:
scrape_all_reviews(blist)

restaurant:  0
1432
restaurant:  1
1594
restaurant:  2
928
restaurant:  3
936
restaurant:  4
375
restaurant:  5
738
restaurant:  6
603
restaurant:  7
436
restaurant:  8
406
restaurant:  9
527
restaurant:  10
510
restaurant:  11
477
restaurant:  12
678
restaurant:  13
542
restaurant:  14
572
restaurant:  15
400
restaurant:  16
534
restaurant:  17
677
restaurant:  18
533
restaurant:  19
350
restaurant:  20
1135
restaurant:  21
333
restaurant:  22
367
restaurant:  23
252
restaurant:  24
951
restaurant:  25
599
restaurant:  26
189
restaurant:  27
273
restaurant:  28
423
restaurant:  29
324
restaurant:  30
360
restaurant:  31
321
restaurant:  32
375
restaurant:  33
471
restaurant:  34
205
restaurant:  35
372
restaurant:  36
334
restaurant:  37
316
restaurant:  38
263
restaurant:  39
187
restaurant:  40
195
restaurant:  41
417
restaurant:  42
132
restaurant:  43
359
restaurant:  44
161
restaurant:  45
434
restaurant:  46
209
restaurant:  47
359
restaurant:  48
301
restaurant:  49
208
restaur

50
restaurant:  401
94
restaurant:  402
26
restaurant:  403
81
restaurant:  404
90
restaurant:  405
41
restaurant:  406
133
restaurant:  407
100
restaurant:  408
291
restaurant:  409
35
restaurant:  410
25
restaurant:  411
70
restaurant:  412
71
restaurant:  413
73
restaurant:  414
153
restaurant:  415
53
restaurant:  416
80
restaurant:  417
128
restaurant:  418
115
restaurant:  419
72
restaurant:  420
17
restaurant:  421
56
restaurant:  422
68
restaurant:  423
128
restaurant:  424
54
restaurant:  425
74
restaurant:  426
99
restaurant:  427
124
restaurant:  428
52
restaurant:  429
118
restaurant:  430
47
restaurant:  431
35
restaurant:  432
154
restaurant:  433
124
restaurant:  434
27
restaurant:  435
127
restaurant:  436
53
restaurant:  437
47
restaurant:  438
46
restaurant:  439
26
restaurant:  440


KeyboardInterrupt: 

## Feature Extraction

### Pre-process

In [None]:
def pre_process(df):
    new_df = df
    # Extract reviews and ratings
    new_df = new_df[['text', 'rating']]
    
    # remove invalid rows with ",text,rating"
    new_df = new_df[new_df.rating.astype(str).str.contains("rating") == False]

    # convert rating values to float
    new_df['rating'] = new_df['rating'].astype(float).fillna(0)
    # add length column to record word count of reviews
    new_df['length'] = new_df['text'].astype(str).apply(word_count)
    print(new_df.iloc[0].length)
    
    # remove punctations
    
    return new_df
    
def word_count(str):
    return len(str.split())

df = pre_process(df)
print(len(df))

df = df[df.rating >= 1.0]
print(len(df))


### Dataset Visualization
#### Review Length for Each Rating

In [None]:
g = sns.FacetGrid(data=df, col='rating')
g.map(plt.hist, 'length', bins=5, range = (0,500))
plt.show()

#### Rating = 1.0 - 5.0, Seperate Graph

In [None]:
## Put it in a loop later.

rating_1 = df[(df.rating == 1.0)]
plot = sns.distplot(rating_1['length'],label='rating=1.0',axlabel='Length of the Reviews',color="g")
plt.title('Rating=1.0')
fig = plot.get_figure()
fig.savefig("rating_1.png")
print ('mean =',np.mean(rating_1['length']))
print ('std =', np.std(rating_1['length']))
plt.show()

### Average Length of Each Rating

In [None]:
mean_length = [np.mean(rating_1['length']), np.mean(rating_2['length']), \
               np.mean(rating_3['length']), np.mean(rating_4['length']), np.mean(rating_5['length'])]
rating_classes = [1,2,3,4,5]

fig, ax = plt.subplots()

plt.barh(rating_classes, mean_length)
ax.set_xlabel('Average Length of Reviews')
ax.set_ylabel('Rating')

plt.savefig("average_length.png")
# for i, v in enumerate(d):
#     ax.text(v + 3, i + .25, str(v), color='blue', fontweight='bold')

plt.show()

### Number of Reviews for Each Rating

In [None]:
rating_classes = [1,2,3,4,5]
rating_size = [len(df[df.rating == 1.0]), len(df[df.rating == 2.0]), len(df[df.rating == 3.0]), \
              len(df[df.rating == 4.0]), len(df[df.rating == 5.0])]
# d = {'rating': rating_classes, 'size': rating_size}
# new_df = pd.DataFrame(data=d, index=None)
# print(new_df)

fig, ax = plt.subplots()

plt.bar(rating_classes, rating_size, color = 'c')
ax.set_xlabel('Rating')
plt.title('Number of Reviews')

for i, (rating_a, size_a) in enumerate(zip(rating_classes, rating_size)):
#     plt.text(s=rating_a, x=1, y=i, color="w", verticalalignment="center", size=28)
    plt.text(s=str(size_a), x=i+0.65, y=size_a-5000, color="w",
             verticalalignment="center", horizontalalignment="left", size=14)

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)

# ax.xaxis.set_ticks([])
ax.yaxis.set_ticks([])

plt.savefig("reviews_size.png")
plt.show()


## Pre-process (Cont.)

In [None]:
# remove stopwords, punctuations, to lower case
def text_process(text):
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

### 5-Class Training

In [None]:
five = df[(df.rating >= 1.0)]
X = five.text
y = five.rating


### Binary Training

In [None]:
binary = df[(df.rating == 1.0) | (df.rating == 5.0)]
# print(binary)
X = binary.text
y = binary.rating

In [None]:
## Takes Long Time to Execute
from sklearn.feature_extraction.text import CountVectorizer
bow_transformer = CountVectorizer(analyzer=text_process).fit(X)

filehandler = open('bow_transformer', 'wb')
pickle.dump(bow_transformer, filehandler)
filehandler.close()

#### Show the effect of tranformer

In [None]:
review_25 = X.iloc[24]
print(review_25)
bow_25 = bow_transformer.transform([review_25])
print(bow_25)

#### Transform the whole training set

In [None]:
X_ = bow_transformer.transform(X)

filehandler = open('X_', 'wb')
pickle.dump(X_, filehandler)
filehandler.close()

## Train the Model using Naive Bayes

### Split the dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_, y, test_size=0.3, random_state=101)

### Train by Naive Bayes

In [None]:
nb = MultinomialNB()
nb.fit(X_train, y_train)

### Predict

In [None]:
preds = nb.predict(X_test)
print(len(preds))

### Evaluate

In [None]:
print(confusion_matrix(y_test, preds))
print('\n')
print(classification_report(y_test, preds)) 

### Average Error

In [None]:
y_test_ = np.array(y_test.tolist())

import numpy as np
print(np.average(np.absolute(preds - y_test)))

## Result Visualiaztion

### Get average

In [None]:
pred_1 = [preds[index] for index, value in enumerate(y_test) if value == 1.0]
pred_1_avg = np.mean(pred_1)
print(len(pred_1))
pred_2 = [preds[index] for index, value in enumerate(y_test) if value == 2.0]
pred_2_avg = np.mean(pred_2)
print(len(pred_2))
pred_3 = [preds[index] for index, value in enumerate(y_test) if value == 3.0]
pred_3_avg = np.mean(pred_3)
print(len(pred_3))
pred_4 = [preds[index] for index, value in enumerate(y_test) if value == 4.0]
pred_4_avg = np.mean(pred_4)
print(len(pred_4))
pred_5 = [preds[index] for index, value in enumerate(y_test) if value == 5.0]
pred_5_avg = np.mean(pred_5)
print(len(pred_5))

pred_avg = [pred_1_avg, pred_2_avg, pred_3_avg, pred_4_avg, pred_5_avg]
print(pred_avg)


### Visualization

In [None]:
fig, ax = plt.subplots()

n_groups = 5

means_men = pred_avg
means_women = [1,2,3,4,5]

index = np.arange(n_groups)
bar_width = 0.35

error_config = {'ecolor': '0.3'}

rects1 = ax.bar(index, means_men, bar_width,color='g', label='Prediction')

rects2 = ax.bar(index + bar_width, means_women, bar_width, color='c', label='Ground Truth')

ax.set_xlabel('Groups')
ax.set_ylabel('Ratings')
ax.set_title('Rating comparision between prediction and groundtruth')
ax.set_xticks(index + bar_width / 2)
ax.set_xticklabels(('1', '2', '3', '4', '5'))
ax.legend()

fig.tight_layout()
plt.savefig("prediction.png")
plt.show()

## SVM (optional)

In [None]:
clf = SVC()
clf.fit(X_train, y_train)

In [None]:
 SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

print(clf.predict(X_test))