# Rating Predictor for Yelp Reviews

In [51]:
# Import libraries
import requests
import json
import time
import pandas as pd
from bs4 import BeautifulSoup

## Scraper
Scrape review and restaurant data from Yelp API.

In [4]:
# Load API key from file 
with open('api_key.txt', 'r') as f:
    api_key = f.read().replace('\n','')

In [29]:
def extract_info_for_business(row):
    return {'name': row['name'], 'id': row['id'], 'review_count': int(row['review_count']), 'url': row['url']}

def scrape_restaurant_list(location):
    # Comment for method    
    payload = {'categories': 'restaurants', 'location': location, 'limit': 20, 'offset': 0}
    basic_url = 'https://api.yelp.com/v3/businesses/search'
    headers = {'Authorization': 'Bearer ' + api_key}
    response = requests.get(basic_url, params=payload, headers=headers)
    res_json = response.json()
    total = res_json['total']
    print("total: ", total)
    businesses_list = list(map(extract_info_for_business, res_json['businesses']))
    while len(businesses_list) < total:
        print(len(businesses_list))
        time.sleep(0.3)
        payload['offset'] = len(businesses_list)
        response = requests.get(basic_url, params=payload, headers=headers)
        res_json = response.json()
        if 'businesses' not in res_json or len(list(res_json['businesses'])) == 0:
            break
        businesses_list.extend(list(map(extract_info_for_business ,res_json['businesses'])))
    
    return businesses_list
    

In [30]:
blist = scrape_restaurant_list('94043')
# print(blist)
print(len(blist))

total:  4000
20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460
480
500
520
540
560
580
600
620
640
660
680
700
720
740
760
780
800
820
840
860
880
900
920
940
960
980
1000
1000


In [31]:
print(blist[:10])

[{'name': "The Sea by Alexander's Steakhouse", 'id': 'P1eEPolk9EDGqVn1Jyncww', 'review_count': 874, 'url': 'https://www.yelp.com/biz/the-sea-by-alexanders-steakhouse-palo-alto?adjust_creative=6RD6nFOw75PxaCjeWnG24Q&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=6RD6nFOw75PxaCjeWnG24Q'}, {'name': 'So Gong Dong Tofu House', 'id': 'rTUV3mPTGcALQrKgdokACA', 'review_count': 1326, 'url': 'https://www.yelp.com/biz/so-gong-dong-tofu-house-palo-alto?adjust_creative=6RD6nFOw75PxaCjeWnG24Q&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=6RD6nFOw75PxaCjeWnG24Q'}, {'name': "Zareen's", 'id': 'ud9ocsQHI7h3zNO7FdOFYQ', 'review_count': 1263, 'url': 'https://www.yelp.com/biz/zareens-mountain-view-3?adjust_creative=6RD6nFOw75PxaCjeWnG24Q&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=6RD6nFOw75PxaCjeWnG24Q'}, {'name': 'Evvia Estiatorio', 'id': '1vMgajRAI3lYwuCeGX58oQ', 'review_count': 2098, 'url': 'https://www.yelp.com/biz/evvia-estiator

Get reviews

In [32]:
def parse_page(html):
    """
    Parse the reviews on a single page of a restaurant.
    
    Args:
        html (string): String of HTML corresponding to a Yelp restaurant

    Returns:
        tuple(list, string): a tuple of two elements
            first element: list of dictionaries corresponding to the extracted review information
            second element: URL for the next page of reviews (or None if it is the last page)
    """
    review_list = []
    soup = BeautifulSoup(html, 'html.parser')
    for review_block in soup.find_all('div', attrs={'class': 'review review--with-sidebar'}):
        review_id = user_id = rating = date = text = None
        if 'data-review-id' in review_block.attrs:
            review_id = review_block['data-review-id']
        if 'data-signup-object' in review_block.attrs and review_block['data-signup-object'].startswith('user_id:'):
            user_id = review_block['data-signup-object'][8:]
        rating_div = review_block.find('div', attrs={'class': 'i-stars'})
        if 'title' in rating_div.attrs:
            rating = float(rating_div['title'].split()[0])
        date_span = review_block.find('span', attrs={'class': 'rating-qualifier'})
        if date_span:
            date = date_span.getText().strip()
        review_content = review_block.find('div', attrs={'class': 'review-content'})
        if review_content:
            text = review_content.find('p').getText()
        if review_id and user_id and rating and date and text:
            review_list.append({
                'review_id': review_id,
                'user_id': user_id,
                'rating': rating,
                'date': date,
                'text': text
            })
    next_link = None
    next_ele = soup.find('a', attrs={'class': 'u-decoration-none next pagination-links_anchor'})
    if next_ele and 'href' in next_ele.attrs:
        next_link = next_ele['href']
    return review_list, next_link

In [53]:
def scrape_all_reviews(restaurants):
    for i, restaurant in enumerate(restaurants):
        print("restaurant: ", i)
        reviews = []
        url = restaurant['url']
        while url != None:
            response = requests.get(url)
            reviews_in_page, url = parse_page(response.content)
            reviews.extend(reviews_in_page)
        print(len(reviews))
        df = pd.DataFrame(reviews)
        df.to_csv('reviews.csv', mode='a', header=True)

In [None]:
scrape_all_reviews(blist[42:])

restaurant:  0
990
restaurant:  1
618
restaurant:  2
440
restaurant:  3
801
restaurant:  4
552
restaurant:  5
1213
restaurant:  6
661
restaurant:  7
482
restaurant:  8
697
restaurant:  9
1183
restaurant:  10
1382
restaurant:  11
1044
restaurant:  12
1423
restaurant:  13
647
restaurant:  14
671
restaurant:  15
1150
restaurant:  16
574
restaurant:  17
150
restaurant:  18
1227
restaurant:  19
442
restaurant:  20
511
restaurant:  21
629
restaurant:  22
703
restaurant:  23
992
restaurant:  24
424
restaurant:  25
193
restaurant:  26
851
restaurant:  27
349
restaurant:  28
1177
restaurant:  29
449
restaurant:  30
1246
restaurant:  31
863
restaurant:  32
1189
restaurant:  33
420
restaurant:  34
623
restaurant:  35
1490
restaurant:  36
612
restaurant:  37
402
restaurant:  38
479
restaurant:  39
913
restaurant:  40
552
restaurant:  41
576
restaurant:  42
931
restaurant:  43
600
restaurant:  44
342
restaurant:  45
721
restaurant:  46
434
restaurant:  47
911
restaurant:  48
286
restaurant:  49
185

3
