# Basic Yelp Restaurant Data Fetch
Yelp API Guide: https://docs.developer.yelp.com/docs/fusion-intro. Uses the v3/businesses/search endpoint.

In [5]:
import json
import os
import requests
import string
import time
import urllib.request
from urllib.error import HTTPError
import unidecode

In [None]:
# Aggregate Manhattan specific restuarants by neighborhood for more granular results.
neighborhoods = [
    "Alphabet City",
    "Battery Park City",
    "Bowery",
    "Bryant Park",
    "Carnegie Hill",
    "Central Park",
    "Chelsea",
    "Chinatown",
    "Civic Center",
    "Clinton",
    "East Harlem",
    "East Village",
    "Financial District",
    "Flatiron",
    "Fort George",
    "Garment District",
    "Gramercy",
    "Greenwich Village",
    "Hamilton Heights",
    "Harlem",
    "Hells Kitchen",
    "Hudson Heights",
    "Hudson Square",
    "Hudson Yards",
    "Inwood",
    "Kips Bay",
    "Lenox Hill",
    "Lincoln Square",
    "Little Italy",
    "Lower East Side",
    "Manhattan Valley",
    "Manhattanville",
    "Meatpacking",
    "Midtown",
    "Midtown East",
    "Midtown South",
    "Midtown West",
    "Morningside Heights",
    "Murray Hill",
    "Noho",
    "Nolita",
    "NoMad",
    "Roosevelt Island",
    "Soho",
    "Stuyvesant Town",
    "Sutton Place",
    "Times Square",
    "Theater District",
    "Tribeca",
    "Tudor City",
    "Turtle Bay",
    "Two Bridges",
    "Union Square",
    "Upper East Side",
    "Upper West Side",
    "Washington Heights",
    "Washington Square Park",
    "West Harlem",
    "West Village",
    "Yorkville",
]

In [None]:
# Yelp allows inly 500 API calls per day, so we rotate btwn 3 different keys for maximum data collection.
kKeyIndex = 0
keys = [
    "ZF5VOfPCUWtK2C4_ZMpMrO3FxyS6EGlN_aCjNPBTYZyHhmMZvi7sADCFioEuDUalKlL_83AGB1fWkICmFeHudLzmUhtUq589kgKpnfQbQoT2BMznqTLJ2cIX1RRAZXYx",
    "QOCKsANBYQUN4Fmrxh23mAl5Bjbi69gv3W7ChGNOmp98Q3124aytz9F2MzEPhmKOXa6EomrQAjLeGEZuvlrbsR5Q_KSnsST7Ona_K0_wafErqsrxsd68aCSe9j9IZXYx",
    "NO9vZwZGnE58R8YbQDEPC90SlZ2eok4O4aYkdIxH96vUZMeSCDCvIZYY7L3VxWVYiMITiaMIkOBPRdtOgkR52BwBexnpVDDmhcjWClFRgu8uByoBopPAP8stZUBIZXYx"  
]

In [None]:
url = 'https://api.yelp.com/v3/businesses/search'
headers = {
    "accept": "application/json",
    "Authorization": "Bearer " + keys[kKeyIndex]
}

In [None]:
alias_to_content = {}

In [None]:
for neighborhood in neighborhoods:
    print("Fetching data for " + neighborhood)
    
    # Maximum results per API request.
    limit = 50
    location = neighborhood + ", Manhattan, NY"
    location = location.replace(" ", "+")
    
    # Get up to 1000 restaurants per neighborhood.
    for i in range(0, 1000, limit):        
        url_params = {
            "location": location,
            "term": "Restaurants",
            "limit": limit,
            "offset": i,
            "categories": "(restaurants, All)",
            "sort_by": "distance",
        }

        response = requests.get(url, headers=headers, params=url_params)
        
        # Max API calls gets a return status == 429!
        if response.status_code == 429: 
            print("Rotating key")
            kKeyIndex += 1
            headers["Authorization"] = "Bearer " + keys[kKeyIndex]
            response = requests.get(url, headers=headers, params=url_params)

        if response.status_code != 200:
            print(response.status_code)
            continue

        # If we already got all the businesses in a neighborhood.
        content = json.loads(response.content)
        if len(content["businesses"]) == 0:
            break

        for business in content["businesses"]:
            alias_to_content[business["alias"]] = business

In [None]:
# Write file.
file_path = "{}/restaurants_michelin_stars.json".format(os.getcwd())
with open(file_path, "w") as fp:
    json.dump(alias_to_content, fp)

## Restaurant Reviews Fetch
Make one file per letter so the team can scrape in parallel. Also one file would be too large. 

## Alphabetical Folder Creation

In [None]:
# Read file.
path = "{}/restaurants.json".format(os.getcwd())
with open(path, "r") as json_file:
    alias_to_content = json.loads(json_file.read())

In [None]:
# Make one file per letter so we can work in parallel.
for letter in string.ascii_lowercase:
    aliases = {a: [] for a in alias_to_content.keys() if a[0] == letter}
    path = "{}/reviews/{}.json".format(os.getcwd(), letter)
    with open(path, 'w') as fp:
        json.dump(aliases, fp)
        
# And one file for all numbers and weird characters.
aliases = {a: [] for a in alias_to_content.keys() if a[0] not in string.ascii_lowercase}
path = "{}/reviews/0.json".format(os.getcwd())
with open(path, 'w') as fp:
    json.dump(aliases, fp)

## Scraping Review Data...
No API for this. We call it with a sketchy endpoint.

In [None]:
headers = {
    "Content-Type": "text/html; charset=UTF-8",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
}

In [None]:
LETTER = 'z'

# Read file.
path = "{}/reviews/{}.json".format(os.getcwd(), LETTER)
with open(path, "r") as json_file:
    alias_to_reviews = json.loads(json_file.read())

print("Reviews left to fetch: {} / {}".format(
    sum(1 for reviews in alias_to_reviews.values() if len(reviews) == 0),
    len(alias_to_reviews)
))

for alias, reviews in sorted(alias_to_reviews.items()):
    # If we already scraped reviews, do not do it again.
    if len(reviews) > 0:
        continue
    
    # Gather 10 reviews per restaurant 5 times.
    try:
        for i in range(0, 50, 10):
            url = "https://www.yelp.com/biz/{}/props?start={}".format(unidecode.unidecode(alias), i)
            
            request = urllib.request.Request(url, headers=headers)
            response = urllib.request.urlopen(request)
            data = json.loads(response.read())
            
            for r in data["bizDetailsPageProps"]["reviewFeedQueryProps"]["reviews"]:
                reviews.append({
                    "photoCount": r["user"]["photoCount"],
                    "reviewCount": r["user"]["reviewCount"],
                    "eliteYear": r["user"]["eliteYear"],
                    "localizedDate": r["localizedDate"],
                    "comment": r["comment"],
                    "rating": r["rating"],
                })
                                 
        print("{}: {}".format(alias, len(reviews)))
        if len(reviews) == 0:
            del alias_to_reviews[alias]
            
    except:
        print("{}: ERROR".format(alias))
    
# Write file.
path = "{}/reviews/{}.json".format(os.getcwd(), LETTER)
with open(path, 'w') as fp:
    json.dump(alias_to_reviews, fp)

## Add Reviews to Main File
So it is easier to use for training data.

In [None]:
# Read file.
path = "{}/restaurants.json".format(os.getcwd())
with open(path, "r") as json_file:
    alias_to_content = json.loads(json_file.read())

In [None]:
for letter in string.ascii_lowercase + '0':
    path = "{}/reviews/{}.json".format(os.getcwd(), letter)
    with open(path, "r") as json_file:
        alias_to_reviews = json.loads(json_file.read())
        for alias, reviews in alias_to_reviews.items():
            alias_to_content[alias]['reviews'] = reviews

# Subsetting Michelin vs Non-Michelin

Great! We now have all our basic restaurant details + review data. Since we are interested in subsetting our data to compare Michelin vs non-Michelin properties, let’s identify Yelp’s aliases/ids for 2023 Michelins in Manhattan proper. 

In [6]:
#2023 Michelin Manhattan restaurants >> https://www.bonappetit.com/story/nyc-michelin-guide-restaurants-list

michelins = [
    "Eleven Madison Park",
    "Le Bernardin",
    "Masa",
    "Per Se",
    "Al Coro",
    "Aquavit",
    "Aska",
    "Atera",
    "Atomix",
    "Blue Hill at Stone Barns",
    "Daniel",
    "Gabriel Kreuther",
    "Jean-Georges",
    "Jungsik",
    "The Modern",
    "Odo",
    "Saga",
    "Sushi Noz",
    "Sixty Three Clinton",
    "Bōm",
    "Casa Mono",
    "Caviar Russe",
    "Clover Hill",
    "Cote",
    "Crown Shy",
    "Dirt Candy",
    "Essential by Christophe",
    "Estela",
    "Family Hill at Blue Hill",
    "The Four Horseman",
    "Francie",
    "Frevo",
    "Gramercy Tavern",
    "Hirohisa",
    "Icca",
    "Jeju Noodle Bar",
    "Jōji",
    "Joomak Banjum",
    "Jua",
    "Kochi",
    "Kosaka",
    "L'Abeille",
    "Le Coucou",
    "Le Jardinier",
    "Le Pavillon",
    "Mari",
    "Meju",
    "The Musket Room",
    "Noda",
    "Noz 17",
    "Oiji Mi",
    "One White Street",
    "Oxalis",
    "Oxomoco",
    "Red Paper Clip",
    "Restaurant Yuu",
    "Rezdôra",
    "Semma",
    "Shion 69 Leonard Street",
    "Shmoné",
    "Sushi Amane",
    "Sushi Ichimura",
    "Sushi Nakazawa",
    "Sushi Yasuda",
    "Tempura Matsui",
    "Torien",
    "Torrisi",
    "Tsukimi",
    "Tuome",
    "Vestry",
    "Yoshino"
]

In [7]:
michelin_to_alias = {}
headers = {
    "accept": "application/json",
    "Authorization": "Bearer ZF5VOfPCUWtK2C4_ZMpMrO3FxyS6EGlN_aCjNPBTYZyHhmMZvi7sADCFioEuDUalKlL_83AGB1fWkICmFeHudLzmUhtUq589kgKpnfQbQoT2BMznqTLJ2cIX1RRAZXYx"
}

for michelin in michelins: 
    url = "https://api.yelp.com/v3/businesses/search?location=new%20york%20city&term={}&sort_by=best_match&limit=20".format(michelin)
    response = requests.get(url, headers=headers)
    content = json.loads(response.content)
    michelin_to_alias[michelin] = content['businesses'][0]['alias']

In [8]:
michelin_to_alias

{'Eleven Madison Park': 'eleven-madison-park-new-york',
 'Le Bernardin': 'le-bernardin-new-york',
 'Masa': 'masa-new-york',
 'Per Se': 'per-se-new-york',
 'Al Coro': 'al-coro-new-york',
 'Aquavit': 'aquavit-new-york',
 'Aska': 'aska-brooklyn-3',
 'Atera': 'atera-new-york',
 'Atomix': 'atomix-new-york',
 'Blue Hill at Stone Barns': 'family-meal-at-blue-hill-new-york',
 'Daniel': 'daniel-new-york-4',
 'Gabriel Kreuther': 'gabriel-kreuther-new-york',
 'Jean-Georges': 'jean-georges-new-york-2',
 'Jungsik': 'jungsik-new-york',
 'The Modern': 'the-modern-new-york-3',
 'Odo': 'odo-new-york',
 'Saga': 'saga-new-york-6',
 'Sushi Noz': 'sushi-noz-new-york',
 'Sixty Three Clinton': 'sixty-three-clinton-new-york',
 'Bōm': 'bōm-new-york-3',
 'Casa Mono': 'casa-mono-new-york',
 'Caviar Russe': 'caviar-russe-new-york',
 'Clover Hill': 'clover-hill-brooklyn',
 'Cote': 'cote-korean-steakhouse-new-york',
 'Crown Shy': 'crown-shy-new-york-2',
 'Dirt Candy': 'dirt-candy-new-york-2',
 'Essential by Christo

In [9]:
"""current_directory = os.getcwd()

# Define the filename for the JSON file
json_filename = 'michelin_alias_michelin_stars.json'
json_filepath = os.path.join(current_directory, json_filename)

with open(json_filepath, 'w') as json_file:
    json.dump(michelin_to_alias, json_file)"""