In [1]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os, math, json, time
from yelpapi import YelpAPI
from tqdm.notebook import tqdm_notebook

In [2]:
# Load API credentials
with open('yelp_api.json') as f:
    login = json.load(f)
yelp_api = YelpAPI(login['api-key'], timeout_s = 5.0)

In [3]:
# Specify the file name
json_file = f'yelp_records.json'
json_file

'yelp_records.json'

In [4]:
def create_json_file(json_file, delete_if_exists = False):
    
    # Check if the file exists
    file_exists = os.path.isfile(json_file)
    
    # If the file does NOT exist
    if file_exists:
        # Check if the user wants to delete the file
        if delete_if_exists:
            print(f'[!] {json_file} already exists. Deleting the previous file.')
            os.remove(json_file)
        else:
            print(f'[i] {json_file} alread exsist.')
    # If the file does exist
    else:
        # Inform user and save empty list
        print(f'[i] {json_file} not found. Saving empty list to new file.')
        
        # Create any needed folders
        folder = os.path.dirname(json_file)
        
        # If json_file included a folder
        if len(folder) > 0:
            os.makedirs(foldel, exist_ok = True)
            
        # Save the empty list to start the file
        with open(json_file, 'w') as f:
            json.dump([], f)

In [5]:
create_json_file(json_file, True)

[i] yelp_records.json not found. Saving empty list to new file.


In [6]:
# Determine the number of results already in the file
with open(json_file) as f:
    previous_results = json.load(f)
    
# Set offset based on previous results
n_results = len(previous_results)
print(f'- {n_results} previous results found.')

- 0 previous results found.


In [7]:
# Make a query
results = yelp_api.search_query(location = 'Seattle, WA',
                                term = 'Vegetarian',
                                offset = n_results)

results.keys()

dict_keys(['businesses', 'total', 'region'])

In [8]:
# See how many results there are
total_results = results['total']
total_results

4300

In [9]:
# Find how many results we get per page
results_per_page = len(results['businesses'])
results_per_page

20

In [10]:
# Find how many pages we need for our search
n_pages = math.ceil((results['total'] - n_results) / results_per_page)
n_pages

215

In [11]:
# Add the query to the previous results list
previous_results.extend(results['businesses'])
with open(json_file, 'w') as f:
    json.dump(previous_results, f)

In [12]:
len(previous_results)

20

In [13]:
# Make a loop to get all of our queries
for i in tqdm_notebook(range(1, n_pages + 1)):
    # Get the results so far
    with open(json_file, 'r') as f:
        previous_results = json.load(f)
    n_results = len(previous_results)
    if (n_results + results_per_page) > 1000:
        print('Exceeded 1000 API calls. Terminating loop.')
        break
    # Use n_results as the offset
    results = yelp_api.search_query(location = 'Seattle, WA',
                                    term = 'Vegetarian',
                                    offset = n_results)
    # Append the new results and save the file
    previous_results.extend(results['businesses'])
    with open(json_file, 'w') as f:
        json.dump(previous_results, f)
        
    # Add a 200 ms pause
    time.sleep(0.2)

  0%|          | 0/215 [00:00<?, ?it/s]

Exceeded 1000 API calls. Terminating loop.


In [18]:
# Put the results into a data frame
df = pd.read_json(json_file)
display(df.head(), df.tail())

Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,price,location,phone,display_phone,distance
0,djjEdAZkY4eyzI3_8TBLVA,plum-bistro-seattle-2,Plum Bistro,https://s3-media3.fl.yelpcdn.com/bphoto/DlnguN...,False,https://www.yelp.com/biz/plum-bistro-seattle-2...,1802,"[{'alias': 'vegan', 'title': 'Vegan'}]",4.0,"{'latitude': 47.6137675, 'longitude': -122.317...","[delivery, pickup]",$$,"{'address1': '1429 12th Ave', 'address2': '', ...",12068385333,(206) 838-5333,1888.19153
1,9nJlVsm9GEz6DFhK5ckaYA,cafe-flora-seattle,Cafe Flora,https://s3-media1.fl.yelpcdn.com/bphoto/M33I7s...,False,https://www.yelp.com/biz/cafe-flora-seattle?ad...,1433,"[{'alias': 'vegetarian', 'title': 'Vegetarian'...",4.0,"{'latitude': 47.62397, 'longitude': -122.29521}",[delivery],$$,"{'address1': '2901 E Madison St', 'address2': ...",12063259100,(206) 325-9100,3033.471253
2,0jZIOA_9H3YOsbiyoyRWrA,seattle-lotus-vegetarian-restaurant-seattle,Seattle Lotus Vegetarian Restaurant,https://s3-media1.fl.yelpcdn.com/bphoto/GWvM2a...,False,https://www.yelp.com/biz/seattle-lotus-vegetar...,176,"[{'alias': 'vegetarian', 'title': 'Vegetarian'...",4.5,"{'latitude': 47.5088066528403, 'longitude': -1...",[pickup],$$,"{'address1': '10439 16th Ave SW', 'address2': ...",12064666345,(206) 466-6345,13054.220036
3,mOybprI4rEcRQUxBqAhupw,harvest-beat-seattle,Harvest Beat,https://s3-media3.fl.yelpcdn.com/bphoto/KVq5tn...,False,https://www.yelp.com/biz/harvest-beat-seattle?...,230,"[{'alias': 'vegetarian', 'title': 'Vegetarian'...",4.5,"{'latitude': 47.66115, 'longitude': -122.33678}",[],$$$,"{'address1': '1711 N 45th St', 'address2': '',...",12065471348,(206) 547-1348,3976.167517
4,7eoMEbus8ANI15VuYzUNkQ,moonlight-cafe-seattle,Moonlight Cafe,https://s3-media4.fl.yelpcdn.com/bphoto/J2m1Jy...,False,https://www.yelp.com/biz/moonlight-cafe-seattl...,451,"[{'alias': 'vegetarian', 'title': 'Vegetarian'...",4.0,"{'latitude': 47.59901, 'longitude': -122.30672}","[delivery, pickup]",$,"{'address1': '1919 S Jackson St', 'address2': ...",12064857313,(206) 485-7313,3654.663113


Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,price,location,phone,display_phone,distance
995,wOzJie9tDawjt52-Tc4i5A,mammoth-seattle-4,Mammoth,https://s3-media3.fl.yelpcdn.com/bphoto/V22hJ7...,False,https://www.yelp.com/biz/mammoth-seattle-4?adj...,13,"[{'alias': 'sandwiches', 'title': 'Sandwiches'...",4.5,"{'latitude': 47.66622504829018, 'longitude': -...",[delivery],,"{'address1': '5239 Ballard Ave NW', 'address2'...",12069461065,(206) 946-1065,5780.690653
996,_4rv6ssRb9RpYdXQZgEQOw,spice-king-renton,Spice King,https://s3-media2.fl.yelpcdn.com/bphoto/BrfnJ1...,False,https://www.yelp.com/biz/spice-king-renton?adj...,486,"[{'alias': 'indpak', 'title': 'Indian'}]",4.0,"{'latitude': 47.47316, 'longitude': -122.22257}",[pickup],$,"{'address1': '720 Lind Ave SW', 'address2': ''...",14252266700,(425) 226-6700,18930.939469
997,Bc0odaNt6wsRPzRQQ63QEw,bakery-nouveau-seattle,Bakery Nouveau,https://s3-media2.fl.yelpcdn.com/bphoto/JizGW2...,False,https://www.yelp.com/biz/bakery-nouveau-seattl...,1575,"[{'alias': 'bakeries', 'title': 'Bakeries'}, {...",4.5,"{'latitude': 47.5600625713343, 'longitude': -1...",[delivery],$,"{'address1': '4737 California Ave SW', 'addres...",12069230534,(206) 923-0534,8227.186162
998,ngmgbNUMBO576sbYiH5v0w,salt-district-seattle-3,Salt District,https://s3-media3.fl.yelpcdn.com/bphoto/ZmWq2r...,False,https://www.yelp.com/biz/salt-district-seattle...,37,"[{'alias': 'italian', 'title': 'Italian'}]",3.5,"{'latitude': 47.60454, 'longitude': -122.33901}",[],,"{'address1': '1101 Alaskan Way', 'address2': N...",12067095763,(206) 709-5763,2317.428864
999,vKgyKPKRuveMB23UPwk97Q,leenas-cafe-shoreline,Leena's Cafe,https://s3-media3.fl.yelpcdn.com/bphoto/M_L6Kr...,False,https://www.yelp.com/biz/leenas-cafe-shoreline...,328,"[{'alias': 'breakfast_brunch', 'title': 'Break...",3.5,"{'latitude': 47.75846, 'longitude': -122.3131}",[delivery],$$,"{'address1': '17732 15th Ave NE', 'address2': ...",12063644919,(206) 364-4919,14887.447557


In [21]:
# Check for duplicates
df.duplicated(subset = 'id').sum()

10

In [24]:
# Drop the duplicates
df.drop_duplicates(subset = 'id', inplace = True)

# Confirm the duplicates were dropped
df.duplicated(subset = 'id').sum()

0

In [26]:
# Save the data frame to a compressed csv file
df.to_csv('final_results_seattle_vegetarian.csv.gz',
          compression = 'gzip',
          index = False)