# Efficient API Calls (Core)
Name: Mike McCann <br>
Date: 26 APR 2022

## Imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os, json, math, time
from yelpapi import YelpAPI
from tqdm.notebook import tqdm_notebook

## API Setup

In [4]:
# load creds
api_path = '../../.secret/yelp_api.json'

with open (api_path, 'r') as f:
    login = json.load(f)
    
login.keys()

dict_keys(['client-id', 'api-key'])

In [6]:
# instantiate yelpAPI
yelp_api = YelpAPI(login['api-key'], timeout_s = 3)
yelp_api

<yelpapi.yelpapi.YelpAPI at 0x206abddd808>

In [7]:
# Define terms for yelp search (LOCATION and TERM)
LOCATION = "Charlottesville, VA"
TERM = "taco"

## Create/Confirm Data Location

In [9]:
# Create folder and filepath for saving work
FOLDER = "Data/"

JSON_FILE = f"{FOLDER}results_in_progress_{LOCATION.split(',')[0]}_{TERM}.json"
JSON_FILE

'Data/results_in_progress_Charlottesville_taco.json'

In [12]:
# Check for existing JSON FILE
file_exists = os.path.isfile(JSON_FILE)

if file_exists == False:
    folder = os.path.dirname(JSON_FILE)
    if len(folder) > 0:
        os.makedirs(folder, exist_ok = True)
    
    print(f"{JSON_FILE} not found. Saving empty list to file.")
    
    with open(JSON_FILE, 'w') as f:
        json.dump([],f)

else: 
    print(f"{JSON_FILE} aleady exists.")

Data/results_in_progress_Charlottesville_taco.json aleady exists.


In [15]:
# Check if results are already saved..
with open(JSON_FILE, 'r') as f:
    previous_results = json.load(f)
    
# Set offset based on results
n_results = len(previous_results)
print(f"There are {n_results} previous results")

There are 0 previous results


## Initial Pull

In [16]:
# First pull
results = yelp_api.search_query(location = LOCATION,
                               term = TERM,
                               offset = n_results+1)
results.keys()

dict_keys(['businesses', 'total', 'region'])

In [18]:
total_results = results['total']
results_per_page = len(results['businesses'])
n_pages = math.ceil((total_results - n_results) / results_per_page)

print(f"Searching for {TERM} in {LOCATION} has yielded {total_results} results")
print(f"There are {results_per_page} results per page/search")
print(f"We will need {n_pages} pages to complete our API call")

Searching for taco in Charlottesville, VA has yielded 112 results
There are 20 results per page/search
We will need 6 pages to complete our API call


In [19]:
# Save out our current results
previous_results.extend(results['businesses'])
with open(JSON_FILE, 'w') as f:
    json.dump(previous_results, f)

## Loop Pull

In [20]:
for i in tqdm_notebook(range(1,n_pages+1)):
    time.sleep(.2)
    
    #check for previous results
    with open(JSON_FILE, 'r') as f:
        previous_results = json.load(f)

    # Create offset    
    n_results = len(previous_results)
    
    # Run API call
    results = yelp_api.search_query(location = LOCATION,
                                   term = TERM,
                                   offset = n_results+1)
    
    # extract and extend businesses to our previous results
    previous_results.extend(results['businesses'])
    
    # Save current iteration
    with open(JSON_FILE, 'w') as f:
        json.dump(previous_results, f)

  0%|          | 0/6 [00:00<?, ?it/s]

## Check and Save Results

In [21]:
# Save to DF.
final_df = pd.read_json(JSON_FILE)

display(final_df.head(3))
print("\n\n")
final_df.info()

Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,price,location,phone,display_phone,distance
0,kViH5j4Z4S3OlTSwBEm-Nw,la-michoacana-taqueria-and-restaurant-charlott...,La Michoacana Taqueria & Restaurant,https://s3-media2.fl.yelpcdn.com/bphoto/edLG_l...,False,https://www.yelp.com/biz/la-michoacana-taqueri...,240,"[{'alias': 'mexican', 'title': 'Mexican'}]",4.5,"{'latitude': 38.0317039489746, 'longitude': -7...","[pickup, delivery]",$,"{'address1': '1138 E High St', 'address2': '',...",14342021336,(434) 202-1336,1750.712216
1,eI_GjUo-Ux8zq2xewUtXtA,brazos-tacos-charlottesville,Brazos Tacos,https://s3-media3.fl.yelpcdn.com/bphoto/pvv_pW...,False,https://www.yelp.com/biz/brazos-tacos-charlott...,404,"[{'alias': 'tex-mex', 'title': 'Tex-Mex'}, {'a...",4.0,"{'latitude': 38.0247182105962, 'longitude': -7...",[delivery],$,"{'address1': '925 2nd St SE', 'address2': '', ...",14349841163,(434) 984-1163,2274.659108
2,a3h3Q6A50DVOmppznCaVmA,el-tako-nako-charlottesville,El Tako Nako,https://s3-media4.fl.yelpcdn.com/bphoto/DndjKO...,False,https://www.yelp.com/biz/el-tako-nako-charlott...,50,"[{'alias': 'mexican', 'title': 'Mexican'}, {'a...",5.0,"{'latitude': 38.065467, 'longitude': -78.495929}",[delivery],$,"{'address1': '2405 Hydraulic Rd', 'address2': ...",14343058918,(434) 305-8918,2778.539862





<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111 entries, 0 to 110
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             111 non-null    object 
 1   alias          111 non-null    object 
 2   name           111 non-null    object 
 3   image_url      111 non-null    object 
 4   is_closed      111 non-null    bool   
 5   url            111 non-null    object 
 6   review_count   111 non-null    int64  
 7   categories     111 non-null    object 
 8   rating         111 non-null    float64
 9   coordinates    111 non-null    object 
 10  transactions   111 non-null    object 
 11  price          84 non-null     object 
 12  location       111 non-null    object 
 13  phone          111 non-null    object 
 14  display_phone  111 non-null    object 
 15  distance       111 non-null    float64
dtypes: bool(1), float64(2), int64(1), object(12)
memory usage: 13.2+ KB


In [22]:
# Check for dupes
final_df.duplicated(subset = 'id').sum()

0

In [23]:
# Save to compressed csv
final_df.to_csv(f"{FOLDER}results_final_{LOCATION.split(',')[0]}_{TERM}.csv.gz",
               compression='gzip', index = False)