In [1]:
#Load the modules for retrieving data from the Yelp API.
import requests
import json
#Load the modules for data manipulation.
import pandas as pd
import numpy as np
import itertools

In [2]:
#Building the function that will be used for retrieveing data from Yelp's Business Search API.
#The API returns a maximum of 1000 shops, and most of the search results return that many.
#However, two of the results return only 750 (where yelp_pages=15).

#api_key is the alias for my personal Yelp API Key, which has been omitted here.
headers = {'Authorization': 'Bearer %s' % api_key}
url='https://api.yelp.com/v3/businesses/search' 

def coffee_shops(term, location, yelp_pages=20):
    """
    Inputs: search term, geopgraphical location, number of pages of returned results (maximum 50 results per page) 
    Output: dataframe with maximum 1000 rows of Yelp data where each row corresponds to a coffee shop
    Terms: 'coffee', 'cafe', 'espresso', 'latte', 'cappuccino', 'macchiato', 'americano', 'mocha', 'decaf'
    Locations: 'NYC', 'Manhattan', 'Brooklyn', 'Bronx', 'Queens', Staten Island'
    """
    shop_id = []
    name = []
    review_count = []
    rating = []
    latitude = []
    longitude = []
    price = []
    
    #The Yelp API allows a maximum retrieval of 50 rows per page.
    #yelp_pages=20 to acquire 1000 coffee shops; yelp_pages=15 to acquire 750 coffee shops in the two special cases.
    for i in range(yelp_pages):
        params = {'term':term, 'location':location, 'limit':50, 'offset':50*i, 'radius':0}
        reqs = requests.get(url, params=params, headers=headers)

        requests_text = json.loads(reqs.text)
        businesses = requests_text['businesses']

        for i in range(50):     
            shop_id.append(businesses[i]['id'])
            name.append(businesses[i]['name'])
            review_count.append(businesses[i]['review_count'])
            rating.append(businesses[i]['rating'])
            latitude.append(businesses[i]['coordinates']['latitude'])
            longitude.append(businesses[i]['coordinates']['longitude'])
                 
            #To accomodate the fact that some businesses do not have a 'price' value recorded.
            if 'price' in list(businesses[i].keys()):
                price.append(businesses[i]['price'])
            else:
                price.append('None')
    
    output = pd.DataFrame(np.array([shop_id, name, review_count, rating, latitude, longitude, price]).T)
    output.columns = ['shop_id', 'name', 'review_count', 'rating', 'latitude', 'longitude', 'price']

    return output

In [3]:
#Building a function that retrieves data from the Yelp API and generates a dataframe for each (search term, location) pair.

def term_location_dictionary(term_list, location_list):
    """
    Inputs: a list of search terms, a list of geographical locations
    Output: a dictionary where each key is of the form '[term]_[location]' and the value corresponding to a key is a dataframe
    consisting of 750 or 1000 rows of coffee shops retrieved from the Yelp API using that term and location in the search.
    """
    dictionary = {}
    for pair in itertools.product(term_list, location_list):
        term = pair[0]
        location = pair[1]
        if location == 'Staten_Island' and (term == 'macchiato' or term == 'decaf'):
            dictionary[term + '_' + location] = coffee_shops(term, location.replace('_', ' '), 15)   #replace method for replacing 'Staten_Island' with 'Staten Island'
        else:
            dictionary[term + '_' + location] = coffee_shops(term, location.replace('_', ' '))   #replace method for replacing 'Staten_Island' with 'Staten Island'
    return dictionary

In [4]:
###Creating the preliminary dataframe for this project: 'shops'.

#Search parameters for the Yelp API.
terms = ['coffee', 'cafe', 'espresso', 'latte', 'cappuccino', 'macchiato', 'americano', 'mocha', 'decaf']
locations = ['NYC', 'Manhattan', 'Brooklyn', 'Bronx', 'Queens', 'Staten_Island']

#A dictionary where each key is of the form '[term]_[location]' and the corresponding value is a dataframe of Yelp data.
term_location_dict = term_location_dictionary(terms, locations)

#Creating a list of all '[term]_[location]' strings, which are the keys to term_location_dict.
term_location_list = []
for pair in itertools.product(terms, locations):
    term_location = pair[0] + '_' + pair[1]
    term_location_list.append(term_location)

#Combining the dataframes in term_location_dict into a single dataframe called 'shops' (with duplicate rows removed).
shops = term_location_dict['coffee_NYC']
for term_location in term_location_list:
    shop_ids = set(shops['shop_id'])
    term_location_dataframe = term_location_dict[term_location]
    for i in range(term_location_dataframe.shape[0]):
        if term_location_dataframe.iloc[i]['shop_id'] not in shop_ids:
            shops = shops.append(term_location_dataframe.iloc[i], ignore_index=True)
            
#The numeric variables in shops are all of type str. Here we will convert them to the appropriate types.
shops['review_count'] = shops['review_count'].astype(int)
shops['rating'] = shops['rating'].astype(float)
shops['latitude'] = shops['latitude'].astype(float)
shops['longitude'] = shops['longitude'].astype(float)

In [5]:
#Save shops to a .csv file.
shops.to_csv('shops.csv')