In [20]:
import requests
import json
import sys
import pandas as pd
import csv
from  keys  import  client_id, api_key

In [21]:
'''
 This will save the URL for the Yelp API and the API key.  
 Running this cell prompts the user to input variables that will be used to search the Yelp API
'''
url =  'https://api.yelp.com/v3/businesses/search' #points to url of user's yelp developer page
headers = {
        'Authorization': 'Bearer {}'.format(api_key),
    }
term = input('What sort of business do you think you can do? (plumbers, takeout, etc.): ')
location = input('Near where?: ')
categories = input('What sort of industry would you like to disrupt? You may list multiple categories, separated with a comma: ')
spec = f'{term}_{location}_{categories}_data'
# Note on categories, they appear to be optional. You may sepparate entries with a comma to search multiple categories, search 'All' for all, or leave the field blank.
url_params = {
                "term": term.replace(' ', '+'),
                "location": location.replace(' ', '+'),
                "categories" : categories,
                "limit": 50,
                "offset": 0
            }

What sort of business do you think you can do? (plumbers, takeout, etc.): Film Production
Near where?: NYC
What sort of industry would you like to disrupt? You may list multiple categories, separated with a comma: Video/Film Production, Recording & Rehearsal Studios, Advertising, Studio Taping


In [22]:
print(categories)

Video/Film Production, Recording & Rehearsal Studios, Advertising, Studio Taping


In [23]:
def yelp_call(headers, url_params):
    """
    This function will use the url_params variable and the headers variable to call the Yelp API,
    and return the data as a JSON 
    This will use the requests module to get from Yelp. 
    What is returned will be modified by our URL parameters.
    This must be called fresh with updated url_params for each call if we want to return more results.
    """
    response = requests.get(url, headers=headers, params=url_params) # our url, header and params should be consistent, atleast with our Yelp data
    data = response.json()
    return data

In [24]:
def parse_data(list_of_data):
    """
    Input data['businesses'] to return a list of tuples,
    with each tuple containing individual business name, address, rating, review count,
    Categories, and business ID
    """
    businesses = []
    for business in list_of_data:
        biz_price = None
        if 'price' not in business.keys():
            biz_price = None
        else:
            biz_price = len(business['price'])
        biz_tuple = (business['name'],
                     business['location']['display_address'],
                     business['location']['city'],
                     business['rating'],
                     business['review_count'],
                     business['coordinates'],
                     biz_price,
                     business['id'],
                     business['categories'],
                     business['coordinates']['latitude'],
                     business['coordinates']['longitude'])
        businesses.append(biz_tuple)
    return businesses

In [25]:
def call_1000():
    """
    This function will use the information gathered above to call the Yelp API and construct a data frame
    """
    csv_filepath = f'../database/{term}_{location}_database.csv'
    url_params['offset'] = 0
    results = yelp_call(headers, url_params)
    parsed = parse_data(results['businesses']) # list of businesses in tuples
    num = results['total']
    biz_list = []
    #Loop through the API to reach all of the businesses in the call
    while url_params['offset'] < 1000 and len(biz_list) < num:
        for biz in parsed:
            biz_list.append(biz)
        url_params['offset'] += 50
        results = yelp_call(headers, url_params)
        if num >= len(biz_list): # The statements below prevent the function from breaking due to the Yelp API restricting returned calls to 1000
            if 'businesses' not in results:
                break
            else:
                parsed = parse_data(results['businesses']) # list of businesses in tuples
        elif len(biz_list) <= 950:
            continue
        else:
            break
    # Create the data frame from the gathered information
    df = pd.DataFrame(biz_list, columns=['Name', 'Address','City', 'Rating','Review Count','Coordinates','Price','Id','Categories','Latitude','Longitude'])
    #Save the data frame as a CSV file
    with open(csv_filepath, "a") as f: 
        read_file = csv.writer(f)
        df.to_csv(csv_filepath, mode = "a", index = False)
    print(f'CSV file written to {csv_filepath}.')
    return df.head(3)

### Run below to write your business data to a csv file.

In [26]:
call_1000()

CSV file written to ../database/Film Production_NYC_database.csv.


Unnamed: 0,Name,Address,City,Rating,Review Count,Coordinates,Price,Id,Categories,Latitude,Longitude
0,DiJiFi,"[1166 Manhattan Ave, Ste 201, Brooklyn, NY 11222]",Brooklyn,4.5,165,"{'latitude': 40.7382584, 'longitude': -73.9549...",2.0,CwBYC8nVLbG7KjVU6nwR4A,"[{'alias': 'photographystores', 'title': 'Phot...",40.738258,-73.954948
1,Indigo Productions,"[250 W 90th St, New York, NY 10024]",New York,5.0,5,"{'latitude': 40.7908706665039, 'longitude': -7...",,8v0pPzCHOgJKGmydp2_xCA,"[{'alias': 'videographers', 'title': 'Videogra...",40.790871,-73.975197
2,Bizvidmedia Productions,"[99 Wallstreet, New York, NY 10005]",New York,5.0,5,"{'latitude': 40.70479, 'longitude': -74.00726}",,dQeD7LOvK4owK-8ysfu9gQ,"[{'alias': 'videofilmproductions', 'title': 'V...",40.70479,-74.00726


#### With that file, you can use the functions below to call the Yelp API for the associated business reviews.

In [27]:
term_loc = f'../database/{term}_{location}_database.csv'
biz_data = pd.read_csv(term_loc)

In [28]:
def call_reviews(biz_id): 
    """
    This function loops through the list of business ID's, and call the API on each one.
    Then, it will save this data to a list, and return the list
    """
    list_of_reviews = []
    for biz in biz_id:
        response = requests.get(f'https://api.yelp.com/v3/businesses/{biz}/reviews',headers = headers)
        review_data = response.json()
        list_of_reviews.append(review_data)
    return list_of_reviews 

In [29]:
def call_all_reviews(b_data): 
    """
    This function takes in a data frame, which we can create from our written csv files, 
    and creates a list of the business Id's from it. 
    This will then return that list
    """
    biz_id = []
    for j in b_data['Id']: # iterated through the ID column of business data dataframe and appends each item to a list
        biz_id.append(j)
    list_of_reviews = call_reviews(biz_id)
    return list_of_reviews

In [30]:
def format_reviews(b_data):
    """
    This function takes in the business data frame, and calls the function 'call_all_reviews' 
    to get the list of business Ids.  It will then loop through and create a new list of dictionaries
    with all of the reviews for that company, and that companies Business Id.
    """
    eg = call_all_reviews(b_data)
    list_of_reviews = []
    x = 0
    for i in eg:
        reviews = {}
        if 'error' in i.keys():
            reviews['Id'] = b_data["Id"][x]
            list_of_reviews.append(reviews)
            continue
        for count in list(range(0, (len(i['reviews'])))): # This may be confusing, but it is simply comparing
            reviews[f'Review_{count}'] = i['reviews'][count]['text']
        reviews['Id'] = b_data["Id"][x]
        list_of_reviews.append(reviews)
        x+=1
    return list_of_reviews

In [31]:
def reviews_to_csv(b_data): # WE NEED TO FIX THIS
    """
    This function takes in the business data frame, and runs the 'format_reviews' function.
    It then converts the list of dictionaries into the reviews CSV file
    """
    csv_filepath = f'../database/{term}_{location}_reviews.csv'
    formatted_reviews = format_reviews(b_data)
    df = pd.DataFrame(formatted_reviews)
    with open(csv_filepath, "a") as f:
        read_file = csv.writer(f)
        df.to_csv(csv_filepath, mode = "a", index = False)
    return df

In [32]:
# input your business database as an argument to write the reviews the correspond to those businesses to csv
reviews_to_csv(biz_data) 

KeyboardInterrupt: 

### The variables below are built in to the functions above, however they are useful for reference outside.

In [None]:
# this assigns the local path of the csv of reviews, which you created above, to a variable 
term_loc_review = f'../database/{term}_{location}_reviews.csv'

# ... that variable may then be referenced using read_csv. 
biz_reviews = pd.read_csv(term_loc_review)

In [None]:
biz_reviews.iloc[525]['Review_0'] #reference a sample review from your output csv.

<br><br><br><br><br><br><br><br><br><br><br><br>