In [1]:
#Dependencies 
import requests
import json
import pandas as pd
from api_key import yelp_api_key, mapbox_api_key
# C:\Users\nwf91\OneDrive\Desktop\VandyDataCourse\project-3\project-3\api_key.py
# C:\Users\nwf91\OneDrive\Desktop\VandyDataCourse\project-3\project-3\working_doc.ipynb

from pprint import pprint

In [2]:
# Test if the API keys are imported correctly
print(f"Yelp API Key: {yelp_api_key}")
print(f"Mapbox API Key: {mapbox_api_key}")

Yelp API Key: GfAgyvCCqzBJJrefnJOUX_fcJI02oK4PgB2mMkTwNmWSXDEPhJPU9kYctePGP49D0ujK3QqYE4_Iic-RfsWG_ULgbOWv8yx1jRUrgVpj50Vq6qws4Y1Nkn-n2_TgZnYx
Mapbox API Key: pk.eyJ1IjoibmJpb3JnOTkiLCJhIjoiY20weDd5OG43MDFxbDJzcHBwN2Y2aXUxYiJ9.U3TN-LcnCc7RnK25R74N5w


In [3]:
# Yelp API 

# Yelp API base URL for searching businesses
yelp_url = "https://api.yelp.com/v3/businesses/search"

# Location and other parameters
location = "san francisco"
term = "restaurants"  
limit = 40 # Maximum results per request 
offset = 0 
rounds = 6 

yelp_data = []

for round in range(rounds): 

    # Define the headers to include the API key
    headers = {
        "Authorization": f"Bearer {yelp_api_key}"
    }

    # Define the parameters for the API call
    yelp_params = {
        "location": location,
        "term": term,
        "limit": limit,
        "offset": offset 
    }

    # Make the GET request to the Yelp API
    yelp_response = requests.get(yelp_url, headers=headers, params=yelp_params)
        
    #Saving Yelp Data 
    # Check if the Yelp request was successful
    if yelp_response.status_code == 200:
        yelp_data_json = yelp_response.json()  # Convert the response to JSON format
        businesses = yelp_data_json.get("businesses", [])  # Extract the businesses from the response
        

        # Loop over the Yelp businesses and call WalkScore API for each business
        for business in businesses:
            business_name = business["name"]
            business_id = business["id"]
            latitude = business["coordinates"]["latitude"]
            longitude = business["coordinates"]["longitude"]
            address = ", ".join(business["location"]["display_address"])
            rating = business["rating"]
            review_count = business["review_count"]
            price = business.get("price", "N/A")  # Some businesses may not have a price listed
            bus_categories = ", ".join([category["title"] for category in business["categories"]]),
            transactions = business["transactions"]
            
            yelp_data.append({
                "Name" : business_name,
                "Business_ID": business_id,
                "Latitude": latitude, 
                "Longitude": longitude, 
                "Address": address,
                "Rating": rating,
                "Review_count": review_count,
                "Price": price, 
                "Categories":bus_categories,
                "Transactions": transactions
            })
        
    else:
        print(f"Error: {yelp_response.status_code} - {yelp_response.text}") 

    offset += 40



Extracting and Transforming Yelp Data

In [36]:
# Transform Yelp data and convert to CSV 
yelp_pd = pd.DataFrame(yelp_data)
yelp_pd

# Take ZIP codes out of address 
yelp_pd['ZIP'] = yelp_pd['Address'].str.extract(r'(\d{5})')

# Convert category column 'Transactions' into three boolean columns
yelp_pd['Pickup'] = yelp_pd['Transactions'].apply(lambda x: 'pickup' in x)
yelp_pd['Delivery'] = yelp_pd['Transactions'].apply(lambda x: 'delivery' in x)
yelp_pd['Restaurant_Reservation'] = yelp_pd['Transactions'].apply(lambda x: 'restaurant_reservation' in x)

#Create coordinates "LatLong" column from latitude & longitude 
yelp_pd['LatLong'] = yelp_pd['Latitude'].astype(str) + ',' + yelp_pd['Longitude'].astype(str)


yelp_pd = yelp_pd[['Name', 'Business_ID', 'Latitude', 'Longitude', 'LatLong', 'Address','Rating', 'Review_count', 'Price', 'ZIP', 'Pickup', 'Delivery', 'Restaurant_Reservation']]

# Dropping duplicate LatLong
yelp_pd = yelp_pd.drop(80)

yelp_pd['LatLong'].duplicated().sum()

# convert to CSV 
yelp_pd.to_csv("CSV_Outputs/Businesses.csv", index=False)

# yelp_pd.head(5)



In [41]:
yelp_pd.dtypes


Name                       object
Business_ID                object
Latitude                  float64
Longitude                 float64
LatLong                    object
Address                    object
Rating                    float64
Review_count                int64
Price                      object
ZIP                        object
Pickup                       bool
Delivery                     bool
Restaurant_Reservation       bool
dtype: object

Pulling Categories from Yelp 

In [117]:
#TODO # Pull unique categories from Yelp_pd['Categories'] Column and create new table with those categories and IDs 
# Category_ID Category_Name 
categories_copy = yelp_pd['Categories'].copy()

# Step 2: Remove parentheses and split by ', ' to create a list of categories
categories_copy = categories_copy.apply(lambda x: x.strip('()').split(', '))

# Step 3: Flatten the list of all categories
all_categories = [category for sublist in categories_copy for category in sublist if category]

# Step 4: Get unique categories
unique_categories = set(all_categories)

# Display the unique categories
print(unique_categories)

AttributeError: 'tuple' object has no attribute 'strip'

MapBox API Call and Directions Estimate Loop 

In [16]:
#MapBox API 

#Loop for driving
#base url for mapbox 
mapbox_url = "https://api.mapbox.com/directions/v5/mapbox/"
st_coordinates = (37.787937, -122.407677) #Latitude, Longitude in Union Square, SF 
method = "driving"
driving_data = []

for row in yelp_data:
    bs_coordinates = (row['Longitude'], row['Latitude'])
    
    coordinates = f"{st_coordinates[1]},{st_coordinates[0]};{row["Longitude"]},{row["Latitude"]}"
    
    query_url = f"{mapbox_url}{method}/{coordinates}.json"

    #Mapbox paramaters 
    driving_params = {
        "access_token" : mapbox_api_key,
        "overview" : "full",
        # "annotations": "distance, duration"
    }
    
    #Get request for MapBox API 
    mapbox_response = requests.get(query_url, params = driving_params)
    # print(mapbox_response.json()) 
    #if successful, extract duration and distance 
    if mapbox_response.status_code == 200: 
        mapbox_data = mapbox_response.json()

        if 'routes' in mapbox_data and len(mapbox_data['routes']) > 0:
            distance = mapbox_data['routes'][0]['distance']  # Distance in meters
            duration = mapbox_data['routes'][0]['duration']  # Duration in seconds
            route_id = mapbox_data['uuid'] # route_id
            
            driving_data.append({
                "Route_ID": route_id,
                "Distance_Meters": distance, 
                "Duration_Seconds" : duration,
                "Latitude": bs_coordinates[1],
                "Longitude": bs_coordinates[0]
            })
            
        else:
            print(f"Error with MapBox API for {row['name']}: {mapbox_response.status_code}")




In [26]:
driving_pd = pd.DataFrame(driving_data)
driving_pd

# Cleaning up columns and rearranging 
driving_pd["Duration_Minutes"] = (driving_pd['Duration_Seconds']/60).round()
driving_pd['LatLong'] = driving_pd['Latitude'].astype(str) + ',' + driving_pd['Longitude'].astype(str)

driving_pd = driving_pd[['Route_ID', 'Distance_Meters', 'Duration_Seconds', 'Duration_Minutes', 'Latitude', 'Longitude', 'LatLong']]

driving_pd.head(5)

driving_pd = driving_pd.drop(80)

driving_pd['LatLong'].duplicated().sum()

# Converting to CSV 
driving_pd.to_csv("CSV_Outputs/Driving_Estimate.csv", index=False)


In [46]:
yelp_pd = yelp_pd.merge(driving_pd[['LatLong']], on='LatLong', how='inner')
yelp_pd['LatLong'].nunique()
driving_pd['LatLong'].nunique()

239