In [19]:
import pandas as pd
import os #env variables
import requests #call the APIs
import json
from IPython.display import JSON

# Foursquare

Send a request to Foursquare with a small radius (1000m) for all the bike stations in your city of choice. 

In [20]:
FOURSQUARE_KEY = os.getenv('FOURSQUARE_API_KEY')

In [21]:
csv_file_path = '..\data\df_bikes.csv'

# Read the CSV file into a DataFrame
df_bikes = pd.read_csv(csv_file_path)

In [22]:
df_bikes.head()

Unnamed: 0,Station_No,Station_ID,City,Bike_Lat,Bike_Lon,Free_Bikes,Empty_Slots,Total_Bikes
0,0,d0e8f4f1834b7b33a3faf8882f567ab8,"San Francisco Bay Area, CA",37.849735,-122.270582,1,14,15
1,1,983514094dd808b1604da2dcfc2d09af,"San Francisco Bay Area, CA",37.336188,-121.889277,6,5,11
2,2,da17603652106fda93da4e255a5b0a22,"San Francisco Bay Area, CA",37.322125,-121.88109,16,7,23
3,3,7a21c92b3b4cd2f7759107b4fdebf869,"San Francisco Bay Area, CA",37.323678,-121.874119,7,7,14
4,4,ce34d38fb230a23c1ced12d1e16df294,"San Francisco Bay Area, CA",37.325998,-121.87712,4,23,27


In [17]:
# This method accepts a station no/id from bike data
def getDetails(no,id,lat,long):
    url = "https://api.foursquare.com/v3/places/search"
    params = {
        "query": "bars",
        "ll": f"{lat},{long}",  # Using latitude and longitude from the CityBikes df
        "radius": "1000",
        "sort": "DISTANCE",
        #Grab 5 closest POIs per bike station
        'limit': 5
    }

    headers = {
        "Accept": "application/json",
        "Authorization": FOURSQUARE_KEY
    }

    # API request
    response = requests.get(url, params=params, headers=headers)
    # Parse the JSON response
    data = response.json().get("results", [])
    
    # Extracting relevant data
    poi_data = []
    for result in data:
        name = result['name']
        distance = result['distance']
        poi_data.append([no,id, lat, long, name, distance])
    
    
    return poi_data

In [8]:
#Get Fsq data foreach lat/long
foursquare_data = []
for _, row in df_bikes.iterrows():
    foursquare_data.extend(getDetails(row["Station_No"],row["Station_ID"],row["Bike_Lat"], row["Bike_Lon"]))

Parse through the response to get the POI (such as restaurants, bars, etc) details you want (ratings, name, location, etc)

Put your parsed results into a DataFrame

In [12]:
# Create a FSQ dataframe
df_fsq = pd.DataFrame(foursquare_data, columns=["Station_No","Station_ID", "Bike_Lat", "Bike_Lon", "name", "distance"])

In [10]:
# Display or save/export the dataframe as needed
df_fsq.to_csv('..\data\df_fsq.csv', index=False)

In [None]:
df_fsq.head()

# Yelp

Send a request to Yelp with a small radius (1000m) for all the bike stations in your city of choice. 

Parse through the response to get the POI (such as restaurants, bars, etc) details you want (ratings, name, location, etc)

In [23]:
YELP_API_KEY = os.getenv('YELP_API_KEY')
YELP_CLIENT_ID = os.getenv('YELP_CLIENT_ID')

In [25]:
# Yelp data for each latitude and longitude pertaining to each bike station
def getYelpDetails(no, id, lat, long):
    yelp_url = "https://api.yelp.com/v3/businesses/search"

    headers = {
        "Authorization": f"Bearer {YELP_API_KEY}",
    }

    params = {
        "term": "restaurant",
        "radius": 1000,
        #This limit is due to Free API tier -  HTTP 429 Too Many Requests
        "limit": 3,
        "sort_by": "distance"
    }

    url = yelp_url

    # Add latitude and longitude to the params dictionary
    params["latitude"] = lat
    params["longitude"] = long

    response = requests.get(url, params=params, headers=headers)

    # Check the response status code
    if response.status_code != 200:
        print(f"Warning: Yelp API request failed with status code {response.status_code}")
        return []

    pois = response.json().get("businesses", [])
   
    extracted_data = []

    for poi in pois:
        name = poi["name"]
        review_count = poi["review_count"]
        categories = ', '.join([category["title"] for category in poi["categories"]])
        rating = poi["rating"]
        distance = poi["distance"]

        extracted_data.append([no, id, lat, long, name, review_count, categories, rating, distance])

    return extracted_data 

In [None]:
# Yelp data for each set of latitude and longitude
yelp_data = []
for _, row in df_bikes.iterrows():
    yelp_data.extend(getYelpDetails(row["Station_No"], row["Station_ID"], row["Bike_Lat"], row["Bike_Lon"]))

In [37]:
# Spot check list
for i in range(6):
    print(yelp_data[i])

[0, 'd0e8f4f1834b7b33a3faf8882f567ab8', 37.849735, -122.270582, 'The Fat Fish', 190, 'Fish & Chips', 4.0, 29.9305557971102]
[0, 'd0e8f4f1834b7b33a3faf8882f567ab8', 37.849735, -122.270582, 'El Tiny Cafe', 57, 'Cafes', 5.0, 31.707767038858425]
[0, 'd0e8f4f1834b7b33a3faf8882f567ab8', 37.849735, -122.270582, '808 deli and cafe', 0, 'Delis', 0.0, 79.03748686104271]
[1, '983514094dd808b1604da2dcfc2d09af', 37.33618830029063, -121.8892765045166, 'Scratch Cookery', 495, 'Food Trucks, Chicken Shop', 4.5, 28.365905869786513]
[1, '983514094dd808b1604da2dcfc2d09af', 37.33618830029063, -121.8892765045166, 'Goodtime Bar', 73, 'Wine Bars, Tapas/Small Plates, Seafood', 5.0, 33.119119462517794]
[1, '983514094dd808b1604da2dcfc2d09af', 37.33618830029063, -121.8892765045166, 'CommonGrounds Workspace', 2, 'Shared Office Spaces, Venues & Event Spaces, Cafes', 3.5, 39.069892422608945]


Put your parsed results into a DataFrame

In [38]:
df_yelp = pd.DataFrame(yelp_data, columns=["Station_No","Station_ID", "Bike_Lat", "Bike_Long", "name", "review_count", "categories", "rating","distance"])
# Export the dataframe
df_yelp.to_csv('..\data\df_yelp.csv', index=False)

In [39]:
df_yelp.head()

Unnamed: 0,Station_No,Station_ID,Bike_Lat,Bike_Long,name,review_count,categories,rating,distance
0,0,d0e8f4f1834b7b33a3faf8882f567ab8,37.849735,-122.270582,The Fat Fish,190,Fish & Chips,4.0,29.930556
1,0,d0e8f4f1834b7b33a3faf8882f567ab8,37.849735,-122.270582,El Tiny Cafe,57,Cafes,5.0,31.707767
2,0,d0e8f4f1834b7b33a3faf8882f567ab8,37.849735,-122.270582,808 deli and cafe,0,Delis,0.0,79.037487
3,1,983514094dd808b1604da2dcfc2d09af,37.336188,-121.889277,Scratch Cookery,495,"Food Trucks, Chicken Shop",4.5,28.365906
4,1,983514094dd808b1604da2dcfc2d09af,37.336188,-121.889277,Goodtime Bar,73,"Wine Bars, Tapas/Small Plates, Seafood",5.0,33.119119


In [40]:
df_yelp.columns

Index(['Station_No', 'Station_ID', 'Bike_Lat', 'Bike_Long', 'name',
       'review_count', 'categories', 'rating', 'distance'],
      dtype='object')

In [41]:
df_yelp.dtypes

Station_No        int64
Station_ID       object
Bike_Lat        float64
Bike_Long       float64
name             object
review_count      int64
categories       object
rating          float64
distance        float64
dtype: object

# Comparing Results

Which API provided you with more complete data? Provide an explanation. 

They both were comparable in the data they provided. Foursquare focuses more on location data and Yelp has a more details on the Points of Interest based on ratings/customer reviews. However, Yelp also has some location info and data relative to one another which is useful. Depending on the use case you could select either or both.

Get the top 10 restaurants according to their rating

Using the Yelp API below as it has ratings provided:

In [42]:
top_10_resto = df_yelp.sort_values(by='rating', ascending=False)

In [43]:
top_10_resto.head(10)

Unnamed: 0,Station_No,Station_ID,Bike_Lat,Bike_Long,name,review_count,categories,rating,distance
489,163,e69e4dce53bdbe1ffe17185ee72fdcae,37.858868,-122.291209,Tarocco,54,"Mediterranean, Gluten-Free, Venues & Event Spaces",5.0,27.210341
406,135,4a2e731c87e9a1e07ad0f5c8a91efd4b,37.866418,-122.253799,Pizzeria 1868,1,Pizza,5.0,228.298703
1321,440,7201ad6fe2a1e03ff9ea5749d84c354d,37.80303,-122.270496,Pierre Pierre,4,Cajun/Creole,5.0,22.369493
490,163,e69e4dce53bdbe1ffe17185ee72fdcae,37.858868,-122.291209,Cafe Buenos Aires,4,"Empanadas, Coffee & Tea, Breakfast & Brunch",5.0,112.315716
1315,438,b8644a87f64a3672609640fee7a2adc2,37.725372,-122.393954,ChiliCali,25,"Pop-Up Restaurants, Caterers, Indonesian",5.0,139.585572
480,160,03ec647221434cdad0abf4907706e949,37.32673,-121.889273,Almaden Crossing,2,"Coffee & Tea, Lounges, Delis",5.0,75.895164
910,303,6d41d45711456aef7f31b60c4f9d2fa1,37.780265,-122.406644,Bini’s Kitchen,5,Himalayan/Nepalese,5.0,96.530427
473,157,a4d58de82f6475801e4bf6cf97ecd387,37.337122,-121.883215,Cowboy Pho,11,"Vietnamese, Barbeque, Soup",5.0,139.347195
463,154,8ecedfe82a3e35413e8a45da10ace67b,37.827757,-122.256716,Aman Cafe,87,"Malaysian, Coffee & Tea, Cafes",5.0,25.353186
458,152,3367402e41d84a3c70c5056e5312bb24,37.729279,-122.392896,Bloomstock,45,"Coffee & Tea, Waffles, Cafes",5.0,133.347626


### Data Audit

In [21]:
null_values = df_yelp['rating'].isnull()

print(f"Number of null ratings: {null_values.sum()}")

Number of null ratings: 0


In [22]:
df_bikes.rename(columns={'location': 'City'}, inplace=True)

In [23]:
df_bikes.head()

Unnamed: 0,Station_No,Station_ID,City,Bike_Lat,Bike_Lon,Free_Bikes,Empty_Slots,Total_Bikes
0,0,d0e8f4f1834b7b33a3faf8882f567ab8,"San Francisco Bay Area, CA",37.849735,-122.270582,1,14,15
1,1,983514094dd808b1604da2dcfc2d09af,"San Francisco Bay Area, CA",37.336188,-121.889277,6,5,11
2,2,da17603652106fda93da4e255a5b0a22,"San Francisco Bay Area, CA",37.322125,-121.88109,16,7,23
3,3,7a21c92b3b4cd2f7759107b4fdebf869,"San Francisco Bay Area, CA",37.323678,-121.874119,7,7,14
4,4,ce34d38fb230a23c1ced12d1e16df294,"San Francisco Bay Area, CA",37.325998,-121.87712,4,23,27


In [3]:
df_yelp = pd.read_csv('..\data\df_yelp.csv')

### Describe function on the numerical columns:

df_yelp['review_count'].describe()

In [6]:
df_yelp['rating'].describe()

count    1494.000000
mean        4.317269
std         0.321807
min         3.000000
25%         4.000000
50%         4.500000
75%         4.500000
max         5.000000
Name: rating, dtype: float64

In [7]:
df_yelp['distance'].describe()

count    1494.000000
mean      440.492811
std       267.348585
min         8.098200
25%       225.239767
50%       406.597981
75%       623.936574
max      1375.693927
Name: distance, dtype: float64

#### Check for erroneous ratings in Yelp table

In [17]:
# Specify the threshold
threshold_rating = 5.0
below_zero_rating = 0

# .loc to filter rows based on the condition
AboveFive_df = df_yelp.loc[df_yelp['rating'] > threshold_rating]
BelowZero_df = df_yelp.loc[df_yelp['rating'] < below_zero_rating]

# Filtered dataframe
print(f"these ratings are above 5: {AboveFive_df}")
print(f"these ratings are below 0: {BelowZero_df}")

these ratings are above 5: Empty DataFrame
Columns: [Station_ID, Bike_Lat, Bike_Long, name, review_count, categories, rating, distance]
Index: []
these ratings are below 0: Empty DataFrame
Columns: [Station_ID, Bike_Lat, Bike_Long, name, review_count, categories, rating, distance]
Index: []
