In [94]:
# Import Dependencies
import requests
import json
import pandas as pd
import time
import numpy as np
from config import yelp_key

In [95]:
# API Base URL
base_url = "https://api.yelp.com/v3/businesses/search?"

In [96]:
# API Headers
headers = {
    "accept" : "application/json",
    "Authorization" : "Bearer " + yelp_key 
}

In [97]:
# Create List of Categories (manually located)
categories = ['beaches','hanggliding','horsebackriding',
             'hiking','hot_air_balloons',
              'paddleboarding','parasailing',
              'sailing','snorkeling','ziplining', 
              'spas','hotsprings','massage','tours',
              'skiing', 'nightlife','sledding','zoos','wineries','casinos'
             ]

len(categories)

20

In [98]:
# Import Lat/Long Database
csv_path = "../Resources/CitiesWGeolocation.csv"
locations_df = pd.read_csv(csv_path)
locations_df

Unnamed: 0.1,Unnamed: 0,City,State/Province,Country,Latitude,Longitude
0,0,Lexington,Kentucky,United States,38.046407,-84.497039
1,1,San Diego,California,United States,32.717420,-117.162773
2,2,Cook Islands,,Cook Islands,-19.996972,-157.785871
3,3,Park City,Utah,United States,40.646092,-111.497996
4,4,Newcastle Upon Tyne,England,United Kingdom,54.973847,-1.613157
...,...,...,...,...,...,...
223,223,Beirut,,Lebanon,33.895920,35.478430
224,224,Zurich,,Switzerland,47.374449,8.541042
225,225,Geneva,,Switzerland,46.201756,6.146601
226,226,Valletta,,Malta,35.898982,14.513676


In [99]:
# Normalize capitalization
locations_df = locations_df.rename(columns = {"Latitude": "latitude","Longitude":"longitude","Unnamed: 0":"locationID"})

In [100]:
# Gather Lists of Latitudes and Longitudes
latitudes_list = []
longitudes_list = []
location_list = []
for i in range(locations_df.shape[0]):
    latitudes_list.append(locations_df.loc[i].at['latitude'])
    longitudes_list.append(locations_df.loc[i].at['longitude'])
    location_list.append(locations_df.loc[i].at['locationID'])
print(latitudes_list)
print(longitudes_list)

[38.0464066, 32.7174202, -19.99697155, 40.646092100000004, 54.97384739999999, -27.468968199999996, -28.0402165, 34.244058700000004, -36.718804999999996, 49.5041747, 42.64873625, 37.2395367, -32.728465, 17.223472100000002, 25.265347100000003, 50.67108245, 21.721746, 36.508976000000004, 32.079007399999995, -16.484598300000002, -33.928992, -19.1421421, -33.934444, -28.648333299999997, -45.0321923, 33.77217945, -6.166490799999999, -20.2759451, 45.437190799999996, 19.70318225, -4.6574976999999995, -28.002373100000003, 44.4643768, 31.6258257, 37.9374939, 49.8879177, 43.10656029999999, 30.2711224, 15.2214956, 43.4832523, -33.953177600000004, 26.9154576, 46.603353999999996, 13.1500331, 20.169626800000003, 51.53882410000001, 60.39430550000001, 24.578720999999998, -34.61341495, 34.862942600000004, 18.185050699999998, 12.51756625, -35.14181285, -34.4175, 46.7985624, -33.8611665, -16.9206657, 44.958452799999996, -26.6544338, -34.427808299999995, 51.08668970000001, 44.279621, -37.8142454, -31.95589

In [101]:
# Gather List of URLs
url_list = []
lat_lon_list = []
cat_list = []
location_ID_list = []
for i in range(len(latitudes_list)):
        for x in range(len(categories)):
            latitude = latitudes_list[i]
            longitude = longitudes_list[i]
            category = categories[x]
            location = location_list[i]
            # Limit radius to appx 15 miles from the coordinates
            complete_url = base_url + "latitude=" + str(latitude) + "&longitude=" + str(longitude) + "&radius=40000&categories=" + category + "&sort_by=rating"
            url_list.append(complete_url)
            lat_lon_list.append([latitude,longitude])
            cat_list.append(category)
            location_ID_list.append(location)
# Confirm list of API calls is less than 5,000 daily limit
print(len(url_list))
url_list

4560


['https://api.yelp.com/v3/businesses/search?latitude=38.0464066&longitude=-84.4970393&radius=40000&categories=beaches&sort_by=rating',
 'https://api.yelp.com/v3/businesses/search?latitude=38.0464066&longitude=-84.4970393&radius=40000&categories=hanggliding&sort_by=rating',
 'https://api.yelp.com/v3/businesses/search?latitude=38.0464066&longitude=-84.4970393&radius=40000&categories=horsebackriding&sort_by=rating',
 'https://api.yelp.com/v3/businesses/search?latitude=38.0464066&longitude=-84.4970393&radius=40000&categories=hiking&sort_by=rating',
 'https://api.yelp.com/v3/businesses/search?latitude=38.0464066&longitude=-84.4970393&radius=40000&categories=hot_air_balloons&sort_by=rating',
 'https://api.yelp.com/v3/businesses/search?latitude=38.0464066&longitude=-84.4970393&radius=40000&categories=paddleboarding&sort_by=rating',
 'https://api.yelp.com/v3/businesses/search?latitude=38.0464066&longitude=-84.4970393&radius=40000&categories=parasailing&sort_by=rating',
 'https://api.yelp.com/v

In [102]:
# Create API Calls (separate the blank list to a new cell so we can do numerous calls if needed)
data_list=[]

In [106]:
record_count = 1
set_count = 1
url_errors = []
location_len = len(locations_df['locationID'])

for i in range(len(url_list)):
    if (i%len(categories) == 0 and i >=len(categories)):
#         set_count += 1
        record_count = 1
        
    print("processing Record %s | Location: %s / %s" %(record_count,location_ID_list[i],location_len))    
    
    record_count += 1

    response = requests.get(url_list[i],headers=headers)
    data = response.json()
    try:
        # Calculate Average Rating and Review Count
        rating_sum = 0
        review_sum = 0
        # setting the range with an initial number so it can be changed if an error pops up midway through
        for x in range(len(data["businesses"])):
            rating_sum += data["businesses"][x]["rating"]
            review_sum += data["businesses"][x]["review_count"]
        rating_avg = rating_sum / len(data["businesses"])
        review_avg = review_sum / len(data["businesses"])
    
        # Add the desired data to a dictionary
        data_dict = {
            "locationID" : location_ID_list[i],
            "latitude" : lat_lon_list[i][0] ,
            "longitude": lat_lon_list[i][1],
            "category" : cat_list[i],
            "results_total" : len(data["businesses"]),
            "avg_rating" : rating_avg,
            "avg_review_count" : review_avg
        }
        # Append the dictionary to a list
        data_list.append(data_dict)
        time.sleep(1)
    except:
        print("********URL error: Skipping...********")
        print(data)
        try:
            url_errors.append({"locationID":location_ID_list[i],"url":url_list[i],"latitude":data["region"]["center"]["latitude"],
                           "longitude":data["region"]["center"]["longitude"]})
        except:
            url_errors.append({"locationID":location_ID_list[i],"url":url_list[i],"latitude":"NAN",
                           "longitude":"NAN"})
        pass

processing Record 1 | Location: 0 / 228
processing Record 2 | Location: 0 / 228
********URL error: Skipping...********
{'businesses': [], 'total': 0, 'region': {'center': {'longitude': -84.4970393, 'latitude': 38.0464066}}}
processing Record 3 | Location: 0 / 228
processing Record 4 | Location: 0 / 228
processing Record 5 | Location: 0 / 228
********URL error: Skipping...********
{'businesses': [], 'total': 0, 'region': {'center': {'longitude': -84.4970393, 'latitude': 38.0464066}}}
processing Record 6 | Location: 0 / 228
processing Record 7 | Location: 0 / 228
********URL error: Skipping...********
{'businesses': [], 'total': 0, 'region': {'center': {'longitude': -84.4970393, 'latitude': 38.0464066}}}
processing Record 8 | Location: 0 / 228
********URL error: Skipping...********
{'businesses': [], 'total': 0, 'region': {'center': {'longitude': -84.4970393, 'latitude': 38.0464066}}}
processing Record 9 | Location: 0 / 228
********URL error: Skipping...********
{'businesses': [], 'total

KeyboardInterrupt: 

In [18]:
# convert the list of dictionaries to a dataframe
yelp_df = pd.DataFrame(data_list)   
yelp_df.head()

Unnamed: 0,locationID,latitude,longitude,category,results_total,avg_rating,avg_review_count
0,0,38.046407,-84.497039,beaches,2,4.75,3.5
1,0,38.046407,-84.497039,horsebackriding,16,4.5,13.375
2,0,38.046407,-84.497039,hiking,8,4.5,8.5
3,0,38.046407,-84.497039,paddleboarding,2,4.75,16.5
4,0,38.046407,-84.497039,ziplining,20,5.0,7.9


In [19]:
# Convert the list of errors into a dataframe to view any similarities
errors_df = pd.DataFrame(url_errors)
errors_df.head()

Unnamed: 0,locationID,url,latitude,longitude
0,0,https://api.yelp.com/v3/businesses/search?lati...,38.046407,-84.497039
1,0,https://api.yelp.com/v3/businesses/search?lati...,38.046407,-84.497039
2,0,https://api.yelp.com/v3/businesses/search?lati...,38.046407,-84.497039
3,0,https://api.yelp.com/v3/businesses/search?lati...,38.046407,-84.497039
4,0,https://api.yelp.com/v3/businesses/search?lati...,38.046407,-84.497039


# Clean the Yelp Data and Prepare for Export

In [41]:
# determine if there are any missing reviews. All 14 categories are accounted for. No extraneous values
yelp_df["category"].value_counts()

ziplining           168
tours               145
massage             141
spas                140
beaches             125
hiking              124
horsebackriding     115
sailing              77
paddleboarding       57
hot_air_balloons     47
snorkeling           29
hanggliding          19
hotsprings           13
parasailing          11
Name: category, dtype: int64

In [42]:
# Check how many categories successfully returned for each of the locations that returned.
# We are missing many locations, as the length should be 227
yelp_df["locationID"].nunique()

168

In [43]:
# Check how many values are located for each location
# None of the locations have all 14 categories
yelp_df['locationID'].value_counts()

76     13
1      13
119    13
139    13
127    13
       ..
150     1
91      1
97      1
101     1
226     1
Name: locationID, Length: 168, dtype: int64

In [44]:
# Add total ranking column
yelp_df['total_rating'] = yelp_df['avg_rating'] * yelp_df['avg_review_count']
yelp_df

Unnamed: 0,locationID,latitude,longitude,category,results_total,avg_rating,avg_review_count,total_rating
0,0,38.046407,-84.497039,beaches,2,4.750000,3.500000,16.625000
1,0,38.046407,-84.497039,horsebackriding,16,4.500000,13.375000,60.187500
2,0,38.046407,-84.497039,hiking,8,4.500000,8.500000,38.250000
3,0,38.046407,-84.497039,paddleboarding,2,4.750000,16.500000,78.375000
4,0,38.046407,-84.497039,ziplining,20,5.000000,7.900000,39.500000
...,...,...,...,...,...,...,...,...
1206,225,46.201756,6.146601,ziplining,20,5.000000,7.850000,39.250000
1207,225,46.201756,6.146601,spas,15,4.033333,1.733333,6.991111
1208,225,46.201756,6.146601,massage,20,4.625000,1.150000,5.318750
1209,225,46.201756,6.146601,tours,16,3.812500,2.812500,10.722656


In [45]:
# Separate out all of the values that have less than 5 actual activities returned successfully
value_counts = pd.DataFrame(yelp_df[["locationID","latitude","longitude"]].groupby(["locationID"]).size().sort_values(ascending=False))
value_counts = value_counts.rename(columns={0:"activityCount","longitude":"test"})
value_counts = value_counts.loc[(value_counts["activityCount"] < 5)]
value_counts = value_counts.reset_index(inplace=False)
value_counts

Unnamed: 0,locationID,activityCount
0,21,4
1,40,4
2,9,4
3,51,3
4,52,3
5,197,2
6,13,2
7,42,2
8,55,2
9,8,2


In [52]:
# # Filter the list of results to only include locations with five or more actual activities
# location_list = value_counts["locationID"].tolist()
# five_activities = yelp_df[~yelp_df["locationID"].isin(location_list)]
# five_activities.reset_index(inplace=True,drop=True)
# five_activities

Unnamed: 0,locationID,latitude,longitude,category,results_total,avg_rating,avg_review_count,total_rating
0,0,38.046407,-84.497039,beaches,2,4.750000,3.500000,16.625000
1,0,38.046407,-84.497039,horsebackriding,16,4.500000,13.375000,60.187500
2,0,38.046407,-84.497039,hiking,8,4.500000,8.500000,38.250000
3,0,38.046407,-84.497039,paddleboarding,2,4.750000,16.500000,78.375000
4,0,38.046407,-84.497039,ziplining,20,5.000000,7.900000,39.500000
...,...,...,...,...,...,...,...,...
1156,225,46.201756,6.146601,paddleboarding,1,4.500000,2.000000,9.000000
1157,225,46.201756,6.146601,ziplining,20,5.000000,7.850000,39.250000
1158,225,46.201756,6.146601,spas,15,4.033333,1.733333,6.991111
1159,225,46.201756,6.146601,massage,20,4.625000,1.150000,5.318750


In [53]:
# # Create a dataframe of all null values in the yelp dataframe
# null_df = pd.merge(five_activities,locations_df,how="right",on=["latitude","longitude","locationID"])
# null_df = null_df[null_df["category"].isna()]
# null_df

Unnamed: 0,locationID,latitude,longitude,category,results_total,avg_rating,avg_review_count,total_rating,City,State/Province,Country
1161,2,-19.996972,-157.785871,,,,,,Cook Islands,,Cook Islands
1162,8,-36.718805,142.220951,,,,,,Grampians,Victoria,Australia
1163,9,49.504175,-115.062867,,,,,,Fernie,British Columbia,Canada
1164,10,42.648736,18.094680,,,,,,Dubrovnik,Croatia,Croatia
1165,13,17.223472,-61.955461,,,,,,Antigua and Barbuda,,Antigua and Barbuda
...,...,...,...,...,...,...,...,...,...,...,...
1245,217,23.588202,58.382945,,,,,,Muscat,,Oman
1246,221,29.379653,47.973417,,,,,,Kuwait City,,Kuwait
1247,223,33.895920,35.478430,,,,,,Beirut,,Lebanon
1248,226,35.898982,14.513676,,,,,,Valletta,,Malta


In [54]:
# null_df = null_df[['locationID','City','State/Province','Country']]
# null_df.reset_index(inplace=True, drop=True)
# null_df

Unnamed: 0,locationID,City,State/Province,Country
0,2,Cook Islands,,Cook Islands
1,8,Grampians,Victoria,Australia
2,9,Fernie,British Columbia,Canada
3,10,Dubrovnik,Croatia,Croatia
4,13,Antigua and Barbuda,,Antigua and Barbuda
...,...,...,...,...
84,217,Muscat,,Oman
85,221,Kuwait City,,Kuwait
86,223,Beirut,,Lebanon
87,226,Valletta,,Malta


# Export Cleaned CSVs

In [26]:
# Export list of all locations with 5+ activities
yelp_df.to_csv("../Resources/yelpdata.csv")

In [27]:
# Export list of all locations with 0-4 activities
null_df.to_csv("../Resources/missingActivities.csv")