# API and webscrapping for Foursquare

Foursquare is a local search-and-discovery mobile app. The app provides personalized recommendations of places to go near a user's current location based on users previous browsing history and check-in history.

This data was collected using venues endpoint of Foursquare API. It contains details data of the venues in Riyadh city (Nov-2019), including the name, address, likes, rating, categories and more.

See here for more details about venues endpoint : https://developer.foursquare.com/docs/api/venues/details


There is a lot you can do with this data set. you can see what types of venues get more ratings and likes, what categories of venues are more likely to be visited in a certain time, you can study the overall behavior of people in Riyadh city. 

In [1]:
#import
import pandas as pd
import foursquare

In [15]:
# Construct the client object
client = foursquare.Foursquare(client_id='', client_secret='', redirect_uri='http://fondu.com/oauth/authorize')

# Build the authorization url for your app
auth_uri = client.oauth.auth_url()

In [3]:
#list of lat and lng to cover Riyadh city
lls=['24.774265,46.738586',
    '24.8260967,46.7484399',
    '24.8095071,46.6729062',
    '24.7814533,46.6035979',
    '24.7988011,46.7850383',
    '24.7619308,46.6933349',
    '24.7355036,46.6260983',
    '24.7611459,46.8004901',
    '24.7128417,46.7100004',
    '24.6854466,46.6243770',
    '24.7051727,46.8215656',
    '24.6679739,46.7319644',
    '24.6679739,46.7319644',
    '24.6298277,46.7387158']

#list of sections of places
sections=['food','drinks','coffee','shops','arts', 'outdoors', 'sights']

In [231]:
# collect the venues ids for all ll and sections
venues_ids=[]
for section in sections:
    for ll in lls:
        venues=client.venues.explore(params={'section': section, 'll': ll,'limit':'50'})['groups'][0]['items']
        for venue in venues:
            venues_ids.append(venue['venue']['id'])       

In [232]:
#delete duplicates
venues_ids = list(dict.fromkeys(venues_ids))

In [233]:
venues_ids2=[]
for ll in lls:
    venues=client.venues.search(params={'query': 'coffee', 'll': ll,'limit':'50'})['venues']
    for venue in venues:
        venues_ids2.append(venue['id']) 

In [234]:
venues_ids2 = list(dict.fromkeys(venues_ids2))

In [235]:
missing=[]
for x in venues_ids2:
    if x not in venues_ids:
        missing.append(x)        

In [236]:
venues_ids= venues_ids+ missing

In [252]:
#create dataframe of venues ids
ids_df = pd.DataFrame({'venues_id': venues_ids})

In [254]:
#save the venues_ids to csv file to use it later 
ids_df.to_csv(r'venues_ids.csv',index=False)

In [4]:
# function to get venues details and return dataframe
def get_venues(venues_ids):
    venues_id=[]
    name=[]
    address=[]
    crossStreet=[]
    lat=[]
    lng=[]
    cc=[]
    city=[]
    country=[]
    categories=[]
    verified=[]
    tip_count=[]
    url=[]
    price_tier=[]
    price_message=[]
    price_currency=[]
    likes=[]
    rating=[]
    photos=[]
    description=[]
    lists=[]
    hours=[]
    popular_hours=[]
    attributes=[]
    tips=[]


    for _id in venues_ids:
        
        venue=''
        venue=client.venues(_id)['venue']          
            
        try:
            venues_id.append(venue['id'])
        except:
            venues_id.append('')
    
        try:
            name.append(venue['name'])
        except:
            name.append('')
        
        try:
            address.append(venue['location']['address'])
        except:
            address.append('')
        
        try:
            crossStreet.append(venue['location']['crossStreet'])
        except:
            crossStreet.append('')
        
        try:
            lat.append(venue['location']['lat'])
        except:
            lat.append('')
        
        try:
            lng.append(venue['location']['lng'])
        except:
            lng.append('')
        
        try:
            cc.append(venue['location']['cc'])
        except:
            cc.append('')
        
        try:
            city.append(venue['location']['city'])
        except:
            city.append('')
        
        try:
            country.append(venue['location']['country'])
        except:
            country.append('')
        
        try:
            cat_str=''
            for c in venue['categories']:
                cat_str = cat_str +'"'+c['name']+'"'
            categories.append(cat_str)
        except:
            categories.append('')
        
        try:
            verified.append(venue['verified'])
        except:
            verified.append('')
    
        try:
            tip_count.append(venue['stats']['tipCount'])
        except:
            tip_count.append('')
    
        try:
            url.append(venue['url'])
        except:
            url.append('')
        
        try:
            price_tier.append(venue['price']['tier'])
        except:
            price_tier.append('')      
    
        try:
            price_message.append(venue['price']['message'])
        except:
            price_message.append('') 
        
        try:
            price_currency.append(venue['price']['currency'])
        except:
            price_currency.append('')  
    
        try:
            likes.append(venue['likes']['count'])
        except:
            likes.append('')

        try:
            rating.append(venue['rating'])
        except:
            rating.append('')
        
        try:
            photos.append(venue['photos']['count'])
        except:
            photos.append('')
        
        try:
            description.append(venue['description'])
        except:
            description.append('')
        
        try:
            list_str=''
            for l in venue['listed']['groups']:
                for ll in l['items']:
                    list_str = list_str +'"'+ll['name']+'"'
            lists.append(list_str)
        except:
            lists.append('')

        try:
            time_str=''
            for d in venue['hours']['timeframes']:
                time_str=time_str +'"'+d['days']+' '
                for t in d['open']: 
                    time_str=time_str+t['renderedTime']+' '
                time_str=time_str+'"'
            hours.append(time_str)
        except:
            hours.append('')
        
        try:
            time_str=''
            for d in venue['popular']['timeframes']:
                time_str=time_str +'"'+d['days']+' '
                for t in d['open']: 
                    time_str=time_str+t['renderedTime']+' '
                time_str=time_str+'"'
            popular_hours.append(time_str)
        except:
            popular_hours.append('')
        
        try:
            attributes_str=''
            for a in venue['attributes']['groups']:
                for i in a['items']: 
                    attributes_str=attributes_str+'"'+i['displayName']+' : '
                    attributes_str=attributes_str+i['displayValue']+'"'
            attributes.append(attributes_str)
        except:
            attributes.append('')
        
    
    venues = pd.DataFrame(
    {'venues_id': venues_id,
     'name': name,
     'address': address,
     'crossStreet': crossStreet,
     'lat': lat,
     'lng': lng,
     'cc': cc,
     'city': city,
     'country': country,
     'categories': categories,
     'verified': verified,
     'tip_count':tip_count,
     'url': url,
     'price_tier': price_tier,
     'price_message': price_message,
     'price_currency': price_currency,
     'likes': likes,
     'rating': rating,
     'photos': photos,
     'description': description,
     'lists': lists,
     'hours': hours,
     'popular_hours': popular_hours,
     'attributes': attributes
    })
    
    return venues

In [5]:
#read csv file of ids that was saved earliar
ids_df=pd.read_csv('venues_ids.csv')

In [6]:
all_venues_ids=ids_df['venues_id']

In [7]:
#number of ids (The Foursquare API has a limit of 99,500 Regular API Calls per day and 500 Premium API Calls per day)
len(all_venues_ids)

2618

In [12]:
# 6 days to get all the data
venues_ids_1=all_venues_ids[0:500]
venues_ids_2=all_venues_ids[500:1000]
venues_ids_3=all_venues_ids[1000:1500]
venues_ids_4=all_venues_ids[1500:2000]
venues_ids_5=all_venues_ids[2000:2500]
venues_ids_6=all_venues_ids[2500:]

In [13]:
len(venues_ids_1), len(venues_ids_2), len(venues_ids_3), len(venues_ids_4), len(venues_ids_5), len(venues_ids_6)

(500, 500, 500, 500, 500, 118)

In [311]:
venues_1=get_venues(venues_ids_1)
venues_1.to_csv(r'venues_1.csv',index=False)

In [39]:
venues_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 24 columns):
venues_id         500 non-null object
name              500 non-null object
address           443 non-null object
crossStreet       366 non-null object
lat               500 non-null float64
lng               500 non-null float64
cc                500 non-null object
city              481 non-null object
country           500 non-null object
categories        500 non-null object
verified          500 non-null bool
tip_count         500 non-null int64
url               360 non-null object
price_tier        485 non-null object
price_message     485 non-null object
price_currency    485 non-null object
likes             500 non-null int64
rating            500 non-null float64
photos            500 non-null int64
description       309 non-null object
lists             466 non-null object
hours             360 non-null object
popular_hours     450 non-null object
attributes        497 n

In [16]:
venues_2=get_venues(venues_ids_2)
venues_2.to_csv(r'venues_2.csv',index=False)

In [25]:
venues_3=get_venues(venues_ids_3)
venues_3.to_csv(r'venues_3.csv',index=False)

In [None]:
venues_4=get_venues(venues_ids_4)
venues_4.to_csv(r'venues_4.csv',index=False)

In [None]:
venues_5=get_venues(venues_ids_5)
venues_5.to_csv(r'venues_5.csv',index=False)

In [None]:
venues_6=get_venues(venues_ids_6)
venues_6.to_csv(r'venues_6.csv',index=False)

In [26]:
venues_all=pd.read_csv('venues_all.csv')

In [27]:
venues_all.shape

(1000, 24)

In [28]:
all_venues=pd.concat([venues_all, venues_3], axis=0).reset_index(drop = True)

In [29]:
all_venues.shape

(1500, 24)

In [30]:
all_venues.to_csv(r'venues_1500.csv',index=False)