## This .ipynb file is to process raw foursquare data to dataframe for later urban profiling, trip arrival estimation, zone correlation, and dynamic OD estimation

In [1]:
import pandas as pd
import json
from pandas.io.json import json_normalize    
from tqdm import tqdm
import os
import pickle
import geopandas as gpd
import requests
from shapely.geometry import Point
import foursquare

### We need to collected POI category information from Foursquare Venue Hierarchy 

In [2]:
cd C:\Users\Nicholas\Dropbox\2017.09.19 18PKDD Urban Zonal Correlation\github page

C:\Users\Nicholas\Dropbox\2017.09.19 18PKDD Urban Zonal Correlation\github page


In [3]:
# client_id & client_secret obtained from foursquare API home by registering a client
client_id = ''
client_secret = ''

client = foursquare.Foursquare(client_id=client_id, client_secret=client_secret)

out = open('foursquare-categories.json', 'w')
out.write(json.dumps(client.venues.categories(), sort_keys=True, indent=4))
out.close()

In [4]:
filename='foursquare-categories.json'
with open(filename, 'r') as f:
        datastore = json.load(f)

### the following code is for specific use of certain level of category, as i know, foursquare contain at leaset five level. For my research, I only map the first level (9 types)

In [5]:
hierarchical={}
for i in range(len(datastore['categories'])):
    hierarchical[datastore['categories'][i]['id']]=datastore['categories'][i]['name']
    for j in range(len(datastore['categories'][i]['categories'])):
        hierarchical[datastore['categories'][i]['categories'][j]['id']]=datastore['categories'][i]['name']
        for k in range(len(datastore['categories'][i]['categories'][j]['categories'])):
            hierarchical[datastore['categories'][i]['categories'][j]['categories'][k]['id']]=datastore['categories'][i]['name']
            for l in range(len(datastore['categories'][i]['categories'][j]['categories'][k]['categories'])):
                hierarchical[datastore['categories'][i]['categories'][j]['categories'][k]['categories'][l]['id']]=datastore['categories'][i]['name']
                for m in range(len(datastore['categories'][i]['categories'][j]['categories'][k]['categories'][l]['categories'])):
                    hierarchical[datastore['categories'][i]['categories'][j]['categories'][k]['categories'][l]['categories'][m]['id']]=datastore['categories'][i]['name']
                    for n in range(len(datastore['categories'][i]['categories'][j]['categories'][k]['categories'][l]['categories'][m]['categories'])):
                        hierarchical[datastore['categories'][i]['categories'][j]['categories'][k]['categories'][l]['categories'][m]['categories'][n]['id']]=datastore['categories'][i]['name']

In [6]:
cd C:\Users\Nicholas\Dropbox\2017.09.19 18PKDD Urban Zonal Correlation\github page\sampleData

C:\Users\Nicholas\Dropbox\2017.09.19 18PKDD Urban Zonal Correlation\github page\sampleData


In [7]:
path_to_json = '.' ### the fold contains all foursquare data. In my study, foursquare data is archived in json format.
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]

In [8]:
df = pd.DataFrame(columns=['time','lat','lng','cate'])
df.to_pickle('firehose')
df = pd.read_pickle('firehose')

In [9]:
bb=[[-74.0516,-73.9038],[40.6818,40.8832]] ###[lng, lat] for study area. In my study, it is Mahattan Island area of New York City.

In [10]:
for js in json_files:
    print (js)
    json_text = pd.read_json(js, lines=True)
    if len(json_text)!=0:
        time=pd.to_datetime((json_text['createdAt']+json_text['timeZoneOffset']*60), unit='s').rename("time")
        location=json_normalize(json_text['venue'])
        lat = location['location.lat'].rename("lat")
        lat = lat[lat.between(bb[1][0],bb[1][1])]
        lng = location['location.lng'].rename("lng")
        lng = lng[lng.between(bb[0][0],bb[0][1])]
        cate=json_normalize(location['categories'].str[0].dropna())['id'].rename("cate").map(hierarchical)
        tmp = pd.concat([time, lat, lng, cate], axis=1)
        tmp=tmp.dropna().reset_index(drop=True)
        df = pd.read_pickle('firehose')
        df = pd.concat([df, tmp], axis=0)
        df.to_pickle('firehose')
        del df
### now everything is stored in one datafram and pickcle file
df = pd.read_pickle('firehose')

firehose.20170101-00.json


In [11]:
df.head() ### now all 1-level cate has been assigned to every record

Unnamed: 0,time,lat,lng,cate
0,2017-01-01 00:02:56,40.747904,-73.986685,Food
1,2017-01-01 00:03:04,40.705628,-73.921426,Shop & Service
2,2017-01-01 00:03:31,40.751739,-73.986051,Travel & Transport
3,2017-01-01 00:03:31,40.751739,-73.986051,Shop & Service
4,2017-01-01 00:03:34,40.743954,-73.99694,Food


### Now we need to do spatial join to get aggregate-level check-in. In this study, we use census tract level as my spatial resolution

In [12]:
geom = df.apply(lambda x : Point([x['lng'],x['lat']]), axis=1)

In [13]:
df = gpd.GeoDataFrame(df, geometry=geom)
df.crs = {'init' :'epsg:4326'}
df.head()

Unnamed: 0,time,lat,lng,cate,geometry
0,2017-01-01 00:02:56,40.747904,-73.986685,Food,POINT (-73.9866845613309 40.74790388657718)
1,2017-01-01 00:03:04,40.705628,-73.921426,Shop & Service,POINT (-73.92142641765564 40.70562774921207)
2,2017-01-01 00:03:31,40.751739,-73.986051,Travel & Transport,POINT (-73.986051 40.751739)
3,2017-01-01 00:03:31,40.751739,-73.986051,Shop & Service,POINT (-73.986051 40.751739)
4,2017-01-01 00:03:34,40.743954,-73.99694,Food,POINT (-73.99693994448383 40.74395351562067)


In [14]:
cd C:\Users\Nicholas\Dropbox\2017.09.19 18PKDD Urban Zonal Correlation\github page

C:\Users\Nicholas\Dropbox\2017.09.19 18PKDD Urban Zonal Correlation\github page


In [15]:
geojson_file = 'nycGeo.geojson'
TAZ = gpd.read_file(geojson_file)[['TAZ','geometry']]
TAZ.head()

Unnamed: 0,TAZ,geometry
0,1,"(POLYGON ((-74.01683300000001 40.705454, -74.0..."
1,2,"(POLYGON ((-74.01317400000001 40.719443, -74.0..."
2,3,"(POLYGON ((-74.027877 40.684782, -74.0278 40.6..."
3,4,"(POLYGON ((-74.008641 40.703439, -74.009623 40..."
4,5,"(POLYGON ((-74.005233 40.705619, -74.005645 40..."


In [16]:
dfTAZ = gpd.sjoin(df, TAZ, op='within')
dfTAZ=dfTAZ[['time','cate','TAZ']].reset_index(drop=True)

In [17]:
dfTAZ.head()

Unnamed: 0,time,cate,TAZ
0,2017-01-01 00:02:56,Food,95
1,2017-01-01 00:07:38,Professional & Other Places,95
2,2017-01-01 00:19:20,Food,95
3,2017-01-01 00:19:25,Food,95
4,2017-01-01 00:21:18,Nightlife Spot,95


In [18]:
dfTAZ.to_pickle('firehoseTAZ')

### now the datafram has been processed.