## Imports

In [3]:
#data import
import wget
import urllib, json
import zipfile

#string regex
import re

#time
import pytz as tz
from datetime import datetime

#data explore
import numpy as np
np.random.seed(1337)

import pandas as pd


## Data Aquisition
### The Gowalla Dataset

The Gowalla dataset provides the anonymozed users who are registering at a specific geolocated venue at a specific timestamp.   
The dataset [1] is available in from https://snap.stanford.edu/data/loc-gowalla.html

[1] E. Cho, S. A. Myers, J. Leskovec. Friendship and Mobility: Friendship and Mobility: User Movement in Location-Based Social Networks ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD), 2011.


In [4]:
import os
import shutil

datadir = './data'
if not os.path.exists(datadir):
    os.makedirs(datadir)

In [5]:
# users events

url = 'https://snap.stanford.edu/data/loc-gowalla_totalCheckins.txt.gz' 
filename = os.path.join(datadir, 'gowalla_events.csv.gz')

if not os.path.isfile(filename):
    wget.download(url, out=filename)
    
# no need to uncompress, pandas can read compressed csv files, 
# by inferring the compression type from the filename extension

In [6]:
# users graph

url = 'https://snap.stanford.edu/data/loc-gowalla_edges.txt.gz' 
filename = os.path.join(datadir, 'gowalla_users.csv.gz')

if not os.path.isfile(filename):
    wget.download(url, out=filename)
    
# no need to uncompress, pandas can read compressed csv files, 
# by inferring the compression type from the filename extension

The venues names [2] in New York City are fetched from :   
https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/locrec/gowalla-dataset.zip   

[2] The project is being developed in the context of the SInteliGIS project financed by the Portuguese Foundation for Science and Technology (FCT) through project grant PTDC/EIA-EIA/109840/2009. 


In [7]:
url = 'https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/locrec/gowalla-dataset.zip' 

src_filename = os.path.join(datadir, 'gowalla-dataset.zip')
trg_filename = os.path.join(datadir, 'gowalla_venues.csv')

if not os.path.isfile(trg_filename):
    # download
    wget.download(url, out=src_filename)
    
    #extract
    zf = zipfile.ZipFile(src_filename, 'r')
    extracted_file = zf.extract('gowalla-dataset/spots.txt', datadir)

    tmp_filename = shutil.copy(extracted_file, datadir)
    shutil.rmtree(os.path.dirname(extracted_file))

    #preprocess
    regex = re.compile(r'POINT\((\-?\d+\.\d+)\s+(\-?\d+\.\d+)\)')

    with open(tmp_filename, "r") as src, \
         open(trg_filename, "w") as dst  :
    
        next(src) 
        for line in src: 
            dst.write(regex.sub(r'\2\t\1', line))

    #cleanup
    os.remove(src_filename)
    os.remove(tmp_filename)
    

## Data preparation

### Loading the events data
This dataset contains just 5 columns. Although the dataset contains datapoints spread out all over the world, we are going to limit our analysis to the city of New York. Let's start first by importing the data in a pandas dataframe.

In [16]:
filename = os.path.join(datadir, 'gowalla_events.csv.gz')

dfe = pd.read_csv(filename, sep='\t', header=None)
dfe.columns = ['uid','timestamp','lat','lon','vid']
dfe.head(3)

Unnamed: 0,uid,timestamp,lat,lon,vid
0,0,2010-10-12T00:21:28Z,40.643885,-73.782806,23261
1,0,2010-10-11T20:21:20Z,40.741374,-73.988105,16907
2,0,2010-10-11T20:20:42Z,40.741388,-73.989455,12973


### Filtering New York City venues
Now let's filter the dataset to the area of New York City.   

The Flickr API provides us with the bounding box of the city.    
`https://www.flickr.com/places/info/2459115`

`
Bounding Box:	-74.2589, 40.4774, -73.7004, 40.9176
Centroid:	    -74.0071, 40.7146
`

In [17]:
lon_min, lat_min, lon_max, lat_max = -74.2589, 40.4774, -73.7004, 40.9176

dfe = dfe[ (dfe['lon']>lon_min) & 
           (dfe['lon']<lon_max) & 
           (dfe['lat']>lat_min) & 
           (dfe['lat']<lat_max) ]

dfe.head(3)

Unnamed: 0,uid,timestamp,lat,lon,vid
0,0,2010-10-12T00:21:28Z,40.643885,-73.782806,23261
1,0,2010-10-11T20:21:20Z,40.741374,-73.988105,16907
2,0,2010-10-11T20:20:42Z,40.741388,-73.989455,12973


In [18]:
# Overwrite with nyc only (for the tutorial - smaller file)
# filename = os.path.join(datadir, 'gowalla_events.csv.gz')
# dfe.to_csv(filename, sep='\t', index=False, header=None, compression='gzip')

### Parse and localize Date and Time

In [19]:
def parse_datetime(s):
    tzone = tz.timezone("America/New_York")
    utc = datetime.strptime(s, '%Y-%m-%dT%H:%M:%SZ')
    return tz.utc.localize(utc).astimezone(tzone)

dfe['ts'] = dfe['timestamp'].apply(lambda x: parse_datetime(x))
dfe = dfe.drop('timestamp',axis=1,errors='ignore')

dfe.head(3)

Unnamed: 0,uid,lat,lon,vid,ts
0,0,40.643885,-73.782806,23261,2010-10-11 20:21:28-04:00
1,0,40.741374,-73.988105,16907,2010-10-11 16:21:20-04:00
2,0,40.741388,-73.989455,12973,2010-10-11 16:20:42-04:00


### Loading the venues data
This dataset contains just 4 columns. Let's start first by importing the data in a pandas dataframe.

In [20]:
filename = os.path.join(datadir, 'gowalla_venues.csv')

dfv = pd.read_csv(filename, sep='\t', header=None)
dfv.columns = ['vid','name','lat','lon']
dfv.head(3)

Unnamed: 0,vid,name,lat,lon
0,1391604,Conference House Park,40.501759,-74.252343
1,1391611,Almer G. Russell Pavilion,40.502265,-74.254264
2,3612422,Conference House,40.500064,-74.249042


### Loading the user graph data
This dataset contains 2 columns. Namely who follows who.

In [21]:
filename = os.path.join(datadir, 'gowalla_users.csv.gz')

dfu = pd.read_csv(filename, sep='\t', header=None)
dfu.columns = ['uid','fid']
dfu.head(3)

Unnamed: 0,uid,fid
0,0,1
1,0,2
2,0,3


### Merge events and venues data
Let's consider only events for which the venue has a known name

In [22]:
df = pd.DataFrame.merge(dfe,dfv[['vid', 'name']], on='vid', how="inner")

In [23]:
df[df['uid']==0].sort_values(by='ts').head(5)

Unnamed: 0,uid,lat,lon,vid,ts,name
1,0,40.643885,-73.782806,23261,2010-10-07 11:27:40-04:00,JFK John F. Kennedy International
5080,0,40.751508,-73.9755,34484,2010-10-07 16:14:44-04:00,Chrysler Building
4690,0,40.748444,-73.985732,12313,2010-10-07 16:31:48-04:00,Empire State Building
4412,0,40.74581,-73.988221,60450,2010-10-07 17:02:01-04:00,Ace Hotel
4153,0,40.742201,-73.987995,17710,2010-10-07 17:58:31-04:00,Madison Square Park


In [24]:
#filter venues, users and events according to the NYC area
dfe = df[['uid','ts','lat','lon','vid']]
dfv = dfv[dfv['vid'].isin(df['vid'].unique())]

dfu = dfu[dfu['uid'].isin(df['uid'].unique())]
dfu = dfu[dfu['fid'].isin(df['uid'].unique())]

### Save data for New York City
Let's consider only events and user from the given geographical area.  
For this tutorial, we will save it in the following three tables
  - events
  - venues
  - users

In [25]:
filename = os.path.join(datadir, 'gowalla_events.nyc.csv')
dfe.to_csv(filename, sep='\t', index=False)

filename = os.path.join(datadir, 'gowalla_venues.nyc.csv')
dfv.to_csv(filename, sep='\t', index=False)

filename = os.path.join(datadir, 'gowalla_users.nyc.csv')
dfu.to_csv(filename, sep='\t', index=False)
