# Tokyo Foursquare Dataset EDA  

Foursquare check-ins in Tokyo from around 2012 - 2013. Goal is to come up with a visualization for subway / metro travel in this time.

In [3]:
import pandas as pd
import numpy as np

In [103]:
import json

In [4]:
df_tokyo = pd.read_csv('../_data/foursquare-nyc-and-tokyo-check-ins/dataset_TSMC2014_TKY.csv')

In [5]:
df_trains = df_tokyo[df_tokyo['venueCategory'] == 'Train Station']

In [6]:
df_subway = df_tokyo[df_tokyo['venueCategory'] == 'Subway']

In [7]:
df_trains.head(2)

Unnamed: 0,userId,venueId,venueCategoryId,venueCategory,latitude,longitude,timezoneOffset,utcTimestamp
7,114,4b3eae5cf964a520b4a025e3,4bf58dd8d48988d129951735,Train Station,35.700253,139.480255,540,Tue Apr 03 19:35:36 +0000 2012
15,2290,4b53b05ef964a520e8a727e3,4bf58dd8d48988d129951735,Train Station,35.749538,139.58654,540,Tue Apr 03 20:14:18 +0000 2012


### Convert utc to regular time

In [8]:
timezone_offset = pd.to_timedelta('6 hours')

def convert_time(row):
    return pd.to_datetime(row['utcTimestamp']) + timezone_offset

In [9]:
df_trains['localTimestamp'] = df_trains.apply(convert_time, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [10]:
df_subway['localTimestamp'] = df_subway.apply(convert_time, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [69]:
df_trains[0:1].reset_index()

0   2012-04-04 01:35:36
Name: localTimestamp, dtype: datetime64[ns]

## Function to convert to geojson

In [11]:
def row_to_geojson_full(row, json_list=[]):
    headers = row.columns
    properties = {}
    for h in headers:
        properties[h] = row[h].values[0]
    dict = {"type": "Feature",
            "geometry": {
                "type": "Point",
                "coordinates": [row['longitude'].values[0], row['latitude'].values[0]]
            },
            "properties": properties}
    json_list.append(dict)

In [12]:
def row_to_geojson(row, json_list=[]):
#     print(row)
#     dict = {"type": "Feature",
#             "geometry": {
#                 "type": "Point",
#                 "coordinates": [row['longitude'].values[0], row['latitude'].values[0]]
#             },
#             "properties": {
#                 "localTimestamp": row['localTimestamp'].values[0]
#             }}
    dict = {"type": "Feature",
            "geometry": {
                "type": "Point",
                "coordinates": [row['longitude'], row['latitude']]
            },
            "properties": {
                "localTimestamp": row['localTimestamp']
            }}
    json_list.append(dict)

In [13]:
# json.dumps(dict, default=timestamp_to_json)
def timestamp_to_json(obj):
    if isinstance(obj, pd.datetime):
        return obj.__str__()

### Test Subway JSON collection

In [15]:
list_json_subways = []
df_subway[:10].apply(lambda x: row_to_geojson(x, json_list=list_json_subways), axis=1)

32     None
34     None
54     None
65     None
75     None
92     None
93     None
105    None
108    None
125    None
dtype: object

In [16]:
list_json_subways

[{'geometry': {'coordinates': [139.7195989, 35.74880451], 'type': 'Point'},
  'properties': {'localTimestamp': Timestamp('2012-04-04 02:59:52')},
  'type': 'Feature'},
 {'geometry': {'coordinates': [139.798767, 35.68220662], 'type': 'Point'},
  'properties': {'localTimestamp': Timestamp('2012-04-04 03:04:04')},
  'type': 'Feature'},
 {'geometry': {'coordinates': [139.633677, 35.55002896], 'type': 'Point'},
  'properties': {'localTimestamp': Timestamp('2012-04-04 03:39:12')},
  'type': 'Feature'},
 {'geometry': {'coordinates': [139.7113779, 35.73027306], 'type': 'Point'},
  'properties': {'localTimestamp': Timestamp('2012-04-04 03:52:17')},
  'type': 'Feature'},
 {'geometry': {'coordinates': [139.7106913, 35.68859747], 'type': 'Point'},
  'properties': {'localTimestamp': Timestamp('2012-04-04 03:56:43')},
  'type': 'Feature'},
 {'geometry': {'coordinates': [139.8175974, 35.66959007], 'type': 'Point'},
  'properties': {'localTimestamp': Timestamp('2012-04-04 04:06:34')},
  'type': 'Featu

## Reduce coordinate noise

Basically try to map check-ins to a reasonable number. Check-ins might be referring to the same station, but slightly off in GPS coordinates, so experiment with how precise the geocoordinates should be. Assuming there are about 140 stations to work with.

In [17]:
len(df_subway['latitude'].apply(lambda x: np.round_(x, decimals=3)).value_counts())

136

In [18]:
len(df_subway['longitude'].apply(lambda x: np.round_(x, decimals=3)).value_counts())

164

In [23]:
df_subway_small = df_subway[['venueId', 'latitude', 'longitude', 'localTimestamp']]
df_subway_small['latitude'] = df_subway_small['latitude'].apply(lambda x: np.round_(x, decimals=3))
df_subway_small['longitude'] = df_subway_small['longitude'].apply(lambda x: np.round_(x, decimals=2))
len((df_subway_small['latitude'].map(str) + ":" + df_subway_small['longitude'].map(str)).unique())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


292

In [290]:
df_subway_small['localTimestamp'].diff().std()

Timedelta('0 days 03:17:35.704839')

### Save CSV

In [25]:
df_subway_small.to_csv('../_data/df_subway_station_checkins')

## Extra stuff (ignorable):

### Find average time difference

Want to calculate the average time difference between check-ins, accounting for outliers

In [298]:
df_timedeltas_subway = df_subway_small['localTimestamp'].diff()[1:]

In [305]:
df_timedeltas_subway.std()

Timedelta('0 days 03:17:35.704839')

In [319]:
df_timedeltas_subway[df_timedeltas_subway <= df_timedeltas_subway.std()].mean()

Timedelta('0 days 00:04:39.152909')

In [274]:
len(df_subway_small['venueId'].unique())

431

In [258]:
row_to_geojson_full(df_subway_small[0:1])

{'geometry': {'coordinates': [139.72, 35.75], 'type': 'Point'},
 'properties': {'latitude': 35.75,
  'localTimestamp': numpy.datetime64('2012-04-04T02:59:52.000000000'),
  'longitude': 139.72},
 'type': 'Feature'}

In [321]:
json_list = []
df_subway_small[:100].apply(lambda x: row_to_geojson(x, json_list=json_list), axis=1)

32     None
34     None
54     None
65     None
75     None
92     None
93     None
105    None
108    None
125    None
126    None
130    None
138    None
155    None
156    None
166    None
181    None
190    None
191    None
192    None
217    None
224    None
235    None
236    None
237    None
239    None
240    None
242    None
251    None
254    None
       ... 
489    None
493    None
494    None
498    None
500    None
505    None
517    None
532    None
536    None
538    None
539    None
552    None
555    None
560    None
566    None
578    None
580    None
599    None
603    None
607    None
608    None
619    None
642    None
643    None
645    None
648    None
659    None
663    None
665    None
669    None
Length: 100, dtype: object

In [322]:
json.dumps(json_list, default=timestamp_to_json)

'[{"type": "Feature", "geometry": {"type": "Point", "coordinates": [139.72, 35.75]}, "properties": {"localTimestamp": "2012-04-04 02:59:52"}}, {"type": "Feature", "geometry": {"type": "Point", "coordinates": [139.8, 35.68]}, "properties": {"localTimestamp": "2012-04-04 03:04:04"}}, {"type": "Feature", "geometry": {"type": "Point", "coordinates": [139.63, 35.55]}, "properties": {"localTimestamp": "2012-04-04 03:39:12"}}, {"type": "Feature", "geometry": {"type": "Point", "coordinates": [139.71, 35.73]}, "properties": {"localTimestamp": "2012-04-04 03:52:17"}}, {"type": "Feature", "geometry": {"type": "Point", "coordinates": [139.71, 35.69]}, "properties": {"localTimestamp": "2012-04-04 03:56:43"}}, {"type": "Feature", "geometry": {"type": "Point", "coordinates": [139.82, 35.67]}, "properties": {"localTimestamp": "2012-04-04 04:06:34"}}, {"type": "Feature", "geometry": {"type": "Point", "coordinates": [139.86, 35.66]}, "properties": {"localTimestamp": "2012-04-04 04:06:57"}}, {"type": "Fe

In [318]:
df_subway_small[10:1000]['localTimestamp'].diff().std()

Timedelta('0 days 01:02:11.222440')

In [24]:
df_subway_small

Unnamed: 0,venueId,latitude,longitude,localTimestamp
32,4b5da9e0f964a520bb6529e3,35.749,139.72,2012-04-04 02:59:52
34,4b8c5418f964a520e3ca32e3,35.682,139.80,2012-04-04 03:04:04
54,4b5bac4af964a520200f29e3,35.550,139.63,2012-04-04 03:39:12
65,4b7481e1f964a520bde02de3,35.730,139.71,2012-04-04 03:52:17
75,4b2692f9f964a520be7d24e3,35.689,139.71,2012-04-04 03:56:43
92,4b5599caf964a5209be827e3,35.670,139.82,2012-04-04 04:06:34
93,4bd6b3024e32d13a50f6c280,35.665,139.86,2012-04-04 04:06:57
105,4b9b8cd2f964a520c30e36e3,35.676,139.76,2012-04-04 04:12:01
108,4b66c862f964a520aa2a2be3,35.605,139.71,2012-04-04 04:13:27
125,4b5e34f8f964a520bb8329e3,35.696,139.75,2012-04-04 04:21:13
