In [1]:
import pandas as pd
from datetime import timedelta, date
from sodapy import Socrata
import requests

In [2]:
import os, sys
sys.path.append(os.path.dirname(os.path.abspath('.')))
import app_tokens

In [3]:
def getData(data_key, timeout=1):
    # Example authenticated client (needed for non-public datasets):
    client = Socrata('data.seattle.gov',
                     app_tokens.getAppTokens()['seattle_gov'],
                    timeout=timeout*60)

    results = client.get(data_key,
                     select='distinct sourceelementkey,location',
                     limit=100000)    
    client.close()
    return results

In [24]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
#client = Socrata("data.seattle.gov", None)

#data is delayed 48 hrs
# socrata data keys for parking data
#2019 ytd
data_ytd = 'qktt-2bsy'
#last 30 days
data_mtd = 'rke9-rsvs'
# last 48 hours
data_48hrs = 'hiyf-7edq'

i=0
while i < 5:
    try:
        results = getData(data_mtd, timeout=(i+1))
        i=5
    except requests.exceptions.Timeout:
        i += 1
        if i == 5:
            print('too many timeouts')
            raise requests.exceptions.Timeout()
        print('Timeout, retrying...')


In [33]:
# convert results to dataframe, change sourcelementkey to numberic datatype
df = pd.DataFrame.from_records(results)
df['sourceelementkey'] = df['sourceelementkey'].apply(pd.to_numeric)
df = df.sort_values(by='sourceelementkey')

In [34]:
#take a peak at the data
df.head()

Unnamed: 0,location,sourceelementkey
0,"{'type': 'Point', 'coordinates': [-122.3346935...",1001
1,"{'type': 'Point', 'coordinates': [-122.3345126...",1002
2,"{'type': 'Point', 'coordinates': [-122.3351432...",1006
3,"{'type': 'Point', 'coordinates': [-122.3366575...",1009
4,"{'type': 'Point', 'coordinates': [-122.3364474...",1010


In [35]:
print(len(df))
print(df['sourceelementkey'].nunique())

1461
1461


In [36]:
# unpack location object into latitude and longitude
def unpackCoordinates(location_obj):
    lat = location_obj['coordinates'][0]
    long = location_obj['coordinates'][1]
    # all seattle lat/long should be approx 47, -122. swap lat and long if wrong way
    if lat < long:
        return pd.Series([long, lat])
    else:
        return pd.Series([lat, long])

In [37]:
df[['latitude', 'longitude']] = df.location.apply(unpackCoordinates)

In [38]:
df.head()

Unnamed: 0,location,sourceelementkey,latitude,longitude
0,"{'type': 'Point', 'coordinates': [-122.3346935...",1001,47.602873,-122.334694
1,"{'type': 'Point', 'coordinates': [-122.3345126...",1002,47.602949,-122.334513
2,"{'type': 'Point', 'coordinates': [-122.3351432...",1006,47.603674,-122.335143
3,"{'type': 'Point', 'coordinates': [-122.3366575...",1009,47.605018,-122.336658
4,"{'type': 'Point', 'coordinates': [-122.3364474...",1010,47.605101,-122.336447


In [39]:
# number of unique parking segments
print(df['sourceelementkey'].nunique())
df = df.drop(columns=['location'])

1461


Looks like all location-sourceelementkey pairss are the same. But, there may be additional sourceelementkeys that are in the year table that are not in the monthly data. So I will loop through a range of sourcelementkeys to make sure that we aren't missing anything

In [57]:
# Uh oh. it looks like some of the coordinates have latitidue and longitude swapped :grimacing:
# print(df.iloc[0]['location'], df.iloc[1]['location'])

# df_max = df.groupby(['sourceelementkey'], as_index=False).max()
# df_min = df.groupby(['sourceelementkey'], as_index=False).min()
# df_max.head()

# merged = df_max.merge(df_min, indicator=True, how='outer')
# merged['_merge'].value_counts()
# merged.head()

# df.sort_values(by=['sourceelementkey'])

In [65]:
df.tail()

Unnamed: 0,sourceelementkey,latitude,longitude
1456,134962,47.627591,-122.342213
1457,134965,47.629286,-122.342198
1458,135261,47.613963,-122.313879
1459,136041,47.609676,-122.336325
1460,136322,47.653648,-122.313881


In [64]:
df.head()

Unnamed: 0,sourceelementkey,latitude,longitude
0,1001,47.602873,-122.334694
1,1002,47.602949,-122.334513
2,1006,47.603674,-122.335143
3,1009,47.605018,-122.336658
4,1010,47.605101,-122.336447


In [59]:
def getOneLocation(client, data_key, elementkey):

    results = client.get(data_key,
                     select='sourceelementkey,location',
                     sourceelementkey=elementkey,
                     limit=1)
    #print(results)
    return results

In [67]:
all_results = []

#initialize client
client = Socrata('data.seattle.gov',
                 app_tokens.getAppTokens()['seattle_gov'],
                timeout=60)
# loop through possible values
start = 1000
stop = 137000
for i in range(start, stop):
    print('i: %5d\tpercent done : %.3f' % (i, 100*(i - start) /(stop-start)), end='\r')
    results = getOneLocation(client=client, data_key=data_ytd, elementkey=i)
    #print(len(results))
    if len(results) > 0:
        all_results.append(results[0])

# close connection
client.close()

i: 136999	percent done : 99.999

In [68]:
df_all = pd.DataFrame.from_records(all_results)
df_all['sourceelementkey'] = df_all['sourceelementkey'].apply(pd.to_numeric)
df_all = df_all.sort_values(by='sourceelementkey')
df_all[['latitude', 'longitude']] = df_all.location.apply(unpackCoordinates)

In [69]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1505 entries, 0 to 1504
Data columns (total 4 columns):
location            1505 non-null object
sourceelementkey    1505 non-null int64
latitude            1505 non-null float64
longitude           1505 non-null float64
dtypes: float64(2), int64(1), object(1)
memory usage: 58.8+ KB


In [70]:
df_all.to_csv('block_locations.csv')