# Load the modules

In [None]:
import cudf
from collections import OrderedDict
import numpy as np
import datetime as dt

In [None]:
print(cudf.__version__)

# Read the data

In [None]:
transactions_partitions_cnt = 10
transactions_path = 's3://bsql/data/seattle_parking/parking_MayJun2019.parquet/partition_idx={partition}/'
transactions_parq = [transactions_path.format(partition=p) for p in range(transactions_partitions_cnt)]

locations_parq = 's3://bsql/data/seattle_parking/parking_locations.parquet/'

In [None]:
transactions = cudf.read_parquet(transactions_parq, storage_options={'anon': True})
locations = cudf.read_parquet(locations_parq, storage_options={'anon': True})

In [None]:
print('The transactions dataset has {0} records and {1} columns.'.format(*transactions.shape))
print('The locations dataset has {0} records and {1} columns.'.format(*locations.shape))

In [None]:
print(transactions.columns)

In [None]:
print(locations.columns)

In [None]:
transactions.head()

In [None]:
locations.head()

# Extract date information

In [None]:
transactions['year'] = transactions['OccupancyDateTime']._column.year
transactions['month'] = transactions['OccupancyDateTime']._column.month
transactions['day'] = transactions['OccupancyDateTime']._column.day

transactions['hour'] = transactions['OccupancyDateTime']._column.hour
transactions['minute'] = transactions['OccupancyDateTime']._column.minute

transactions[['OccupancyDateTime','year','month','day','hour', 'minute']].head()

In [None]:
counts = transactions.groupby(['year', 'month', 'day']).agg({'OccupancyDateTime': 'count'})
counts

In [None]:
print('Average number of transactions per day: {0:.0f}'.format(counts['OccupancyDateTime'].mean()))

# All parking locations

In [None]:
def extractLon(location):
    lon = location.str.extract('([0-9\.\-]+) ([0-9\.]+)')[0]
    return lon

def extractLat(location):
    lon = location.str.extract('([0-9\.\-]+) ([0-9\.]+)')[1]
    return lon
    
locations['longitude'] = extractLon(locations['Location']).astype('float64')
locations['latitude'] = extractLat(locations['Location']).astype('float64')

locations[['Location', 'longitude', 'latitude']].head()

# Average occupancy

In [None]:
def avgOccupancy(PaidOccupancy, ParkingSpaceCount, AvgOccupancy):
    for i, (paid, available) in enumerate(zip(PaidOccupancy, ParkingSpaceCount)):
        AvgOccupancy[i] = min(1.0, paid / available) # cap it at 100%, sometimes we see more paid occupancy than spaces available
        
transactions = (
    transactions
    .merge(locations[['SourceElementKey', 'ParkingSpaceCount']], on='SourceElementKey')
    .apply_rows(
        avgOccupancy
        , incols=['PaidOccupancy', 'ParkingSpaceCount']
        , outcols={'AvgOccupancy': np.float64}
        , kwargs={}
    )
)
transactions.head()

In [None]:
search_date_f = dt.datetime.strptime('2019-05-24T10:00:00', '%Y-%m-%dT%H:%M:%S')
search_date_t = dt.datetime.strptime('2019-05-24T10:59:59', '%Y-%m-%dT%H:%M:%S')
transactions.query('''SourceElementKey == 35889 and OccupancyDateTime >= @search_date_f and OccupancyDateTime <= @search_date_t'''
).sort_values(by='OccupancyDateTime').head(5).to_pandas()

In [None]:
def calcMean(AvgOccupancy, ParkingSpaceCount, MeanOccupancy):
    '''
        Calculate mean
    '''
    for i, (avgOccSum, avgCnt) in enumerate(zip(AvgOccupancy, ParkingSpaceCount)):
        MeanOccupancy[i] = float(avgOccSum) / avgCnt

df_agg_dt = (
    transactions
    .groupby(['SourceElementKey', 'dow','hour'])
    .agg({
          'ParkingSpaceCount': 'count'
        , 'AvgOccupancy': 'sum'
    })
    .reset_index()
)

df_agg_dt = df_agg_dt.apply_rows(
    calcMean
    , incols=['AvgOccupancy', 'ParkingSpaceCount']
    , outcols={'MeanOccupancy':np.float64}
    , kwargs={}
)

df_agg_dt = df_agg_dt.drop(columns=['AvgOccupancy'])
df_agg_dt = df_agg_dt.drop(columns=['ParkingSpaceCount'])

df_agg_dt.head()

# Find the best parking

In [None]:
locations['LON_Ref'] = -122.349358
locations['LAT_Ref'] = 47.620422

In [None]:
from math import sin, cos, sqrt, asin, pi

def calculateDistance(latitude, longitude, LAT_Ref, LON_Ref, Distance):
    R = 3958.8 # Earth's radius in miles
    
    for i, (lt, ln, lt_r, ln_r) in enumerate(zip(latitude, longitude, LAT_Ref, LON_Ref)):
        lt_rad = lt / 180.0 * pi
        ln_rad = ln / 180.0 * pi
        
        dlon = (ln_r - ln) / 180.0 * pi
        dlat = (lt_r - lt) / 180.0 * pi
        a = (sin(dlat/2.0))**2 + cos(lt_rad) * cos(ln_rad) * (sin(dlon/2.0))**2
        c = 2 * asin(sqrt(a))
        distance = R * c
        Distance[i] = distance * 5280 # in feet
        
locations = locations.apply_rows(
    calculateDistance
    , incols=['latitude', 'longitude', 'LAT_Ref', 'LON_Ref']
    , outcols={'Distance':np.float64}
    , kwargs={}
)

locations.head().to_pandas()

In [None]:
# get only meters within 1000 ft
closest = locations.query('Distance < 1000')

closest = closest.merge(df_agg_dt, how='inner', on=['SourceElementKey']).query('dow == 3 and hour == 13')
closest = closest.sort_values(by='MeanOccupancy')

closest_host = closest[['BlockfaceName', 'SideOfStreet',
       'ParkingTimeLimitCategory', 'ParkingSpaceCount', 'PaidParkingArea',
       'PaidParkingSubArea', 'ParkingCategory', 'Location', 'Distance', 'dow', 'hour', 'MeanOccupancy', 'longitude', 'latitude']].head()
closest_host