## Data Preparation

In [1]:
# Import necessary libraries

import os
from datetime import timedelta
import pandas as pd
import numpy as np
import re

notebook_path = os.path.abspath("Data_Preparation.ipynb")

### Load files and do initial cleaning

#### Parking meter transactions data

In [2]:
# load parking meter transactions data
filename_csv = os.path.join(os.path.dirname(notebook_path), "Data/Raw/treas_parking_payments_2019_datasd.csv")
transactions = pd.read_csv(filename_csv, parse_dates=['trans_start','meter_expire'])

# add a durations column (amount of time between transaction time and expiration time, in minutes)
transactions['duration'] = transactions['meter_expire'] - transactions['trans_start']
transactions['duration'] = transactions['duration']/np.timedelta64(1,'m')

# add a day_of_week column (so that we can easily extract weekday transactions)
transactions['day_of_week'] = transactions.trans_start.dt.dayofweek

#### Parking meter locations data

In [3]:
# load parking meter locations information
filename_csv = os.path.join(os.path.dirname(notebook_path), "Data/Raw/treas_parking_meters_loc_datasd.csv")
locations = pd.read_csv(filename_csv)

# create a short version of the config_name
locations['short_config_name'] = locations['config_name']
locations['short_config_name'] = locations['short_config_name'].str.split('HR', n=1, expand=True)[0]

# remove multi-vehicle spots, special Stadium parking
locations = locations[~(locations['short_config_name'].str.contains('MSPM')) &
                      ~(locations['short_config_name'].str.contains('San Diego')) &
                      ~(locations['short_config_name'].str.contains('Single Space'))]

# remove Z-Spares
locations = locations[locations['zone'] != 'Z-Spares']

# create column for parking time_limit
locations.loc[locations['short_config_name'].str.contains('15 Min'), 'time_limit'] = 0.25
locations.loc[locations['short_config_name'].str.contains('30 Min'), 'time_limit'] = 0.50
locations.loc[locations['short_config_name'].str.contains('1 Hour'), 'time_limit'] = 1
locations.loc[locations['short_config_name'].str.contains('2 Hour'), 'time_limit'] = 2
locations.loc[locations['short_config_name'].str.contains('4 Hour'), 'time_limit'] = 4
locations.loc[locations['short_config_name'].str.contains('8 Hour'), 'time_limit'] = 8
locations.loc[locations['short_config_name'].str.contains('9 Hour'), 'time_limit'] = 9

# create column for hourly_rate
locations['hourly_rate'] = locations['short_config_name']
locations['hourly_rate'] = locations['hourly_rate'].str.split('$', n=1, expand=True)[1]
locations['hourly_rate'] = locations['hourly_rate'].astype(str).str[0:4].astype(float)

# Save a new locations file for easy plotting in Tableau
locations.to_csv('Data/meter_locations.csv', header=True)

#### Join transactions and locations dataframes

In [4]:
# join dataframes on pole/pole_id so that all information is available in one dataframe
trans_locs_df = pd.merge(transactions, locations, left_on='pole_id', right_on='pole', how='inner')

# drop unnecessary columns
trans_locs_df = trans_locs_df.drop(['meter_type', 'trans_amt', 'pay_method', 'pole'], axis=1)
trans_locs_df.head()

Unnamed: 0,uuid,pole_id,trans_start,meter_expire,duration,day_of_week,zone,area,sub_area,config_id,config_name,longitude,latitude,short_config_name,time_limit,hourly_rate
0,SSG80519010100102125,G-805,2019-01-01 00:10:21,2019-01-01 00:10:21,0.0,1,Downtown,East Village,800 G ST,12494,2 Hour Max $1.25 HR 8am-6pm Mon-Sat (Mobile Pay),-117.15711,32.712518,2 Hour Max $1.25,2.0,1.25
1,SSG80519010100102725,G-805,2019-01-01 00:10:27,2019-01-01 00:10:27,0.0,1,Downtown,East Village,800 G ST,12494,2 Hour Max $1.25 HR 8am-6pm Mon-Sat (Mobile Pay),-117.15711,32.712518,2 Hour Max $1.25,2.0,1.25
2,SSG805190101091329250,G-805,2019-01-01 09:13:29,2019-01-01 11:13:29,120.0,1,Downtown,East Village,800 G ST,12494,2 Hour Max $1.25 HR 8am-6pm Mon-Sat (Mobile Pay),-117.15711,32.712518,2 Hour Max $1.25,2.0,1.25
3,SSG805190101110113225,G-805,2019-01-01 11:01:13,2019-01-01 13:01:13,120.0,1,Downtown,East Village,800 G ST,12494,2 Hour Max $1.25 HR 8am-6pm Mon-Sat (Mobile Pay),-117.15711,32.712518,2 Hour Max $1.25,2.0,1.25
4,SSG805190101115917125,G-805,2019-01-01 11:59:17,2019-01-01 13:59:17,120.0,1,Downtown,East Village,800 G ST,12494,2 Hour Max $1.25 HR 8am-6pm Mon-Sat (Mobile Pay),-117.15711,32.712518,2 Hour Max $1.25,2.0,1.25


### Calculate meter occupancy
Meter occupancy is calculated for every hour of the day. As an example, consider a single meter 'CC-1003' during the one-hour time period from '2019-01-01 10:00:00' to '2019-01-01 10:59:00'. If there are any `trans_start` OR `meter_expire` events within that period, the meter is scored `is_occupied = 1` for that hour. Also finds transaction durations that are longer than 1 hour, and scores `is_occupied = 1` for the hours in-between the `trans_start` and `meter_expire` times.

This analysis is limited to Tuesdays through Thursdays to capture parking behavior on  'typical' weekdays, and is further filtered between 8am-8pm to capture the full range of parking meter service times (although subgroups meters will have different service times within that range).

**NOTE**: This takes a long time to run, so much of it is commented out. The final dataframe, `all_meter_counts`, is loaded at the end of this section from a stored .csv file.

In [5]:
def single_meter_occupancy(one_meter):
    
    # creates a dataframe resampled by datetime so that there is an
    # entry for every hour of every day
    # trans_start_count = number of transactions during that hour,
    start_count = one_meter.set_index('trans_start').resample('1H').count()
    start_count = start_count['uuid'].to_frame()
    start_count = start_count.rename(index=str, columns={'uuid': 'trans_start_count'})
    start_count = start_count.rename_axis(None, axis=1).rename_axis('event_time', axis=0)
 
    # creates a dataframe resampled by datetime so that there is an
    # entry for every hour of every day
    # meter_expire_count = number of times the meter expired during that hour
    expire_count = one_meter.set_index('meter_expire').resample('1H').count()
    expire_count = expire_count['uuid'].to_frame()
    expire_count = expire_count.rename(index=str, columns={'uuid': 'meter_expire_count'})
    expire_count = expire_count.rename_axis(None, axis=1).rename_axis('event_time', axis=0)
    
    # combine dataframes on datetime index, then move the datetime to a column (reset index)
    start_expire_count = pd.merge(start_count, expire_count, how='outer', left_index=True, right_index=True)
    start_expire_count.index = pd.to_datetime(start_expire_count.index)
    
    # this section fills in gaps--if a person paid for more than 1 hour, the car might not be 'visible'
    # a portion of the metered time. At the end of this section there will be a column of 'total_events'
    # which basically indicates whether or not there was a car in that spot during a single hour.
    for i, row in one_meter[one_meter['duration'] > 60].iterrows():
        add_hour = 1
        while (row.trans_start.hour + add_hour) < (row.trans_start.hour + row.duration/60):
            tmp_time = row.trans_start.replace(microsecond=0, 
                                               second=0, minute=0).to_pydatetime()+timedelta(hours=add_hour)
            start_expire_count.loc[tmp_time, 'trans_start_count'] = start_expire_count.loc[tmp_time, 'trans_start_count'] + 1
            add_hour = add_hour + 1
        
    start_expire_count = start_expire_count.fillna(0)
    start_expire_count = start_expire_count.reset_index()
    
    # add 'total_events' and 'is_occupied' values for every single hour
    start_expire_count['total_events'] = start_expire_count['trans_start_count'] + start_expire_count['meter_expire_count']
    start_expire_count['is_occupied'] = 0
    start_expire_count.loc[(start_expire_count.total_events > 0),'is_occupied'] = 1
    
    return start_expire_count

#### Loop through each meter 
This section loops through each individual meter, and calls the `single_meter_occupancy` function to calcuate whether at least one car has occupied that spot during each hour of the day (only weekdays from Tue-Thu are included). After calculating `is_occupied`, the dataframe is filtered to 8am-8pm only (to save space).

In [6]:
## limit day_of_week to Tues (1) through Thurs (3)
#weekday_df = trans_locs_df.copy()
#weekday_df = weekday_df[(weekday_df['day_of_week'] > 0) & (weekday_df['day_of_week'] < 4)]
#
## Create new dataframe
#all_meter_counts = pd.DataFrame()
#
## The actual loop
#meter_list = weekday_df['pole_id'].unique()
#for meter in meter_list:
#    one_meter = weekday_df[weekday_df['pole_id'] == meter]
#    start_expire_count = single_meter_occupancy(one_meter)
#
#    # make a dataframe that contains *all* information from a single meter
#    one_meter_counts = pd.DataFrame(index=range(len(start_expire_count)))
#    one_meter_counts['pole_id'] = one_meter.iloc[0].pole_id
#    one_meter_counts['zone'] = one_meter.iloc[0].zone
#    one_meter_counts['area'] = one_meter.iloc[0].area
#    one_meter_counts['sub_area'] = one_meter.iloc[0].sub_area
#    one_meter_counts['longitude'] = one_meter.iloc[0].longitude
#    one_meter_counts['latitude'] = one_meter.iloc[0].latitude
#    one_meter_counts['short_config_name'] = one_meter.iloc[0].short_config_name
#    one_meter_counts['time_limit'] = one_meter.iloc[0].time_limit
#    one_meter_counts['hourly_rate'] = one_meter.iloc[0].hourly_rate
#    one_meter_counts['event_time'] = start_expire_count['event_time']
#    one_meter_counts['total_events'] = start_expire_count['total_events']
#    one_meter_counts['is_occupied'] = start_expire_count['is_occupied']
#
#    # append that single meter df to the larger (all meters) dataframe
#    all_meter_counts = all_meter_counts.append(one_meter_counts)
#
## add day_of_week and hour information
#all_meter_counts['day_of_week'] = all_meter_counts.event_time.dt.dayofweek    
#all_meter_counts['hour'] = all_meter_counts.event_time.dt.hour
#    
## Filter data for 8am-8pm only
#all_meter_counts = all_meter_counts[(all_meter_counts['hour'] >= 8) & (all_meter_counts['hour'] < 21)]
#
## save to .csv file
#all_meter_counts.to_csv('Data/all_meter_counts.csv', header=True)

#### Load saved all_meter_counts file (don't recalculate it)

In [7]:
all_meter_counts = pd.read_csv('Data/all_meter_counts.csv', index_col=0, parse_dates=['event_time'])

  mask |= (ar1 == a)


#### some housekeeping
This code should have been included in the previous loop-through-meters, will be cleaned up later

In [8]:
all_meter_counts['date'] = all_meter_counts.event_time.dt.date
all_meter_counts = all_meter_counts[(all_meter_counts['day_of_week'] > 0) & (all_meter_counts['day_of_week'] < 4)]

all_meter_counts.to_csv('Data/all_meter_counts.csv', header=True)

#### Use number of start_trans and meter_expire events to calculate block occupancy
First find the percent of meters in a sub_area that are occupied each hour of every day. Next, find the mean percent-occupied in a sub_area during each hour *across* days.

In [9]:
# calculate percent of meters filled in a sub_area every hour
tmp_series1 = all_meter_counts.groupby(['sub_area','date','hour']).is_occupied.sum() / all_meter_counts.groupby('sub_area').pole_id.nunique()
tmp_series2 = pd.DataFrame(tmp_series1)
tmp_series2 = tmp_series2.rename(columns={list(tmp_series2)[0]: 'prct_occupied'})

# calculate mean percent-occupied across days
block_occupancy = pd.DataFrame(tmp_series2.groupby(['sub_area','hour']).prct_occupied.mean())
block_occupancy.reset_index(inplace=True)

# merge the meter locations data with the block_occupancy data so that we can plot
block_occupancy = pd.merge(locations, block_occupancy,
                           left_on='sub_area', right_on='sub_area', how='left')

block_occupancy.to_csv('Data/block_occupancy.csv', header=True)