# DOES NOT YET GIVE SAME RESULTS AS CORRECT PARKING_COST_APRIL22.IPYNB


# Estimating Parking Cost and Spatial Autocorrelation Analysis of Parking Data

Goals:
   1. Join csv cost data with spatial data for parking lots
   3. Estimate Ratios of M to D, D to H, M to H to estimate missing rates values.
   4. KNN for points to polygons (TAZs)

For inflation adjustment: https://www.inflationtool.com/us-dollar/2010-to-present-value    

In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
import pysal
from osgeo import gdal
import copy
import libpysal as lps
import scipy
from itertools import combinations

## Bring In Data

1. Lot Rates
2. Lot Points
3. Join Points and Rates
4. Filter Lots (must have at least one rate)
5. TAZs

In [2]:
# bring in data
base = "J:\\Shared drives\\TMD_TSA\\Data\\Parking\\WebScraped_ParkingCost\\required_inputs"
# parking costs
rates = pd.read_csv(base+"\parking_cost_fullrec_NAP_F16.csv")

# spatial points
points = gpd.read_file(base+"\GeocodedParkingLots\DKedits_parking_cost_fullrec_NAP.shp")
points = points.dropna(subset=["geometry"])

# join cost to points
lots = points[['IN_SingleL','geometry','USER_month','USER_lot_u']].merge(rates[['IN_SingleLine','USER_lot_url',
                                                                                'MR','DR','HR']],
                                                                         left_on='USER_lot_u',right_on='USER_lot_url')


In [3]:
# bring in relevant TAZs
base2 = "J:\Shared drives\TMD_TSA\Data\GIS Data\TAZ"
alltazs = gpd.read_file(base2+"\\candidate_CTPS_TAZ_STATEWIDE_2019_wgs84.shp")

## Estimate and Fill Missing Monthly Rates

Calculate the Monthly/Daily ratio per district by dividing the monthly column by the daily column to get lot level ratios and aggregate to the region. This region-wide ratio is multiplied by each lot's Daily Rate to calculate an estimated monthly rate. At this point, a new column is made where observed monthly rate data unless missing, then estimated monthly rate data is used if existing (aka if lot has a daily rate). 

This will be conducted for M/D, D/H, and M/H - M/D is used as the example for the explanation for ease of understanding.

In [10]:
def estimate_lot_rates(tps, lots):
    # filter out customer only parking (no rates for any category)
    estmonth = copy.deepcopy(lots[lots[list(tps.keys())].notna().any(axis='columns')])

    rl = []
    # estimate round 1: lot rate estimations based on other tp values
    for key1 in list(tps.keys()):
        if 'Est_'+key1 in estmonth.columns:
            continue
        else:
            ratio = tps[key1][0]
            rl.append(ratio)

        if len(tps[key1]) == 3 and tps[key1][2] == "reverse_ratio":
            estmonth[ratio] = estmonth[rl[0]]*estmonth[rl[1]]       
        else:
            #get ratio at the lot level
            estmonth[ratio] = np.where((estmonth[tps[key1][1]]== 0) | (estmonth[key1]== 0),
                                       0,estmonth[key1]/estmonth[tps[key1][1]])           
        
        # estimate monthly from daily and mean regional ratio (using only where both values)
        estmonth['Est_'+key1] = estmonth[tps[key1][1]] * estmonth[ratio].mean()

        # combine estimated daily with actual daily where possible
        estmonth[key1+'_wEst'] = np.where(estmonth[key1].isna(),
                                                 estmonth['Est_'+key1],
                                                 estmonth[key1])
    
    # estimate round 2: lot rate estimates based on other estimated values
    for key in list(tps.keys()):
        if 'Est_'+key+'_2' in estmonth.columns:
            continue # if both are already re-estimated go to next pair
        else:
            pass
            if len(tps[key]) == 3 and tps[key][2] == "reverse_ratio":
                estmonth['Est_'+key+'_2'] = estmonth[tps[key][1]+'_wEst'] / estmonth[ratio].mean()
            else:
                estmonth['Est_'+key+'_2'] = estmonth[tps[key][1]+'_wEst'] * estmonth[ratio].mean()

            # combine estimated daily with actual daily where possible
            estmonth[key+'_wEst2'] = np.where(estmonth[key].isna(),
                                                 estmonth['Est_'+key+'_2'],
                                                 estmonth[key])
    
    return estmonth

In [45]:
# key = field being calculated, value[0] = ratio, value[1] = value that the key field will be derived from
tps = {"MR":["MR_to_DR", "DR"], "DR":["DR_to_HR", "HR"], "HR":["MR_to_HR", "MR", "reverse_ratio"]}
K = 16
all_lots = lots
all_tazs = alltazs


elots = estimate_lot_rates(tps, all_lots)

# post LM geojson - only need if running remove_outliers
postSAdf = gpd.read_file(base+"\estmonth_April14_HR_DR_MR_LM_edited.geojson")
cluster_outlier_field = "COType_"
elots = postSAdf

In [12]:
#EXPORTS

elots.to_csv("J:\Shared drives\TMD_TSA\Data\Parking\WebScraped_ParkingCost\estmonth_April26a.csv")

# Export estmonth as geojson for use in ArcPro and QGIS for Local Moran's I and Getis Ord Gi*
elots.to_file("J:\Shared drives\TMD_TSA\Data\Parking\WebScraped_ParkingCost\estmonth_April26a.geojson") 

# Calculate Average Rates per TAZ
### Post Local Spatial Autocorrelation (Local Moran's I)
The data is clustered - see versions of this analysis with Spatial Autocorrelation included for information on that.

In [46]:
def identify_outliers(lots, postSA, cof, tps):
    # if importing new data to add to lots, add and delete outliers
    if isinstance(postSA, gpd.GeoDataFrame) and (len(cof) > 0): # if importing new data with LM outliers
        estmonth = postSA.to_crs(26986)
        #estmonth = lots.sjoin_nearest(postSA[[cluster_outlier_field+s for s in tps]+["geometry"]], how="left")
        
        # 1 and 13 are very close to each other (see index_right) removing them so can filter later
        #estmonth = estmonth[~estmonth.index.duplicated(keep='first')]
        
        # get lot ids where HL or LH for each time period and exclude them from the weighted average
        inin =[]
        for x in tps:
            inin.append(estmonth[~estmonth[cof+x].isin(["LH", "HL"])].reset_index()['index']) 
    return estmonth, inin

In [47]:
def euclidean_matrix(lots,tazs):
    '''get the euclidean distance between every lot and TAZ centroid to create a distance matrix '''

    # reproject to Mass State Plane (meters) so that distance is correct
    rdg83 = alltazs.to_crs("EPSG:26986").set_index("id") # TAZ ids are now the column names
    lots83 =  lots.to_crs("EPSG:26986")
    
    # get euclidean distance matrix from TAZ centroids to lots
    #eucdist = rdg83.centroid.geometry.apply(lambda g: lots83.distance(g)).transpose()
    eucdist = lots83.centroid.geometry.apply(lambda g: rdg83.distance(g))
    # convert to miles
    eucdistmi = eucdist/1609.34
    #round (inches don't matter)
    eucdistmi = eucdistmi.round(5)
    
    return eucdistmi

In [48]:
def knn_average(tps, lots, eucdistmi,tazs,inin=[]):
    # tazs & lots to filter
    tazids = tazs[(tazs['town'].isin(["BOSTON","CAMBRIDGE","SOMERVILLE",
                                        "BROOKLINE","NEWTON"])) & (tazs['id'] < 200000)]["id"].tolist()
    
    # if there are no outliers to be filtered - have the inclusion list be everything
    if len(inin) == 0:
        for x in tps:
            inin.append(lots.reset_index['index'])
    else:
        pass

    # multiply weights (1) by rates, reminder: weights are all 1 so this is essentially a mask
    # filter the rates and weights by whether the lot is an outlier and if closeset 16, then multiply
    msums = []
    for z in tps:
        # if outlier rate value, filter out
        noOutcol = lots[z+'_wEst2'].filter(items = inin[tps.index(z)], axis=0)
        # filter like above and for the 16 closest lots, set weights/distances = 1
        eucdistfit = temp_matrix_filter(K, eucdistmi, inin, z, tps)
        # multiply the weights (1 for 16 closest lots) by the rates (filters 
        m = eucdistfit.multiply(noOutcol, axis="index")
        # get sum per TAZ
        msum = m.sum()
        msum.name = z+"_SumNN"
        #save so can merge together later
        msums.append(msum)
    
    #sum weights by TAZ
    wsum = eucdistfit.sum() # should be K from KNN
    wsum.name = "TotalNN"
    
    # join weighted rates sums by taz and sum weights by taz together
    for q in msums:
        # merge the sum of lots used for avg per TAZ with TAZ level sum of rates of lots used for avg
        wsum = pd.merge(wsum,q, left_index=True, right_index=True)
        # if not in list of TAZs in Cambridge, Boston, Somerville, Newton or Brookline, set to 0
        wsum[q.name] = np.where(~wsum.index.isin(tazids), 0, wsum[q.name])
        # create column that is essentially z_Avg_NN and have it equal the average
        wsum[q.name.split("_")[0]+"_Avg_NN"] = wsum[q.name]/wsum["TotalNN"]
        
        # Convert to 2010
        wsum[q.name.split("_")[0]+"_Avg_NN_2010"] = wsum[q.name.split("_")[0]+"_Avg_NN"] * 0.69
        
    # final spatial result    
    tazs_avg_rates = pd.merge(tazs.set_index("id"),wsum, left_index=True, right_index=True)
    
    return tazs_avg_rates

In [49]:
def temp_matrix_filter(K, eucdistmi, inin, z, tps):
    '''
    1. filter lot to taz distance matrix by inclusion in non-outlier list (inin)
    2. filter lot to taz distance matrix by getting only the 16 closest lots per taz, set other distances to null
    3. return the updated distance matrix'''
    
    # 1
    eucdistmi = eucdistmi.filter(items = inin[tps.index(z)], axis=0) # won't update eucdistmi in other fxns
    
    # get just closest 16 lots to each TAZ centroid based on euclidean distance
    numlot = len(eucdistmi)
    for col in eucdistmi.columns:
        big = max(eucdistmi[col].nsmallest(K))
        eucdistmi.loc[eucdistmi[col] > big, col]= np.nan
        
    # set distances (weights) to 1 so all have equal weights
    eucdistmi[eucdistmi.notna()] = 1
    return eucdistmi

In [50]:
noOutlots, noOutlist = identify_outliers(elots, postSAdf,cluster_outlier_field, list(tps.keys()))

In [51]:
distmatrix = euclidean_matrix(noOutlots,all_tazs)

In [52]:
tar  = knn_average(list(tps.keys()),noOutlots,distmatrix, all_tazs, noOutlist)
tar[tar["MR_Avg_NN_2010"] > 0]

Unnamed: 0_level_0,OBJECTID,taz,type,town,state,town_state,mpo,in_brmpo,subregion,Shape_Leng,...,TotalNN,MR_SumNN,MR_Avg_NN,MR_Avg_NN_2010,DR_SumNN,DR_Avg_NN,DR_Avg_NN_2010,HR_SumNN,HR_Avg_NN,HR_Avg_NN_2010
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
268,59,5659,I,BOSTON,MA,"BOSTON,MA",BRMPO,1,ICC,1630.536299,...,16.0,6354.254979,397.140936,274.027246,433.000000,27.06250,18.673125,178.518387,11.157399,7.698605
94,121,5660,I,BOSTON,MA,"BOSTON,MA",BRMPO,1,ICC,978.447690,...,16.0,9634.371243,602.148203,415.482260,649.000000,40.56250,27.988125,284.617003,17.788563,12.274108
266,172,5661,I,BOSTON,MA,"BOSTON,MA",BRMPO,1,ICC,973.091663,...,16.0,8430.462383,526.903899,363.563690,558.081274,34.88008,24.067255,254.720877,15.920055,10.984838
267,179,5662,I,BOSTON,MA,"BOSTON,MA",BRMPO,1,ICC,900.917147,...,16.0,7683.852734,480.240796,331.366149,499.081274,31.19258,21.522880,230.294761,14.393423,9.931462
269,183,4165,I,BOSTON,MA,"BOSTON,MA",BRMPO,1,ICC,1551.611054,...,16.0,5980.691569,373.793223,257.917324,418.000000,26.12500,18.026250,180.539798,11.283737,7.785779
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,5724,5721,I,BOSTON,MA,"BOSTON,MA",BRMPO,1,ICC,2751.624064,...,16.0,3941.596787,246.349799,169.981361,262.500000,16.40625,11.320312,104.921406,6.557588,4.524736
189,5725,5722,I,BOSTON,MA,"BOSTON,MA",BRMPO,1,ICC,2849.580127,...,16.0,3941.596787,246.349799,169.981361,262.500000,16.40625,11.320312,104.921406,6.557588,4.524736
190,5726,5723,I,BOSTON,MA,"BOSTON,MA",BRMPO,1,ICC,3360.600828,...,16.0,3941.596787,246.349799,169.981361,262.500000,16.40625,11.320312,104.921406,6.557588,4.524736
177,5728,5725,I,BOSTON,MA,"BOSTON,MA",BRMPO,1,ICC,1612.890762,...,16.0,4435.576641,277.223540,191.284243,304.000000,19.00000,13.110000,136.512184,8.532012,5.887088


# Exports

In [53]:
tar.drop("geometry",axis=1).to_csv("J:\\Shared drives\\TMD_TSA\\Data\\Parking\\WebScraped_ParkingCost\\tazs_avg_rates2010_Apr26a.csv")

tar.to_file("J:\Shared drives\\TMD_TSA\Data\Parking\WebScraped_ParkingCost\\tazs_avg_ratesApr26a.geojson")  

In [23]:
distmatrix.to_csv("J:\\Shared drives\\TMD_TSA\\Data\\Parking\\WebScraped_ParkingCost\\distmatrix.csv")
