# Model

In [109]:
import geopandas as gpd
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind, mannwhitneyu
from shapely import wkt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split


## Import Datasets

In [110]:
def df_to_gdf(df: pd.DataFrame, geometry_key: str = "Geometry") -> gpd.GeoDataFrame:
    df[geometry_key] = df[geometry_key].apply(wkt.loads)
    return gpd.GeoDataFrame(df, geometry=geometry_key)

In [111]:
tickets_df = df_to_gdf(pd.read_csv("../data/cleaned_data/parking_tickets.csv"))
tickets_df["lat"] = tickets_df.geometry.x
tickets_df["lon"] = tickets_df.geometry.y
tickets_df.head()

Unnamed: 0,Block,Street,EntryDate,Year,dayofweek,Geometry,Neighbourhood,lat,lon
0,1100,DAVIE ST,2023-05-01,2023,0,POINT (49.28 -123.132),West End,49.280418,-123.13153
1,1500,COAL HARBOUR QUAY,2023-05-01,2023,0,POINT (49.291 -123.128),Downtown,49.291487,-123.128072
2,1500,COAL HARBOUR QUAY,2023-05-01,2023,0,POINT (49.291 -123.128),Downtown,49.291487,-123.128072
3,1000,ROBSON ST,2023-05-01,2023,0,POINT (49.283 -123.123),West End,49.283312,-123.123159
4,1100,ROBSON ST,2023-05-01,2023,0,POINT (49.285 -123.125),West End,49.284606,-123.125148


In [112]:
meters_df = df_to_gdf(pd.read_csv("../data/cleaned_data/parking_meters.csv"))
meters_df["lat"] = meters_df.geometry.x
meters_df["lon"] = meters_df.geometry.y
meters_df.head()

Unnamed: 0,METERHEAD,CREDITCARD,Geo Local Area,METERID,Geometry,lat,lon
0,Twin,0.0,Mount Pleasant,990002,POINT (49.264 -123.1),49.263514,-123.100273
1,Pay Station,1.0,Grandview-Woodland,6C1306,POINT (49.273 -123.069),49.272764,-123.06946
2,Pay Station,1.0,Kensington-Cedar Cottage,6C2619,POINT (49.261 -123.07),49.260636,-123.069919
3,Pay Station,1.0,Grandview-Woodland,6C2016,POINT (49.266 -123.07),49.266275,-123.069567
4,Pay Station,1.0,Kensington-Cedar Cottage,6C2711,POINT (49.26 -123.07),49.259947,-123.069934


In [113]:
# Spatial join to find nearest meter to each ticket
tickets_df = gpd.sjoin_nearest(
    tickets_df,
    meters_df,
    how="inner",
    max_distance=0.1, # meters
    distance_col="distance_to_meter"
)

tickets_df.rename(
    columns={
        "lat_right": "lat",     
        "lon_right": "lon"
    }, 
    inplace=True
)    

tickets_df = tickets_df.drop(columns=["lat_left", "lon_left", "index_right"])
tickets_df.head()

Unnamed: 0,Block,Street,EntryDate,Year,dayofweek,Geometry,Neighbourhood,METERHEAD,CREDITCARD,Geo Local Area,METERID,lat,lon,distance_to_meter
0,1100,DAVIE ST,2023-05-01,2023,0,POINT (49.28 -123.132),West End,Single,1.0,West End,651205,49.280286,-123.131477,0.000142
1,1500,COAL HARBOUR QUAY,2023-05-01,2023,0,POINT (49.291 -123.128),Downtown,Pay Station,1.0,Downtown,331517,49.291543,-123.128028,7.1e-05
2,1500,COAL HARBOUR QUAY,2023-05-01,2023,0,POINT (49.291 -123.128),Downtown,Pay Station,1.0,Downtown,331517,49.291543,-123.128028,7.1e-05
3,1000,ROBSON ST,2023-05-01,2023,0,POINT (49.283 -123.123),West End,Twin,0.0,West End,121010,49.28359,-123.123394,0.000364
3,1000,ROBSON ST,2023-05-01,2023,0,POINT (49.283 -123.123),West End,Twin,0.0,West End,121012,49.28359,-123.123394,0.000364


## Features

In [114]:
# Select features
features = [
    "CREDITCARD",       
    "Street",
    "METERHEAD",
    "Neighbourhood",
    "dayofweek",
    "Year"
]

## Statistical Testing

In [115]:
meter_stats = tickets_df.groupby(features).size().reset_index(name="ticket_count")
meter_stats.head()

Unnamed: 0,CREDITCARD,Street,METERHEAD,Neighbourhood,dayofweek,Year,ticket_count
0,0.0,ALBERNI ST,Single / Disability,West End,0,2020,66
1,0.0,ALBERNI ST,Single / Disability,West End,0,2021,56
2,0.0,ALBERNI ST,Single / Disability,West End,0,2022,45
3,0.0,ALBERNI ST,Single / Disability,West End,0,2023,63
4,0.0,ALBERNI ST,Single / Disability,West End,0,2024,37


In [116]:
print(f"With CC:\t{meter_stats[meter_stats['CREDITCARD'] == 1].shape[0]}")
print(f"Without CC:\t{meter_stats[meter_stats['CREDITCARD'] == 0].shape[0]}")

With CC:	5994
Without CC:	2224


In [117]:
# standardize ticket count
meter_stats["tickets_per_loc"] = (meter_stats["ticket_count"] / meter_stats["ticket_count"].max()) * 100

In [118]:
# Compare ticket rates between credit card vs. non-credit card meters
cc_tickets = meter_stats[meter_stats["CREDITCARD"] == 1]["tickets_per_loc"]
no_cc_tickets = meter_stats[meter_stats["CREDITCARD"] == 0]["tickets_per_loc"]

print(f"Mean tickets for CC meters: {np.mean(cc_tickets):.1f}")
print(f"Mean tickets for non-CC meters: {np.mean(no_cc_tickets):.1f}")

t_stat, p_value = ttest_ind(cc_tickets, no_cc_tickets)
print(f"\nStatistical significance:")
print(f"t-statistic: {t_stat}")
print(f"p-value: {p_value} (significant if < 0.05)")

stat, p = mannwhitneyu(cc_tickets, no_cc_tickets, alternative='less')
print(f"Mann-Whitney U test p-value: {p}")

Mean tickets for CC meters: 3.8
Mean tickets for non-CC meters: 5.6

Statistical significance:
t-statistic: -8.33187700606638
p-value: 9.244768566984192e-17 (significant if < 0.05)
Mann-Whitney U test p-value: 1.3492288493980663e-29


## Predictive Model

### Training

In [119]:
# Calculate median ticket count to define 'high-risk'
median_tickets = meter_stats["ticket_count"].median()
meter_stats['high_risk'] = (meter_stats["ticket_count"] > median_tickets).astype(int)

In [120]:
X = meter_stats[features]
y = meter_stats["high_risk"]

# Convert categorical features
X = pd.get_dummies(X, columns=[
    "Street",
    "METERHEAD", 
    "Neighbourhood"
])

In [121]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42,stratify=y
)

In [122]:
print(f"X_train shape: {X_train.shape[0]}")
print(f"X_test shape: {X_test.shape[0]}")

X_train shape: 6574
X_test shape: 1644


In [123]:
model = GradientBoostingClassifier(
    n_estimators=100,
    max_depth=5
)
model.fit(X_train, y_train)

### Testing

In [124]:
y_pred = model.predict(X_test)

In [125]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.87      0.88       828
           1       0.87      0.90      0.89       816

    accuracy                           0.89      1644
   macro avg       0.89      0.89      0.89      1644
weighted avg       0.89      0.89      0.89      1644



### Analysis

In [126]:
importances = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False, ignore_index=True)

importances.head(10)

Unnamed: 0,feature,importance
0,Neighbourhood_Downtown,0.071982
1,Neighbourhood_West End,0.061312
2,Neighbourhood_Fairview,0.037608
3,Street_W 4TH AVE,0.030876
4,Neighbourhood_Kensington-Cedar Cottage,0.027311
5,METERHEAD_Single / Disability,0.026343
6,Street_COMMERCIAL DRIVE,0.021875
7,Neighbourhood_Grandview-Woodland,0.020344
8,METERHEAD_Single,0.018033
9,Street_W GEORGIA ST,0.017096


In [127]:
print(f"total features: {importances.shape[0]}")
credit_card_effect = importances[
    importances['feature'].str.contains('CREDITCARD')
]
credit_card_effect

total features: 233


Unnamed: 0,feature,importance
29,CREDITCARD,0.010087


In [128]:
priority_meters = meter_stats[
    (meter_stats['CREDITCARD'] == 0) & 
    (meter_stats["tickets_per_loc"] > meter_stats['tickets_per_loc'].quantile(0.75))
].sort_values('tickets_per_loc', ascending=False)

print("Top Priority Meters for CC Installation:")
priority_meters = priority_meters[["Street", 'tickets_per_loc', 'Neighbourhood']]
priority_meters["tickets_per_loc"] = priority_meters["tickets_per_loc"] / 100
priority_meters.head(10)

Top Priority Meters for CC Installation:


Unnamed: 0,Street,tickets_per_loc,Neighbourhood
2011,W 8TH AVE,1.0,Fairview
2010,W 8TH AVE,0.958015,Fairview
2021,W 8TH AVE,0.942112,Fairview
2016,W 8TH AVE,0.905216,Fairview
2005,W 8TH AVE,0.85687,Fairview
2009,W 8TH AVE,0.844148,Fairview
2015,W 8TH AVE,0.840967,Fairview
2020,W 8TH AVE,0.795802,Fairview
2036,W 8TH AVE,0.775445,Fairview
2014,W 8TH AVE,0.767176,Fairview
